5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 # if defined(BUGGY_MSC6)
104 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105 # pragma optimize("a",off)
106 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107 # pragma optimize("w",on )
108 # endif /* BUGGY_MSC6 */
112 #define STATIC static
116 typedef struct RExC_state_t {
117 U32 flags; /* RXf_* are we folding, multilining? */
118 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119 char *precomp; /* uncompiled string. */
120 REGEXP *rx_sv; /* The SV that is the regexp. */
121 regexp *rx; /* perl core regexp structure */
122 regexp_internal *rxi; /* internal data for regexp object pprivate field */
123 char *start; /* Start of input for compile */
124 char *end; /* End of input for compile */
125 char *parse; /* Input-scan pointer. */
126 SSize_t whilem_seen; /* number of WHILEM in this expr */
127 regnode *emit_start; /* Start of emitted-code area */
128 regnode *emit_bound; /* First regnode outside of the allocated space */
129 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
130 implies compiling, so don't emit */
131 regnode emit_dummy; /* placeholder for emit to point to */
132 I32 naughty; /* How bad is this pattern? */
133 I32 sawback; /* Did we see \1, ...? */
135 SSize_t size; /* Code size. */
136 I32 npar; /* Capture buffer count, (OPEN). */
137 I32 cpar; /* Capture buffer count, (CLOSE). */
138 I32 nestroot; /* root parens we are in - used by accept */
141 regnode **open_parens; /* pointers to open parens */
142 regnode **close_parens; /* pointers to close parens */
143 regnode *opend; /* END node in program */
144 I32 utf8; /* whether the pattern is utf8 or not */
145 I32 orig_utf8; /* whether the pattern was originally in utf8 */
146 /* XXX use this for future optimisation of case
147 * where pattern must be upgraded to utf8. */
148 I32 uni_semantics; /* If a d charset modifier should use unicode
149 rules, even if the pattern is not in
151 HV *paren_names; /* Paren names */
153 regnode **recurse; /* Recurse regops */
154 I32 recurse_count; /* Number of recurse regops */
157 I32 override_recoding;
158 I32 in_multi_char_class;
159 struct reg_code_block *code_blocks; /* positions of literal (?{})
161 int num_code_blocks; /* size of code_blocks[] */
162 int code_index; /* next code_blocks[] slot */
164 char *starttry; /* -Dr: where regtry was called. */
165 #define RExC_starttry (pRExC_state->starttry)
167 SV *runtime_code_qr; /* qr with the runtime code blocks */
169 const char *lastparse;
171 AV *paren_name_list; /* idx -> name */
172 #define RExC_lastparse (pRExC_state->lastparse)
173 #define RExC_lastnum (pRExC_state->lastnum)
174 #define RExC_paren_name_list (pRExC_state->paren_name_list)
178 #define RExC_flags (pRExC_state->flags)
179 #define RExC_pm_flags (pRExC_state->pm_flags)
180 #define RExC_precomp (pRExC_state->precomp)
181 #define RExC_rx_sv (pRExC_state->rx_sv)
182 #define RExC_rx (pRExC_state->rx)
183 #define RExC_rxi (pRExC_state->rxi)
184 #define RExC_start (pRExC_state->start)
185 #define RExC_end (pRExC_state->end)
186 #define RExC_parse (pRExC_state->parse)
187 #define RExC_whilem_seen (pRExC_state->whilem_seen)
188 #ifdef RE_TRACK_PATTERN_OFFSETS
189 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
191 #define RExC_emit (pRExC_state->emit)
192 #define RExC_emit_dummy (pRExC_state->emit_dummy)
193 #define RExC_emit_start (pRExC_state->emit_start)
194 #define RExC_emit_bound (pRExC_state->emit_bound)
195 #define RExC_naughty (pRExC_state->naughty)
196 #define RExC_sawback (pRExC_state->sawback)
197 #define RExC_seen (pRExC_state->seen)
198 #define RExC_size (pRExC_state->size)
199 #define RExC_npar (pRExC_state->npar)
200 #define RExC_nestroot (pRExC_state->nestroot)
201 #define RExC_extralen (pRExC_state->extralen)
202 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
203 #define RExC_utf8 (pRExC_state->utf8)
204 #define RExC_uni_semantics (pRExC_state->uni_semantics)
205 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
206 #define RExC_open_parens (pRExC_state->open_parens)
207 #define RExC_close_parens (pRExC_state->close_parens)
208 #define RExC_opend (pRExC_state->opend)
209 #define RExC_paren_names (pRExC_state->paren_names)
210 #define RExC_recurse (pRExC_state->recurse)
211 #define RExC_recurse_count (pRExC_state->recurse_count)
212 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
213 #define RExC_contains_locale (pRExC_state->contains_locale)
214 #define RExC_override_recoding (pRExC_state->override_recoding)
215 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
218 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
219 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
220 ((*s) == '{' && regcurly(s, FALSE)))
223 #undef SPSTART /* dratted cpp namespace... */
226 * Flags to be passed up and down.
228 #define WORST 0 /* Worst case. */
229 #define HASWIDTH 0x01 /* Known to match non-null strings. */
231 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
232 * character. (There needs to be a case: in the switch statement in regexec.c
233 * for any node marked SIMPLE.) Note that this is not the same thing as
236 #define SPSTART 0x04 /* Starts with * or + */
237 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
238 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
239 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
243 /* whether trie related optimizations are enabled */
244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
245 #define TRIE_STUDY_OPT
246 #define FULL_TRIE_STUDY
252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
253 #define PBITVAL(paren) (1 << ((paren) & 7))
254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
258 #define REQUIRE_UTF8 STMT_START { \
260 *flagp = RESTART_UTF8; \
265 /* This converts the named class defined in regcomp.h to its equivalent class
266 * number defined in handy.h. */
267 #define namedclass_to_classnum(class) ((int) ((class) / 2))
268 #define classnum_to_namedclass(classnum) ((classnum) * 2)
270 /* About scan_data_t.
272 During optimisation we recurse through the regexp program performing
273 various inplace (keyhole style) optimisations. In addition study_chunk
274 and scan_commit populate this data structure with information about
275 what strings MUST appear in the pattern. We look for the longest
276 string that must appear at a fixed location, and we look for the
277 longest string that may appear at a floating location. So for instance
282 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
283 strings (because they follow a .* construct). study_chunk will identify
284 both FOO and BAR as being the longest fixed and floating strings respectively.
286 The strings can be composites, for instance
290 will result in a composite fixed substring 'foo'.
292 For each string some basic information is maintained:
294 - offset or min_offset
295 This is the position the string must appear at, or not before.
296 It also implicitly (when combined with minlenp) tells us how many
297 characters must match before the string we are searching for.
298 Likewise when combined with minlenp and the length of the string it
299 tells us how many characters must appear after the string we have
303 Only used for floating strings. This is the rightmost point that
304 the string can appear at. If set to SSize_t_MAX it indicates that the
305 string can occur infinitely far to the right.
308 A pointer to the minimum number of characters of the pattern that the
309 string was found inside. This is important as in the case of positive
310 lookahead or positive lookbehind we can have multiple patterns
315 The minimum length of the pattern overall is 3, the minimum length
316 of the lookahead part is 3, but the minimum length of the part that
317 will actually match is 1. So 'FOO's minimum length is 3, but the
318 minimum length for the F is 1. This is important as the minimum length
319 is used to determine offsets in front of and behind the string being
320 looked for. Since strings can be composites this is the length of the
321 pattern at the time it was committed with a scan_commit. Note that
322 the length is calculated by study_chunk, so that the minimum lengths
323 are not known until the full pattern has been compiled, thus the
324 pointer to the value.
328 In the case of lookbehind the string being searched for can be
329 offset past the start point of the final matching string.
330 If this value was just blithely removed from the min_offset it would
331 invalidate some of the calculations for how many chars must match
332 before or after (as they are derived from min_offset and minlen and
333 the length of the string being searched for).
334 When the final pattern is compiled and the data is moved from the
335 scan_data_t structure into the regexp structure the information
336 about lookbehind is factored in, with the information that would
337 have been lost precalculated in the end_shift field for the
340 The fields pos_min and pos_delta are used to store the minimum offset
341 and the delta to the maximum offset at the current point in the pattern.
345 typedef struct scan_data_t {
346 /*I32 len_min; unused */
347 /*I32 len_delta; unused */
351 SSize_t last_end; /* min value, <0 unless valid. */
352 SSize_t last_start_min;
353 SSize_t last_start_max;
354 SV **longest; /* Either &l_fixed, or &l_float. */
355 SV *longest_fixed; /* longest fixed string found in pattern */
356 SSize_t offset_fixed; /* offset where it starts */
357 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
358 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
359 SV *longest_float; /* longest floating string found in pattern */
360 SSize_t offset_float_min; /* earliest point in string it can appear */
361 SSize_t offset_float_max; /* latest point in string it can appear */
362 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
363 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
366 SSize_t *last_closep;
367 struct regnode_charclass_class *start_class;
370 /* The below is perhaps overboard, but this allows us to save a test at the
371 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
372 * and 'a' differ by a single bit; the same with the upper and lower case of
373 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
374 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
375 * then inverts it to form a mask, with just a single 0, in the bit position
376 * where the upper- and lowercase differ. XXX There are about 40 other
377 * instances in the Perl core where this micro-optimization could be used.
378 * Should decide if maintenance cost is worse, before changing those
380 * Returns a boolean as to whether or not 'v' is either a lowercase or
381 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
382 * compile-time constant, the generated code is better than some optimizing
383 * compilers figure out, amounting to a mask and test. The results are
384 * meaningless if 'c' is not one of [A-Za-z] */
385 #define isARG2_lower_or_UPPER_ARG1(c, v) \
386 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
389 * Forward declarations for pregcomp()'s friends.
392 static const scan_data_t zero_scan_data =
393 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
395 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
396 #define SF_BEFORE_SEOL 0x0001
397 #define SF_BEFORE_MEOL 0x0002
398 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
399 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
402 # define SF_FIX_SHIFT_EOL (0+2)
403 # define SF_FL_SHIFT_EOL (0+4)
405 # define SF_FIX_SHIFT_EOL (+2)
406 # define SF_FL_SHIFT_EOL (+4)
409 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
410 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
412 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
413 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
414 #define SF_IS_INF 0x0040
415 #define SF_HAS_PAR 0x0080
416 #define SF_IN_PAR 0x0100
417 #define SF_HAS_EVAL 0x0200
418 #define SCF_DO_SUBSTR 0x0400
419 #define SCF_DO_STCLASS_AND 0x0800
420 #define SCF_DO_STCLASS_OR 0x1000
421 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
422 #define SCF_WHILEM_VISITED_POS 0x2000
424 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
425 #define SCF_SEEN_ACCEPT 0x8000
426 #define SCF_TRIE_DOING_RESTUDY 0x10000
428 #define UTF cBOOL(RExC_utf8)
430 /* The enums for all these are ordered so things work out correctly */
431 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
432 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
433 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
434 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
435 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
436 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
437 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
439 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
441 #define OOB_NAMEDCLASS -1
443 /* There is no code point that is out-of-bounds, so this is problematic. But
444 * its only current use is to initialize a variable that is always set before
446 #define OOB_UNICODE 0xDEADBEEF
448 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
449 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
452 /* length of regex to show in messages that don't mark a position within */
453 #define RegexLengthToShowInErrorMessages 127
456 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
457 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
458 * op/pragma/warn/regcomp.
460 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
461 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
463 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
466 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
467 * arg. Show regex, up to a maximum length. If it's too long, chop and add
470 #define _FAIL(code) STMT_START { \
471 const char *ellipses = ""; \
472 IV len = RExC_end - RExC_precomp; \
475 SAVEFREESV(RExC_rx_sv); \
476 if (len > RegexLengthToShowInErrorMessages) { \
477 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
478 len = RegexLengthToShowInErrorMessages - 10; \
484 #define FAIL(msg) _FAIL( \
485 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
486 msg, (int)len, RExC_precomp, ellipses))
488 #define FAIL2(msg,arg) _FAIL( \
489 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
490 arg, (int)len, RExC_precomp, ellipses))
493 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
495 #define Simple_vFAIL(m) STMT_START { \
496 const IV offset = RExC_parse - RExC_precomp; \
497 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
498 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
502 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
504 #define vFAIL(m) STMT_START { \
506 SAVEFREESV(RExC_rx_sv); \
511 * Like Simple_vFAIL(), but accepts two arguments.
513 #define Simple_vFAIL2(m,a1) STMT_START { \
514 const IV offset = RExC_parse - RExC_precomp; \
515 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
516 (int)offset, RExC_precomp, RExC_precomp + offset); \
520 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
522 #define vFAIL2(m,a1) STMT_START { \
524 SAVEFREESV(RExC_rx_sv); \
525 Simple_vFAIL2(m, a1); \
530 * Like Simple_vFAIL(), but accepts three arguments.
532 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
533 const IV offset = RExC_parse - RExC_precomp; \
534 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
535 (int)offset, RExC_precomp, RExC_precomp + offset); \
539 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
541 #define vFAIL3(m,a1,a2) STMT_START { \
543 SAVEFREESV(RExC_rx_sv); \
544 Simple_vFAIL3(m, a1, a2); \
548 * Like Simple_vFAIL(), but accepts four arguments.
550 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
551 const IV offset = RExC_parse - RExC_precomp; \
552 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
553 (int)offset, RExC_precomp, RExC_precomp + offset); \
556 #define vFAIL4(m,a1,a2,a3) STMT_START { \
558 SAVEFREESV(RExC_rx_sv); \
559 Simple_vFAIL4(m, a1, a2, a3); \
562 /* m is not necessarily a "literal string", in this macro */
563 #define reg_warn_non_literal_string(loc, m) STMT_START { \
564 const IV offset = loc - RExC_precomp; \
565 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
566 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
569 #define ckWARNreg(loc,m) STMT_START { \
570 const IV offset = loc - RExC_precomp; \
571 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
572 (int)offset, RExC_precomp, RExC_precomp + offset); \
575 #define vWARN_dep(loc, m) STMT_START { \
576 const IV offset = loc - RExC_precomp; \
577 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
578 (int)offset, RExC_precomp, RExC_precomp + offset); \
581 #define ckWARNdep(loc,m) STMT_START { \
582 const IV offset = loc - RExC_precomp; \
583 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
585 (int)offset, RExC_precomp, RExC_precomp + offset); \
588 #define ckWARNregdep(loc,m) STMT_START { \
589 const IV offset = loc - RExC_precomp; \
590 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
592 (int)offset, RExC_precomp, RExC_precomp + offset); \
595 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
596 const IV offset = loc - RExC_precomp; \
597 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
599 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
602 #define ckWARN2reg(loc, m, a1) STMT_START { \
603 const IV offset = loc - RExC_precomp; \
604 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
605 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
608 #define vWARN3(loc, m, a1, a2) STMT_START { \
609 const IV offset = loc - RExC_precomp; \
610 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
611 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
614 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
615 const IV offset = loc - RExC_precomp; \
616 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
617 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
620 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
621 const IV offset = loc - RExC_precomp; \
622 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
623 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
626 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
627 const IV offset = loc - RExC_precomp; \
628 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
629 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
632 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
633 const IV offset = loc - RExC_precomp; \
634 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
639 /* Allow for side effects in s */
640 #define REGC(c,s) STMT_START { \
641 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
644 /* Macros for recording node offsets. 20001227 mjd@plover.com
645 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
646 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
647 * Element 0 holds the number n.
648 * Position is 1 indexed.
650 #ifndef RE_TRACK_PATTERN_OFFSETS
651 #define Set_Node_Offset_To_R(node,byte)
652 #define Set_Node_Offset(node,byte)
653 #define Set_Cur_Node_Offset
654 #define Set_Node_Length_To_R(node,len)
655 #define Set_Node_Length(node,len)
656 #define Set_Node_Cur_Length(node,start)
657 #define Node_Offset(n)
658 #define Node_Length(n)
659 #define Set_Node_Offset_Length(node,offset,len)
660 #define ProgLen(ri) ri->u.proglen
661 #define SetProgLen(ri,x) ri->u.proglen = x
663 #define ProgLen(ri) ri->u.offsets[0]
664 #define SetProgLen(ri,x) ri->u.offsets[0] = x
665 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
667 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
668 __LINE__, (int)(node), (int)(byte))); \
670 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
672 RExC_offsets[2*(node)-1] = (byte); \
677 #define Set_Node_Offset(node,byte) \
678 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
679 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
681 #define Set_Node_Length_To_R(node,len) STMT_START { \
683 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
684 __LINE__, (int)(node), (int)(len))); \
686 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
688 RExC_offsets[2*(node)] = (len); \
693 #define Set_Node_Length(node,len) \
694 Set_Node_Length_To_R((node)-RExC_emit_start, len)
695 #define Set_Node_Cur_Length(node, start) \
696 Set_Node_Length(node, RExC_parse - start)
698 /* Get offsets and lengths */
699 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
700 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
702 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
703 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
704 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
708 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
709 #define EXPERIMENTAL_INPLACESCAN
710 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
712 #define DEBUG_STUDYDATA(str,data,depth) \
713 DEBUG_OPTIMISE_MORE_r(if(data){ \
714 PerlIO_printf(Perl_debug_log, \
715 "%*s" str "Pos:%"IVdf"/%"IVdf \
716 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
717 (int)(depth)*2, "", \
718 (IV)((data)->pos_min), \
719 (IV)((data)->pos_delta), \
720 (UV)((data)->flags), \
721 (IV)((data)->whilem_c), \
722 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
723 is_inf ? "INF " : "" \
725 if ((data)->last_found) \
726 PerlIO_printf(Perl_debug_log, \
727 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
728 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
729 SvPVX_const((data)->last_found), \
730 (IV)((data)->last_end), \
731 (IV)((data)->last_start_min), \
732 (IV)((data)->last_start_max), \
733 ((data)->longest && \
734 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
735 SvPVX_const((data)->longest_fixed), \
736 (IV)((data)->offset_fixed), \
737 ((data)->longest && \
738 (data)->longest==&((data)->longest_float)) ? "*" : "", \
739 SvPVX_const((data)->longest_float), \
740 (IV)((data)->offset_float_min), \
741 (IV)((data)->offset_float_max) \
743 PerlIO_printf(Perl_debug_log,"\n"); \
746 /* Mark that we cannot extend a found fixed substring at this point.
747 Update the longest found anchored substring and the longest found
748 floating substrings if needed. */
751 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
752 SSize_t *minlenp, int is_inf)
754 const STRLEN l = CHR_SVLEN(data->last_found);
755 const STRLEN old_l = CHR_SVLEN(*data->longest);
756 GET_RE_DEBUG_FLAGS_DECL;
758 PERL_ARGS_ASSERT_SCAN_COMMIT;
760 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
761 SvSetMagicSV(*data->longest, data->last_found);
762 if (*data->longest == data->longest_fixed) {
763 data->offset_fixed = l ? data->last_start_min : data->pos_min;
764 if (data->flags & SF_BEFORE_EOL)
766 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
768 data->flags &= ~SF_FIX_BEFORE_EOL;
769 data->minlen_fixed=minlenp;
770 data->lookbehind_fixed=0;
772 else { /* *data->longest == data->longest_float */
773 data->offset_float_min = l ? data->last_start_min : data->pos_min;
774 data->offset_float_max = (l
775 ? data->last_start_max
776 : (data->pos_delta == SSize_t_MAX
778 : data->pos_min + data->pos_delta));
780 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
781 data->offset_float_max = SSize_t_MAX;
782 if (data->flags & SF_BEFORE_EOL)
784 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
786 data->flags &= ~SF_FL_BEFORE_EOL;
787 data->minlen_float=minlenp;
788 data->lookbehind_float=0;
791 SvCUR_set(data->last_found, 0);
793 SV * const sv = data->last_found;
794 if (SvUTF8(sv) && SvMAGICAL(sv)) {
795 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
801 data->flags &= ~SF_BEFORE_EOL;
802 DEBUG_STUDYDATA("commit: ",data,0);
805 /* These macros set, clear and test whether the synthetic start class ('ssc',
806 * given by the parameter) matches an empty string (EOS). This uses the
807 * 'next_off' field in the node, to save a bit in the flags field. The ssc
808 * stands alone, so there is never a next_off, so this field is otherwise
809 * unused. The EOS information is used only for compilation, but theoretically
810 * it could be passed on to the execution code. This could be used to store
811 * more than one bit of information, but only this one is currently used. */
812 #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
813 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
814 #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
816 /* Can match anything (initialization) */
818 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
820 PERL_ARGS_ASSERT_CL_ANYTHING;
822 ANYOF_BITMAP_SETALL(cl);
823 cl->flags = ANYOF_UNICODE_ALL;
826 /* If any portion of the regex is to operate under locale rules,
827 * initialization includes it. The reason this isn't done for all regexes
828 * is that the optimizer was written under the assumption that locale was
829 * all-or-nothing. Given the complexity and lack of documentation in the
830 * optimizer, and that there are inadequate test cases for locale, so many
831 * parts of it may not work properly, it is safest to avoid locale unless
833 if (RExC_contains_locale) {
834 ANYOF_CLASS_SETALL(cl); /* /l uses class */
835 cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
838 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
842 /* Can match anything (initialization) */
844 S_cl_is_anything(const struct regnode_charclass_class *cl)
848 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
850 for (value = 0; value < ANYOF_MAX; value += 2)
851 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
853 if (!(cl->flags & ANYOF_UNICODE_ALL))
855 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
860 /* Can match anything (initialization) */
862 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
864 PERL_ARGS_ASSERT_CL_INIT;
866 Zero(cl, 1, struct regnode_charclass_class);
868 cl_anything(pRExC_state, cl);
869 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
872 /* These two functions currently do the exact same thing */
873 #define cl_init_zero cl_init
875 /* 'AND' a given class with another one. Can create false positives. 'cl'
876 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
877 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
879 S_cl_and(struct regnode_charclass_class *cl,
880 const struct regnode_charclass_class *and_with)
882 PERL_ARGS_ASSERT_CL_AND;
884 assert(PL_regkind[and_with->type] == ANYOF);
886 /* I (khw) am not sure all these restrictions are necessary XXX */
887 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
888 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
889 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
890 && !(and_with->flags & ANYOF_LOC_FOLD)
891 && !(cl->flags & ANYOF_LOC_FOLD)) {
894 if (and_with->flags & ANYOF_INVERT)
895 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
896 cl->bitmap[i] &= ~and_with->bitmap[i];
898 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
899 cl->bitmap[i] &= and_with->bitmap[i];
900 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
902 if (and_with->flags & ANYOF_INVERT) {
904 /* Here, the and'ed node is inverted. Get the AND of the flags that
905 * aren't affected by the inversion. Those that are affected are
906 * handled individually below */
907 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
908 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
909 cl->flags |= affected_flags;
911 /* We currently don't know how to deal with things that aren't in the
912 * bitmap, but we know that the intersection is no greater than what
913 * is already in cl, so let there be false positives that get sorted
914 * out after the synthetic start class succeeds, and the node is
915 * matched for real. */
917 /* The inversion of these two flags indicate that the resulting
918 * intersection doesn't have them */
919 if (and_with->flags & ANYOF_UNICODE_ALL) {
920 cl->flags &= ~ANYOF_UNICODE_ALL;
922 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
923 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
926 else { /* and'd node is not inverted */
927 U8 outside_bitmap_but_not_utf8; /* Temp variable */
929 if (! ANYOF_NONBITMAP(and_with)) {
931 /* Here 'and_with' doesn't match anything outside the bitmap
932 * (except possibly ANYOF_UNICODE_ALL), which means the
933 * intersection can't either, except for ANYOF_UNICODE_ALL, in
934 * which case we don't know what the intersection is, but it's no
935 * greater than what cl already has, so can just leave it alone,
936 * with possible false positives */
937 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
938 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
939 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
942 else if (! ANYOF_NONBITMAP(cl)) {
944 /* Here, 'and_with' does match something outside the bitmap, and cl
945 * doesn't have a list of things to match outside the bitmap. If
946 * cl can match all code points above 255, the intersection will
947 * be those above-255 code points that 'and_with' matches. If cl
948 * can't match all Unicode code points, it means that it can't
949 * match anything outside the bitmap (since the 'if' that got us
950 * into this block tested for that), so we leave the bitmap empty.
952 if (cl->flags & ANYOF_UNICODE_ALL) {
953 ARG_SET(cl, ARG(and_with));
955 /* and_with's ARG may match things that don't require UTF8.
956 * And now cl's will too, in spite of this being an 'and'. See
957 * the comments below about the kludge */
958 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
962 /* Here, both 'and_with' and cl match something outside the
963 * bitmap. Currently we do not do the intersection, so just match
964 * whatever cl had at the beginning. */
968 /* Take the intersection of the two sets of flags. However, the
969 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
970 * kludge around the fact that this flag is not treated like the others
971 * which are initialized in cl_anything(). The way the optimizer works
972 * is that the synthetic start class (SSC) is initialized to match
973 * anything, and then the first time a real node is encountered, its
974 * values are AND'd with the SSC's with the result being the values of
975 * the real node. However, there are paths through the optimizer where
976 * the AND never gets called, so those initialized bits are set
977 * inappropriately, which is not usually a big deal, as they just cause
978 * false positives in the SSC, which will just mean a probably
979 * imperceptible slow down in execution. However this bit has a
980 * higher false positive consequence in that it can cause utf8.pm,
981 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
982 * bigger slowdown and also causes significant extra memory to be used.
983 * In order to prevent this, the code now takes a different tack. The
984 * bit isn't set unless some part of the regular expression needs it,
985 * but once set it won't get cleared. This means that these extra
986 * modules won't get loaded unless there was some path through the
987 * pattern that would have required them anyway, and so any false
988 * positives that occur by not ANDing them out when they could be
989 * aren't as severe as they would be if we treated this bit like all
991 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
992 & ANYOF_NONBITMAP_NON_UTF8;
993 cl->flags &= and_with->flags;
994 cl->flags |= outside_bitmap_but_not_utf8;
998 /* 'OR' a given class with another one. Can create false positives. 'cl'
999 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
1000 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
1002 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
1004 PERL_ARGS_ASSERT_CL_OR;
1006 if (or_with->flags & ANYOF_INVERT) {
1008 /* Here, the or'd node is to be inverted. This means we take the
1009 * complement of everything not in the bitmap, but currently we don't
1010 * know what that is, so give up and match anything */
1011 if (ANYOF_NONBITMAP(or_with)) {
1012 cl_anything(pRExC_state, cl);
1015 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1016 * <= (B1 | !B2) | (CL1 | !CL2)
1017 * which is wasteful if CL2 is small, but we ignore CL2:
1018 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1019 * XXXX Can we handle case-fold? Unclear:
1020 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1021 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1023 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024 && !(or_with->flags & ANYOF_LOC_FOLD)
1025 && !(cl->flags & ANYOF_LOC_FOLD) ) {
1028 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1029 cl->bitmap[i] |= ~or_with->bitmap[i];
1030 } /* XXXX: logic is complicated otherwise */
1032 cl_anything(pRExC_state, cl);
1035 /* And, we can just take the union of the flags that aren't affected
1036 * by the inversion */
1037 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1039 /* For the remaining flags:
1040 ANYOF_UNICODE_ALL and inverted means to not match anything above
1041 255, which means that the union with cl should just be
1042 what cl has in it, so can ignore this flag
1043 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1044 is (ASCII) 127-255 to match them, but then invert that, so
1045 the union with cl should just be what cl has in it, so can
1048 } else { /* 'or_with' is not inverted */
1049 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1050 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1051 && (!(or_with->flags & ANYOF_LOC_FOLD)
1052 || (cl->flags & ANYOF_LOC_FOLD)) ) {
1055 /* OR char bitmap and class bitmap separately */
1056 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1057 cl->bitmap[i] |= or_with->bitmap[i];
1058 if (or_with->flags & ANYOF_CLASS) {
1059 ANYOF_CLASS_OR(or_with, cl);
1062 else { /* XXXX: logic is complicated, leave it along for a moment. */
1063 cl_anything(pRExC_state, cl);
1066 if (ANYOF_NONBITMAP(or_with)) {
1068 /* Use the added node's outside-the-bit-map match if there isn't a
1069 * conflict. If there is a conflict (both nodes match something
1070 * outside the bitmap, but what they match outside is not the same
1071 * pointer, and hence not easily compared until XXX we extend
1072 * inversion lists this far), give up and allow the start class to
1073 * match everything outside the bitmap. If that stuff is all above
1074 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1075 if (! ANYOF_NONBITMAP(cl)) {
1076 ARG_SET(cl, ARG(or_with));
1078 else if (ARG(cl) != ARG(or_with)) {
1080 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1081 cl_anything(pRExC_state, cl);
1084 cl->flags |= ANYOF_UNICODE_ALL;
1089 /* Take the union */
1090 cl->flags |= or_with->flags;
1094 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1095 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1096 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1097 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1102 dump_trie(trie,widecharmap,revcharmap)
1103 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1104 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1106 These routines dump out a trie in a somewhat readable format.
1107 The _interim_ variants are used for debugging the interim
1108 tables that are used to generate the final compressed
1109 representation which is what dump_trie expects.
1111 Part of the reason for their existence is to provide a form
1112 of documentation as to how the different representations function.
1117 Dumps the final compressed table form of the trie to Perl_debug_log.
1118 Used for debugging make_trie().
1122 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1123 AV *revcharmap, U32 depth)
1126 SV *sv=sv_newmortal();
1127 int colwidth= widecharmap ? 6 : 4;
1129 GET_RE_DEBUG_FLAGS_DECL;
1131 PERL_ARGS_ASSERT_DUMP_TRIE;
1133 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1134 (int)depth * 2 + 2,"",
1135 "Match","Base","Ofs" );
1137 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1138 SV ** const tmp = av_fetch( revcharmap, state, 0);
1140 PerlIO_printf( Perl_debug_log, "%*s",
1142 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1143 PL_colors[0], PL_colors[1],
1144 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1145 PERL_PV_ESCAPE_FIRSTCHAR
1150 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1151 (int)depth * 2 + 2,"");
1153 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1154 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1155 PerlIO_printf( Perl_debug_log, "\n");
1157 for( state = 1 ; state < trie->statecount ; state++ ) {
1158 const U32 base = trie->states[ state ].trans.base;
1160 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1162 if ( trie->states[ state ].wordnum ) {
1163 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1165 PerlIO_printf( Perl_debug_log, "%6s", "" );
1168 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1173 while( ( base + ofs < trie->uniquecharcount ) ||
1174 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1175 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1178 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1180 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1181 if ( ( base + ofs >= trie->uniquecharcount ) &&
1182 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1183 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1185 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1187 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1189 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1193 PerlIO_printf( Perl_debug_log, "]");
1196 PerlIO_printf( Perl_debug_log, "\n" );
1198 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1199 for (word=1; word <= trie->wordcount; word++) {
1200 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1201 (int)word, (int)(trie->wordinfo[word].prev),
1202 (int)(trie->wordinfo[word].len));
1204 PerlIO_printf(Perl_debug_log, "\n" );
1207 Dumps a fully constructed but uncompressed trie in list form.
1208 List tries normally only are used for construction when the number of
1209 possible chars (trie->uniquecharcount) is very high.
1210 Used for debugging make_trie().
1213 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1214 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1218 SV *sv=sv_newmortal();
1219 int colwidth= widecharmap ? 6 : 4;
1220 GET_RE_DEBUG_FLAGS_DECL;
1222 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1224 /* print out the table precompression. */
1225 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1226 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1227 "------:-----+-----------------\n" );
1229 for( state=1 ; state < next_alloc ; state ++ ) {
1232 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1233 (int)depth * 2 + 2,"", (UV)state );
1234 if ( ! trie->states[ state ].wordnum ) {
1235 PerlIO_printf( Perl_debug_log, "%5s| ","");
1237 PerlIO_printf( Perl_debug_log, "W%4x| ",
1238 trie->states[ state ].wordnum
1241 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1242 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1244 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1246 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1247 PL_colors[0], PL_colors[1],
1248 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1249 PERL_PV_ESCAPE_FIRSTCHAR
1251 TRIE_LIST_ITEM(state,charid).forid,
1252 (UV)TRIE_LIST_ITEM(state,charid).newstate
1255 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1256 (int)((depth * 2) + 14), "");
1259 PerlIO_printf( Perl_debug_log, "\n");
1264 Dumps a fully constructed but uncompressed trie in table form.
1265 This is the normal DFA style state transition table, with a few
1266 twists to facilitate compression later.
1267 Used for debugging make_trie().
1270 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1271 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1276 SV *sv=sv_newmortal();
1277 int colwidth= widecharmap ? 6 : 4;
1278 GET_RE_DEBUG_FLAGS_DECL;
1280 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1283 print out the table precompression so that we can do a visual check
1284 that they are identical.
1287 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1289 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1292 PerlIO_printf( Perl_debug_log, "%*s",
1294 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1295 PL_colors[0], PL_colors[1],
1296 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1297 PERL_PV_ESCAPE_FIRSTCHAR
1303 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1305 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1306 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1309 PerlIO_printf( Perl_debug_log, "\n" );
1311 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1313 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1314 (int)depth * 2 + 2,"",
1315 (UV)TRIE_NODENUM( state ) );
1317 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1318 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1320 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1322 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1324 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1325 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1327 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1328 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1336 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1337 startbranch: the first branch in the whole branch sequence
1338 first : start branch of sequence of branch-exact nodes.
1339 May be the same as startbranch
1340 last : Thing following the last branch.
1341 May be the same as tail.
1342 tail : item following the branch sequence
1343 count : words in the sequence
1344 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1345 depth : indent depth
1347 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1349 A trie is an N'ary tree where the branches are determined by digital
1350 decomposition of the key. IE, at the root node you look up the 1st character and
1351 follow that branch repeat until you find the end of the branches. Nodes can be
1352 marked as "accepting" meaning they represent a complete word. Eg:
1356 would convert into the following structure. Numbers represent states, letters
1357 following numbers represent valid transitions on the letter from that state, if
1358 the number is in square brackets it represents an accepting state, otherwise it
1359 will be in parenthesis.
1361 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1365 (1) +-i->(6)-+-s->[7]
1367 +-s->(3)-+-h->(4)-+-e->[5]
1369 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1371 This shows that when matching against the string 'hers' we will begin at state 1
1372 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1373 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1374 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1375 single traverse. We store a mapping from accepting to state to which word was
1376 matched, and then when we have multiple possibilities we try to complete the
1377 rest of the regex in the order in which they occured in the alternation.
1379 The only prior NFA like behaviour that would be changed by the TRIE support is
1380 the silent ignoring of duplicate alternations which are of the form:
1382 / (DUPE|DUPE) X? (?{ ... }) Y /x
1384 Thus EVAL blocks following a trie may be called a different number of times with
1385 and without the optimisation. With the optimisations dupes will be silently
1386 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1387 the following demonstrates:
1389 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1391 which prints out 'word' three times, but
1393 'words'=~/(word|word|word)(?{ print $1 })S/
1395 which doesnt print it out at all. This is due to other optimisations kicking in.
1397 Example of what happens on a structural level:
1399 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1401 1: CURLYM[1] {1,32767}(18)
1412 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1413 and should turn into:
1415 1: CURLYM[1] {1,32767}(18)
1417 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1425 Cases where tail != last would be like /(?foo|bar)baz/:
1435 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1436 and would end up looking like:
1439 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1446 d = uvchr_to_utf8_flags(d, uv, 0);
1448 is the recommended Unicode-aware way of saying
1453 #define TRIE_STORE_REVCHAR(val) \
1456 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1457 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1458 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1459 SvCUR_set(zlopp, kapow - flrbbbbb); \
1462 av_push(revcharmap, zlopp); \
1464 char ooooff = (char)val; \
1465 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1469 /* This gets the next character from the input, folding it if not already
1471 #define TRIE_READ_CHAR STMT_START { \
1474 /* if it is UTF then it is either already folded, or does not need \
1476 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1478 else if (folder == PL_fold_latin1) { \
1479 /* This folder implies Unicode rules, which in the range expressible \
1480 * by not UTF is the lower case, with the two exceptions, one of \
1481 * which should have been taken care of before calling this */ \
1482 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1483 uvc = toLOWER_L1(*uc); \
1484 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1487 /* raw data, will be folded later if needed */ \
1495 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1496 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1497 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1498 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1500 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1501 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1502 TRIE_LIST_CUR( state )++; \
1505 #define TRIE_LIST_NEW(state) STMT_START { \
1506 Newxz( trie->states[ state ].trans.list, \
1507 4, reg_trie_trans_le ); \
1508 TRIE_LIST_CUR( state ) = 1; \
1509 TRIE_LIST_LEN( state ) = 4; \
1512 #define TRIE_HANDLE_WORD(state) STMT_START { \
1513 U16 dupe= trie->states[ state ].wordnum; \
1514 regnode * const noper_next = regnext( noper ); \
1517 /* store the word for dumping */ \
1519 if (OP(noper) != NOTHING) \
1520 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1522 tmp = newSVpvn_utf8( "", 0, UTF ); \
1523 av_push( trie_words, tmp ); \
1527 trie->wordinfo[curword].prev = 0; \
1528 trie->wordinfo[curword].len = wordlen; \
1529 trie->wordinfo[curword].accept = state; \
1531 if ( noper_next < tail ) { \
1533 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1534 trie->jump[curword] = (U16)(noper_next - convert); \
1536 jumper = noper_next; \
1538 nextbranch= regnext(cur); \
1542 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1543 /* chain, so that when the bits of chain are later */\
1544 /* linked together, the dups appear in the chain */\
1545 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1546 trie->wordinfo[dupe].prev = curword; \
1548 /* we haven't inserted this word yet. */ \
1549 trie->states[ state ].wordnum = curword; \
1554 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1555 ( ( base + charid >= ucharcount \
1556 && base + charid < ubound \
1557 && state == trie->trans[ base - ucharcount + charid ].check \
1558 && trie->trans[ base - ucharcount + charid ].next ) \
1559 ? trie->trans[ base - ucharcount + charid ].next \
1560 : ( state==1 ? special : 0 ) \
1564 #define MADE_JUMP_TRIE 2
1565 #define MADE_EXACT_TRIE 4
1568 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1571 /* first pass, loop through and scan words */
1572 reg_trie_data *trie;
1573 HV *widecharmap = NULL;
1574 AV *revcharmap = newAV();
1576 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1581 regnode *jumper = NULL;
1582 regnode *nextbranch = NULL;
1583 regnode *convert = NULL;
1584 U32 *prev_states; /* temp array mapping each state to previous one */
1585 /* we just use folder as a flag in utf8 */
1586 const U8 * folder = NULL;
1589 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1590 AV *trie_words = NULL;
1591 /* along with revcharmap, this only used during construction but both are
1592 * useful during debugging so we store them in the struct when debugging.
1595 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1596 STRLEN trie_charcount=0;
1598 SV *re_trie_maxbuff;
1599 GET_RE_DEBUG_FLAGS_DECL;
1601 PERL_ARGS_ASSERT_MAKE_TRIE;
1603 PERL_UNUSED_ARG(depth);
1610 case EXACTFU: folder = PL_fold_latin1; break;
1611 case EXACTF: folder = PL_fold; break;
1612 case EXACTFL: folder = PL_fold_locale; break;
1613 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1616 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1618 trie->startstate = 1;
1619 trie->wordcount = word_count;
1620 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1621 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1623 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1624 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1625 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1628 trie_words = newAV();
1631 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1632 if (!SvIOK(re_trie_maxbuff)) {
1633 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1635 DEBUG_TRIE_COMPILE_r({
1636 PerlIO_printf( Perl_debug_log,
1637 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1638 (int)depth * 2 + 2, "",
1639 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1640 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1644 /* Find the node we are going to overwrite */
1645 if ( first == startbranch && OP( last ) != BRANCH ) {
1646 /* whole branch chain */
1649 /* branch sub-chain */
1650 convert = NEXTOPER( first );
1653 /* -- First loop and Setup --
1655 We first traverse the branches and scan each word to determine if it
1656 contains widechars, and how many unique chars there are, this is
1657 important as we have to build a table with at least as many columns as we
1660 We use an array of integers to represent the character codes 0..255
1661 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1662 native representation of the character value as the key and IV's for the
1665 *TODO* If we keep track of how many times each character is used we can
1666 remap the columns so that the table compression later on is more
1667 efficient in terms of memory by ensuring the most common value is in the
1668 middle and the least common are on the outside. IMO this would be better
1669 than a most to least common mapping as theres a decent chance the most
1670 common letter will share a node with the least common, meaning the node
1671 will not be compressible. With a middle is most common approach the worst
1672 case is when we have the least common nodes twice.
1676 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1677 regnode *noper = NEXTOPER( cur );
1678 const U8 *uc = (U8*)STRING( noper );
1679 const U8 *e = uc + STR_LEN( noper );
1681 U32 wordlen = 0; /* required init */
1682 STRLEN minbytes = 0;
1683 STRLEN maxbytes = 0;
1684 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1686 if (OP(noper) == NOTHING) {
1687 regnode *noper_next= regnext(noper);
1688 if (noper_next != tail && OP(noper_next) == flags) {
1690 uc= (U8*)STRING(noper);
1691 e= uc + STR_LEN(noper);
1692 trie->minlen= STR_LEN(noper);
1699 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1700 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1701 regardless of encoding */
1702 if (OP( noper ) == EXACTFU_SS) {
1703 /* false positives are ok, so just set this */
1704 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
1707 for ( ; uc < e ; uc += len ) {
1708 TRIE_CHARCOUNT(trie)++;
1711 /* Acummulate to the current values, the range in the number of
1712 * bytes that this character could match. The max is presumed to
1713 * be the same as the folded input (which TRIE_READ_CHAR returns),
1714 * except that when this is not in UTF-8, it could be matched
1715 * against a string which is UTF-8, and the variant characters
1716 * could be 2 bytes instead of the 1 here. Likewise, for the
1717 * minimum number of bytes when not folded. When folding, the min
1718 * is assumed to be 1 byte could fold to match the single character
1719 * here, or in the case of a multi-char fold, 1 byte can fold to
1720 * the whole sequence. 'foldlen' is used to denote whether we are
1721 * in such a sequence, skipping the min setting if so. XXX TODO
1722 * Use the exact list of what folds to each character, from
1723 * PL_utf8_foldclosures */
1725 maxbytes += UTF8SKIP(uc);
1727 /* A non-UTF-8 string could be 1 byte to match our 2 */
1728 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1734 foldlen -= UTF8SKIP(uc);
1737 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1743 maxbytes += (UNI_IS_INVARIANT(*uc))
1754 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
1761 U8 folded= folder[ (U8) uvc ];
1762 if ( !trie->charmap[ folded ] ) {
1763 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1764 TRIE_STORE_REVCHAR( folded );
1767 if ( !trie->charmap[ uvc ] ) {
1768 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1769 TRIE_STORE_REVCHAR( uvc );
1772 /* store the codepoint in the bitmap, and its folded
1774 TRIE_BITMAP_SET(trie, uvc);
1776 /* store the folded codepoint */
1777 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1780 /* store first byte of utf8 representation of
1781 variant codepoints */
1782 if (! NATIVE_IS_INVARIANT(uvc)) {
1783 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1786 set_bit = 0; /* We've done our bit :-) */
1791 widecharmap = newHV();
1793 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1796 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1798 if ( !SvTRUE( *svpp ) ) {
1799 sv_setiv( *svpp, ++trie->uniquecharcount );
1800 TRIE_STORE_REVCHAR(uvc);
1804 if( cur == first ) {
1805 trie->minlen = minbytes;
1806 trie->maxlen = maxbytes;
1807 } else if (minbytes < trie->minlen) {
1808 trie->minlen = minbytes;
1809 } else if (maxbytes > trie->maxlen) {
1810 trie->maxlen = maxbytes;
1812 } /* end first pass */
1813 DEBUG_TRIE_COMPILE_r(
1814 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1815 (int)depth * 2 + 2,"",
1816 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1817 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1818 (int)trie->minlen, (int)trie->maxlen )
1822 We now know what we are dealing with in terms of unique chars and
1823 string sizes so we can calculate how much memory a naive
1824 representation using a flat table will take. If it's over a reasonable
1825 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1826 conservative but potentially much slower representation using an array
1829 At the end we convert both representations into the same compressed
1830 form that will be used in regexec.c for matching with. The latter
1831 is a form that cannot be used to construct with but has memory
1832 properties similar to the list form and access properties similar
1833 to the table form making it both suitable for fast searches and
1834 small enough that its feasable to store for the duration of a program.
1836 See the comment in the code where the compressed table is produced
1837 inplace from the flat tabe representation for an explanation of how
1838 the compression works.
1843 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1846 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1848 Second Pass -- Array Of Lists Representation
1850 Each state will be represented by a list of charid:state records
1851 (reg_trie_trans_le) the first such element holds the CUR and LEN
1852 points of the allocated array. (See defines above).
1854 We build the initial structure using the lists, and then convert
1855 it into the compressed table form which allows faster lookups
1856 (but cant be modified once converted).
1859 STRLEN transcount = 1;
1861 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1862 "%*sCompiling trie using list compiler\n",
1863 (int)depth * 2 + 2, ""));
1865 trie->states = (reg_trie_state *)
1866 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1867 sizeof(reg_trie_state) );
1871 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1873 regnode *noper = NEXTOPER( cur );
1874 U8 *uc = (U8*)STRING( noper );
1875 const U8 *e = uc + STR_LEN( noper );
1876 U32 state = 1; /* required init */
1877 U16 charid = 0; /* sanity init */
1878 U32 wordlen = 0; /* required init */
1880 if (OP(noper) == NOTHING) {
1881 regnode *noper_next= regnext(noper);
1882 if (noper_next != tail && OP(noper_next) == flags) {
1884 uc= (U8*)STRING(noper);
1885 e= uc + STR_LEN(noper);
1889 if (OP(noper) != NOTHING) {
1890 for ( ; uc < e ; uc += len ) {
1895 charid = trie->charmap[ uvc ];
1897 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1901 charid=(U16)SvIV( *svpp );
1904 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1911 if ( !trie->states[ state ].trans.list ) {
1912 TRIE_LIST_NEW( state );
1914 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1915 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1916 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1921 newstate = next_alloc++;
1922 prev_states[newstate] = state;
1923 TRIE_LIST_PUSH( state, charid, newstate );
1928 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1932 TRIE_HANDLE_WORD(state);
1934 } /* end second pass */
1936 /* next alloc is the NEXT state to be allocated */
1937 trie->statecount = next_alloc;
1938 trie->states = (reg_trie_state *)
1939 PerlMemShared_realloc( trie->states,
1941 * sizeof(reg_trie_state) );
1943 /* and now dump it out before we compress it */
1944 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1945 revcharmap, next_alloc,
1949 trie->trans = (reg_trie_trans *)
1950 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1957 for( state=1 ; state < next_alloc ; state ++ ) {
1961 DEBUG_TRIE_COMPILE_MORE_r(
1962 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1966 if (trie->states[state].trans.list) {
1967 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1971 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1972 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1973 if ( forid < minid ) {
1975 } else if ( forid > maxid ) {
1979 if ( transcount < tp + maxid - minid + 1) {
1981 trie->trans = (reg_trie_trans *)
1982 PerlMemShared_realloc( trie->trans,
1984 * sizeof(reg_trie_trans) );
1985 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1987 base = trie->uniquecharcount + tp - minid;
1988 if ( maxid == minid ) {
1990 for ( ; zp < tp ; zp++ ) {
1991 if ( ! trie->trans[ zp ].next ) {
1992 base = trie->uniquecharcount + zp - minid;
1993 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1994 trie->trans[ zp ].check = state;
2000 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2001 trie->trans[ tp ].check = state;
2006 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2007 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2008 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2009 trie->trans[ tid ].check = state;
2011 tp += ( maxid - minid + 1 );
2013 Safefree(trie->states[ state ].trans.list);
2016 DEBUG_TRIE_COMPILE_MORE_r(
2017 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2020 trie->states[ state ].trans.base=base;
2022 trie->lasttrans = tp + 1;
2026 Second Pass -- Flat Table Representation.
2028 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2029 We know that we will need Charcount+1 trans at most to store the data
2030 (one row per char at worst case) So we preallocate both structures
2031 assuming worst case.
2033 We then construct the trie using only the .next slots of the entry
2036 We use the .check field of the first entry of the node temporarily to
2037 make compression both faster and easier by keeping track of how many non
2038 zero fields are in the node.
2040 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2043 There are two terms at use here: state as a TRIE_NODEIDX() which is a
2044 number representing the first entry of the node, and state as a
2045 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2046 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2047 are 2 entrys per node. eg:
2055 The table is internally in the right hand, idx form. However as we also
2056 have to deal with the states array which is indexed by nodenum we have to
2057 use TRIE_NODENUM() to convert.
2060 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2061 "%*sCompiling trie using table compiler\n",
2062 (int)depth * 2 + 2, ""));
2064 trie->trans = (reg_trie_trans *)
2065 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2066 * trie->uniquecharcount + 1,
2067 sizeof(reg_trie_trans) );
2068 trie->states = (reg_trie_state *)
2069 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2070 sizeof(reg_trie_state) );
2071 next_alloc = trie->uniquecharcount + 1;
2074 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2076 regnode *noper = NEXTOPER( cur );
2077 const U8 *uc = (U8*)STRING( noper );
2078 const U8 *e = uc + STR_LEN( noper );
2080 U32 state = 1; /* required init */
2082 U16 charid = 0; /* sanity init */
2083 U32 accept_state = 0; /* sanity init */
2085 U32 wordlen = 0; /* required init */
2087 if (OP(noper) == NOTHING) {
2088 regnode *noper_next= regnext(noper);
2089 if (noper_next != tail && OP(noper_next) == flags) {
2091 uc= (U8*)STRING(noper);
2092 e= uc + STR_LEN(noper);
2096 if ( OP(noper) != NOTHING ) {
2097 for ( ; uc < e ; uc += len ) {
2102 charid = trie->charmap[ uvc ];
2104 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2105 charid = svpp ? (U16)SvIV(*svpp) : 0;
2109 if ( !trie->trans[ state + charid ].next ) {
2110 trie->trans[ state + charid ].next = next_alloc;
2111 trie->trans[ state ].check++;
2112 prev_states[TRIE_NODENUM(next_alloc)]
2113 = TRIE_NODENUM(state);
2114 next_alloc += trie->uniquecharcount;
2116 state = trie->trans[ state + charid ].next;
2118 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2120 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2123 accept_state = TRIE_NODENUM( state );
2124 TRIE_HANDLE_WORD(accept_state);
2126 } /* end second pass */
2128 /* and now dump it out before we compress it */
2129 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2131 next_alloc, depth+1));
2135 * Inplace compress the table.*
2137 For sparse data sets the table constructed by the trie algorithm will
2138 be mostly 0/FAIL transitions or to put it another way mostly empty.
2139 (Note that leaf nodes will not contain any transitions.)
2141 This algorithm compresses the tables by eliminating most such
2142 transitions, at the cost of a modest bit of extra work during lookup:
2144 - Each states[] entry contains a .base field which indicates the
2145 index in the state[] array wheres its transition data is stored.
2147 - If .base is 0 there are no valid transitions from that node.
2149 - If .base is nonzero then charid is added to it to find an entry in
2152 -If trans[states[state].base+charid].check!=state then the
2153 transition is taken to be a 0/Fail transition. Thus if there are fail
2154 transitions at the front of the node then the .base offset will point
2155 somewhere inside the previous nodes data (or maybe even into a node
2156 even earlier), but the .check field determines if the transition is
2160 The following process inplace converts the table to the compressed
2161 table: We first do not compress the root node 1,and mark all its
2162 .check pointers as 1 and set its .base pointer as 1 as well. This
2163 allows us to do a DFA construction from the compressed table later,
2164 and ensures that any .base pointers we calculate later are greater
2167 - We set 'pos' to indicate the first entry of the second node.
2169 - We then iterate over the columns of the node, finding the first and
2170 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2171 and set the .check pointers accordingly, and advance pos
2172 appropriately and repreat for the next node. Note that when we copy
2173 the next pointers we have to convert them from the original
2174 NODEIDX form to NODENUM form as the former is not valid post
2177 - If a node has no transitions used we mark its base as 0 and do not
2178 advance the pos pointer.
2180 - If a node only has one transition we use a second pointer into the
2181 structure to fill in allocated fail transitions from other states.
2182 This pointer is independent of the main pointer and scans forward
2183 looking for null transitions that are allocated to a state. When it
2184 finds one it writes the single transition into the "hole". If the
2185 pointer doesnt find one the single transition is appended as normal.
2187 - Once compressed we can Renew/realloc the structures to release the
2190 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2191 specifically Fig 3.47 and the associated pseudocode.
2195 const U32 laststate = TRIE_NODENUM( next_alloc );
2198 trie->statecount = laststate;
2200 for ( state = 1 ; state < laststate ; state++ ) {
2202 const U32 stateidx = TRIE_NODEIDX( state );
2203 const U32 o_used = trie->trans[ stateidx ].check;
2204 U32 used = trie->trans[ stateidx ].check;
2205 trie->trans[ stateidx ].check = 0;
2207 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2208 if ( flag || trie->trans[ stateidx + charid ].next ) {
2209 if ( trie->trans[ stateidx + charid ].next ) {
2211 for ( ; zp < pos ; zp++ ) {
2212 if ( ! trie->trans[ zp ].next ) {
2216 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2217 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2218 trie->trans[ zp ].check = state;
2219 if ( ++zp > pos ) pos = zp;
2226 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2228 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2229 trie->trans[ pos ].check = state;
2234 trie->lasttrans = pos + 1;
2235 trie->states = (reg_trie_state *)
2236 PerlMemShared_realloc( trie->states, laststate
2237 * sizeof(reg_trie_state) );
2238 DEBUG_TRIE_COMPILE_MORE_r(
2239 PerlIO_printf( Perl_debug_log,
2240 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2241 (int)depth * 2 + 2,"",
2242 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2245 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2248 } /* end table compress */
2250 DEBUG_TRIE_COMPILE_MORE_r(
2251 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2252 (int)depth * 2 + 2, "",
2253 (UV)trie->statecount,
2254 (UV)trie->lasttrans)
2256 /* resize the trans array to remove unused space */
2257 trie->trans = (reg_trie_trans *)
2258 PerlMemShared_realloc( trie->trans, trie->lasttrans
2259 * sizeof(reg_trie_trans) );
2261 { /* Modify the program and insert the new TRIE node */
2262 U8 nodetype =(U8)(flags & 0xFF);
2266 regnode *optimize = NULL;
2267 #ifdef RE_TRACK_PATTERN_OFFSETS
2270 U32 mjd_nodelen = 0;
2271 #endif /* RE_TRACK_PATTERN_OFFSETS */
2272 #endif /* DEBUGGING */
2274 This means we convert either the first branch or the first Exact,
2275 depending on whether the thing following (in 'last') is a branch
2276 or not and whther first is the startbranch (ie is it a sub part of
2277 the alternation or is it the whole thing.)
2278 Assuming its a sub part we convert the EXACT otherwise we convert
2279 the whole branch sequence, including the first.
2281 /* Find the node we are going to overwrite */
2282 if ( first != startbranch || OP( last ) == BRANCH ) {
2283 /* branch sub-chain */
2284 NEXT_OFF( first ) = (U16)(last - first);
2285 #ifdef RE_TRACK_PATTERN_OFFSETS
2287 mjd_offset= Node_Offset((convert));
2288 mjd_nodelen= Node_Length((convert));
2291 /* whole branch chain */
2293 #ifdef RE_TRACK_PATTERN_OFFSETS
2296 const regnode *nop = NEXTOPER( convert );
2297 mjd_offset= Node_Offset((nop));
2298 mjd_nodelen= Node_Length((nop));
2302 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2303 (int)depth * 2 + 2, "",
2304 (UV)mjd_offset, (UV)mjd_nodelen)
2307 /* But first we check to see if there is a common prefix we can
2308 split out as an EXACT and put in front of the TRIE node. */
2309 trie->startstate= 1;
2310 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2312 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2316 const U32 base = trie->states[ state ].trans.base;
2318 if ( trie->states[state].wordnum )
2321 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2322 if ( ( base + ofs >= trie->uniquecharcount ) &&
2323 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2324 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2326 if ( ++count > 1 ) {
2327 SV **tmp = av_fetch( revcharmap, ofs, 0);
2328 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2329 if ( state == 1 ) break;
2331 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2333 PerlIO_printf(Perl_debug_log,
2334 "%*sNew Start State=%"UVuf" Class: [",
2335 (int)depth * 2 + 2, "",
2338 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2339 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2341 TRIE_BITMAP_SET(trie,*ch);
2343 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2345 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2349 TRIE_BITMAP_SET(trie,*ch);
2351 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2352 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2358 SV **tmp = av_fetch( revcharmap, idx, 0);
2360 char *ch = SvPV( *tmp, len );
2362 SV *sv=sv_newmortal();
2363 PerlIO_printf( Perl_debug_log,
2364 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2365 (int)depth * 2 + 2, "",
2367 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2368 PL_colors[0], PL_colors[1],
2369 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2370 PERL_PV_ESCAPE_FIRSTCHAR
2375 OP( convert ) = nodetype;
2376 str=STRING(convert);
2379 STR_LEN(convert) += len;
2385 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2390 trie->prefixlen = (state-1);
2392 regnode *n = convert+NODE_SZ_STR(convert);
2393 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2394 trie->startstate = state;
2395 trie->minlen -= (state - 1);
2396 trie->maxlen -= (state - 1);
2398 /* At least the UNICOS C compiler choked on this
2399 * being argument to DEBUG_r(), so let's just have
2402 #ifdef PERL_EXT_RE_BUILD
2408 regnode *fix = convert;
2409 U32 word = trie->wordcount;
2411 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2412 while( ++fix < n ) {
2413 Set_Node_Offset_Length(fix, 0, 0);
2416 SV ** const tmp = av_fetch( trie_words, word, 0 );
2418 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2419 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2421 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2429 NEXT_OFF(convert) = (U16)(tail - convert);
2430 DEBUG_r(optimize= n);
2436 if ( trie->maxlen ) {
2437 NEXT_OFF( convert ) = (U16)(tail - convert);
2438 ARG_SET( convert, data_slot );
2439 /* Store the offset to the first unabsorbed branch in
2440 jump[0], which is otherwise unused by the jump logic.
2441 We use this when dumping a trie and during optimisation. */
2443 trie->jump[0] = (U16)(nextbranch - convert);
2445 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2446 * and there is a bitmap
2447 * and the first "jump target" node we found leaves enough room
2448 * then convert the TRIE node into a TRIEC node, with the bitmap
2449 * embedded inline in the opcode - this is hypothetically faster.
2451 if ( !trie->states[trie->startstate].wordnum
2453 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2455 OP( convert ) = TRIEC;
2456 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2457 PerlMemShared_free(trie->bitmap);
2460 OP( convert ) = TRIE;
2462 /* store the type in the flags */
2463 convert->flags = nodetype;
2467 + regarglen[ OP( convert ) ];
2469 /* XXX We really should free up the resource in trie now,
2470 as we won't use them - (which resources?) dmq */
2472 /* needed for dumping*/
2473 DEBUG_r(if (optimize) {
2474 regnode *opt = convert;
2476 while ( ++opt < optimize) {
2477 Set_Node_Offset_Length(opt,0,0);
2480 Try to clean up some of the debris left after the
2483 while( optimize < jumper ) {
2484 mjd_nodelen += Node_Length((optimize));
2485 OP( optimize ) = OPTIMIZED;
2486 Set_Node_Offset_Length(optimize,0,0);
2489 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2491 } /* end node insert */
2493 /* Finish populating the prev field of the wordinfo array. Walk back
2494 * from each accept state until we find another accept state, and if
2495 * so, point the first word's .prev field at the second word. If the
2496 * second already has a .prev field set, stop now. This will be the
2497 * case either if we've already processed that word's accept state,
2498 * or that state had multiple words, and the overspill words were
2499 * already linked up earlier.
2506 for (word=1; word <= trie->wordcount; word++) {
2508 if (trie->wordinfo[word].prev)
2510 state = trie->wordinfo[word].accept;
2512 state = prev_states[state];
2515 prev = trie->states[state].wordnum;
2519 trie->wordinfo[word].prev = prev;
2521 Safefree(prev_states);
2525 /* and now dump out the compressed format */
2526 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2528 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2530 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2531 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2533 SvREFCNT_dec_NN(revcharmap);
2537 : trie->startstate>1
2543 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2545 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2547 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2548 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2551 We find the fail state for each state in the trie, this state is the longest proper
2552 suffix of the current state's 'word' that is also a proper prefix of another word in our
2553 trie. State 1 represents the word '' and is thus the default fail state. This allows
2554 the DFA not to have to restart after its tried and failed a word at a given point, it
2555 simply continues as though it had been matching the other word in the first place.
2557 'abcdgu'=~/abcdefg|cdgu/
2558 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2559 fail, which would bring us to the state representing 'd' in the second word where we would
2560 try 'g' and succeed, proceeding to match 'cdgu'.
2562 /* add a fail transition */
2563 const U32 trie_offset = ARG(source);
2564 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2566 const U32 ucharcount = trie->uniquecharcount;
2567 const U32 numstates = trie->statecount;
2568 const U32 ubound = trie->lasttrans + ucharcount;
2572 U32 base = trie->states[ 1 ].trans.base;
2575 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2576 GET_RE_DEBUG_FLAGS_DECL;
2578 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2580 PERL_UNUSED_ARG(depth);
2584 ARG_SET( stclass, data_slot );
2585 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2586 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2587 aho->trie=trie_offset;
2588 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2589 Copy( trie->states, aho->states, numstates, reg_trie_state );
2590 Newxz( q, numstates, U32);
2591 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2594 /* initialize fail[0..1] to be 1 so that we always have
2595 a valid final fail state */
2596 fail[ 0 ] = fail[ 1 ] = 1;
2598 for ( charid = 0; charid < ucharcount ; charid++ ) {
2599 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2601 q[ q_write ] = newstate;
2602 /* set to point at the root */
2603 fail[ q[ q_write++ ] ]=1;
2606 while ( q_read < q_write) {
2607 const U32 cur = q[ q_read++ % numstates ];
2608 base = trie->states[ cur ].trans.base;
2610 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2611 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2613 U32 fail_state = cur;
2616 fail_state = fail[ fail_state ];
2617 fail_base = aho->states[ fail_state ].trans.base;
2618 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2620 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2621 fail[ ch_state ] = fail_state;
2622 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2624 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2626 q[ q_write++ % numstates] = ch_state;
2630 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2631 when we fail in state 1, this allows us to use the
2632 charclass scan to find a valid start char. This is based on the principle
2633 that theres a good chance the string being searched contains lots of stuff
2634 that cant be a start char.
2636 fail[ 0 ] = fail[ 1 ] = 0;
2637 DEBUG_TRIE_COMPILE_r({
2638 PerlIO_printf(Perl_debug_log,
2639 "%*sStclass Failtable (%"UVuf" states): 0",
2640 (int)(depth * 2), "", (UV)numstates
2642 for( q_read=1; q_read<numstates; q_read++ ) {
2643 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2645 PerlIO_printf(Perl_debug_log, "\n");
2648 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2653 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2654 * These need to be revisited when a newer toolchain becomes available.
2656 #if defined(__sparc64__) && defined(__GNUC__)
2657 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2658 # undef SPARC64_GCC_WORKAROUND
2659 # define SPARC64_GCC_WORKAROUND 1
2663 #define DEBUG_PEEP(str,scan,depth) \
2664 DEBUG_OPTIMISE_r({if (scan){ \
2665 SV * const mysv=sv_newmortal(); \
2666 regnode *Next = regnext(scan); \
2667 regprop(RExC_rx, mysv, scan); \
2668 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2669 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2670 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2674 /* The below joins as many adjacent EXACTish nodes as possible into a single
2675 * one. The regop may be changed if the node(s) contain certain sequences that
2676 * require special handling. The joining is only done if:
2677 * 1) there is room in the current conglomerated node to entirely contain the
2679 * 2) they are the exact same node type
2681 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2682 * these get optimized out
2684 * If a node is to match under /i (folded), the number of characters it matches
2685 * can be different than its character length if it contains a multi-character
2686 * fold. *min_subtract is set to the total delta of the input nodes.
2688 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2689 * and contains LATIN SMALL LETTER SHARP S
2691 * This is as good a place as any to discuss the design of handling these
2692 * multi-character fold sequences. It's been wrong in Perl for a very long
2693 * time. There are three code points in Unicode whose multi-character folds
2694 * were long ago discovered to mess things up. The previous designs for
2695 * dealing with these involved assigning a special node for them. This
2696 * approach doesn't work, as evidenced by this example:
2697 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2698 * Both these fold to "sss", but if the pattern is parsed to create a node that
2699 * would match just the \xDF, it won't be able to handle the case where a
2700 * successful match would have to cross the node's boundary. The new approach
2701 * that hopefully generally solves the problem generates an EXACTFU_SS node
2704 * It turns out that there are problems with all multi-character folds, and not
2705 * just these three. Now the code is general, for all such cases. The
2706 * approach taken is:
2707 * 1) This routine examines each EXACTFish node that could contain multi-
2708 * character fold sequences. It returns in *min_subtract how much to
2709 * subtract from the the actual length of the string to get a real minimum
2710 * match length; it is 0 if there are no multi-char folds. This delta is
2711 * used by the caller to adjust the min length of the match, and the delta
2712 * between min and max, so that the optimizer doesn't reject these
2713 * possibilities based on size constraints.
2714 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2715 * is used for an EXACTFU node that contains at least one "ss" sequence in
2716 * it. For non-UTF-8 patterns and strings, this is the only case where
2717 * there is a possible fold length change. That means that a regular
2718 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2719 * with length changes, and so can be processed faster. regexec.c takes
2720 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2721 * pre-folded by regcomp.c. This saves effort in regex matching.
2722 * However, the pre-folding isn't done for non-UTF8 patterns because the
2723 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2724 * down by forcing the pattern into UTF8 unless necessary. Also what
2725 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2726 * possibilities for the non-UTF8 patterns are quite simple, except for
2727 * the sharp s. All the ones that don't involve a UTF-8 target string are
2728 * members of a fold-pair, and arrays are set up for all of them so that
2729 * the other member of the pair can be found quickly. Code elsewhere in
2730 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2731 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2732 * described in the next item.
2733 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2734 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2735 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
2736 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
2737 * character in the pattern corresponds to at most a single character in
2738 * the target string. (And I do mean character, and not byte here, unlike
2739 * other parts of the documentation that have never been updated to
2740 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
2741 * two character string 'ss'; in EXACTFA nodes it can match
2742 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
2743 * instances where it is violated. I'm reluctant to try to change the
2744 * assumption, as the code involved is impenetrable to me (khw), so
2745 * instead the code here punts. This routine examines (when the pattern
2746 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2747 * boolean indicating whether or not the node contains a sharp s. When it
2748 * is true, the caller sets a flag that later causes the optimizer in this
2749 * file to not set values for the floating and fixed string lengths, and
2750 * thus avoids the optimizer code in regexec.c that makes the invalid
2751 * assumption. Thus, there is no optimization based on string lengths for
2752 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2753 * (The reason the assumption is wrong only in these two cases is that all
2754 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2755 * other folds to their expanded versions. We can't prefold sharp s to
2756 * 'ss' in EXACTF nodes because we don't know at compile time if it
2757 * actually matches 'ss' or not. It will match iff the target string is
2758 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2759 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
2760 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2761 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2762 * require the pattern to be forced into UTF-8, the overhead of which we
2765 * Similarly, the code that generates tries doesn't currently handle
2766 * not-already-folded multi-char folds, and it looks like a pain to change
2767 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
2768 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
2769 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
2770 * using /iaa matching will be doing so almost entirely with ASCII
2771 * strings, so this should rarely be encountered in practice */
2773 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2774 if (PL_regkind[OP(scan)] == EXACT) \
2775 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2778 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2779 /* Merge several consecutive EXACTish nodes into one. */
2780 regnode *n = regnext(scan);
2782 regnode *next = scan + NODE_SZ_STR(scan);
2786 regnode *stop = scan;
2787 GET_RE_DEBUG_FLAGS_DECL;
2789 PERL_UNUSED_ARG(depth);
2792 PERL_ARGS_ASSERT_JOIN_EXACT;
2793 #ifndef EXPERIMENTAL_INPLACESCAN
2794 PERL_UNUSED_ARG(flags);
2795 PERL_UNUSED_ARG(val);
2797 DEBUG_PEEP("join",scan,depth);
2799 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2800 * EXACT ones that are mergeable to the current one. */
2802 && (PL_regkind[OP(n)] == NOTHING
2803 || (stringok && OP(n) == OP(scan)))
2805 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2808 if (OP(n) == TAIL || n > next)
2810 if (PL_regkind[OP(n)] == NOTHING) {
2811 DEBUG_PEEP("skip:",n,depth);
2812 NEXT_OFF(scan) += NEXT_OFF(n);
2813 next = n + NODE_STEP_REGNODE;
2820 else if (stringok) {
2821 const unsigned int oldl = STR_LEN(scan);
2822 regnode * const nnext = regnext(n);
2824 /* XXX I (khw) kind of doubt that this works on platforms where
2825 * U8_MAX is above 255 because of lots of other assumptions */
2826 /* Don't join if the sum can't fit into a single node */
2827 if (oldl + STR_LEN(n) > U8_MAX)
2830 DEBUG_PEEP("merg",n,depth);
2833 NEXT_OFF(scan) += NEXT_OFF(n);
2834 STR_LEN(scan) += STR_LEN(n);
2835 next = n + NODE_SZ_STR(n);
2836 /* Now we can overwrite *n : */
2837 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2845 #ifdef EXPERIMENTAL_INPLACESCAN
2846 if (flags && !NEXT_OFF(n)) {
2847 DEBUG_PEEP("atch", val, depth);
2848 if (reg_off_by_arg[OP(n)]) {
2849 ARG_SET(n, val - n);
2852 NEXT_OFF(n) = val - n;
2860 *has_exactf_sharp_s = FALSE;
2862 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2863 * can now analyze for sequences of problematic code points. (Prior to
2864 * this final joining, sequences could have been split over boundaries, and
2865 * hence missed). The sequences only happen in folding, hence for any
2866 * non-EXACT EXACTish node */
2867 if (OP(scan) != EXACT) {
2868 const U8 * const s0 = (U8*) STRING(scan);
2870 const U8 * const s_end = s0 + STR_LEN(scan);
2872 /* One pass is made over the node's string looking for all the
2873 * possibilities. to avoid some tests in the loop, there are two main
2874 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2878 /* Examine the string for a multi-character fold sequence. UTF-8
2879 * patterns have all characters pre-folded by the time this code is
2881 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2882 length sequence we are looking for is 2 */
2885 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2886 if (! len) { /* Not a multi-char fold: get next char */
2891 /* Nodes with 'ss' require special handling, except for EXACTFL
2892 * and EXACTFA-ish for which there is no multi-char fold to
2894 if (len == 2 && *s == 's' && *(s+1) == 's'
2895 && OP(scan) != EXACTFL
2896 && OP(scan) != EXACTFA
2897 && OP(scan) != EXACTFA_NO_TRIE)
2900 OP(scan) = EXACTFU_SS;
2903 else { /* Here is a generic multi-char fold. */
2904 const U8* multi_end = s + len;
2906 /* Count how many characters in it. In the case of /l and
2907 * /aa, no folds which contain ASCII code points are
2908 * allowed, so check for those, and skip if found. (In
2909 * EXACTFL, no folds are allowed to any Latin1 code point,
2910 * not just ASCII. But there aren't any of these
2911 * currently, nor ever likely, so don't take the time to
2912 * test for them. The code that generates the
2913 * is_MULTI_foo() macros croaks should one actually get put
2914 * into Unicode .) */
2915 if (OP(scan) != EXACTFL
2916 && OP(scan) != EXACTFA
2917 && OP(scan) != EXACTFA_NO_TRIE)
2919 count = utf8_length(s, multi_end);
2923 while (s < multi_end) {
2926 goto next_iteration;
2936 /* The delta is how long the sequence is minus 1 (1 is how long
2937 * the character that folds to the sequence is) */
2938 *min_subtract += count - 1;
2942 else if (OP(scan) == EXACTFA) {
2944 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
2945 * fold to the ASCII range (and there are no existing ones in the
2946 * upper latin1 range). But, as outlined in the comments preceding
2947 * this function, we need to flag any occurrences of the sharp s.
2948 * This character forbids trie formation (because of added
2951 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
2952 OP(scan) = EXACTFA_NO_TRIE;
2953 *has_exactf_sharp_s = TRUE;
2960 else if (OP(scan) != EXACTFL) {
2962 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
2963 * multi-char folds that are all Latin1. (This code knows that
2964 * there are no current multi-char folds possible with EXACTFL,
2965 * relying on fold_grind.t to catch any errors if the very unlikely
2966 * event happens that some get added in future Unicode versions.)
2967 * As explained in the comments preceding this function, we look
2968 * also for the sharp s in EXACTF nodes; it can be in the final
2969 * position. Otherwise we can stop looking 1 byte earlier because
2970 * have to find at least two characters for a multi-fold */
2971 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2974 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2975 if (! len) { /* Not a multi-char fold. */
2976 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2978 *has_exactf_sharp_s = TRUE;
2985 && isARG2_lower_or_UPPER_ARG1('s', *s)
2986 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
2989 /* EXACTF nodes need to know that the minimum length
2990 * changed so that a sharp s in the string can match this
2991 * ss in the pattern, but they remain EXACTF nodes, as they
2992 * won't match this unless the target string is is UTF-8,
2993 * which we don't know until runtime */
2994 if (OP(scan) != EXACTF) {
2995 OP(scan) = EXACTFU_SS;
2999 *min_subtract += len - 1;
3006 /* Allow dumping but overwriting the collection of skipped
3007 * ops and/or strings with fake optimized ops */
3008 n = scan + NODE_SZ_STR(scan);
3016 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3020 /* REx optimizer. Converts nodes into quicker variants "in place".
3021 Finds fixed substrings. */
3023 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3024 to the position after last scanned or to NULL. */
3026 #define INIT_AND_WITHP \
3027 assert(!and_withp); \
3028 Newx(and_withp,1,struct regnode_charclass_class); \
3029 SAVEFREEPV(and_withp)
3031 /* this is a chain of data about sub patterns we are processing that
3032 need to be handled separately/specially in study_chunk. Its so
3033 we can simulate recursion without losing state. */
3035 typedef struct scan_frame {
3036 regnode *last; /* last node to process in this frame */
3037 regnode *next; /* next node to process when last is reached */
3038 struct scan_frame *prev; /*previous frame*/
3039 I32 stop; /* what stopparen do we use */
3043 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3046 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3047 SSize_t *minlenp, SSize_t *deltap,
3052 struct regnode_charclass_class *and_withp,
3053 U32 flags, U32 depth)
3054 /* scanp: Start here (read-write). */
3055 /* deltap: Write maxlen-minlen here. */
3056 /* last: Stop before this one. */
3057 /* data: string data about the pattern */
3058 /* stopparen: treat close N as END */
3059 /* recursed: which subroutines have we recursed into */
3060 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3063 /* There must be at least this number of characters to match */
3066 regnode *scan = *scanp, *next;
3068 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3069 int is_inf_internal = 0; /* The studied chunk is infinite */
3070 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3071 scan_data_t data_fake;
3072 SV *re_trie_maxbuff = NULL;
3073 regnode *first_non_open = scan;
3074 SSize_t stopmin = SSize_t_MAX;
3075 scan_frame *frame = NULL;
3076 GET_RE_DEBUG_FLAGS_DECL;
3078 PERL_ARGS_ASSERT_STUDY_CHUNK;
3081 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3085 while (first_non_open && OP(first_non_open) == OPEN)
3086 first_non_open=regnext(first_non_open);
3091 while ( scan && OP(scan) != END && scan < last ){
3092 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3093 node length to get a real minimum (because
3094 the folded version may be shorter) */
3095 bool has_exactf_sharp_s = FALSE;
3096 /* Peephole optimizer: */
3097 DEBUG_STUDYDATA("Peep:", data,depth);
3098 DEBUG_PEEP("Peep",scan,depth);
3100 /* Its not clear to khw or hv why this is done here, and not in the
3101 * clauses that deal with EXACT nodes. khw's guess is that it's
3102 * because of a previous design */
3103 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3105 /* Follow the next-chain of the current node and optimize
3106 away all the NOTHINGs from it. */
3107 if (OP(scan) != CURLYX) {
3108 const int max = (reg_off_by_arg[OP(scan)]
3110 /* I32 may be smaller than U16 on CRAYs! */
3111 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3112 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3116 /* Skip NOTHING and LONGJMP. */
3117 while ((n = regnext(n))
3118 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3119 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3120 && off + noff < max)
3122 if (reg_off_by_arg[OP(scan)])
3125 NEXT_OFF(scan) = off;
3130 /* The principal pseudo-switch. Cannot be a switch, since we
3131 look into several different things. */
3132 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3133 || OP(scan) == IFTHEN) {
3134 next = regnext(scan);
3136 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3138 if (OP(next) == code || code == IFTHEN) {
3139 /* NOTE - There is similar code to this block below for handling
3140 TRIE nodes on a re-study. If you change stuff here check there
3142 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3143 struct regnode_charclass_class accum;
3144 regnode * const startbranch=scan;
3146 if (flags & SCF_DO_SUBSTR)
3147 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3148 if (flags & SCF_DO_STCLASS)
3149 cl_init_zero(pRExC_state, &accum);
3151 while (OP(scan) == code) {
3152 SSize_t deltanext, minnext, fake;
3154 struct regnode_charclass_class this_class;
3157 data_fake.flags = 0;
3159 data_fake.whilem_c = data->whilem_c;
3160 data_fake.last_closep = data->last_closep;
3163 data_fake.last_closep = &fake;
3165 data_fake.pos_delta = delta;
3166 next = regnext(scan);
3167 scan = NEXTOPER(scan);
3169 scan = NEXTOPER(scan);
3170 if (flags & SCF_DO_STCLASS) {
3171 cl_init(pRExC_state, &this_class);
3172 data_fake.start_class = &this_class;
3173 f = SCF_DO_STCLASS_AND;
3175 if (flags & SCF_WHILEM_VISITED_POS)
3176 f |= SCF_WHILEM_VISITED_POS;
3178 /* we suppose the run is continuous, last=next...*/
3179 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3181 stopparen, recursed, NULL, f,depth+1);
3184 if (deltanext == SSize_t_MAX) {
3185 is_inf = is_inf_internal = 1;
3187 } else if (max1 < minnext + deltanext)
3188 max1 = minnext + deltanext;
3190 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3192 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3193 if ( stopmin > minnext)
3194 stopmin = min + min1;
3195 flags &= ~SCF_DO_SUBSTR;
3197 data->flags |= SCF_SEEN_ACCEPT;
3200 if (data_fake.flags & SF_HAS_EVAL)
3201 data->flags |= SF_HAS_EVAL;
3202 data->whilem_c = data_fake.whilem_c;
3204 if (flags & SCF_DO_STCLASS)
3205 cl_or(pRExC_state, &accum, &this_class);
3207 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3209 if (flags & SCF_DO_SUBSTR) {
3210 data->pos_min += min1;
3211 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3212 data->pos_delta = SSize_t_MAX;
3214 data->pos_delta += max1 - min1;
3215 if (max1 != min1 || is_inf)
3216 data->longest = &(data->longest_float);
3219 if (delta == SSize_t_MAX
3220 || SSize_t_MAX - delta - (max1 - min1) < 0)
3221 delta = SSize_t_MAX;
3223 delta += max1 - min1;
3224 if (flags & SCF_DO_STCLASS_OR) {
3225 cl_or(pRExC_state, data->start_class, &accum);
3227 cl_and(data->start_class, and_withp);
3228 flags &= ~SCF_DO_STCLASS;
3231 else if (flags & SCF_DO_STCLASS_AND) {
3233 cl_and(data->start_class, &accum);
3234 flags &= ~SCF_DO_STCLASS;
3237 /* Switch to OR mode: cache the old value of
3238 * data->start_class */
3240 StructCopy(data->start_class, and_withp,
3241 struct regnode_charclass_class);
3242 flags &= ~SCF_DO_STCLASS_AND;
3243 StructCopy(&accum, data->start_class,
3244 struct regnode_charclass_class);
3245 flags |= SCF_DO_STCLASS_OR;
3246 SET_SSC_EOS(data->start_class);
3250 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3253 Assuming this was/is a branch we are dealing with: 'scan' now
3254 points at the item that follows the branch sequence, whatever
3255 it is. We now start at the beginning of the sequence and look
3262 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3264 If we can find such a subsequence we need to turn the first
3265 element into a trie and then add the subsequent branch exact
3266 strings to the trie.
3270 1. patterns where the whole set of branches can be converted.
3272 2. patterns where only a subset can be converted.
3274 In case 1 we can replace the whole set with a single regop
3275 for the trie. In case 2 we need to keep the start and end
3278 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3279 becomes BRANCH TRIE; BRANCH X;
3281 There is an additional case, that being where there is a
3282 common prefix, which gets split out into an EXACT like node
3283 preceding the TRIE node.
3285 If x(1..n)==tail then we can do a simple trie, if not we make
3286 a "jump" trie, such that when we match the appropriate word
3287 we "jump" to the appropriate tail node. Essentially we turn
3288 a nested if into a case structure of sorts.
3293 if (!re_trie_maxbuff) {
3294 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3295 if (!SvIOK(re_trie_maxbuff))
3296 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3298 if ( SvIV(re_trie_maxbuff)>=0 ) {
3300 regnode *first = (regnode *)NULL;
3301 regnode *last = (regnode *)NULL;
3302 regnode *tail = scan;
3307 SV * const mysv = sv_newmortal(); /* for dumping */
3309 /* var tail is used because there may be a TAIL
3310 regop in the way. Ie, the exacts will point to the
3311 thing following the TAIL, but the last branch will
3312 point at the TAIL. So we advance tail. If we
3313 have nested (?:) we may have to move through several
3317 while ( OP( tail ) == TAIL ) {
3318 /* this is the TAIL generated by (?:) */
3319 tail = regnext( tail );
3323 DEBUG_TRIE_COMPILE_r({
3324 regprop(RExC_rx, mysv, tail );
3325 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3326 (int)depth * 2 + 2, "",
3327 "Looking for TRIE'able sequences. Tail node is: ",
3328 SvPV_nolen_const( mysv )
3334 Step through the branches
3335 cur represents each branch,
3336 noper is the first thing to be matched as part of that branch
3337 noper_next is the regnext() of that node.
3339 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3340 via a "jump trie" but we also support building with NOJUMPTRIE,
3341 which restricts the trie logic to structures like /FOO|BAR/.
3343 If noper is a trieable nodetype then the branch is a possible optimization
3344 target. If we are building under NOJUMPTRIE then we require that noper_next
3345 is the same as scan (our current position in the regex program).
3347 Once we have two or more consecutive such branches we can create a
3348 trie of the EXACT's contents and stitch it in place into the program.
3350 If the sequence represents all of the branches in the alternation we
3351 replace the entire thing with a single TRIE node.
3353 Otherwise when it is a subsequence we need to stitch it in place and
3354 replace only the relevant branches. This means the first branch has
3355 to remain as it is used by the alternation logic, and its next pointer,
3356 and needs to be repointed at the item on the branch chain following
3357 the last branch we have optimized away.
3359 This could be either a BRANCH, in which case the subsequence is internal,
3360 or it could be the item following the branch sequence in which case the
3361 subsequence is at the end (which does not necessarily mean the first node
3362 is the start of the alternation).
3364 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3367 ----------------+-----------
3371 EXACTFU_SS | EXACTFU
3376 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3377 ( EXACT == (X) ) ? EXACT : \
3378 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3379 ( EXACTFA == (X) ) ? EXACTFA : \
3382 /* dont use tail as the end marker for this traverse */
3383 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3384 regnode * const noper = NEXTOPER( cur );
3385 U8 noper_type = OP( noper );
3386 U8 noper_trietype = TRIE_TYPE( noper_type );
3387 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3388 regnode * const noper_next = regnext( noper );
3389 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3390 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3393 DEBUG_TRIE_COMPILE_r({
3394 regprop(RExC_rx, mysv, cur);
3395 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3396 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3398 regprop(RExC_rx, mysv, noper);
3399 PerlIO_printf( Perl_debug_log, " -> %s",
3400 SvPV_nolen_const(mysv));
3403 regprop(RExC_rx, mysv, noper_next );
3404 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3405 SvPV_nolen_const(mysv));
3407 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3408 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3409 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3413 /* Is noper a trieable nodetype that can be merged with the
3414 * current trie (if there is one)? */
3418 ( noper_trietype == NOTHING)
3419 || ( trietype == NOTHING )
3420 || ( trietype == noper_trietype )
3423 && noper_next == tail
3427 /* Handle mergable triable node
3428 * Either we are the first node in a new trieable sequence,
3429 * in which case we do some bookkeeping, otherwise we update
3430 * the end pointer. */
3433 if ( noper_trietype == NOTHING ) {
3434 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3435 regnode * const noper_next = regnext( noper );
3436 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3437 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3440 if ( noper_next_trietype ) {
3441 trietype = noper_next_trietype;
3442 } else if (noper_next_type) {
3443 /* a NOTHING regop is 1 regop wide. We need at least two
3444 * for a trie so we can't merge this in */
3448 trietype = noper_trietype;
3451 if ( trietype == NOTHING )
3452 trietype = noper_trietype;
3457 } /* end handle mergable triable node */
3459 /* handle unmergable node -
3460 * noper may either be a triable node which can not be tried
3461 * together with the current trie, or a non triable node */
3463 /* If last is set and trietype is not NOTHING then we have found
3464 * at least two triable branch sequences in a row of a similar
3465 * trietype so we can turn them into a trie. If/when we
3466 * allow NOTHING to start a trie sequence this condition will be
3467 * required, and it isn't expensive so we leave it in for now. */
3468 if ( trietype && trietype != NOTHING )
3469 make_trie( pRExC_state,
3470 startbranch, first, cur, tail, count,
3471 trietype, depth+1 );
3472 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3476 && noper_next == tail
3479 /* noper is triable, so we can start a new trie sequence */
3482 trietype = noper_trietype;
3484 /* if we already saw a first but the current node is not triable then we have
3485 * to reset the first information. */
3490 } /* end handle unmergable node */
3491 } /* loop over branches */
3492 DEBUG_TRIE_COMPILE_r({
3493 regprop(RExC_rx, mysv, cur);
3494 PerlIO_printf( Perl_debug_log,
3495 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3496 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3499 if ( last && trietype ) {
3500 if ( trietype != NOTHING ) {
3501 /* the last branch of the sequence was part of a trie,
3502 * so we have to construct it here outside of the loop
3504 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3505 #ifdef TRIE_STUDY_OPT
3506 if ( ((made == MADE_EXACT_TRIE &&
3507 startbranch == first)
3508 || ( first_non_open == first )) &&
3510 flags |= SCF_TRIE_RESTUDY;
3511 if ( startbranch == first
3514 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3519 /* at this point we know whatever we have is a NOTHING sequence/branch
3520 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3522 if ( startbranch == first ) {
3524 /* the entire thing is a NOTHING sequence, something like this:
3525 * (?:|) So we can turn it into a plain NOTHING op. */
3526 DEBUG_TRIE_COMPILE_r({
3527 regprop(RExC_rx, mysv, cur);
3528 PerlIO_printf( Perl_debug_log,
3529 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3530 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3533 OP(startbranch)= NOTHING;
3534 NEXT_OFF(startbranch)= tail - startbranch;
3535 for ( opt= startbranch + 1; opt < tail ; opt++ )
3539 } /* end if ( last) */
3540 } /* TRIE_MAXBUF is non zero */
3545 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3546 scan = NEXTOPER(NEXTOPER(scan));
3547 } else /* single branch is optimized. */
3548 scan = NEXTOPER(scan);
3550 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3551 scan_frame *newframe = NULL;
3556 if (OP(scan) != SUSPEND) {
3557 /* set the pointer */
3558 if (OP(scan) == GOSUB) {
3560 RExC_recurse[ARG2L(scan)] = scan;
3561 start = RExC_open_parens[paren-1];
3562 end = RExC_close_parens[paren-1];
3565 start = RExC_rxi->program + 1;
3569 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3570 SAVEFREEPV(recursed);
3572 if (!PAREN_TEST(recursed,paren+1)) {
3573 PAREN_SET(recursed,paren+1);
3574 Newx(newframe,1,scan_frame);
3576 if (flags & SCF_DO_SUBSTR) {
3577 SCAN_COMMIT(pRExC_state,data,minlenp);
3578 data->longest = &(data->longest_float);
3580 is_inf = is_inf_internal = 1;
3581 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3582 cl_anything(pRExC_state, data->start_class);
3583 flags &= ~SCF_DO_STCLASS;
3586 Newx(newframe,1,scan_frame);
3589 end = regnext(scan);
3594 SAVEFREEPV(newframe);
3595 newframe->next = regnext(scan);
3596 newframe->last = last;
3597 newframe->stop = stopparen;
3598 newframe->prev = frame;
3608 else if (OP(scan) == EXACT) {
3609 SSize_t l = STR_LEN(scan);
3612 const U8 * const s = (U8*)STRING(scan);
3613 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3614 l = utf8_length(s, s + l);
3616 uc = *((U8*)STRING(scan));
3619 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3620 /* The code below prefers earlier match for fixed
3621 offset, later match for variable offset. */
3622 if (data->last_end == -1) { /* Update the start info. */
3623 data->last_start_min = data->pos_min;
3624 data->last_start_max = is_inf
3625 ? SSize_t_MAX : data->pos_min + data->pos_delta;
3627 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3629 SvUTF8_on(data->last_found);
3631 SV * const sv = data->last_found;
3632 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3633 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3634 if (mg && mg->mg_len >= 0)
3635 mg->mg_len += utf8_length((U8*)STRING(scan),
3636 (U8*)STRING(scan)+STR_LEN(scan));
3638 data->last_end = data->pos_min + l;
3639 data->pos_min += l; /* As in the first entry. */
3640 data->flags &= ~SF_BEFORE_EOL;
3642 if (flags & SCF_DO_STCLASS_AND) {
3643 /* Check whether it is compatible with what we know already! */
3647 /* If compatible, we or it in below. It is compatible if is
3648 * in the bitmp and either 1) its bit or its fold is set, or 2)
3649 * it's for a locale. Even if there isn't unicode semantics
3650 * here, at runtime there may be because of matching against a
3651 * utf8 string, so accept a possible false positive for
3652 * latin1-range folds */
3654 (!(data->start_class->flags & ANYOF_LOCALE)
3655 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3656 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3657 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3662 ANYOF_CLASS_ZERO(data->start_class);
3663 ANYOF_BITMAP_ZERO(data->start_class);
3665 ANYOF_BITMAP_SET(data->start_class, uc);
3666 else if (uc >= 0x100) {
3669 /* Some Unicode code points fold to the Latin1 range; as
3670 * XXX temporary code, instead of figuring out if this is
3671 * one, just assume it is and set all the start class bits
3672 * that could be some such above 255 code point's fold
3673 * which will generate fals positives. As the code
3674 * elsewhere that does compute the fold settles down, it
3675 * can be extracted out and re-used here */
3676 for (i = 0; i < 256; i++){
3677 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3678 ANYOF_BITMAP_SET(data->start_class, i);
3682 CLEAR_SSC_EOS(data->start_class);
3684 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3686 else if (flags & SCF_DO_STCLASS_OR) {
3687 /* false positive possible if the class is case-folded */
3689 ANYOF_BITMAP_SET(data->start_class, uc);
3691 data->start_class->flags |= ANYOF_UNICODE_ALL;
3692 CLEAR_SSC_EOS(data->start_class);
3693 cl_and(data->start_class, and_withp);
3695 flags &= ~SCF_DO_STCLASS;
3697 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3698 SSize_t l = STR_LEN(scan);
3699 UV uc = *((U8*)STRING(scan));
3701 /* Search for fixed substrings supports EXACT only. */
3702 if (flags & SCF_DO_SUBSTR) {
3704 SCAN_COMMIT(pRExC_state, data, minlenp);
3707 const U8 * const s = (U8 *)STRING(scan);
3708 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3709 l = utf8_length(s, s + l);
3711 if (has_exactf_sharp_s) {
3712 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3714 min += l - min_subtract;
3716 delta += min_subtract;
3717 if (flags & SCF_DO_SUBSTR) {
3718 data->pos_min += l - min_subtract;
3719 if (data->pos_min < 0) {
3722 data->pos_delta += min_subtract;
3724 data->longest = &(data->longest_float);
3727 if (flags & SCF_DO_STCLASS_AND) {
3728 /* Check whether it is compatible with what we know already! */
3731 (!(data->start_class->flags & ANYOF_LOCALE)
3732 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3733 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3737 ANYOF_CLASS_ZERO(data->start_class);
3738 ANYOF_BITMAP_ZERO(data->start_class);
3740 ANYOF_BITMAP_SET(data->start_class, uc);
3741 CLEAR_SSC_EOS(data->start_class);
3742 if (OP(scan) == EXACTFL) {
3743 /* XXX This set is probably no longer necessary, and
3744 * probably wrong as LOCALE now is on in the initial
3746 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3750 /* Also set the other member of the fold pair. In case
3751 * that unicode semantics is called for at runtime, use
3752 * the full latin1 fold. (Can't do this for locale,
3753 * because not known until runtime) */
3754 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3756 /* All other (EXACTFL handled above) folds except under
3757 * /iaa that include s, S, and sharp_s also may include
3759 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
3761 if (uc == 's' || uc == 'S') {
3762 ANYOF_BITMAP_SET(data->start_class,
3763 LATIN_SMALL_LETTER_SHARP_S);
3765 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3766 ANYOF_BITMAP_SET(data->start_class, 's');
3767 ANYOF_BITMAP_SET(data->start_class, 'S');
3772 else if (uc >= 0x100) {
3774 for (i = 0; i < 256; i++){
3775 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3776 ANYOF_BITMAP_SET(data->start_class, i);
3781 else if (flags & SCF_DO_STCLASS_OR) {
3782 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3783 /* false positive possible if the class is case-folded.
3784 Assume that the locale settings are the same... */
3786 ANYOF_BITMAP_SET(data->start_class, uc);
3787 if (OP(scan) != EXACTFL) {
3789 /* And set the other member of the fold pair, but
3790 * can't do that in locale because not known until
3792 ANYOF_BITMAP_SET(data->start_class,
3793 PL_fold_latin1[uc]);
3795 /* All folds except under /iaa that include s, S,
3796 * and sharp_s also may include the others */
3797 if (OP(scan) != EXACTFA
3798 && OP(scan) != EXACTFA_NO_TRIE)
3800 if (uc == 's' || uc == 'S') {
3801 ANYOF_BITMAP_SET(data->start_class,
3802 LATIN_SMALL_LETTER_SHARP_S);
3804 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3805 ANYOF_BITMAP_SET(data->start_class, 's');
3806 ANYOF_BITMAP_SET(data->start_class, 'S');
3811 CLEAR_SSC_EOS(data->start_class);
3813 cl_and(data->start_class, and_withp);
3815 flags &= ~SCF_DO_STCLASS;
3817 else if (REGNODE_VARIES(OP(scan))) {
3818 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
3819 I32 fl = 0, f = flags;
3820 regnode * const oscan = scan;
3821 struct regnode_charclass_class this_class;
3822 struct regnode_charclass_class *oclass = NULL;
3823 I32 next_is_eval = 0;
3825 switch (PL_regkind[OP(scan)]) {
3826 case WHILEM: /* End of (?:...)* . */
3827 scan = NEXTOPER(scan);
3830 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3831 next = NEXTOPER(scan);
3832 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3834 maxcount = REG_INFTY;
3835 next = regnext(scan);
3836 scan = NEXTOPER(scan);
3840 if (flags & SCF_DO_SUBSTR)
3845 if (flags & SCF_DO_STCLASS) {
3847 maxcount = REG_INFTY;
3848 next = regnext(scan);
3849 scan = NEXTOPER(scan);
3852 is_inf = is_inf_internal = 1;
3853 scan = regnext(scan);
3854 if (flags & SCF_DO_SUBSTR) {
3855 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3856 data->longest = &(data->longest_float);
3858 goto optimize_curly_tail;
3860 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3861 && (scan->flags == stopparen))
3866 mincount = ARG1(scan);
3867 maxcount = ARG2(scan);
3869 next = regnext(scan);
3870 if (OP(scan) == CURLYX) {
3871 I32 lp = (data ? *(data->last_closep) : 0);
3872 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3874 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3875 next_is_eval = (OP(scan) == EVAL);
3877 if (flags & SCF_DO_SUBSTR) {
3878 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3879 pos_before = data->pos_min;
3883 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3885 data->flags |= SF_IS_INF;
3887 if (flags & SCF_DO_STCLASS) {
3888 cl_init(pRExC_state, &this_class);
3889 oclass = data->start_class;
3890 data->start_class = &this_class;
3891 f |= SCF_DO_STCLASS_AND;
3892 f &= ~SCF_DO_STCLASS_OR;
3894 /* Exclude from super-linear cache processing any {n,m}
3895 regops for which the combination of input pos and regex
3896 pos is not enough information to determine if a match
3899 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3900 regex pos at the \s*, the prospects for a match depend not
3901 only on the input position but also on how many (bar\s*)
3902 repeats into the {4,8} we are. */
3903 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3904 f &= ~SCF_WHILEM_VISITED_POS;
3906 /* This will finish on WHILEM, setting scan, or on NULL: */
3907 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3908 last, data, stopparen, recursed, NULL,
3910 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3912 if (flags & SCF_DO_STCLASS)
3913 data->start_class = oclass;
3914 if (mincount == 0 || minnext == 0) {
3915 if (flags & SCF_DO_STCLASS_OR) {
3916 cl_or(pRExC_state, data->start_class, &this_class);
3918 else if (flags & SCF_DO_STCLASS_AND) {
3919 /* Switch to OR mode: cache the old value of
3920 * data->start_class */
3922 StructCopy(data->start_class, and_withp,
3923 struct regnode_charclass_class);
3924 flags &= ~SCF_DO_STCLASS_AND;
3925 StructCopy(&this_class, data->start_class,
3926 struct regnode_charclass_class);
3927 flags |= SCF_DO_STCLASS_OR;
3928 SET_SSC_EOS(data->start_class);
3930 } else { /* Non-zero len */
3931 if (flags & SCF_DO_STCLASS_OR) {
3932 cl_or(pRExC_state, data->start_class, &this_class);
3933 cl_and(data->start_class, and_withp);
3935 else if (flags & SCF_DO_STCLASS_AND)
3936 cl_and(data->start_class, &this_class);
3937 flags &= ~SCF_DO_STCLASS;
3939 if (!scan) /* It was not CURLYX, but CURLY. */
3941 if (!(flags & SCF_TRIE_DOING_RESTUDY)
3942 /* ? quantifier ok, except for (?{ ... }) */
3943 && (next_is_eval || !(mincount == 0 && maxcount == 1))
3944 && (minnext == 0) && (deltanext == 0)
3945 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3946 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3948 /* Fatal warnings may leak the regexp without this: */
3949 SAVEFREESV(RExC_rx_sv);
3950 ckWARNreg(RExC_parse,
3951 "Quantifier unexpected on zero-length expression");
3952 (void)ReREFCNT_inc(RExC_rx_sv);
3955 min += minnext * mincount;
3956 is_inf_internal |= deltanext == SSize_t_MAX
3957 || (maxcount == REG_INFTY && minnext + deltanext > 0);
3958 is_inf |= is_inf_internal;
3960 delta = SSize_t_MAX;
3962 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3964 /* Try powerful optimization CURLYX => CURLYN. */
3965 if ( OP(oscan) == CURLYX && data
3966 && data->flags & SF_IN_PAR
3967 && !(data->flags & SF_HAS_EVAL)
3968 && !deltanext && minnext == 1 ) {
3969 /* Try to optimize to CURLYN. */
3970 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3971 regnode * const nxt1 = nxt;
3978 if (!REGNODE_SIMPLE(OP(nxt))
3979 && !(PL_regkind[OP(nxt)] == EXACT
3980 && STR_LEN(nxt) == 1))
3986 if (OP(nxt) != CLOSE)
3988 if (RExC_open_parens) {
3989 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3990 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3992 /* Now we know that nxt2 is the only contents: */
3993 oscan->flags = (U8)ARG(nxt);
3995 OP(nxt1) = NOTHING; /* was OPEN. */
3998 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3999 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4000 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4001 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4002 OP(nxt + 1) = OPTIMIZED; /* was count. */
4003 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4008 /* Try optimization CURLYX => CURLYM. */
4009 if ( OP(oscan) == CURLYX && data
4010 && !(data->flags & SF_HAS_PAR)
4011 && !(data->flags & SF_HAS_EVAL)
4012 && !deltanext /* atom is fixed width */
4013 && minnext != 0 /* CURLYM can't handle zero width */
4014 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4016 /* XXXX How to optimize if data == 0? */
4017 /* Optimize to a simpler form. */
4018 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4022 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4023 && (OP(nxt2) != WHILEM))
4025 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4026 /* Need to optimize away parenths. */
4027 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4028 /* Set the parenth number. */
4029 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4031 oscan->flags = (U8)ARG(nxt);
4032 if (RExC_open_parens) {
4033 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4034 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4036 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4037 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4040 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4041 OP(nxt + 1) = OPTIMIZED; /* was count. */
4042 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4043 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4046 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4047 regnode *nnxt = regnext(nxt1);
4049 if (reg_off_by_arg[OP(nxt1)])
4050 ARG_SET(nxt1, nxt2 - nxt1);
4051 else if (nxt2 - nxt1 < U16_MAX)
4052 NEXT_OFF(nxt1) = nxt2 - nxt1;
4054 OP(nxt) = NOTHING; /* Cannot beautify */
4059 /* Optimize again: */
4060 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4061 NULL, stopparen, recursed, NULL, 0,depth+1);
4066 else if ((OP(oscan) == CURLYX)
4067 && (flags & SCF_WHILEM_VISITED_POS)
4068 /* See the comment on a similar expression above.
4069 However, this time it's not a subexpression
4070 we care about, but the expression itself. */
4071 && (maxcount == REG_INFTY)
4072 && data && ++data->whilem_c < 16) {
4073 /* This stays as CURLYX, we can put the count/of pair. */
4074 /* Find WHILEM (as in regexec.c) */
4075 regnode *nxt = oscan + NEXT_OFF(oscan);
4077 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4079 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4080 | (RExC_whilem_seen << 4)); /* On WHILEM */
4082 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4084 if (flags & SCF_DO_SUBSTR) {
4085 SV *last_str = NULL;
4086 int counted = mincount != 0;
4088 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4089 #if defined(SPARC64_GCC_WORKAROUND)
4092 const char *s = NULL;
4095 if (pos_before >= data->last_start_min)
4098 b = data->last_start_min;
4101 s = SvPV_const(data->last_found, l);
4102 old = b - data->last_start_min;
4105 SSize_t b = pos_before >= data->last_start_min
4106 ? pos_before : data->last_start_min;
4108 const char * const s = SvPV_const(data->last_found, l);
4109 SSize_t old = b - data->last_start_min;
4113 old = utf8_hop((U8*)s, old) - (U8*)s;
4115 /* Get the added string: */
4116 last_str = newSVpvn_utf8(s + old, l, UTF);
4117 if (deltanext == 0 && pos_before == b) {
4118 /* What was added is a constant string */
4120 SvGROW(last_str, (mincount * l) + 1);
4121 repeatcpy(SvPVX(last_str) + l,
4122 SvPVX_const(last_str), l, mincount - 1);
4123 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4124 /* Add additional parts. */
4125 SvCUR_set(data->last_found,
4126 SvCUR(data->last_found) - l);
4127 sv_catsv(data->last_found, last_str);
4129 SV * sv = data->last_found;
4131 SvUTF8(sv) && SvMAGICAL(sv) ?
4132 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4133 if (mg && mg->mg_len >= 0)
4134 mg->mg_len += CHR_SVLEN(last_str) - l;
4136 data->last_end += l * (mincount - 1);
4139 /* start offset must point into the last copy */
4140 data->last_start_min += minnext * (mincount - 1);
4141 data->last_start_max += is_inf ? SSize_t_MAX
4142 : (maxcount - 1) * (minnext + data->pos_delta);
4145 /* It is counted once already... */
4146 data->pos_min += minnext * (mincount - counted);
4148 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4149 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4150 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4151 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4153 if (deltanext != SSize_t_MAX)
4154 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4155 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4156 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4158 if (deltanext == SSize_t_MAX ||
4159 -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4160 data->pos_delta = SSize_t_MAX;
4162 data->pos_delta += - counted * deltanext +
4163 (minnext + deltanext) * maxcount - minnext * mincount;
4164 if (mincount != maxcount) {
4165 /* Cannot extend fixed substrings found inside
4167 SCAN_COMMIT(pRExC_state,data,minlenp);
4168 if (mincount && last_str) {
4169 SV * const sv = data->last_found;
4170 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4171 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4175 sv_setsv(sv, last_str);
4176 data->last_end = data->pos_min;
4177 data->last_start_min =
4178 data->pos_min - CHR_SVLEN(last_str);
4179 data->last_start_max = is_inf
4181 : data->pos_min + data->pos_delta
4182 - CHR_SVLEN(last_str);
4184 data->longest = &(data->longest_float);
4186 SvREFCNT_dec(last_str);
4188 if (data && (fl & SF_HAS_EVAL))
4189 data->flags |= SF_HAS_EVAL;
4190 optimize_curly_tail:
4191 if (OP(oscan) != CURLYX) {
4192 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4194 NEXT_OFF(oscan) += NEXT_OFF(next);
4197 default: /* REF, and CLUMP only? */
4198 if (flags & SCF_DO_SUBSTR) {
4199 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4200 data->longest = &(data->longest_float);
4202 is_inf = is_inf_internal = 1;
4203 if (flags & SCF_DO_STCLASS_OR)
4204 cl_anything(pRExC_state, data->start_class);
4205 flags &= ~SCF_DO_STCLASS;
4209 else if (OP(scan) == LNBREAK) {
4210 if (flags & SCF_DO_STCLASS) {
4212 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4213 if (flags & SCF_DO_STCLASS_AND) {
4214 for (value = 0; value < 256; value++)
4215 if (!is_VERTWS_cp(value))
4216 ANYOF_BITMAP_CLEAR(data->start_class, value);
4219 for (value = 0; value < 256; value++)
4220 if (is_VERTWS_cp(value))
4221 ANYOF_BITMAP_SET(data->start_class, value);
4223 if (flags & SCF_DO_STCLASS_OR)
4224 cl_and(data->start_class, and_withp);
4225 flags &= ~SCF_DO_STCLASS;
4228 delta++; /* Because of the 2 char string cr-lf */
4229 if (flags & SCF_DO_SUBSTR) {
4230 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4232 data->pos_delta += 1;
4233 data->longest = &(data->longest_float);
4236 else if (REGNODE_SIMPLE(OP(scan))) {
4239 if (flags & SCF_DO_SUBSTR) {
4240 SCAN_COMMIT(pRExC_state,data,minlenp);
4244 if (flags & SCF_DO_STCLASS) {
4246 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4248 /* Some of the logic below assumes that switching
4249 locale on will only add false positives. */
4250 switch (PL_regkind[OP(scan)]) {
4256 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4259 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4260 cl_anything(pRExC_state, data->start_class);
4263 if (OP(scan) == SANY)
4265 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4266 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4267 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4268 cl_anything(pRExC_state, data->start_class);
4270 if (flags & SCF_DO_STCLASS_AND || !value)
4271 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4274 if (flags & SCF_DO_STCLASS_AND)
4275 cl_and(data->start_class,
4276 (struct regnode_charclass_class*)scan);
4278 cl_or(pRExC_state, data->start_class,
4279 (struct regnode_charclass_class*)scan);
4287 classnum = FLAGS(scan);
4288 if (flags & SCF_DO_STCLASS_AND) {
4289 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4290 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4291 for (value = 0; value < loop_max; value++) {
4292 if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4293 ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4299 if (data->start_class->flags & ANYOF_LOCALE) {
4300 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4304 /* Even if under locale, set the bits for non-locale
4305 * in case it isn't a true locale-node. This will
4306 * create false positives if it truly is locale */
4307 for (value = 0; value < loop_max; value++) {
4308 if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4309 ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4321 classnum = FLAGS(scan);
4322 if (flags & SCF_DO_STCLASS_AND) {
4323 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4324 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4325 for (value = 0; value < loop_max; value++) {
4326 if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4327 ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4333 if (data->start_class->flags & ANYOF_LOCALE) {
4334 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4338 /* Even if under locale, set the bits for non-locale in
4339 * case it isn't a true locale-node. This will create
4340 * false positives if it truly is locale */
4341 for (value = 0; value < loop_max; value++) {
4342 if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4343 ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4346 if (PL_regkind[OP(scan)] == NPOSIXD) {
4347 data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4353 if (flags & SCF_DO_STCLASS_OR)
4354 cl_and(data->start_class, and_withp);
4355 flags &= ~SCF_DO_STCLASS;
4358 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4359 data->flags |= (OP(scan) == MEOL
4362 SCAN_COMMIT(pRExC_state, data, minlenp);
4365 else if ( PL_regkind[OP(scan)] == BRANCHJ
4366 /* Lookbehind, or need to calculate parens/evals/stclass: */
4367 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4368 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4369 if ( OP(scan) == UNLESSM &&
4371 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4372 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4375 regnode *upto= regnext(scan);
4377 SV * const mysv_val=sv_newmortal();
4378 DEBUG_STUDYDATA("OPFAIL",data,depth);
4380 /*DEBUG_PARSE_MSG("opfail");*/
4381 regprop(RExC_rx, mysv_val, upto);
4382 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4383 SvPV_nolen_const(mysv_val),
4384 (IV)REG_NODE_NUM(upto),
4389 NEXT_OFF(scan) = upto - scan;
4390 for (opt= scan + 1; opt < upto ; opt++)
4391 OP(opt) = OPTIMIZED;
4395 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4396 || OP(scan) == UNLESSM )
4398 /* Negative Lookahead/lookbehind
4399 In this case we can't do fixed string optimisation.
4402 SSize_t deltanext, minnext, fake = 0;
4404 struct regnode_charclass_class intrnl;
4407 data_fake.flags = 0;
4409 data_fake.whilem_c = data->whilem_c;
4410 data_fake.last_closep = data->last_closep;
4413 data_fake.last_closep = &fake;
4414 data_fake.pos_delta = delta;
4415 if ( flags & SCF_DO_STCLASS && !scan->flags
4416 && OP(scan) == IFMATCH ) { /* Lookahead */
4417 cl_init(pRExC_state, &intrnl);
4418 data_fake.start_class = &intrnl;
4419 f |= SCF_DO_STCLASS_AND;
4421 if (flags & SCF_WHILEM_VISITED_POS)
4422 f |= SCF_WHILEM_VISITED_POS;
4423 next = regnext(scan);
4424 nscan = NEXTOPER(NEXTOPER(scan));
4425 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4426 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4429 FAIL("Variable length lookbehind not implemented");
4431 else if (minnext > (I32)U8_MAX) {
4432 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4434 scan->flags = (U8)minnext;
4437 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4439 if (data_fake.flags & SF_HAS_EVAL)
4440 data->flags |= SF_HAS_EVAL;
4441 data->whilem_c = data_fake.whilem_c;
4443 if (f & SCF_DO_STCLASS_AND) {
4444 if (flags & SCF_DO_STCLASS_OR) {
4445 /* OR before, AND after: ideally we would recurse with
4446 * data_fake to get the AND applied by study of the
4447 * remainder of the pattern, and then derecurse;
4448 * *** HACK *** for now just treat as "no information".
4449 * See [perl #56690].
4451 cl_init(pRExC_state, data->start_class);
4453 /* AND before and after: combine and continue */
4454 const int was = TEST_SSC_EOS(data->start_class);
4456 cl_and(data->start_class, &intrnl);
4458 SET_SSC_EOS(data->start_class);
4462 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4464 /* Positive Lookahead/lookbehind
4465 In this case we can do fixed string optimisation,
4466 but we must be careful about it. Note in the case of
4467 lookbehind the positions will be offset by the minimum
4468 length of the pattern, something we won't know about
4469 until after the recurse.
4474 struct regnode_charclass_class intrnl;
4476 /* We use SAVEFREEPV so that when the full compile
4477 is finished perl will clean up the allocated
4478 minlens when it's all done. This way we don't
4479 have to worry about freeing them when we know
4480 they wont be used, which would be a pain.
4483 Newx( minnextp, 1, SSize_t );
4484 SAVEFREEPV(minnextp);
4487 StructCopy(data, &data_fake, scan_data_t);
4488 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4491 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4492 data_fake.last_found=newSVsv(data->last_found);
4496 data_fake.last_closep = &fake;
4497 data_fake.flags = 0;
4498 data_fake.pos_delta = delta;
4500 data_fake.flags |= SF_IS_INF;
4501 if ( flags & SCF_DO_STCLASS && !scan->flags
4502 && OP(scan) == IFMATCH ) { /* Lookahead */
4503 cl_init(pRExC_state, &intrnl);
4504 data_fake.start_class = &intrnl;
4505 f |= SCF_DO_STCLASS_AND;
4507 if (flags & SCF_WHILEM_VISITED_POS)
4508 f |= SCF_WHILEM_VISITED_POS;
4509 next = regnext(scan);
4510 nscan = NEXTOPER(NEXTOPER(scan));
4512 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4513 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4516 FAIL("Variable length lookbehind not implemented");
4518 else if (*minnextp > (I32)U8_MAX) {
4519 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4521 scan->flags = (U8)*minnextp;
4526 if (f & SCF_DO_STCLASS_AND) {
4527 const int was = TEST_SSC_EOS(data.start_class);
4529 cl_and(data->start_class, &intrnl);
4531 SET_SSC_EOS(data->start_class);
4534 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4536 if (data_fake.flags & SF_HAS_EVAL)
4537 data->flags |= SF_HAS_EVAL;
4538 data->whilem_c = data_fake.whilem_c;
4539 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4540 if (RExC_rx->minlen<*minnextp)
4541 RExC_rx->minlen=*minnextp;
4542 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4543 SvREFCNT_dec_NN(data_fake.last_found);
4545 if ( data_fake.minlen_fixed != minlenp )
4547 data->offset_fixed= data_fake.offset_fixed;
4548 data->minlen_fixed= data_fake.minlen_fixed;
4549 data->lookbehind_fixed+= scan->flags;
4551 if ( data_fake.minlen_float != minlenp )
4553 data->minlen_float= data_fake.minlen_float;
4554 data->offset_float_min=data_fake.offset_float_min;
4555 data->offset_float_max=data_fake.offset_float_max;
4556 data->lookbehind_float+= scan->flags;
4563 else if (OP(scan) == OPEN) {
4564 if (stopparen != (I32)ARG(scan))
4567 else if (OP(scan) == CLOSE) {
4568 if (stopparen == (I32)ARG(scan)) {
4571 if ((I32)ARG(scan) == is_par) {
4572 next = regnext(scan);
4574 if ( next && (OP(next) != WHILEM) && next < last)
4575 is_par = 0; /* Disable optimization */
4578 *(data->last_closep) = ARG(scan);
4580 else if (OP(scan) == EVAL) {
4582 data->flags |= SF_HAS_EVAL;
4584 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4585 if (flags & SCF_DO_SUBSTR) {
4586 SCAN_COMMIT(pRExC_state,data,minlenp);
4587 flags &= ~SCF_DO_SUBSTR;
4589 if (data && OP(scan)==ACCEPT) {
4590 data->flags |= SCF_SEEN_ACCEPT;
4595 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4597 if (flags & SCF_DO_SUBSTR) {
4598 SCAN_COMMIT(pRExC_state,data,minlenp);
4599 data->longest = &(data->longest_float);
4601 is_inf = is_inf_internal = 1;
4602 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4603 cl_anything(pRExC_state, data->start_class);
4604 flags &= ~SCF_DO_STCLASS;
4606 else if (OP(scan) == GPOS) {
4607 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4608 !(delta || is_inf || (data && data->pos_delta)))
4610 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4611 RExC_rx->extflags |= RXf_ANCH_GPOS;
4612 if (RExC_rx->gofs < (STRLEN)min)
4613 RExC_rx->gofs = min;
4615 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4619 #ifdef TRIE_STUDY_OPT
4620 #ifdef FULL_TRIE_STUDY
4621 else if (PL_regkind[OP(scan)] == TRIE) {
4622 /* NOTE - There is similar code to this block above for handling
4623 BRANCH nodes on the initial study. If you change stuff here
4625 regnode *trie_node= scan;
4626 regnode *tail= regnext(scan);
4627 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4628 SSize_t max1 = 0, min1 = SSize_t_MAX;
4629 struct regnode_charclass_class accum;
4631 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4632 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4633 if (flags & SCF_DO_STCLASS)
4634 cl_init_zero(pRExC_state, &accum);
4640 const regnode *nextbranch= NULL;
4643 for ( word=1 ; word <= trie->wordcount ; word++)
4645 SSize_t deltanext=0, minnext=0, f = 0, fake;
4646 struct regnode_charclass_class this_class;
4648 data_fake.flags = 0;
4650 data_fake.whilem_c = data->whilem_c;
4651 data_fake.last_closep = data->last_closep;
4654 data_fake.last_closep = &fake;
4655 data_fake.pos_delta = delta;
4656 if (flags & SCF_DO_STCLASS) {
4657 cl_init(pRExC_state, &this_class);
4658 data_fake.start_class = &this_class;
4659 f = SCF_DO_STCLASS_AND;
4661 if (flags & SCF_WHILEM_VISITED_POS)
4662 f |= SCF_WHILEM_VISITED_POS;
4664 if (trie->jump[word]) {
4666 nextbranch = trie_node + trie->jump[0];
4667 scan= trie_node + trie->jump[word];
4668 /* We go from the jump point to the branch that follows
4669 it. Note this means we need the vestigal unused branches
4670 even though they arent otherwise used.
4672 minnext = study_chunk(pRExC_state, &scan, minlenp,
4673 &deltanext, (regnode *)nextbranch, &data_fake,
4674 stopparen, recursed, NULL, f,depth+1);
4676 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4677 nextbranch= regnext((regnode*)nextbranch);
4679 if (min1 > (SSize_t)(minnext + trie->minlen))
4680 min1 = minnext + trie->minlen;
4681 if (deltanext == SSize_t_MAX) {
4682 is_inf = is_inf_internal = 1;
4684 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
4685 max1 = minnext + deltanext + trie->maxlen;
4687 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4689 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4690 if ( stopmin > min + min1)
4691 stopmin = min + min1;
4692 flags &= ~SCF_DO_SUBSTR;
4694 data->flags |= SCF_SEEN_ACCEPT;
4697 if (data_fake.flags & SF_HAS_EVAL)
4698 data->flags |= SF_HAS_EVAL;
4699 data->whilem_c = data_fake.whilem_c;
4701 if (flags & SCF_DO_STCLASS)
4702 cl_or(pRExC_state, &accum, &this_class);
4705 if (flags & SCF_DO_SUBSTR) {
4706 data->pos_min += min1;
4707 data->pos_delta += max1 - min1;
4708 if (max1 != min1 || is_inf)
4709 data->longest = &(data->longest_float);
4712 delta += max1 - min1;
4713 if (flags & SCF_DO_STCLASS_OR) {
4714 cl_or(pRExC_state, data->start_class, &accum);
4716 cl_and(data->start_class, and_withp);
4717 flags &= ~SCF_DO_STCLASS;
4720 else if (flags & SCF_DO_STCLASS_AND) {
4722 cl_and(data->start_class, &accum);
4723 flags &= ~SCF_DO_STCLASS;
4726 /* Switch to OR mode: cache the old value of
4727 * data->start_class */
4729 StructCopy(data->start_class, and_withp,
4730 struct regnode_charclass_class);
4731 flags &= ~SCF_DO_STCLASS_AND;
4732 StructCopy(&accum, data->start_class,
4733 struct regnode_charclass_class);
4734 flags |= SCF_DO_STCLASS_OR;
4735 SET_SSC_EOS(data->start_class);
4742 else if (PL_regkind[OP(scan)] == TRIE) {
4743 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4746 min += trie->minlen;
4747 delta += (trie->maxlen - trie->minlen);
4748 flags &= ~SCF_DO_STCLASS; /* xxx */
4749 if (flags & SCF_DO_SUBSTR) {
4750 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4751 data->pos_min += trie->minlen;
4752 data->pos_delta += (trie->maxlen - trie->minlen);
4753 if (trie->maxlen != trie->minlen)
4754 data->longest = &(data->longest_float);
4756 if (trie->jump) /* no more substrings -- for now /grr*/
4757 flags &= ~SCF_DO_SUBSTR;
4759 #endif /* old or new */
4760 #endif /* TRIE_STUDY_OPT */
4762 /* Else: zero-length, ignore. */
4763 scan = regnext(scan);
4768 stopparen = frame->stop;
4769 frame = frame->prev;
4770 goto fake_study_recurse;
4775 DEBUG_STUDYDATA("pre-fin:",data,depth);
4778 *deltap = is_inf_internal ? SSize_t_MAX : delta;
4779 if (flags & SCF_DO_SUBSTR && is_inf)
4780 data->pos_delta = SSize_t_MAX - data->pos_min;
4781 if (is_par > (I32)U8_MAX)
4783 if (is_par && pars==1 && data) {
4784 data->flags |= SF_IN_PAR;
4785 data->flags &= ~SF_HAS_PAR;
4787 else if (pars && data) {
4788 data->flags |= SF_HAS_PAR;
4789 data->flags &= ~SF_IN_PAR;
4791 if (flags & SCF_DO_STCLASS_OR)
4792 cl_and(data->start_class, and_withp);
4793 if (flags & SCF_TRIE_RESTUDY)
4794 data->flags |= SCF_TRIE_RESTUDY;
4796 DEBUG_STUDYDATA("post-fin:",data,depth);
4798 return min < stopmin ? min : stopmin;
4802 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4804 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4806 PERL_ARGS_ASSERT_ADD_DATA;
4808 Renewc(RExC_rxi->data,
4809 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4810 char, struct reg_data);
4812 Renew(RExC_rxi->data->what, count + n, U8);
4814 Newx(RExC_rxi->data->what, n, U8);
4815 RExC_rxi->data->count = count + n;
4816 Copy(s, RExC_rxi->data->what + count, n, U8);
4820 /*XXX: todo make this not included in a non debugging perl */
4821 #ifndef PERL_IN_XSUB_RE
4823 Perl_reginitcolors(pTHX)
4826 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4828 char *t = savepv(s);
4832 t = strchr(t, '\t');
4838 PL_colors[i] = t = (char *)"";
4843 PL_colors[i++] = (char *)"";
4850 #ifdef TRIE_STUDY_OPT
4851 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4854 (data.flags & SCF_TRIE_RESTUDY) \
4862 #define CHECK_RESTUDY_GOTO_butfirst
4866 * pregcomp - compile a regular expression into internal code
4868 * Decides which engine's compiler to call based on the hint currently in
4872 #ifndef PERL_IN_XSUB_RE
4874 /* return the currently in-scope regex engine (or the default if none) */
4876 regexp_engine const *
4877 Perl_current_re_engine(pTHX)
4881 if (IN_PERL_COMPILETIME) {
4882 HV * const table = GvHV(PL_hintgv);
4886 return &PL_core_reg_engine;
4887 ptr = hv_fetchs(table, "regcomp", FALSE);
4888 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4889 return &PL_core_reg_engine;
4890 return INT2PTR(regexp_engine*,SvIV(*ptr));
4894 if (!PL_curcop->cop_hints_hash)
4895 return &PL_core_reg_engine;
4896 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4897 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4898 return &PL_core_reg_engine;
4899 return INT2PTR(regexp_engine*,SvIV(ptr));
4905 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4908 regexp_engine const *eng = current_re_engine();
4909 GET_RE_DEBUG_FLAGS_DECL;
4911 PERL_ARGS_ASSERT_PREGCOMP;
4913 /* Dispatch a request to compile a regexp to correct regexp engine. */
4915 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4918 return CALLREGCOMP_ENG(eng, pattern, flags);
4922 /* public(ish) entry point for the perl core's own regex compiling code.
4923 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4924 * pattern rather than a list of OPs, and uses the internal engine rather
4925 * than the current one */
4928 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4930 SV *pat = pattern; /* defeat constness! */
4931 PERL_ARGS_ASSERT_RE_COMPILE;
4932 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4933 #ifdef PERL_IN_XSUB_RE
4936 &PL_core_reg_engine,
4938 NULL, NULL, rx_flags, 0);
4942 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4943 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4944 * point to the realloced string and length.
4946 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4950 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4951 char **pat_p, STRLEN *plen_p, int num_code_blocks)
4953 U8 *const src = (U8*)*pat_p;
4956 STRLEN s = 0, d = 0;
4958 GET_RE_DEBUG_FLAGS_DECL;
4960 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4961 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4963 Newx(dst, *plen_p * 2 + 1, U8);
4965 while (s < *plen_p) {
4966 if (NATIVE_IS_INVARIANT(src[s]))
4969 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
4970 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
4972 if (n < num_code_blocks) {
4973 if (!do_end && pRExC_state->code_blocks[n].start == s) {
4974 pRExC_state->code_blocks[n].start = d;
4975 assert(dst[d] == '(');
4978 else if (do_end && pRExC_state->code_blocks[n].end == s) {
4979 pRExC_state->code_blocks[n].end = d;
4980 assert(dst[d] == ')');
4990 *pat_p = (char*) dst;
4992 RExC_orig_utf8 = RExC_utf8 = 1;
4997 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
4998 * while recording any code block indices, and handling overloading,
4999 * nested qr// objects etc. If pat is null, it will allocate a new
5000 * string, or just return the first arg, if there's only one.
5002 * Returns the malloced/updated pat.
5003 * patternp and pat_count is the array of SVs to be concatted;
5004 * oplist is the optional list of ops that generated the SVs;
5005 * recompile_p is a pointer to a boolean that will be set if
5006 * the regex will need to be recompiled.
5007 * delim, if non-null is an SV that will be inserted between each element
5011 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5012 SV *pat, SV ** const patternp, int pat_count,
5013 OP *oplist, bool *recompile_p, SV *delim)
5017 bool use_delim = FALSE;
5018 bool alloced = FALSE;
5020 /* if we know we have at least two args, create an empty string,
5021 * then concatenate args to that. For no args, return an empty string */
5022 if (!pat && pat_count != 1) {
5023 pat = newSVpvn("", 0);
5028 for (svp = patternp; svp < patternp + pat_count; svp++) {
5031 STRLEN orig_patlen = 0;
5033 SV *msv = use_delim ? delim : *svp;
5035 /* if we've got a delimiter, we go round the loop twice for each
5036 * svp slot (except the last), using the delimiter the second
5045 if (SvTYPE(msv) == SVt_PVAV) {
5046 /* we've encountered an interpolated array within
5047 * the pattern, e.g. /...@a..../. Expand the list of elements,
5048 * then recursively append elements.
5049 * The code in this block is based on S_pushav() */
5051 AV *const av = (AV*)msv;
5052 const I32 maxarg = AvFILL(av) + 1;
5056 assert(oplist->op_type == OP_PADAV
5057 || oplist->op_type == OP_RV2AV);
5058 oplist = oplist->op_sibling;;
5061 if (SvRMAGICAL(av)) {
5064 Newx(array, maxarg, SV*);
5066 for (i=0; i < (U32)maxarg; i++) {
5067 SV ** const svp = av_fetch(av, i, FALSE);
5068 array[i] = svp ? *svp : &PL_sv_undef;
5072 array = AvARRAY(av);
5074 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5075 array, maxarg, NULL, recompile_p,
5077 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5083 /* we make the assumption here that each op in the list of
5084 * op_siblings maps to one SV pushed onto the stack,
5085 * except for code blocks, with have both an OP_NULL and
5087 * This allows us to match up the list of SVs against the
5088 * list of OPs to find the next code block.
5090 * Note that PUSHMARK PADSV PADSV ..
5092 * PADRANGE PADSV PADSV ..
5093 * so the alignment still works. */
5096 if (oplist->op_type == OP_NULL
5097 && (oplist->op_flags & OPf_SPECIAL))
5099 assert(n < pRExC_state->num_code_blocks);
5100 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5101 pRExC_state->code_blocks[n].block = oplist;
5102 pRExC_state->code_blocks[n].src_regex = NULL;
5105 oplist = oplist->op_sibling; /* skip CONST */
5108 oplist = oplist->op_sibling;;
5111 /* apply magic and QR overloading to arg */
5114 if (SvROK(msv) && SvAMAGIC(msv)) {
5115 SV *sv = AMG_CALLunary(msv, regexp_amg);
5119 if (SvTYPE(sv) != SVt_REGEXP)
5120 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5125 /* try concatenation overload ... */
5126 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5127 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5130 /* overloading involved: all bets are off over literal
5131 * code. Pretend we haven't seen it */
5132 pRExC_state->num_code_blocks -= n;
5136 /* ... or failing that, try "" overload */
5137 while (SvAMAGIC(msv)
5138 && (sv = AMG_CALLunary(msv, string_amg))
5142 && SvRV(msv) == SvRV(sv))
5147 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5151 /* this is a partially unrolled
5152 * sv_catsv_nomg(pat, msv);
5153 * that allows us to adjust code block indices if
5156 char *dst = SvPV_force_nomg(pat, dlen);
5158 if (SvUTF8(msv) && !SvUTF8(pat)) {
5159 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5160 sv_setpvn(pat, dst, dlen);
5163 sv_catsv_nomg(pat, msv);
5170 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5173 /* extract any code blocks within any embedded qr//'s */
5174 if (rx && SvTYPE(rx) == SVt_REGEXP
5175 && RX_ENGINE((REGEXP*)rx)->op_comp)
5178 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5179 if (ri->num_code_blocks) {
5181 /* the presence of an embedded qr// with code means
5182 * we should always recompile: the text of the
5183 * qr// may not have changed, but it may be a
5184 * different closure than last time */
5186 Renew(pRExC_state->code_blocks,
5187 pRExC_state->num_code_blocks + ri->num_code_blocks,
5188 struct reg_code_block);
5189 pRExC_state->num_code_blocks += ri->num_code_blocks;
5191 for (i=0; i < ri->num_code_blocks; i++) {
5192 struct reg_code_block *src, *dst;
5193 STRLEN offset = orig_patlen
5194 + ReANY((REGEXP *)rx)->pre_prefix;
5195 assert(n < pRExC_state->num_code_blocks);
5196 src = &ri->code_blocks[i];
5197 dst = &pRExC_state->code_blocks[n];
5198 dst->start = src->start + offset;
5199 dst->end = src->end + offset;
5200 dst->block = src->block;
5201 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5210 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5219 /* see if there are any run-time code blocks in the pattern.
5220 * False positives are allowed */
5223 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5224 char *pat, STRLEN plen)
5229 for (s = 0; s < plen; s++) {
5230 if (n < pRExC_state->num_code_blocks
5231 && s == pRExC_state->code_blocks[n].start)
5233 s = pRExC_state->code_blocks[n].end;
5237 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5239 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5241 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5248 /* Handle run-time code blocks. We will already have compiled any direct
5249 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5250 * copy of it, but with any literal code blocks blanked out and
5251 * appropriate chars escaped; then feed it into
5253 * eval "qr'modified_pattern'"
5257 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5261 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5263 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5264 * and merge them with any code blocks of the original regexp.
5266 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5267 * instead, just save the qr and return FALSE; this tells our caller that
5268 * the original pattern needs upgrading to utf8.
5272 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5273 char *pat, STRLEN plen)
5277 GET_RE_DEBUG_FLAGS_DECL;
5279 if (pRExC_state->runtime_code_qr) {
5280 /* this is the second time we've been called; this should
5281 * only happen if the main pattern got upgraded to utf8
5282 * during compilation; re-use the qr we compiled first time
5283 * round (which should be utf8 too)
5285 qr = pRExC_state->runtime_code_qr;
5286 pRExC_state->runtime_code_qr = NULL;
5287 assert(RExC_utf8 && SvUTF8(qr));
5293 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5297 /* determine how many extra chars we need for ' and \ escaping */
5298 for (s = 0; s < plen; s++) {
5299 if (pat[s] == '\'' || pat[s] == '\\')
5303 Newx(newpat, newlen, char);
5305 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5307 for (s = 0; s < plen; s++) {
5308 if (n < pRExC_state->num_code_blocks
5309 && s == pRExC_state->code_blocks[n].start)
5311 /* blank out literal code block */
5312 assert(pat[s] == '(');
5313 while (s <= pRExC_state->code_blocks[n].end) {
5321 if (pat[s] == '\'' || pat[s] == '\\')
5326 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5330 PerlIO_printf(Perl_debug_log,
5331 "%sre-parsing pattern for runtime code:%s %s\n",
5332 PL_colors[4],PL_colors[5],newpat);
5335 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5341 PUSHSTACKi(PERLSI_REQUIRE);
5342 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5343 * parsing qr''; normally only q'' does this. It also alters
5345 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5346 SvREFCNT_dec_NN(sv);
5351 SV * const errsv = ERRSV;
5352 if (SvTRUE_NN(errsv))
5354 Safefree(pRExC_state->code_blocks);
5355 /* use croak_sv ? */
5356 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5359 assert(SvROK(qr_ref));
5361 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5362 /* the leaving below frees the tmp qr_ref.
5363 * Give qr a life of its own */
5371 if (!RExC_utf8 && SvUTF8(qr)) {
5372 /* first time through; the pattern got upgraded; save the
5373 * qr for the next time through */
5374 assert(!pRExC_state->runtime_code_qr);
5375 pRExC_state->runtime_code_qr = qr;
5380 /* extract any code blocks within the returned qr// */
5383 /* merge the main (r1) and run-time (r2) code blocks into one */
5385 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5386 struct reg_code_block *new_block, *dst;
5387 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5390 if (!r2->num_code_blocks) /* we guessed wrong */
5392 SvREFCNT_dec_NN(qr);
5397 r1->num_code_blocks + r2->num_code_blocks,
5398 struct reg_code_block);
5401 while ( i1 < r1->num_code_blocks
5402 || i2 < r2->num_code_blocks)
5404 struct reg_code_block *src;
5407 if (i1 == r1->num_code_blocks) {
5408 src = &r2->code_blocks[i2++];
5411 else if (i2 == r2->num_code_blocks)
5412 src = &r1->code_blocks[i1++];
5413 else if ( r1->code_blocks[i1].start
5414 < r2->code_blocks[i2].start)
5416 src = &r1->code_blocks[i1++];
5417 assert(src->end < r2->code_blocks[i2].start);
5420 assert( r1->code_blocks[i1].start
5421 > r2->code_blocks[i2].start);
5422 src = &r2->code_blocks[i2++];
5424 assert(src->end < r1->code_blocks[i1].start);
5427 assert(pat[src->start] == '(');
5428 assert(pat[src->end] == ')');
5429 dst->start = src->start;
5430 dst->end = src->end;
5431 dst->block = src->block;
5432 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5436 r1->num_code_blocks += r2->num_code_blocks;
5437 Safefree(r1->code_blocks);
5438 r1->code_blocks = new_block;
5441 SvREFCNT_dec_NN(qr);
5447 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5448 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5450 /* This is the common code for setting up the floating and fixed length
5451 * string data extracted from Perl_re_op_compile() below. Returns a boolean
5452 * as to whether succeeded or not */
5457 if (! (longest_length
5458 || (eol /* Can't have SEOL and MULTI */
5459 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5461 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5462 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5467 /* copy the information about the longest from the reg_scan_data
5468 over to the program. */
5469 if (SvUTF8(sv_longest)) {
5470 *rx_utf8 = sv_longest;
5473 *rx_substr = sv_longest;
5476 /* end_shift is how many chars that must be matched that
5477 follow this item. We calculate it ahead of time as once the
5478 lookbehind offset is added in we lose the ability to correctly
5480 ml = minlen ? *(minlen) : (SSize_t)longest_length;
5481 *rx_end_shift = ml - offset
5482 - longest_length + (SvTAIL(sv_longest) != 0)
5485 t = (eol/* Can't have SEOL and MULTI */
5486 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5487 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5493 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5494 * regular expression into internal code.
5495 * The pattern may be passed either as:
5496 * a list of SVs (patternp plus pat_count)
5497 * a list of OPs (expr)
5498 * If both are passed, the SV list is used, but the OP list indicates
5499 * which SVs are actually pre-compiled code blocks
5501 * The SVs in the list have magic and qr overloading applied to them (and
5502 * the list may be modified in-place with replacement SVs in the latter
5505 * If the pattern hasn't changed from old_re, then old_re will be
5508 * eng is the current engine. If that engine has an op_comp method, then
5509 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5510 * do the initial concatenation of arguments and pass on to the external
5513 * If is_bare_re is not null, set it to a boolean indicating whether the
5514 * arg list reduced (after overloading) to a single bare regex which has
5515 * been returned (i.e. /$qr/).
5517 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5519 * pm_flags contains the PMf_* flags, typically based on those from the
5520 * pm_flags field of the related PMOP. Currently we're only interested in
5521 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5523 * We can't allocate space until we know how big the compiled form will be,
5524 * but we can't compile it (and thus know how big it is) until we've got a
5525 * place to put the code. So we cheat: we compile it twice, once with code
5526 * generation turned off and size counting turned on, and once "for real".
5527 * This also means that we don't allocate space until we are sure that the
5528 * thing really will compile successfully, and we never have to move the
5529 * code and thus invalidate pointers into it. (Note that it has to be in
5530 * one piece because free() must be able to free it all.) [NB: not true in perl]
5532 * Beware that the optimization-preparation code in here knows about some
5533 * of the structure of the compiled regexp. [I'll say.]
5537 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5538 OP *expr, const regexp_engine* eng, REGEXP *old_re,
5539 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5544 regexp_internal *ri;
5552 SV *code_blocksv = NULL;
5553 SV** new_patternp = patternp;
5555 /* these are all flags - maybe they should be turned
5556 * into a single int with different bit masks */
5557 I32 sawlookahead = 0;
5562 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5564 bool runtime_code = 0;
5566 RExC_state_t RExC_state;
5567 RExC_state_t * const pRExC_state = &RExC_state;
5568 #ifdef TRIE_STUDY_OPT
5570 RExC_state_t copyRExC_state;
5572 GET_RE_DEBUG_FLAGS_DECL;
5574 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5576 DEBUG_r(if (!PL_colorset) reginitcolors());
5578 #ifndef PERL_IN_XSUB_RE
5579 /* Initialize these here instead of as-needed, as is quick and avoids
5580 * having to test them each time otherwise */
5581 if (! PL_AboveLatin1) {
5582 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5583 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5584 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5586 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5587 = _new_invlist_C_array(L1PosixAlnum_invlist);
5588 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5589 = _new_invlist_C_array(PosixAlnum_invlist);
5591 PL_L1Posix_ptrs[_CC_ALPHA]
5592 = _new_invlist_C_array(L1PosixAlpha_invlist);
5593 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5595 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5596 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5598 /* Cased is the same as Alpha in the ASCII range */
5599 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5600 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5602 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5603 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5605 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5606 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5608 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5609 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5611 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5612 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5614 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5615 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5617 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5618 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5620 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5621 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5622 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5623 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5625 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5626 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5628 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5630 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5631 PL_L1Posix_ptrs[_CC_WORDCHAR]
5632 = _new_invlist_C_array(L1PosixWord_invlist);
5634 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5635 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5637 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5641 pRExC_state->code_blocks = NULL;
5642 pRExC_state->num_code_blocks = 0;
5645 *is_bare_re = FALSE;
5647 if (expr && (expr->op_type == OP_LIST ||
5648 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5649 /* allocate code_blocks if needed */
5653 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5654 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5655 ncode++; /* count of DO blocks */
5657 pRExC_state->num_code_blocks = ncode;
5658 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5663 /* compile-time pattern with just OP_CONSTs and DO blocks */
5668 /* find how many CONSTs there are */
5671 if (expr->op_type == OP_CONST)
5674 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5675 if (o->op_type == OP_CONST)
5679 /* fake up an SV array */
5681 assert(!new_patternp);
5682 Newx(new_patternp, n, SV*);
5683 SAVEFREEPV(new_patternp);
5687 if (expr->op_type == OP_CONST)
5688 new_patternp[n] = cSVOPx_sv(expr);
5690 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5691 if (o->op_type == OP_CONST)
5692 new_patternp[n++] = cSVOPo_sv;
5697 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5698 "Assembling pattern from %d elements%s\n", pat_count,
5699 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5701 /* set expr to the first arg op */
5703 if (pRExC_state->num_code_blocks
5704 && expr->op_type != OP_CONST)
5706 expr = cLISTOPx(expr)->op_first;
5707 assert( expr->op_type == OP_PUSHMARK
5708 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5709 || expr->op_type == OP_PADRANGE);
5710 expr = expr->op_sibling;
5713 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5714 expr, &recompile, NULL);
5716 /* handle bare (possibly after overloading) regex: foo =~ $re */
5721 if (SvTYPE(re) == SVt_REGEXP) {
5725 Safefree(pRExC_state->code_blocks);
5726 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5727 "Precompiled pattern%s\n",
5728 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5734 exp = SvPV_nomg(pat, plen);
5736 if (!eng->op_comp) {
5737 if ((SvUTF8(pat) && IN_BYTES)
5738 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5740 /* make a temporary copy; either to convert to bytes,
5741 * or to avoid repeating get-magic / overloaded stringify */
5742 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5743 (IN_BYTES ? 0 : SvUTF8(pat)));
5745 Safefree(pRExC_state->code_blocks);
5746 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5749 /* ignore the utf8ness if the pattern is 0 length */
5750 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5751 RExC_uni_semantics = 0;
5752 RExC_contains_locale = 0;
5753 pRExC_state->runtime_code_qr = NULL;
5756 SV *dsv= sv_newmortal();
5757 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5758 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5759 PL_colors[4],PL_colors[5],s);
5763 /* we jump here if we upgrade the pattern to utf8 and have to
5766 if ((pm_flags & PMf_USE_RE_EVAL)
5767 /* this second condition covers the non-regex literal case,
5768 * i.e. $foo =~ '(?{})'. */
5769 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5771 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5773 /* return old regex if pattern hasn't changed */
5774 /* XXX: note in the below we have to check the flags as well as the pattern.
5776 * Things get a touch tricky as we have to compare the utf8 flag independently
5777 * from the compile flags.
5782 && !!RX_UTF8(old_re) == !!RExC_utf8
5783 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5784 && RX_PRECOMP(old_re)
5785 && RX_PRELEN(old_re) == plen
5786 && memEQ(RX_PRECOMP(old_re), exp, plen)
5787 && !runtime_code /* with runtime code, always recompile */ )
5789 Safefree(pRExC_state->code_blocks);
5793 rx_flags = orig_rx_flags;
5795 if (initial_charset == REGEX_LOCALE_CHARSET) {
5796 RExC_contains_locale = 1;
5798 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5800 /* Set to use unicode semantics if the pattern is in utf8 and has the
5801 * 'depends' charset specified, as it means unicode when utf8 */
5802 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5806 RExC_flags = rx_flags;
5807 RExC_pm_flags = pm_flags;
5810 if (TAINTING_get && TAINT_get)
5811 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5813 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5814 /* whoops, we have a non-utf8 pattern, whilst run-time code
5815 * got compiled as utf8. Try again with a utf8 pattern */
5816 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5817 pRExC_state->num_code_blocks);
5818 goto redo_first_pass;
5821 assert(!pRExC_state->runtime_code_qr);
5826 RExC_in_lookbehind = 0;
5827 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5829 RExC_override_recoding = 0;
5830 RExC_in_multi_char_class = 0;
5832 /* First pass: determine size, legality. */
5835 RExC_end = exp + plen;
5840 RExC_emit = &RExC_emit_dummy;
5841 RExC_whilem_seen = 0;
5842 RExC_open_parens = NULL;
5843 RExC_close_parens = NULL;
5845 RExC_paren_names = NULL;
5847 RExC_paren_name_list = NULL;
5849 RExC_recurse = NULL;
5850 RExC_recurse_count = 0;
5851 pRExC_state->code_index = 0;
5853 #if 0 /* REGC() is (currently) a NOP at the first pass.
5854 * Clever compilers notice this and complain. --jhi */
5855 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5858 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5860 RExC_lastparse=NULL;
5862 /* reg may croak on us, not giving us a chance to free
5863 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5864 need it to survive as long as the regexp (qr/(?{})/).
5865 We must check that code_blocksv is not already set, because we may
5866 have jumped back to restart the sizing pass. */
5867 if (pRExC_state->code_blocks && !code_blocksv) {
5868 code_blocksv = newSV_type(SVt_PV);
5869 SAVEFREESV(code_blocksv);
5870 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5871 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5873 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5874 /* It's possible to write a regexp in ascii that represents Unicode
5875 codepoints outside of the byte range, such as via \x{100}. If we
5876 detect such a sequence we have to convert the entire pattern to utf8
5877 and then recompile, as our sizing calculation will have been based
5878 on 1 byte == 1 character, but we will need to use utf8 to encode
5879 at least some part of the pattern, and therefore must convert the whole
5882 if (flags & RESTART_UTF8) {
5883 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5884 pRExC_state->num_code_blocks);
5885 goto redo_first_pass;
5887 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
5890 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5893 PerlIO_printf(Perl_debug_log,
5894 "Required size %"IVdf" nodes\n"
5895 "Starting second pass (creation)\n",
5898 RExC_lastparse=NULL;
5901 /* The first pass could have found things that force Unicode semantics */
5902 if ((RExC_utf8 || RExC_uni_semantics)
5903 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5905 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5908 /* Small enough for pointer-storage convention?
5909 If extralen==0, this means that we will not need long jumps. */
5910 if (RExC_size >= 0x10000L && RExC_extralen)
5911 RExC_size += RExC_extralen;
5914 if (RExC_whilem_seen > 15)
5915 RExC_whilem_seen = 15;
5917 /* Allocate space and zero-initialize. Note, the two step process
5918 of zeroing when in debug mode, thus anything assigned has to
5919 happen after that */
5920 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5922 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5923 char, regexp_internal);
5924 if ( r == NULL || ri == NULL )
5925 FAIL("Regexp out of space");
5927 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5928 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5930 /* bulk initialize base fields with 0. */
5931 Zero(ri, sizeof(regexp_internal), char);
5934 /* non-zero initialization begins here */
5937 r->extflags = rx_flags;
5938 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5940 if (pm_flags & PMf_IS_QR) {
5941 ri->code_blocks = pRExC_state->code_blocks;
5942 ri->num_code_blocks = pRExC_state->num_code_blocks;
5947 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5948 if (pRExC_state->code_blocks[n].src_regex)
5949 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5950 SAVEFREEPV(pRExC_state->code_blocks);
5954 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5955 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5957 /* The caret is output if there are any defaults: if not all the STD
5958 * flags are set, or if no character set specifier is needed */
5960 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5962 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5963 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5964 >> RXf_PMf_STD_PMMOD_SHIFT);
5965 const char *fptr = STD_PAT_MODS; /*"msix"*/
5967 /* Allocate for the worst case, which is all the std flags are turned
5968 * on. If more precision is desired, we could do a population count of
5969 * the flags set. This could be done with a small lookup table, or by
5970 * shifting, masking and adding, or even, when available, assembly
5971 * language for a machine-language population count.
5972 * We never output a minus, as all those are defaults, so are
5973 * covered by the caret */
5974 const STRLEN wraplen = plen + has_p + has_runon
5975 + has_default /* If needs a caret */
5977 /* If needs a character set specifier */
5978 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5979 + (sizeof(STD_PAT_MODS) - 1)
5980 + (sizeof("(?:)") - 1);
5982 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5983 r->xpv_len_u.xpvlenu_pv = p;
5985 SvFLAGS(rx) |= SVf_UTF8;
5988 /* If a default, cover it using the caret */
5990 *p++= DEFAULT_PAT_MOD;
5994 const char* const name = get_regex_charset_name(r->extflags, &len);
5995 Copy(name, p, len, char);
5999 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6002 while((ch = *fptr++)) {
6010 Copy(RExC_precomp, p, plen, char);
6011 assert ((RX_WRAPPED(rx) - p) < 16);
6012 r->pre_prefix = p - RX_WRAPPED(rx);
6018 SvCUR_set(rx, p - RX_WRAPPED(rx));
6022 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6024 if (RExC_seen & REG_SEEN_RECURSE) {
6025 Newxz(RExC_open_parens, RExC_npar,regnode *);
6026 SAVEFREEPV(RExC_open_parens);
6027 Newxz(RExC_close_parens,RExC_npar,regnode *);
6028 SAVEFREEPV(RExC_close_parens);
6031 /* Useful during FAIL. */
6032 #ifdef RE_TRACK_PATTERN_OFFSETS
6033 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6034 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6035 "%s %"UVuf" bytes for offset annotations.\n",
6036 ri->u.offsets ? "Got" : "Couldn't get",
6037 (UV)((2*RExC_size+1) * sizeof(U32))));
6039 SetProgLen(ri,RExC_size);
6044 /* Second pass: emit code. */
6045 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6046 RExC_pm_flags = pm_flags;
6048 RExC_end = exp + plen;
6051 RExC_emit_start = ri->program;
6052 RExC_emit = ri->program;
6053 RExC_emit_bound = ri->program + RExC_size + 1;
6054 pRExC_state->code_index = 0;
6056 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6057 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6059 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6061 /* XXXX To minimize changes to RE engine we always allocate
6062 3-units-long substrs field. */
6063 Newx(r->substrs, 1, struct reg_substr_data);
6064 if (RExC_recurse_count) {
6065 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6066 SAVEFREEPV(RExC_recurse);
6070 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6071 Zero(r->substrs, 1, struct reg_substr_data);
6073 #ifdef TRIE_STUDY_OPT
6075 StructCopy(&zero_scan_data, &data, scan_data_t);
6076 copyRExC_state = RExC_state;
6079 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6081 RExC_state = copyRExC_state;
6082 if (seen & REG_TOP_LEVEL_BRANCHES)
6083 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6085 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6086 StructCopy(&zero_scan_data, &data, scan_data_t);
6089 StructCopy(&zero_scan_data, &data, scan_data_t);
6092 /* Dig out information for optimizations. */
6093 r->extflags = RExC_flags; /* was pm_op */
6094 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6097 SvUTF8_on(rx); /* Unicode in it? */
6098 ri->regstclass = NULL;
6099 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6100 r->intflags |= PREGf_NAUGHTY;
6101 scan = ri->program + 1; /* First BRANCH. */
6103 /* testing for BRANCH here tells us whether there is "must appear"
6104 data in the pattern. If there is then we can use it for optimisations */
6105 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6107 STRLEN longest_float_length, longest_fixed_length;
6108 struct regnode_charclass_class ch_class; /* pointed to by data */
6110 SSize_t last_close = 0; /* pointed to by data */
6111 regnode *first= scan;
6112 regnode *first_next= regnext(first);
6114 * Skip introductions and multiplicators >= 1
6115 * so that we can extract the 'meat' of the pattern that must
6116 * match in the large if() sequence following.
6117 * NOTE that EXACT is NOT covered here, as it is normally
6118 * picked up by the optimiser separately.
6120 * This is unfortunate as the optimiser isnt handling lookahead
6121 * properly currently.
6124 while ((OP(first) == OPEN && (sawopen = 1)) ||
6125 /* An OR of *one* alternative - should not happen now. */
6126 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6127 /* for now we can't handle lookbehind IFMATCH*/
6128 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6129 (OP(first) == PLUS) ||
6130 (OP(first) == MINMOD) ||
6131 /* An {n,m} with n>0 */
6132 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6133 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6136 * the only op that could be a regnode is PLUS, all the rest
6137 * will be regnode_1 or regnode_2.
6139 * (yves doesn't think this is true)
6141 if (OP(first) == PLUS)
6144 if (OP(first) == MINMOD)
6146 first += regarglen[OP(first)];
6148 first = NEXTOPER(first);
6149 first_next= regnext(first);
6152 /* Starting-point info. */
6154 DEBUG_PEEP("first:",first,0);
6155 /* Ignore EXACT as we deal with it later. */
6156 if (PL_regkind[OP(first)] == EXACT) {
6157 if (OP(first) == EXACT)
6158 NOOP; /* Empty, get anchored substr later. */
6160 ri->regstclass = first;
6163 else if (PL_regkind[OP(first)] == TRIE &&
6164 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6167 /* this can happen only on restudy */
6168 if ( OP(first) == TRIE ) {
6169 struct regnode_1 *trieop = (struct regnode_1 *)
6170 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6171 StructCopy(first,trieop,struct regnode_1);
6172 trie_op=(regnode *)trieop;
6174 struct regnode_charclass *trieop = (struct regnode_charclass *)
6175 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6176 StructCopy(first,trieop,struct regnode_charclass);
6177 trie_op=(regnode *)trieop;
6180 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6181 ri->regstclass = trie_op;
6184 else if (REGNODE_SIMPLE(OP(first)))
6185 ri->regstclass = first;
6186 else if (PL_regkind[OP(first)] == BOUND ||
6187 PL_regkind[OP(first)] == NBOUND)
6188 ri->regstclass = first;
6189 else if (PL_regkind[OP(first)] == BOL) {
6190 r->extflags |= (OP(first) == MBOL
6192 : (OP(first) == SBOL
6195 first = NEXTOPER(first);
6198 else if (OP(first) == GPOS) {
6199 r->extflags |= RXf_ANCH_GPOS;
6200 first = NEXTOPER(first);
6203 else if ((!sawopen || !RExC_sawback) &&
6204 (OP(first) == STAR &&
6205 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6206 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6208 /* turn .* into ^.* with an implied $*=1 */
6210 (OP(NEXTOPER(first)) == REG_ANY)
6213 r->extflags |= type;
6214 r->intflags |= PREGf_IMPLICIT;
6215 first = NEXTOPER(first);
6218 if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6219 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6220 /* x+ must match at the 1st pos of run of x's */
6221 r->intflags |= PREGf_SKIP;
6223 /* Scan is after the zeroth branch, first is atomic matcher. */
6224 #ifdef TRIE_STUDY_OPT
6227 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6228 (IV)(first - scan + 1))
6232 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6233 (IV)(first - scan + 1))
6239 * If there's something expensive in the r.e., find the
6240 * longest literal string that must appear and make it the
6241 * regmust. Resolve ties in favor of later strings, since
6242 * the regstart check works with the beginning of the r.e.
6243 * and avoiding duplication strengthens checking. Not a
6244 * strong reason, but sufficient in the absence of others.
6245 * [Now we resolve ties in favor of the earlier string if
6246 * it happens that c_offset_min has been invalidated, since the
6247 * earlier string may buy us something the later one won't.]
6250 data.longest_fixed = newSVpvs("");
6251 data.longest_float = newSVpvs("");
6252 data.last_found = newSVpvs("");
6253 data.longest = &(data.longest_fixed);
6254 ENTER_with_name("study_chunk");
6255 SAVEFREESV(data.longest_fixed);
6256 SAVEFREESV(data.longest_float);
6257 SAVEFREESV(data.last_found);
6259 if (!ri->regstclass) {
6260 cl_init(pRExC_state, &ch_class);
6261 data.start_class = &ch_class;
6262 stclass_flag = SCF_DO_STCLASS_AND;
6263 } else /* XXXX Check for BOUND? */
6265 data.last_closep = &last_close;
6267 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6268 &data, -1, NULL, NULL,
6269 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6270 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6274 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6277 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6278 && data.last_start_min == 0 && data.last_end > 0
6279 && !RExC_seen_zerolen
6280 && !(RExC_seen & REG_SEEN_VERBARG)
6281 && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6282 r->extflags |= RXf_CHECK_ALL;
6283 scan_commit(pRExC_state, &data,&minlen,0);
6285 longest_float_length = CHR_SVLEN(data.longest_float);
6287 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6288 && data.offset_fixed == data.offset_float_min
6289 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6290 && S_setup_longest (aTHX_ pRExC_state,
6294 &(r->float_end_shift),
6295 data.lookbehind_float,
6296 data.offset_float_min,
6298 longest_float_length,
6299 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6300 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6302 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6303 r->float_max_offset = data.offset_float_max;
6304 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6305 r->float_max_offset -= data.lookbehind_float;
6306 SvREFCNT_inc_simple_void_NN(data.longest_float);
6309 r->float_substr = r->float_utf8 = NULL;
6310 longest_float_length = 0;
6313 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6315 if (S_setup_longest (aTHX_ pRExC_state,
6317 &(r->anchored_utf8),
6318 &(r->anchored_substr),
6319 &(r->anchored_end_shift),
6320 data.lookbehind_fixed,
6323 longest_fixed_length,
6324 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6325 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6327 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6328 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6331 r->anchored_substr = r->anchored_utf8 = NULL;
6332 longest_fixed_length = 0;
6334 LEAVE_with_name("study_chunk");
6337 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6338 ri->regstclass = NULL;
6340 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6342 && ! TEST_SSC_EOS(data.start_class)
6343 && !cl_is_anything(data.start_class))
6345 const U32 n = add_data(pRExC_state, 1, "f");
6346 OP(data.start_class) = ANYOF_SYNTHETIC;
6348 Newx(RExC_rxi->data->data[n], 1,
6349 struct regnode_charclass_class);
6350 StructCopy(data.start_class,
6351 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6352 struct regnode_charclass_class);
6353 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6354 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6355 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6356 regprop(r, sv, (regnode*)data.start_class);
6357 PerlIO_printf(Perl_debug_log,
6358 "synthetic stclass \"%s\".\n",
6359 SvPVX_const(sv));});
6362 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6363 if (longest_fixed_length > longest_float_length) {
6364 r->check_end_shift = r->anchored_end_shift;
6365 r->check_substr = r->anchored_substr;
6366 r->check_utf8 = r->anchored_utf8;
6367 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6368 if (r->extflags & RXf_ANCH_SINGLE)
6369 r->extflags |= RXf_NOSCAN;
6372 r->check_end_shift = r->float_end_shift;
6373 r->check_substr = r->float_substr;
6374 r->check_utf8 = r->float_utf8;
6375 r->check_offset_min = r->float_min_offset;
6376 r->check_offset_max = r->float_max_offset;
6378 if ((r->check_substr || r->check_utf8) ) {
6379 r->extflags |= RXf_USE_INTUIT;
6380 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6381 r->extflags |= RXf_INTUIT_TAIL;
6383 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6384 if ( (STRLEN)minlen < longest_float_length )
6385 minlen= longest_float_length;
6386 if ( (STRLEN)minlen < longest_fixed_length )
6387 minlen= longest_fixed_length;
6391 /* Several toplevels. Best we can is to set minlen. */
6393 struct regnode_charclass_class ch_class;
6394 SSize_t last_close = 0;
6396 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6398 scan = ri->program + 1;
6399 cl_init(pRExC_state, &ch_class);
6400 data.start_class = &ch_class;
6401 data.last_closep = &last_close;
6404 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6405 &data, -1, NULL, NULL,
6406 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6407 |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6410 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6412 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6413 = r->float_substr = r->float_utf8 = NULL;
6415 if (! TEST_SSC_EOS(data.start_class)
6416 && !cl_is_anything(data.start_class))
6418 const U32 n = add_data(pRExC_state, 1, "f");
6419 OP(data.start_class) = ANYOF_SYNTHETIC;
6421 Newx(RExC_rxi->data->data[n], 1,
6422 struct regnode_charclass_class);
6423 StructCopy(data.start_class,
6424 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6425 struct regnode_charclass_class);
6426 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6427 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6428 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6429 regprop(r, sv, (regnode*)data.start_class);
6430 PerlIO_printf(Perl_debug_log,
6431 "synthetic stclass \"%s\".\n",
6432 SvPVX_const(sv));});
6436 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6437 the "real" pattern. */
6439 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6440 (IV)minlen, (IV)r->minlen);
6442 r->minlenret = minlen;
6443 if (r->minlen < minlen)
6446 if (RExC_seen & REG_SEEN_GPOS)
6447 r->extflags |= RXf_GPOS_SEEN;
6448 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6449 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6450 if (pRExC_state->num_code_blocks)
6451 r->extflags |= RXf_EVAL_SEEN;
6452 if (RExC_seen & REG_SEEN_CANY)
6453 r->extflags |= RXf_CANY_SEEN;
6454 if (RExC_seen & REG_SEEN_VERBARG)
6456 r->intflags |= PREGf_VERBARG_SEEN;
6457 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6459 if (RExC_seen & REG_SEEN_CUTGROUP)
6460 r->intflags |= PREGf_CUTGROUP_SEEN;
6461 if (pm_flags & PMf_USE_RE_EVAL)
6462 r->intflags |= PREGf_USE_RE_EVAL;
6463 if (RExC_paren_names)
6464 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6466 RXp_PAREN_NAMES(r) = NULL;
6469 regnode *first = ri->program + 1;
6471 regnode *next = NEXTOPER(first);
6474 if (PL_regkind[fop] == NOTHING && nop == END)
6475 r->extflags |= RXf_NULL;
6476 else if (PL_regkind[fop] == BOL && nop == END)
6477 r->extflags |= RXf_START_ONLY;
6478 else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6479 r->extflags |= RXf_WHITE;
6480 else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6481 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6485 if (RExC_paren_names) {
6486 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6487 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6490 ri->name_list_idx = 0;
6492 if (RExC_recurse_count) {
6493 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6494 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6495 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6498 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6499 /* assume we don't need to swap parens around before we match */
6502 PerlIO_printf(Perl_debug_log,"Final program:\n");
6505 #ifdef RE_TRACK_PATTERN_OFFSETS
6506 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6507 const STRLEN len = ri->u.offsets[0];
6509 GET_RE_DEBUG_FLAGS_DECL;
6510 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6511 for (i = 1; i <= len; i++) {
6512 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6513 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6514 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6516 PerlIO_printf(Perl_debug_log, "\n");
6521 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6522 * by setting the regexp SV to readonly-only instead. If the
6523 * pattern's been recompiled, the USEDness should remain. */
6524 if (old_re && SvREADONLY(old_re))
6532 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6535 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6537 PERL_UNUSED_ARG(value);
6539 if (flags & RXapif_FETCH) {
6540 return reg_named_buff_fetch(rx, key, flags);
6541 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6542 Perl_croak_no_modify();
6544 } else if (flags & RXapif_EXISTS) {
6545 return reg_named_buff_exists(rx, key, flags)
6548 } else if (flags & RXapif_REGNAMES) {
6549 return reg_named_buff_all(rx, flags);
6550 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6551 return reg_named_buff_scalar(rx, flags);
6553 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6559 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6562 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6563 PERL_UNUSED_ARG(lastkey);
6565 if (flags & RXapif_FIRSTKEY)
6566 return reg_named_buff_firstkey(rx, flags);
6567 else if (flags & RXapif_NEXTKEY)
6568 return reg_named_buff_nextkey(rx, flags);
6570 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6576 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6579 AV *retarray = NULL;
6581 struct regexp *const rx = ReANY(r);
6583 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6585 if (flags & RXapif_ALL)
6588 if (rx && RXp_PAREN_NAMES(rx)) {
6589 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6592 SV* sv_dat=HeVAL(he_str);
6593 I32 *nums=(I32*)SvPVX(sv_dat);
6594 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6595 if ((I32)(rx->nparens) >= nums[i]
6596 && rx->offs[nums[i]].start != -1
6597 && rx->offs[nums[i]].end != -1)
6600 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6605 ret = newSVsv(&PL_sv_undef);
6608 av_push(retarray, ret);
6611 return newRV_noinc(MUTABLE_SV(retarray));
6618 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6621 struct regexp *const rx = ReANY(r);
6623 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6625 if (rx && RXp_PAREN_NAMES(rx)) {
6626 if (flags & RXapif_ALL) {
6627 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6629 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6631 SvREFCNT_dec_NN(sv);
6643 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6645 struct regexp *const rx = ReANY(r);
6647 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6649 if ( rx && RXp_PAREN_NAMES(rx) ) {
6650 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6652 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6659 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6661 struct regexp *const rx = ReANY(r);
6662 GET_RE_DEBUG_FLAGS_DECL;
6664 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6666 if (rx && RXp_PAREN_NAMES(rx)) {
6667 HV *hv = RXp_PAREN_NAMES(rx);
6669 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6672 SV* sv_dat = HeVAL(temphe);
6673 I32 *nums = (I32*)SvPVX(sv_dat);
6674 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6675 if ((I32)(rx->lastparen) >= nums[i] &&
6676 rx->offs[nums[i]].start != -1 &&
6677 rx->offs[nums[i]].end != -1)
6683 if (parno || flags & RXapif_ALL) {
6684 return newSVhek(HeKEY_hek(temphe));
6692 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6697 struct regexp *const rx = ReANY(r);
6699 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6701 if (rx && RXp_PAREN_NAMES(rx)) {
6702 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6703 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6704 } else if (flags & RXapif_ONE) {
6705 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6706 av = MUTABLE_AV(SvRV(ret));
6707 length = av_len(av);
6708 SvREFCNT_dec_NN(ret);
6709 return newSViv(length + 1);
6711 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6715 return &PL_sv_undef;
6719 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6721 struct regexp *const rx = ReANY(r);
6724 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6726 if (rx && RXp_PAREN_NAMES(rx)) {
6727 HV *hv= RXp_PAREN_NAMES(rx);
6729 (void)hv_iterinit(hv);
6730 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6733 SV* sv_dat = HeVAL(temphe);
6734 I32 *nums = (I32*)SvPVX(sv_dat);
6735 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6736 if ((I32)(rx->lastparen) >= nums[i] &&
6737 rx->offs[nums[i]].start != -1 &&
6738 rx->offs[nums[i]].end != -1)
6744 if (parno || flags & RXapif_ALL) {
6745 av_push(av, newSVhek(HeKEY_hek(temphe)));
6750 return newRV_noinc(MUTABLE_SV(av));
6754 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6757 struct regexp *const rx = ReANY(r);
6763 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6765 if ( n == RX_BUFF_IDX_CARET_PREMATCH
6766 || n == RX_BUFF_IDX_CARET_FULLMATCH
6767 || n == RX_BUFF_IDX_CARET_POSTMATCH
6770 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6772 /* on something like
6775 * the KEEPCOPY is set on the PMOP rather than the regex */
6776 if (PL_curpm && r == PM_GETRE(PL_curpm))
6777 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6786 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6787 /* no need to distinguish between them any more */
6788 n = RX_BUFF_IDX_FULLMATCH;
6790 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6791 && rx->offs[0].start != -1)
6793 /* $`, ${^PREMATCH} */
6794 i = rx->offs[0].start;
6798 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6799 && rx->offs[0].end != -1)
6801 /* $', ${^POSTMATCH} */
6802 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6803 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6806 if ( 0 <= n && n <= (I32)rx->nparens &&
6807 (s1 = rx->offs[n].start) != -1 &&
6808 (t1 = rx->offs[n].end) != -1)
6810 /* $&, ${^MATCH}, $1 ... */
6812 s = rx->subbeg + s1 - rx->suboffset;
6817 assert(s >= rx->subbeg);
6818 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
6820 #if NO_TAINT_SUPPORT
6821 sv_setpvn(sv, s, i);
6823 const int oldtainted = TAINT_get;
6825 sv_setpvn(sv, s, i);
6826 TAINT_set(oldtainted);
6828 if ( (rx->extflags & RXf_CANY_SEEN)
6829 ? (RXp_MATCH_UTF8(rx)
6830 && (!i || is_utf8_string((U8*)s, i)))
6831 : (RXp_MATCH_UTF8(rx)) )
6838 if (RXp_MATCH_TAINTED(rx)) {
6839 if (SvTYPE(sv) >= SVt_PVMG) {
6840 MAGIC* const mg = SvMAGIC(sv);
6843 SvMAGIC_set(sv, mg->mg_moremagic);
6845 if ((mgt = SvMAGIC(sv))) {
6846 mg->mg_moremagic = mgt;
6847 SvMAGIC_set(sv, mg);
6858 sv_setsv(sv,&PL_sv_undef);
6864 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6865 SV const * const value)
6867 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6869 PERL_UNUSED_ARG(rx);
6870 PERL_UNUSED_ARG(paren);
6871 PERL_UNUSED_ARG(value);
6874 Perl_croak_no_modify();
6878 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6881 struct regexp *const rx = ReANY(r);
6885 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6887 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
6888 || paren == RX_BUFF_IDX_CARET_FULLMATCH
6889 || paren == RX_BUFF_IDX_CARET_POSTMATCH
6892 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6894 /* on something like
6897 * the KEEPCOPY is set on the PMOP rather than the regex */
6898 if (PL_curpm && r == PM_GETRE(PL_curpm))
6899 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6905 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6907 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6908 case RX_BUFF_IDX_PREMATCH: /* $` */
6909 if (rx->offs[0].start != -1) {
6910 i = rx->offs[0].start;
6919 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6920 case RX_BUFF_IDX_POSTMATCH: /* $' */
6921 if (rx->offs[0].end != -1) {
6922 i = rx->sublen - rx->offs[0].end;
6924 s1 = rx->offs[0].end;
6931 default: /* $& / ${^MATCH}, $1, $2, ... */
6932 if (paren <= (I32)rx->nparens &&
6933 (s1 = rx->offs[paren].start) != -1 &&
6934 (t1 = rx->offs[paren].end) != -1)
6940 if (ckWARN(WARN_UNINITIALIZED))
6941 report_uninit((const SV *)sv);
6946 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6947 const char * const s = rx->subbeg - rx->suboffset + s1;
6952 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6959 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6961 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6962 PERL_UNUSED_ARG(rx);
6966 return newSVpvs("Regexp");
6969 /* Scans the name of a named buffer from the pattern.
6970 * If flags is REG_RSN_RETURN_NULL returns null.
6971 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6972 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6973 * to the parsed name as looked up in the RExC_paren_names hash.
6974 * If there is an error throws a vFAIL().. type exception.
6977 #define REG_RSN_RETURN_NULL 0
6978 #define REG_RSN_RETURN_NAME 1
6979 #define REG_RSN_RETURN_DATA 2
6982 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6984 char *name_start = RExC_parse;
6986 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6988 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6989 /* skip IDFIRST by using do...while */
6992 RExC_parse += UTF8SKIP(RExC_parse);
6993 } while (isWORDCHAR_utf8((U8*)RExC_parse));
6997 } while (isWORDCHAR(*RExC_parse));
6999 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7000 vFAIL("Group name must start with a non-digit word character");
7004 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7005 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7006 if ( flags == REG_RSN_RETURN_NAME)
7008 else if (flags==REG_RSN_RETURN_DATA) {
7011 if ( ! sv_name ) /* should not happen*/
7012 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7013 if (RExC_paren_names)
7014 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7016 sv_dat = HeVAL(he_str);
7018 vFAIL("Reference to nonexistent named group");
7022 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7023 (unsigned long) flags);
7025 assert(0); /* NOT REACHED */
7030 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7031 int rem=(int)(RExC_end - RExC_parse); \
7040 if (RExC_lastparse!=RExC_parse) \
7041 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7044 iscut ? "..." : "<" \
7047 PerlIO_printf(Perl_debug_log,"%16s",""); \
7050 num = RExC_size + 1; \
7052 num=REG_NODE_NUM(RExC_emit); \
7053 if (RExC_lastnum!=num) \
7054 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7056 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7057 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7058 (int)((depth*2)), "", \
7062 RExC_lastparse=RExC_parse; \
7067 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7068 DEBUG_PARSE_MSG((funcname)); \
7069 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7071 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7072 DEBUG_PARSE_MSG((funcname)); \
7073 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7076 /* This section of code defines the inversion list object and its methods. The
7077 * interfaces are highly subject to change, so as much as possible is static to
7078 * this file. An inversion list is here implemented as a malloc'd C UV array
7079 * as an SVt_INVLIST scalar.
7081 * An inversion list for Unicode is an array of code points, sorted by ordinal
7082 * number. The zeroth element is the first code point in the list. The 1th
7083 * element is the first element beyond that not in the list. In other words,
7084 * the first range is
7085 * invlist[0]..(invlist[1]-1)
7086 * The other ranges follow. Thus every element whose index is divisible by two
7087 * marks the beginning of a range that is in the list, and every element not
7088 * divisible by two marks the beginning of a range not in the list. A single
7089 * element inversion list that contains the single code point N generally
7090 * consists of two elements
7093 * (The exception is when N is the highest representable value on the
7094 * machine, in which case the list containing just it would be a single
7095 * element, itself. By extension, if the last range in the list extends to
7096 * infinity, then the first element of that range will be in the inversion list
7097 * at a position that is divisible by two, and is the final element in the
7099 * Taking the complement (inverting) an inversion list is quite simple, if the
7100 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7101 * This implementation reserves an element at the beginning of each inversion
7102 * list to always contain 0; there is an additional flag in the header which
7103 * indicates if the list begins at the 0, or is offset to begin at the next
7106 * More about inversion lists can be found in "Unicode Demystified"
7107 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7108 * More will be coming when functionality is added later.
7110 * The inversion list data structure is currently implemented as an SV pointing
7111 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7112 * array of UV whose memory management is automatically handled by the existing
7113 * facilities for SV's.
7115 * Some of the methods should always be private to the implementation, and some
7116 * should eventually be made public */
7118 /* The header definitions are in F<inline_invlist.c> */
7120 PERL_STATIC_INLINE UV*
7121 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7123 /* Returns a pointer to the first element in the inversion list's array.
7124 * This is called upon initialization of an inversion list. Where the
7125 * array begins depends on whether the list has the code point U+0000 in it
7126 * or not. The other parameter tells it whether the code that follows this
7127 * call is about to put a 0 in the inversion list or not. The first
7128 * element is either the element reserved for 0, if TRUE, or the element
7129 * after it, if FALSE */
7131 bool* offset = get_invlist_offset_addr(invlist);
7132 UV* zero_addr = (UV *) SvPVX(invlist);
7134 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7137 assert(! _invlist_len(invlist));
7141 /* 1^1 = 0; 1^0 = 1 */
7142 *offset = 1 ^ will_have_0;
7143 return zero_addr + *offset;
7146 PERL_STATIC_INLINE UV*
7147 S_invlist_array(pTHX_ SV* const invlist)
7149 /* Returns the pointer to the inversion list's array. Every time the
7150 * length changes, this needs to be called in case malloc or realloc moved
7153 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7155 /* Must not be empty. If these fail, you probably didn't check for <len>
7156 * being non-zero before trying to get the array */
7157 assert(_invlist_len(invlist));
7159 /* The very first element always contains zero, The array begins either
7160 * there, or if the inversion list is offset, at the element after it.
7161 * The offset header field determines which; it contains 0 or 1 to indicate
7162 * how much additionally to add */
7163 assert(0 == *(SvPVX(invlist)));
7164 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7167 PERL_STATIC_INLINE void
7168 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7170 /* Sets the current number of elements stored in the inversion list.
7171 * Updates SvCUR correspondingly */
7173 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7175 assert(SvTYPE(invlist) == SVt_INVLIST);
7180 : TO_INTERNAL_SIZE(len + offset));
7181 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7184 PERL_STATIC_INLINE IV*
7185 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7187 /* Return the address of the IV that is reserved to hold the cached index
7190 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7192 assert(SvTYPE(invlist) == SVt_INVLIST);
7194 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7197 PERL_STATIC_INLINE IV
7198 S_invlist_previous_index(pTHX_ SV* const invlist)
7200 /* Returns cached index of previous search */
7202 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7204 return *get_invlist_previous_index_addr(invlist);
7207 PERL_STATIC_INLINE void
7208 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7210 /* Caches <index> for later retrieval */
7212 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7214 assert(index == 0 || index < (int) _invlist_len(invlist));
7216 *get_invlist_previous_index_addr(invlist) = index;
7219 PERL_STATIC_INLINE UV
7220 S_invlist_max(pTHX_ SV* const invlist)
7222 /* Returns the maximum number of elements storable in the inversion list's
7223 * array, without having to realloc() */
7225 PERL_ARGS_ASSERT_INVLIST_MAX;
7227 assert(SvTYPE(invlist) == SVt_INVLIST);
7229 /* Assumes worst case, in which the 0 element is not counted in the
7230 * inversion list, so subtracts 1 for that */
7231 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7232 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7233 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7236 #ifndef PERL_IN_XSUB_RE
7238 Perl__new_invlist(pTHX_ IV initial_size)
7241 /* Return a pointer to a newly constructed inversion list, with enough
7242 * space to store 'initial_size' elements. If that number is negative, a
7243 * system default is used instead */
7247 if (initial_size < 0) {
7251 /* Allocate the initial space */
7252 new_list = newSV_type(SVt_INVLIST);
7254 /* First 1 is in case the zero element isn't in the list; second 1 is for
7256 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7257 invlist_set_len(new_list, 0, 0);
7259 /* Force iterinit() to be used to get iteration to work */
7260 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7262 *get_invlist_previous_index_addr(new_list) = 0;
7269 S__new_invlist_C_array(pTHX_ const UV* const list)
7271 /* Return a pointer to a newly constructed inversion list, initialized to
7272 * point to <list>, which has to be in the exact correct inversion list
7273 * form, including internal fields. Thus this is a dangerous routine that
7274 * should not be used in the wrong hands. The passed in 'list' contains
7275 * several header fields at the beginning that are not part of the
7276 * inversion list body proper */
7278 const STRLEN length = (STRLEN) list[0];
7279 const UV version_id = list[1];
7280 const bool offset = cBOOL(list[2]);
7281 #define HEADER_LENGTH 3
7282 /* If any of the above changes in any way, you must change HEADER_LENGTH
7283 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7284 * perl -E 'say int(rand 2**31-1)'
7286 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7287 data structure type, so that one being
7288 passed in can be validated to be an
7289 inversion list of the correct vintage.
7292 SV* invlist = newSV_type(SVt_INVLIST);
7294 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7296 if (version_id != INVLIST_VERSION_ID) {
7297 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7300 /* The generated array passed in includes header elements that aren't part
7301 * of the list proper, so start it just after them */
7302 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7304 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7305 shouldn't touch it */
7307 *(get_invlist_offset_addr(invlist)) = offset;
7309 /* The 'length' passed to us is the physical number of elements in the
7310 * inversion list. But if there is an offset the logical number is one
7312 invlist_set_len(invlist, length - offset, offset);
7314 invlist_set_previous_index(invlist, 0);
7316 /* Initialize the iteration pointer. */
7317 invlist_iterfinish(invlist);
7323 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7325 /* Grow the maximum size of an inversion list */
7327 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7329 assert(SvTYPE(invlist) == SVt_INVLIST);
7331 /* Add one to account for the zero element at the beginning which may not
7332 * be counted by the calling parameters */
7333 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7336 PERL_STATIC_INLINE void
7337 S_invlist_trim(pTHX_ SV* const invlist)
7339 PERL_ARGS_ASSERT_INVLIST_TRIM;
7341 assert(SvTYPE(invlist) == SVt_INVLIST);
7343 /* Change the length of the inversion list to how many entries it currently
7345 SvPV_shrink_to_cur((SV *) invlist);
7348 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7351 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7353 /* Subject to change or removal. Append the range from 'start' to 'end' at
7354 * the end of the inversion list. The range must be above any existing
7358 UV max = invlist_max(invlist);
7359 UV len = _invlist_len(invlist);
7362 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7364 if (len == 0) { /* Empty lists must be initialized */
7365 offset = start != 0;
7366 array = _invlist_array_init(invlist, ! offset);
7369 /* Here, the existing list is non-empty. The current max entry in the
7370 * list is generally the first value not in the set, except when the
7371 * set extends to the end of permissible values, in which case it is
7372 * the first entry in that final set, and so this call is an attempt to
7373 * append out-of-order */
7375 UV final_element = len - 1;
7376 array = invlist_array(invlist);
7377 if (array[final_element] > start
7378 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7380 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7381 array[final_element], start,
7382 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7385 /* Here, it is a legal append. If the new range begins with the first
7386 * value not in the set, it is extending the set, so the new first
7387 * value not in the set is one greater than the newly extended range.
7389 offset = *get_invlist_offset_addr(invlist);
7390 if (array[final_element] == start) {
7391 if (end != UV_MAX) {
7392 array[final_element] = end + 1;
7395 /* But if the end is the maximum representable on the machine,
7396 * just let the range that this would extend to have no end */
7397 invlist_set_len(invlist, len - 1, offset);
7403 /* Here the new range doesn't extend any existing set. Add it */
7405 len += 2; /* Includes an element each for the start and end of range */
7407 /* If wll overflow the existing space, extend, which may cause the array to
7410 invlist_extend(invlist, len);
7412 /* Have to set len here to avoid assert failure in invlist_array() */
7413 invlist_set_len(invlist, len, offset);
7415 array = invlist_array(invlist);
7418 invlist_set_len(invlist, len, offset);
7421 /* The next item on the list starts the range, the one after that is
7422 * one past the new range. */
7423 array[len - 2] = start;
7424 if (end != UV_MAX) {
7425 array[len - 1] = end + 1;
7428 /* But if the end is the maximum representable on the machine, just let
7429 * the range have no end */
7430 invlist_set_len(invlist, len - 1, offset);
7434 #ifndef PERL_IN_XSUB_RE
7437 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7439 /* Searches the inversion list for the entry that contains the input code
7440 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7441 * return value is the index into the list's array of the range that
7446 IV high = _invlist_len(invlist);
7447 const IV highest_element = high - 1;
7450 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7452 /* If list is empty, return failure. */
7457 /* (We can't get the array unless we know the list is non-empty) */
7458 array = invlist_array(invlist);
7460 mid = invlist_previous_index(invlist);
7461 assert(mid >=0 && mid <= highest_element);
7463 /* <mid> contains the cache of the result of the previous call to this
7464 * function (0 the first time). See if this call is for the same result,
7465 * or if it is for mid-1. This is under the theory that calls to this
7466 * function will often be for related code points that are near each other.
7467 * And benchmarks show that caching gives better results. We also test
7468 * here if the code point is within the bounds of the list. These tests
7469 * replace others that would have had to be made anyway to make sure that
7470 * the array bounds were not exceeded, and these give us extra information
7471 * at the same time */
7472 if (cp >= array[mid]) {
7473 if (cp >= array[highest_element]) {
7474 return highest_element;
7477 /* Here, array[mid] <= cp < array[highest_element]. This means that
7478 * the final element is not the answer, so can exclude it; it also
7479 * means that <mid> is not the final element, so can refer to 'mid + 1'
7481 if (cp < array[mid + 1]) {
7487 else { /* cp < aray[mid] */
7488 if (cp < array[0]) { /* Fail if outside the array */
7492 if (cp >= array[mid - 1]) {
7497 /* Binary search. What we are looking for is <i> such that
7498 * array[i] <= cp < array[i+1]
7499 * The loop below converges on the i+1. Note that there may not be an
7500 * (i+1)th element in the array, and things work nonetheless */
7501 while (low < high) {
7502 mid = (low + high) / 2;
7503 assert(mid <= highest_element);
7504 if (array[mid] <= cp) { /* cp >= array[mid] */
7507 /* We could do this extra test to exit the loop early.
7508 if (cp < array[low]) {
7513 else { /* cp < array[mid] */
7520 invlist_set_previous_index(invlist, high);
7525 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7527 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7528 * but is used when the swash has an inversion list. This makes this much
7529 * faster, as it uses a binary search instead of a linear one. This is
7530 * intimately tied to that function, and perhaps should be in utf8.c,
7531 * except it is intimately tied to inversion lists as well. It assumes
7532 * that <swatch> is all 0's on input */
7535 const IV len = _invlist_len(invlist);
7539 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7541 if (len == 0) { /* Empty inversion list */
7545 array = invlist_array(invlist);
7547 /* Find which element it is */
7548 i = _invlist_search(invlist, start);
7550 /* We populate from <start> to <end> */
7551 while (current < end) {
7554 /* The inversion list gives the results for every possible code point
7555 * after the first one in the list. Only those ranges whose index is
7556 * even are ones that the inversion list matches. For the odd ones,
7557 * and if the initial code point is not in the list, we have to skip
7558 * forward to the next element */
7559 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7561 if (i >= len) { /* Finished if beyond the end of the array */
7565 if (current >= end) { /* Finished if beyond the end of what we
7567 if (LIKELY(end < UV_MAX)) {
7571 /* We get here when the upper bound is the maximum
7572 * representable on the machine, and we are looking for just
7573 * that code point. Have to special case it */
7575 goto join_end_of_list;
7578 assert(current >= start);
7580 /* The current range ends one below the next one, except don't go past
7583 upper = (i < len && array[i] < end) ? array[i] : end;
7585 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7586 * for each code point in it */
7587 for (; current < upper; current++) {
7588 const STRLEN offset = (STRLEN)(current - start);
7589 swatch[offset >> 3] |= 1 << (offset & 7);
7594 /* Quit if at the end of the list */
7597 /* But first, have to deal with the highest possible code point on
7598 * the platform. The previous code assumes that <end> is one
7599 * beyond where we want to populate, but that is impossible at the
7600 * platform's infinity, so have to handle it specially */
7601 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7603 const STRLEN offset = (STRLEN)(end - start);
7604 swatch[offset >> 3] |= 1 << (offset & 7);
7609 /* Advance to the next range, which will be for code points not in the
7618 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7620 /* Take the union of two inversion lists and point <output> to it. *output
7621 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7622 * the reference count to that list will be decremented. The first list,
7623 * <a>, may be NULL, in which case a copy of the second list is returned.
7624 * If <complement_b> is TRUE, the union is taken of the complement
7625 * (inversion) of <b> instead of b itself.
7627 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7628 * Richard Gillam, published by Addison-Wesley, and explained at some
7629 * length there. The preface says to incorporate its examples into your
7630 * code at your own risk.
7632 * The algorithm is like a merge sort.
7634 * XXX A potential performance improvement is to keep track as we go along
7635 * if only one of the inputs contributes to the result, meaning the other
7636 * is a subset of that one. In that case, we can skip the final copy and
7637 * return the larger of the input lists, but then outside code might need
7638 * to keep track of whether to free the input list or not */
7640 const UV* array_a; /* a's array */
7642 UV len_a; /* length of a's array */
7645 SV* u; /* the resulting union */
7649 UV i_a = 0; /* current index into a's array */
7653 /* running count, as explained in the algorithm source book; items are
7654 * stopped accumulating and are output when the count changes to/from 0.
7655 * The count is incremented when we start a range that's in the set, and
7656 * decremented when we start a range that's not in the set. So its range
7657 * is 0 to 2. Only when the count is zero is something not in the set.
7661 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7664 /* If either one is empty, the union is the other one */
7665 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7672 *output = invlist_clone(b);
7674 _invlist_invert(*output);
7676 } /* else *output already = b; */
7679 else if ((len_b = _invlist_len(b)) == 0) {
7684 /* The complement of an empty list is a list that has everything in it,
7685 * so the union with <a> includes everything too */
7690 *output = _new_invlist(1);
7691 _append_range_to_invlist(*output, 0, UV_MAX);
7693 else if (*output != a) {
7694 *output = invlist_clone(a);
7696 /* else *output already = a; */
7700 /* Here both lists exist and are non-empty */
7701 array_a = invlist_array(a);
7702 array_b = invlist_array(b);
7704 /* If are to take the union of 'a' with the complement of b, set it
7705 * up so are looking at b's complement. */
7708 /* To complement, we invert: if the first element is 0, remove it. To
7709 * do this, we just pretend the array starts one later */
7710 if (array_b[0] == 0) {
7716 /* But if the first element is not zero, we pretend the list starts
7717 * at the 0 that is always stored immediately before the array. */
7723 /* Size the union for the worst case: that the sets are completely
7725 u = _new_invlist(len_a + len_b);
7727 /* Will contain U+0000 if either component does */
7728 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7729 || (len_b > 0 && array_b[0] == 0));
7731 /* Go through each list item by item, stopping when exhausted one of
7733 while (i_a < len_a && i_b < len_b) {
7734 UV cp; /* The element to potentially add to the union's array */
7735 bool cp_in_set; /* is it in the the input list's set or not */
7737 /* We need to take one or the other of the two inputs for the union.
7738 * Since we are merging two sorted lists, we take the smaller of the
7739 * next items. In case of a tie, we take the one that is in its set
7740 * first. If we took one not in the set first, it would decrement the
7741 * count, possibly to 0 which would cause it to be output as ending the
7742 * range, and the next time through we would take the same number, and
7743 * output it again as beginning the next range. By doing it the
7744 * opposite way, there is no possibility that the count will be
7745 * momentarily decremented to 0, and thus the two adjoining ranges will
7746 * be seamlessly merged. (In a tie and both are in the set or both not
7747 * in the set, it doesn't matter which we take first.) */
7748 if (array_a[i_a] < array_b[i_b]
7749 || (array_a[i_a] == array_b[i_b]
7750 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7752 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7756 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7757 cp = array_b[i_b++];
7760 /* Here, have chosen which of the two inputs to look at. Only output
7761 * if the running count changes to/from 0, which marks the
7762 * beginning/end of a range in that's in the set */
7765 array_u[i_u++] = cp;
7772 array_u[i_u++] = cp;
7777 /* Here, we are finished going through at least one of the lists, which
7778 * means there is something remaining in at most one. We check if the list
7779 * that hasn't been exhausted is positioned such that we are in the middle
7780 * of a range in its set or not. (i_a and i_b point to the element beyond
7781 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7782 * is potentially more to output.
7783 * There are four cases:
7784 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7785 * in the union is entirely from the non-exhausted set.
7786 * 2) Both were in their sets, count is 2. Nothing further should
7787 * be output, as everything that remains will be in the exhausted
7788 * list's set, hence in the union; decrementing to 1 but not 0 insures
7790 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7791 * Nothing further should be output because the union includes
7792 * everything from the exhausted set. Not decrementing ensures that.
7793 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7794 * decrementing to 0 insures that we look at the remainder of the
7795 * non-exhausted set */
7796 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7797 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7802 /* The final length is what we've output so far, plus what else is about to
7803 * be output. (If 'count' is non-zero, then the input list we exhausted
7804 * has everything remaining up to the machine's limit in its set, and hence
7805 * in the union, so there will be no further output. */
7808 /* At most one of the subexpressions will be non-zero */
7809 len_u += (len_a - i_a) + (len_b - i_b);
7812 /* Set result to final length, which can change the pointer to array_u, so
7814 if (len_u != _invlist_len(u)) {
7815 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
7817 array_u = invlist_array(u);
7820 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7821 * the other) ended with everything above it not in its set. That means
7822 * that the remaining part of the union is precisely the same as the
7823 * non-exhausted list, so can just copy it unchanged. (If both list were
7824 * exhausted at the same time, then the operations below will be both 0.)
7827 IV copy_count; /* At most one will have a non-zero copy count */
7828 if ((copy_count = len_a - i_a) > 0) {
7829 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7831 else if ((copy_count = len_b - i_b) > 0) {
7832 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7836 /* We may be removing a reference to one of the inputs */
7837 if (a == *output || b == *output) {
7838 assert(! invlist_is_iterating(*output));
7839 SvREFCNT_dec_NN(*output);
7847 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
7849 /* Take the intersection of two inversion lists and point <i> to it. *i
7850 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7851 * the reference count to that list will be decremented.
7852 * If <complement_b> is TRUE, the result will be the intersection of <a>
7853 * and the complement (or inversion) of <b> instead of <b> directly.
7855 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7856 * Richard Gillam, published by Addison-Wesley, and explained at some
7857 * length there. The preface says to incorporate its examples into your
7858 * code at your own risk. In fact, it had bugs
7860 * The algorithm is like a merge sort, and is essentially the same as the
7864 const UV* array_a; /* a's array */
7866 UV len_a; /* length of a's array */
7869 SV* r; /* the resulting intersection */
7873 UV i_a = 0; /* current index into a's array */
7877 /* running count, as explained in the algorithm source book; items are
7878 * stopped accumulating and are output when the count changes to/from 2.
7879 * The count is incremented when we start a range that's in the set, and
7880 * decremented when we start a range that's not in the set. So its range
7881 * is 0 to 2. Only when the count is 2 is something in the intersection.
7885 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7888 /* Special case if either one is empty */
7889 len_a = (a == NULL) ? 0 : _invlist_len(a);
7890 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7892 if (len_a != 0 && complement_b) {
7894 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7895 * be empty. Here, also we are using 'b's complement, which hence
7896 * must be every possible code point. Thus the intersection is
7903 *i = invlist_clone(a);
7905 /* else *i is already 'a' */
7909 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7910 * intersection must be empty */
7917 *i = _new_invlist(0);
7921 /* Here both lists exist and are non-empty */
7922 array_a = invlist_array(a);
7923 array_b = invlist_array(b);
7925 /* If are to take the intersection of 'a' with the complement of b, set it
7926 * up so are looking at b's complement. */
7929 /* To complement, we invert: if the first element is 0, remove it. To
7930 * do this, we just pretend the array starts one later */
7931 if (array_b[0] == 0) {
7937 /* But if the first element is not zero, we pretend the list starts
7938 * at the 0 that is always stored immediately before the array. */
7944 /* Size the intersection for the worst case: that the intersection ends up
7945 * fragmenting everything to be completely disjoint */
7946 r= _new_invlist(len_a + len_b);
7948 /* Will contain U+0000 iff both components do */
7949 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7950 && len_b > 0 && array_b[0] == 0);
7952 /* Go through each list item by item, stopping when exhausted one of
7954 while (i_a < len_a && i_b < len_b) {
7955 UV cp; /* The element to potentially add to the intersection's
7957 bool cp_in_set; /* Is it in the input list's set or not */
7959 /* We need to take one or the other of the two inputs for the
7960 * intersection. Since we are merging two sorted lists, we take the
7961 * smaller of the next items. In case of a tie, we take the one that
7962 * is not in its set first (a difference from the union algorithm). If
7963 * we took one in the set first, it would increment the count, possibly
7964 * to 2 which would cause it to be output as starting a range in the
7965 * intersection, and the next time through we would take that same
7966 * number, and output it again as ending the set. By doing it the
7967 * opposite of this, there is no possibility that the count will be
7968 * momentarily incremented to 2. (In a tie and both are in the set or
7969 * both not in the set, it doesn't matter which we take first.) */
7970 if (array_a[i_a] < array_b[i_b]
7971 || (array_a[i_a] == array_b[i_b]
7972 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7974 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7978 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7982 /* Here, have chosen which of the two inputs to look at. Only output
7983 * if the running count changes to/from 2, which marks the
7984 * beginning/end of a range that's in the intersection */
7988 array_r[i_r++] = cp;
7993 array_r[i_r++] = cp;
7999 /* Here, we are finished going through at least one of the lists, which
8000 * means there is something remaining in at most one. We check if the list
8001 * that has been exhausted is positioned such that we are in the middle
8002 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8003 * the ones we care about.) There are four cases:
8004 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8005 * nothing left in the intersection.
8006 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8007 * above 2. What should be output is exactly that which is in the
8008 * non-exhausted set, as everything it has is also in the intersection
8009 * set, and everything it doesn't have can't be in the intersection
8010 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8011 * gets incremented to 2. Like the previous case, the intersection is
8012 * everything that remains in the non-exhausted set.
8013 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8014 * remains 1. And the intersection has nothing more. */
8015 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8016 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8021 /* The final length is what we've output so far plus what else is in the
8022 * intersection. At most one of the subexpressions below will be non-zero */
8025 len_r += (len_a - i_a) + (len_b - i_b);
8028 /* Set result to final length, which can change the pointer to array_r, so
8030 if (len_r != _invlist_len(r)) {
8031 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8033 array_r = invlist_array(r);
8036 /* Finish outputting any remaining */
8037 if (count >= 2) { /* At most one will have a non-zero copy count */
8039 if ((copy_count = len_a - i_a) > 0) {
8040 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8042 else if ((copy_count = len_b - i_b) > 0) {
8043 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8047 /* We may be removing a reference to one of the inputs */
8048 if (a == *i || b == *i) {
8049 assert(! invlist_is_iterating(*i));
8050 SvREFCNT_dec_NN(*i);
8058 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8060 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8061 * set. A pointer to the inversion list is returned. This may actually be
8062 * a new list, in which case the passed in one has been destroyed. The
8063 * passed in inversion list can be NULL, in which case a new one is created
8064 * with just the one range in it */
8069 if (invlist == NULL) {
8070 invlist = _new_invlist(2);
8074 len = _invlist_len(invlist);
8077 /* If comes after the final entry actually in the list, can just append it
8080 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8081 && start >= invlist_array(invlist)[len - 1]))
8083 _append_range_to_invlist(invlist, start, end);
8087 /* Here, can't just append things, create and return a new inversion list
8088 * which is the union of this range and the existing inversion list */
8089 range_invlist = _new_invlist(2);
8090 _append_range_to_invlist(range_invlist, start, end);
8092 _invlist_union(invlist, range_invlist, &invlist);
8094 /* The temporary can be freed */
8095 SvREFCNT_dec_NN(range_invlist);
8102 PERL_STATIC_INLINE SV*
8103 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8104 return _add_range_to_invlist(invlist, cp, cp);
8107 #ifndef PERL_IN_XSUB_RE
8109 Perl__invlist_invert(pTHX_ SV* const invlist)
8111 /* Complement the input inversion list. This adds a 0 if the list didn't
8112 * have a zero; removes it otherwise. As described above, the data
8113 * structure is set up so that this is very efficient */
8115 PERL_ARGS_ASSERT__INVLIST_INVERT;
8117 assert(! invlist_is_iterating(invlist));
8119 /* The inverse of matching nothing is matching everything */
8120 if (_invlist_len(invlist) == 0) {
8121 _append_range_to_invlist(invlist, 0, UV_MAX);
8125 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8129 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8131 /* Complement the input inversion list (which must be a Unicode property,
8132 * all of which don't match above the Unicode maximum code point.) And
8133 * Perl has chosen to not have the inversion match above that either. This
8134 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8140 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8142 _invlist_invert(invlist);
8144 len = _invlist_len(invlist);
8146 if (len != 0) { /* If empty do nothing */
8147 array = invlist_array(invlist);
8148 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8149 /* Add 0x110000. First, grow if necessary */
8151 if (invlist_max(invlist) < len) {
8152 invlist_extend(invlist, len);
8153 array = invlist_array(invlist);
8155 invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8156 array[len - 1] = PERL_UNICODE_MAX + 1;
8158 else { /* Remove the 0x110000 */
8159 invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8167 PERL_STATIC_INLINE SV*
8168 S_invlist_clone(pTHX_ SV* const invlist)
8171 /* Return a new inversion list that is a copy of the input one, which is
8174 /* Need to allocate extra space to accommodate Perl's addition of a
8175 * trailing NUL to SvPV's, since it thinks they are always strings */
8176 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8177 STRLEN physical_length = SvCUR(invlist);
8178 bool offset = *(get_invlist_offset_addr(invlist));
8180 PERL_ARGS_ASSERT_INVLIST_CLONE;
8182 *(get_invlist_offset_addr(new_invlist)) = offset;
8183 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8184 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8189 PERL_STATIC_INLINE STRLEN*
8190 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8192 /* Return the address of the UV that contains the current iteration
8195 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8197 assert(SvTYPE(invlist) == SVt_INVLIST);
8199 return &(((XINVLIST*) SvANY(invlist))->iterator);
8202 PERL_STATIC_INLINE void
8203 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8205 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8207 *get_invlist_iter_addr(invlist) = 0;
8210 PERL_STATIC_INLINE void
8211 S_invlist_iterfinish(pTHX_ SV* invlist)
8213 /* Terminate iterator for invlist. This is to catch development errors.
8214 * Any iteration that is interrupted before completed should call this
8215 * function. Functions that add code points anywhere else but to the end
8216 * of an inversion list assert that they are not in the middle of an
8217 * iteration. If they were, the addition would make the iteration
8218 * problematical: if the iteration hadn't reached the place where things
8219 * were being added, it would be ok */
8221 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8223 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8227 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8229 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8230 * This call sets in <*start> and <*end>, the next range in <invlist>.
8231 * Returns <TRUE> if successful and the next call will return the next
8232 * range; <FALSE> if was already at the end of the list. If the latter,
8233 * <*start> and <*end> are unchanged, and the next call to this function
8234 * will start over at the beginning of the list */
8236 STRLEN* pos = get_invlist_iter_addr(invlist);
8237 UV len = _invlist_len(invlist);
8240 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8243 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8247 array = invlist_array(invlist);
8249 *start = array[(*pos)++];
8255 *end = array[(*pos)++] - 1;
8261 PERL_STATIC_INLINE bool
8262 S_invlist_is_iterating(pTHX_ SV* const invlist)
8264 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8266 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8269 PERL_STATIC_INLINE UV
8270 S_invlist_highest(pTHX_ SV* const invlist)
8272 /* Returns the highest code point that matches an inversion list. This API
8273 * has an ambiguity, as it returns 0 under either the highest is actually
8274 * 0, or if the list is empty. If this distinction matters to you, check
8275 * for emptiness before calling this function */
8277 UV len = _invlist_len(invlist);
8280 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8286 array = invlist_array(invlist);
8288 /* The last element in the array in the inversion list always starts a
8289 * range that goes to infinity. That range may be for code points that are
8290 * matched in the inversion list, or it may be for ones that aren't
8291 * matched. In the latter case, the highest code point in the set is one
8292 * less than the beginning of this range; otherwise it is the final element
8293 * of this range: infinity */
8294 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8296 : array[len - 1] - 1;
8299 #ifndef PERL_IN_XSUB_RE
8301 Perl__invlist_contents(pTHX_ SV* const invlist)
8303 /* Get the contents of an inversion list into a string SV so that they can
8304 * be printed out. It uses the format traditionally done for debug tracing
8308 SV* output = newSVpvs("\n");
8310 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8312 assert(! invlist_is_iterating(invlist));
8314 invlist_iterinit(invlist);
8315 while (invlist_iternext(invlist, &start, &end)) {
8316 if (end == UV_MAX) {
8317 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8319 else if (end != start) {
8320 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8324 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8332 #ifndef PERL_IN_XSUB_RE
8334 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8336 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8337 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8338 * the string 'indent'. The output looks like this:
8339 [0] 0x000A .. 0x000D
8341 [4] 0x2028 .. 0x2029
8342 [6] 0x3104 .. INFINITY
8343 * This means that the first range of code points matched by the list are
8344 * 0xA through 0xD; the second range contains only the single code point
8345 * 0x85, etc. An inversion list is an array of UVs. Two array elements
8346 * are used to define each range (except if the final range extends to
8347 * infinity, only a single element is needed). The array index of the
8348 * first element for the corresponding range is given in brackets. */
8353 PERL_ARGS_ASSERT__INVLIST_DUMP;
8355 if (invlist_is_iterating(invlist)) {
8356 Perl_dump_indent(aTHX_ level, file,
8357 "%sCan't dump inversion list because is in middle of iterating\n",
8362 invlist_iterinit(invlist);
8363 while (invlist_iternext(invlist, &start, &end)) {
8364 if (end == UV_MAX) {
8365 Perl_dump_indent(aTHX_ level, file,
8366 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8367 indent, (UV)count, start);
8369 else if (end != start) {
8370 Perl_dump_indent(aTHX_ level, file,
8371 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8372 indent, (UV)count, start, end);
8375 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8376 indent, (UV)count, start);
8383 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
8385 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8387 /* Return a boolean as to if the two passed in inversion lists are
8388 * identical. The final argument, if TRUE, says to take the complement of
8389 * the second inversion list before doing the comparison */
8391 const UV* array_a = invlist_array(a);
8392 const UV* array_b = invlist_array(b);
8393 UV len_a = _invlist_len(a);
8394 UV len_b = _invlist_len(b);
8396 UV i = 0; /* current index into the arrays */
8397 bool retval = TRUE; /* Assume are identical until proven otherwise */
8399 PERL_ARGS_ASSERT__INVLISTEQ;
8401 /* If are to compare 'a' with the complement of b, set it
8402 * up so are looking at b's complement. */
8405 /* The complement of nothing is everything, so <a> would have to have
8406 * just one element, starting at zero (ending at infinity) */
8408 return (len_a == 1 && array_a[0] == 0);
8410 else if (array_b[0] == 0) {
8412 /* Otherwise, to complement, we invert. Here, the first element is
8413 * 0, just remove it. To do this, we just pretend the array starts
8421 /* But if the first element is not zero, we pretend the list starts
8422 * at the 0 that is always stored immediately before the array. */
8428 /* Make sure that the lengths are the same, as well as the final element
8429 * before looping through the remainder. (Thus we test the length, final,
8430 * and first elements right off the bat) */
8431 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8434 else for (i = 0; i < len_a - 1; i++) {
8435 if (array_a[i] != array_b[i]) {
8445 #undef HEADER_LENGTH
8446 #undef TO_INTERNAL_SIZE
8447 #undef FROM_INTERNAL_SIZE
8448 #undef INVLIST_VERSION_ID
8450 /* End of inversion list object */
8453 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8455 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8456 * constructs, and updates RExC_flags with them. On input, RExC_parse
8457 * should point to the first flag; it is updated on output to point to the
8458 * final ')' or ':'. There needs to be at least one flag, or this will
8461 /* for (?g), (?gc), and (?o) warnings; warning
8462 about (?c) will warn about (?g) -- japhy */
8464 #define WASTED_O 0x01
8465 #define WASTED_G 0x02
8466 #define WASTED_C 0x04
8467 #define WASTED_GC (WASTED_G|WASTED_C)
8468 I32 wastedflags = 0x00;
8469 U32 posflags = 0, negflags = 0;
8470 U32 *flagsp = &posflags;
8471 char has_charset_modifier = '\0';
8473 bool has_use_defaults = FALSE;
8474 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8476 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8478 /* '^' as an initial flag sets certain defaults */
8479 if (UCHARAT(RExC_parse) == '^') {
8481 has_use_defaults = TRUE;
8482 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8483 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8484 ? REGEX_UNICODE_CHARSET
8485 : REGEX_DEPENDS_CHARSET);
8488 cs = get_regex_charset(RExC_flags);
8489 if (cs == REGEX_DEPENDS_CHARSET
8490 && (RExC_utf8 || RExC_uni_semantics))
8492 cs = REGEX_UNICODE_CHARSET;
8495 while (*RExC_parse) {
8496 /* && strchr("iogcmsx", *RExC_parse) */
8497 /* (?g), (?gc) and (?o) are useless here
8498 and must be globally applied -- japhy */
8499 switch (*RExC_parse) {
8501 /* Code for the imsx flags */
8502 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8504 case LOCALE_PAT_MOD:
8505 if (has_charset_modifier) {
8506 goto excess_modifier;
8508 else if (flagsp == &negflags) {
8511 cs = REGEX_LOCALE_CHARSET;
8512 has_charset_modifier = LOCALE_PAT_MOD;
8513 RExC_contains_locale = 1;
8515 case UNICODE_PAT_MOD:
8516 if (has_charset_modifier) {
8517 goto excess_modifier;
8519 else if (flagsp == &negflags) {
8522 cs = REGEX_UNICODE_CHARSET;
8523 has_charset_modifier = UNICODE_PAT_MOD;
8525 case ASCII_RESTRICT_PAT_MOD:
8526 if (flagsp == &negflags) {
8529 if (has_charset_modifier) {
8530 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8531 goto excess_modifier;
8533 /* Doubled modifier implies more restricted */
8534 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8537 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8539 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8541 case DEPENDS_PAT_MOD:
8542 if (has_use_defaults) {
8543 goto fail_modifiers;
8545 else if (flagsp == &negflags) {
8548 else if (has_charset_modifier) {
8549 goto excess_modifier;
8552 /* The dual charset means unicode semantics if the
8553 * pattern (or target, not known until runtime) are
8554 * utf8, or something in the pattern indicates unicode
8556 cs = (RExC_utf8 || RExC_uni_semantics)
8557 ? REGEX_UNICODE_CHARSET
8558 : REGEX_DEPENDS_CHARSET;
8559 has_charset_modifier = DEPENDS_PAT_MOD;
8563 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8564 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8566 else if (has_charset_modifier == *(RExC_parse - 1)) {
8567 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8570 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8575 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8577 case ONCE_PAT_MOD: /* 'o' */
8578 case GLOBAL_PAT_MOD: /* 'g' */
8579 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8580 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8581 if (! (wastedflags & wflagbit) ) {
8582 wastedflags |= wflagbit;
8583 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8586 "Useless (%s%c) - %suse /%c modifier",
8587 flagsp == &negflags ? "?-" : "?",
8589 flagsp == &negflags ? "don't " : "",
8596 case CONTINUE_PAT_MOD: /* 'c' */
8597 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8598 if (! (wastedflags & WASTED_C) ) {
8599 wastedflags |= WASTED_GC;
8600 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8603 "Useless (%sc) - %suse /gc modifier",
8604 flagsp == &negflags ? "?-" : "?",
8605 flagsp == &negflags ? "don't " : ""
8610 case KEEPCOPY_PAT_MOD: /* 'p' */
8611 if (flagsp == &negflags) {
8613 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8615 *flagsp |= RXf_PMf_KEEPCOPY;
8619 /* A flag is a default iff it is following a minus, so
8620 * if there is a minus, it means will be trying to
8621 * re-specify a default which is an error */
8622 if (has_use_defaults || flagsp == &negflags) {
8623 goto fail_modifiers;
8626 wastedflags = 0; /* reset so (?g-c) warns twice */
8630 RExC_flags |= posflags;
8631 RExC_flags &= ~negflags;
8632 set_regex_charset(&RExC_flags, cs);
8638 vFAIL3("Sequence (%.*s...) not recognized",
8639 RExC_parse-seqstart, seqstart);
8648 - reg - regular expression, i.e. main body or parenthesized thing
8650 * Caller must absorb opening parenthesis.
8652 * Combining parenthesis handling with the base level of regular expression
8653 * is a trifle forced, but the need to tie the tails of the branches to what
8654 * follows makes it hard to avoid.
8656 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8658 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8660 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8663 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8664 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8665 needs to be restarted.
8666 Otherwise would only return NULL if regbranch() returns NULL, which
8669 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8670 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8671 * 2 is like 1, but indicates that nextchar() has been called to advance
8672 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
8673 * this flag alerts us to the need to check for that */
8676 regnode *ret; /* Will be the head of the group. */
8679 regnode *ender = NULL;
8682 U32 oregflags = RExC_flags;
8683 bool have_branch = 0;
8685 I32 freeze_paren = 0;
8686 I32 after_freeze = 0;
8688 char * parse_start = RExC_parse; /* MJD */
8689 char * const oregcomp_parse = RExC_parse;
8691 GET_RE_DEBUG_FLAGS_DECL;
8693 PERL_ARGS_ASSERT_REG;
8694 DEBUG_PARSE("reg ");
8696 *flagp = 0; /* Tentatively. */
8699 /* Make an OPEN node, if parenthesized. */
8702 /* Under /x, space and comments can be gobbled up between the '(' and
8703 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
8704 * intervening space, as the sequence is a token, and a token should be
8706 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8708 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8709 char *start_verb = RExC_parse;
8710 STRLEN verb_len = 0;
8711 char *start_arg = NULL;
8712 unsigned char op = 0;
8714 int internal_argval = 0; /* internal_argval is only useful if !argok */
8716 if (has_intervening_patws && SIZE_ONLY) {
8717 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8719 while ( *RExC_parse && *RExC_parse != ')' ) {
8720 if ( *RExC_parse == ':' ) {
8721 start_arg = RExC_parse + 1;
8727 verb_len = RExC_parse - start_verb;
8730 while ( *RExC_parse && *RExC_parse != ')' )
8732 if ( *RExC_parse != ')' )
8733 vFAIL("Unterminated verb pattern argument");
8734 if ( RExC_parse == start_arg )
8737 if ( *RExC_parse != ')' )
8738 vFAIL("Unterminated verb pattern");
8741 switch ( *start_verb ) {
8742 case 'A': /* (*ACCEPT) */
8743 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8745 internal_argval = RExC_nestroot;
8748 case 'C': /* (*COMMIT) */
8749 if ( memEQs(start_verb,verb_len,"COMMIT") )
8752 case 'F': /* (*FAIL) */
8753 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8758 case ':': /* (*:NAME) */
8759 case 'M': /* (*MARK:NAME) */
8760 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8765 case 'P': /* (*PRUNE) */
8766 if ( memEQs(start_verb,verb_len,"PRUNE") )
8769 case 'S': /* (*SKIP) */
8770 if ( memEQs(start_verb,verb_len,"SKIP") )
8773 case 'T': /* (*THEN) */
8774 /* [19:06] <TimToady> :: is then */
8775 if ( memEQs(start_verb,verb_len,"THEN") ) {
8777 RExC_seen |= REG_SEEN_CUTGROUP;
8783 vFAIL3("Unknown verb pattern '%.*s'",
8784 verb_len, start_verb);
8787 if ( start_arg && internal_argval ) {
8788 vFAIL3("Verb pattern '%.*s' may not have an argument",
8789 verb_len, start_verb);
8790 } else if ( argok < 0 && !start_arg ) {
8791 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8792 verb_len, start_verb);
8794 ret = reganode(pRExC_state, op, internal_argval);
8795 if ( ! internal_argval && ! SIZE_ONLY ) {
8797 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8798 ARG(ret) = add_data( pRExC_state, 1, "S" );
8799 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8806 if (!internal_argval)
8807 RExC_seen |= REG_SEEN_VERBARG;
8808 } else if ( start_arg ) {
8809 vFAIL3("Verb pattern '%.*s' may not have an argument",
8810 verb_len, start_verb);
8812 ret = reg_node(pRExC_state, op);
8814 nextchar(pRExC_state);
8817 else if (*RExC_parse == '?') { /* (?...) */
8818 bool is_logical = 0;
8819 const char * const seqstart = RExC_parse;
8820 if (has_intervening_patws && SIZE_ONLY) {
8821 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8825 paren = *RExC_parse++;
8826 ret = NULL; /* For look-ahead/behind. */
8829 case 'P': /* (?P...) variants for those used to PCRE/Python */
8830 paren = *RExC_parse++;
8831 if ( paren == '<') /* (?P<...>) named capture */
8833 else if (paren == '>') { /* (?P>name) named recursion */
8834 goto named_recursion;
8836 else if (paren == '=') { /* (?P=...) named backref */
8837 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8838 you change this make sure you change that */
8839 char* name_start = RExC_parse;
8841 SV *sv_dat = reg_scan_name(pRExC_state,
8842 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8843 if (RExC_parse == name_start || *RExC_parse != ')')
8844 vFAIL2("Sequence %.3s... not terminated",parse_start);
8847 num = add_data( pRExC_state, 1, "S" );
8848 RExC_rxi->data->data[num]=(void*)sv_dat;
8849 SvREFCNT_inc_simple_void(sv_dat);
8852 ret = reganode(pRExC_state,
8855 : (ASCII_FOLD_RESTRICTED)
8857 : (AT_LEAST_UNI_SEMANTICS)
8865 Set_Node_Offset(ret, parse_start+1);
8866 Set_Node_Cur_Length(ret, parse_start);
8868 nextchar(pRExC_state);
8872 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8874 case '<': /* (?<...) */
8875 if (*RExC_parse == '!')
8877 else if (*RExC_parse != '=')
8883 case '\'': /* (?'...') */
8884 name_start= RExC_parse;
8885 svname = reg_scan_name(pRExC_state,
8886 SIZE_ONLY ? /* reverse test from the others */
8887 REG_RSN_RETURN_NAME :
8888 REG_RSN_RETURN_NULL);
8889 if (RExC_parse == name_start) {
8891 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8894 if (*RExC_parse != paren)
8895 vFAIL2("Sequence (?%c... not terminated",
8896 paren=='>' ? '<' : paren);
8900 if (!svname) /* shouldn't happen */
8902 "panic: reg_scan_name returned NULL");
8903 if (!RExC_paren_names) {
8904 RExC_paren_names= newHV();
8905 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8907 RExC_paren_name_list= newAV();
8908 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8911 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8913 sv_dat = HeVAL(he_str);
8915 /* croak baby croak */
8917 "panic: paren_name hash element allocation failed");
8918 } else if ( SvPOK(sv_dat) ) {
8919 /* (?|...) can mean we have dupes so scan to check
8920 its already been stored. Maybe a flag indicating
8921 we are inside such a construct would be useful,
8922 but the arrays are likely to be quite small, so
8923 for now we punt -- dmq */
8924 IV count = SvIV(sv_dat);
8925 I32 *pv = (I32*)SvPVX(sv_dat);
8927 for ( i = 0 ; i < count ; i++ ) {
8928 if ( pv[i] == RExC_npar ) {
8934 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8935 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8936 pv[count] = RExC_npar;
8937 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8940 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8941 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8943 SvIV_set(sv_dat, 1);
8946 /* Yes this does cause a memory leak in debugging Perls */
8947 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8948 SvREFCNT_dec_NN(svname);
8951 /*sv_dump(sv_dat);*/
8953 nextchar(pRExC_state);
8955 goto capturing_parens;
8957 RExC_seen |= REG_SEEN_LOOKBEHIND;
8958 RExC_in_lookbehind++;
8960 case '=': /* (?=...) */
8961 RExC_seen_zerolen++;
8963 case '!': /* (?!...) */
8964 RExC_seen_zerolen++;
8965 if (*RExC_parse == ')') {
8966 ret=reg_node(pRExC_state, OPFAIL);
8967 nextchar(pRExC_state);
8971 case '|': /* (?|...) */
8972 /* branch reset, behave like a (?:...) except that
8973 buffers in alternations share the same numbers */
8975 after_freeze = freeze_paren = RExC_npar;
8977 case ':': /* (?:...) */
8978 case '>': /* (?>...) */
8980 case '$': /* (?$...) */
8981 case '@': /* (?@...) */
8982 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8984 case '#': /* (?#...) */
8985 /* XXX As soon as we disallow separating the '?' and '*' (by
8986 * spaces or (?#...) comment), it is believed that this case
8987 * will be unreachable and can be removed. See
8989 while (*RExC_parse && *RExC_parse != ')')
8991 if (*RExC_parse != ')')
8992 FAIL("Sequence (?#... not terminated");
8993 nextchar(pRExC_state);
8996 case '0' : /* (?0) */
8997 case 'R' : /* (?R) */
8998 if (*RExC_parse != ')')
8999 FAIL("Sequence (?R) not terminated");
9000 ret = reg_node(pRExC_state, GOSTART);
9001 *flagp |= POSTPONED;
9002 nextchar(pRExC_state);
9005 { /* named and numeric backreferences */
9007 case '&': /* (?&NAME) */
9008 parse_start = RExC_parse - 1;
9011 SV *sv_dat = reg_scan_name(pRExC_state,
9012 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9013 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9015 goto gen_recurse_regop;
9016 assert(0); /* NOT REACHED */
9018 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9020 vFAIL("Illegal pattern");
9022 goto parse_recursion;
9024 case '-': /* (?-1) */
9025 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9026 RExC_parse--; /* rewind to let it be handled later */
9030 case '1': case '2': case '3': case '4': /* (?1) */
9031 case '5': case '6': case '7': case '8': case '9':
9034 num = atoi(RExC_parse);
9035 parse_start = RExC_parse - 1; /* MJD */
9036 if (*RExC_parse == '-')
9038 while (isDIGIT(*RExC_parse))
9040 if (*RExC_parse!=')')
9041 vFAIL("Expecting close bracket");
9044 if ( paren == '-' ) {
9046 Diagram of capture buffer numbering.
9047 Top line is the normal capture buffer numbers
9048 Bottom line is the negative indexing as from
9052 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9056 num = RExC_npar + num;
9059 vFAIL("Reference to nonexistent group");
9061 } else if ( paren == '+' ) {
9062 num = RExC_npar + num - 1;
9065 ret = reganode(pRExC_state, GOSUB, num);
9067 if (num > (I32)RExC_rx->nparens) {
9069 vFAIL("Reference to nonexistent group");
9071 ARG2L_SET( ret, RExC_recurse_count++);
9073 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9074 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9078 RExC_seen |= REG_SEEN_RECURSE;
9079 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9080 Set_Node_Offset(ret, parse_start); /* MJD */
9082 *flagp |= POSTPONED;
9083 nextchar(pRExC_state);
9085 } /* named and numeric backreferences */
9086 assert(0); /* NOT REACHED */
9088 case '?': /* (??...) */
9090 if (*RExC_parse != '{') {
9092 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9095 *flagp |= POSTPONED;
9096 paren = *RExC_parse++;
9098 case '{': /* (?{...}) */
9101 struct reg_code_block *cb;
9103 RExC_seen_zerolen++;
9105 if ( !pRExC_state->num_code_blocks
9106 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9107 || pRExC_state->code_blocks[pRExC_state->code_index].start
9108 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9111 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9112 FAIL("panic: Sequence (?{...}): no code block found\n");
9113 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9115 /* this is a pre-compiled code block (?{...}) */
9116 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9117 RExC_parse = RExC_start + cb->end;
9120 if (cb->src_regex) {
9121 n = add_data(pRExC_state, 2, "rl");
9122 RExC_rxi->data->data[n] =
9123 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9124 RExC_rxi->data->data[n+1] = (void*)o;
9127 n = add_data(pRExC_state, 1,
9128 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9129 RExC_rxi->data->data[n] = (void*)o;
9132 pRExC_state->code_index++;
9133 nextchar(pRExC_state);
9137 ret = reg_node(pRExC_state, LOGICAL);
9138 eval = reganode(pRExC_state, EVAL, n);
9141 /* for later propagation into (??{}) return value */
9142 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9144 REGTAIL(pRExC_state, ret, eval);
9145 /* deal with the length of this later - MJD */
9148 ret = reganode(pRExC_state, EVAL, n);
9149 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9150 Set_Node_Offset(ret, parse_start);
9153 case '(': /* (?(?{...})...) and (?(?=...)...) */
9156 if (RExC_parse[0] == '?') { /* (?(?...)) */
9157 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9158 || RExC_parse[1] == '<'
9159 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9163 ret = reg_node(pRExC_state, LOGICAL);
9167 tail = reg(pRExC_state, 1, &flag, depth+1);
9168 if (flag & RESTART_UTF8) {
9169 *flagp = RESTART_UTF8;
9172 REGTAIL(pRExC_state, ret, tail);
9176 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9177 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9179 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9180 char *name_start= RExC_parse++;
9182 SV *sv_dat=reg_scan_name(pRExC_state,
9183 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9184 if (RExC_parse == name_start || *RExC_parse != ch)
9185 vFAIL2("Sequence (?(%c... not terminated",
9186 (ch == '>' ? '<' : ch));
9189 num = add_data( pRExC_state, 1, "S" );
9190 RExC_rxi->data->data[num]=(void*)sv_dat;
9191 SvREFCNT_inc_simple_void(sv_dat);
9193 ret = reganode(pRExC_state,NGROUPP,num);
9194 goto insert_if_check_paren;
9196 else if (RExC_parse[0] == 'D' &&
9197 RExC_parse[1] == 'E' &&
9198 RExC_parse[2] == 'F' &&
9199 RExC_parse[3] == 'I' &&
9200 RExC_parse[4] == 'N' &&
9201 RExC_parse[5] == 'E')
9203 ret = reganode(pRExC_state,DEFINEP,0);
9206 goto insert_if_check_paren;
9208 else if (RExC_parse[0] == 'R') {
9211 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9212 parno = atoi(RExC_parse++);
9213 while (isDIGIT(*RExC_parse))
9215 } else if (RExC_parse[0] == '&') {
9218 sv_dat = reg_scan_name(pRExC_state,
9219 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9220 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9222 ret = reganode(pRExC_state,INSUBP,parno);
9223 goto insert_if_check_paren;
9225 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9228 parno = atoi(RExC_parse++);
9230 while (isDIGIT(*RExC_parse))
9232 ret = reganode(pRExC_state, GROUPP, parno);
9234 insert_if_check_paren:
9235 if ((c = *nextchar(pRExC_state)) != ')')
9236 vFAIL("Switch condition not recognized");
9238 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9239 br = regbranch(pRExC_state, &flags, 1,depth+1);
9241 if (flags & RESTART_UTF8) {
9242 *flagp = RESTART_UTF8;
9245 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9248 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9249 c = *nextchar(pRExC_state);
9254 vFAIL("(?(DEFINE)....) does not allow branches");
9255 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9256 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9257 if (flags & RESTART_UTF8) {
9258 *flagp = RESTART_UTF8;
9261 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9264 REGTAIL(pRExC_state, ret, lastbr);
9267 c = *nextchar(pRExC_state);
9272 vFAIL("Switch (?(condition)... contains too many branches");
9273 ender = reg_node(pRExC_state, TAIL);
9274 REGTAIL(pRExC_state, br, ender);
9276 REGTAIL(pRExC_state, lastbr, ender);
9277 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9280 REGTAIL(pRExC_state, ret, ender);
9281 RExC_size++; /* XXX WHY do we need this?!!
9282 For large programs it seems to be required
9283 but I can't figure out why. -- dmq*/
9287 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9290 case '[': /* (?[ ... ]) */
9291 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9294 RExC_parse--; /* for vFAIL to print correctly */
9295 vFAIL("Sequence (? incomplete");
9297 default: /* e.g., (?i) */
9300 parse_lparen_question_flags(pRExC_state);
9301 if (UCHARAT(RExC_parse) != ':') {
9302 nextchar(pRExC_state);
9307 nextchar(pRExC_state);
9317 ret = reganode(pRExC_state, OPEN, parno);
9320 RExC_nestroot = parno;
9321 if (RExC_seen & REG_SEEN_RECURSE
9322 && !RExC_open_parens[parno-1])
9324 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9325 "Setting open paren #%"IVdf" to %d\n",
9326 (IV)parno, REG_NODE_NUM(ret)));
9327 RExC_open_parens[parno-1]= ret;
9330 Set_Node_Length(ret, 1); /* MJD */
9331 Set_Node_Offset(ret, RExC_parse); /* MJD */
9339 /* Pick up the branches, linking them together. */
9340 parse_start = RExC_parse; /* MJD */
9341 br = regbranch(pRExC_state, &flags, 1,depth+1);
9343 /* branch_len = (paren != 0); */
9346 if (flags & RESTART_UTF8) {
9347 *flagp = RESTART_UTF8;
9350 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9352 if (*RExC_parse == '|') {
9353 if (!SIZE_ONLY && RExC_extralen) {
9354 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9357 reginsert(pRExC_state, BRANCH, br, depth+1);
9358 Set_Node_Length(br, paren != 0);
9359 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9363 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9365 else if (paren == ':') {
9366 *flagp |= flags&SIMPLE;
9368 if (is_open) { /* Starts with OPEN. */
9369 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9371 else if (paren != '?') /* Not Conditional */
9373 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9375 while (*RExC_parse == '|') {
9376 if (!SIZE_ONLY && RExC_extralen) {
9377 ender = reganode(pRExC_state, LONGJMP,0);
9378 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9381 RExC_extralen += 2; /* Account for LONGJMP. */
9382 nextchar(pRExC_state);
9384 if (RExC_npar > after_freeze)
9385 after_freeze = RExC_npar;
9386 RExC_npar = freeze_paren;
9388 br = regbranch(pRExC_state, &flags, 0, depth+1);
9391 if (flags & RESTART_UTF8) {
9392 *flagp = RESTART_UTF8;
9395 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9397 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9399 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9402 if (have_branch || paren != ':') {
9403 /* Make a closing node, and hook it on the end. */
9406 ender = reg_node(pRExC_state, TAIL);
9409 ender = reganode(pRExC_state, CLOSE, parno);
9410 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9411 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9412 "Setting close paren #%"IVdf" to %d\n",
9413 (IV)parno, REG_NODE_NUM(ender)));
9414 RExC_close_parens[parno-1]= ender;
9415 if (RExC_nestroot == parno)
9418 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9419 Set_Node_Length(ender,1); /* MJD */
9425 *flagp &= ~HASWIDTH;
9428 ender = reg_node(pRExC_state, SUCCEED);
9431 ender = reg_node(pRExC_state, END);
9433 assert(!RExC_opend); /* there can only be one! */
9438 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9439 SV * const mysv_val1=sv_newmortal();
9440 SV * const mysv_val2=sv_newmortal();
9441 DEBUG_PARSE_MSG("lsbr");
9442 regprop(RExC_rx, mysv_val1, lastbr);
9443 regprop(RExC_rx, mysv_val2, ender);
9444 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9445 SvPV_nolen_const(mysv_val1),
9446 (IV)REG_NODE_NUM(lastbr),
9447 SvPV_nolen_const(mysv_val2),
9448 (IV)REG_NODE_NUM(ender),
9449 (IV)(ender - lastbr)
9452 REGTAIL(pRExC_state, lastbr, ender);
9454 if (have_branch && !SIZE_ONLY) {
9457 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9459 /* Hook the tails of the branches to the closing node. */
9460 for (br = ret; br; br = regnext(br)) {
9461 const U8 op = PL_regkind[OP(br)];
9463 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9464 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9467 else if (op == BRANCHJ) {
9468 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9469 /* for now we always disable this optimisation * /
9470 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9476 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9477 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9478 SV * const mysv_val1=sv_newmortal();
9479 SV * const mysv_val2=sv_newmortal();
9480 DEBUG_PARSE_MSG("NADA");
9481 regprop(RExC_rx, mysv_val1, ret);
9482 regprop(RExC_rx, mysv_val2, ender);
9483 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9484 SvPV_nolen_const(mysv_val1),
9485 (IV)REG_NODE_NUM(ret),
9486 SvPV_nolen_const(mysv_val2),
9487 (IV)REG_NODE_NUM(ender),
9492 if (OP(ender) == TAIL) {
9497 for ( opt= br + 1; opt < ender ; opt++ )
9499 NEXT_OFF(br)= ender - br;
9507 static const char parens[] = "=!<,>";
9509 if (paren && (p = strchr(parens, paren))) {
9510 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9511 int flag = (p - parens) > 1;
9514 node = SUSPEND, flag = 0;
9515 reginsert(pRExC_state, node,ret, depth+1);
9516 Set_Node_Cur_Length(ret, parse_start);
9517 Set_Node_Offset(ret, parse_start + 1);
9519 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9523 /* Check for proper termination. */
9525 /* restore original flags, but keep (?p) */
9526 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9527 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9528 RExC_parse = oregcomp_parse;
9529 vFAIL("Unmatched (");
9532 else if (!paren && RExC_parse < RExC_end) {
9533 if (*RExC_parse == ')') {
9535 vFAIL("Unmatched )");
9538 FAIL("Junk on end of regexp"); /* "Can't happen". */
9539 assert(0); /* NOTREACHED */
9542 if (RExC_in_lookbehind) {
9543 RExC_in_lookbehind--;
9545 if (after_freeze > RExC_npar)
9546 RExC_npar = after_freeze;
9551 - regbranch - one alternative of an | operator
9553 * Implements the concatenation operator.
9555 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9559 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9563 regnode *chain = NULL;
9565 I32 flags = 0, c = 0;
9566 GET_RE_DEBUG_FLAGS_DECL;
9568 PERL_ARGS_ASSERT_REGBRANCH;
9570 DEBUG_PARSE("brnc");
9575 if (!SIZE_ONLY && RExC_extralen)
9576 ret = reganode(pRExC_state, BRANCHJ,0);
9578 ret = reg_node(pRExC_state, BRANCH);
9579 Set_Node_Length(ret, 1);
9583 if (!first && SIZE_ONLY)
9584 RExC_extralen += 1; /* BRANCHJ */
9586 *flagp = WORST; /* Tentatively. */
9589 nextchar(pRExC_state);
9590 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9592 latest = regpiece(pRExC_state, &flags,depth+1);
9593 if (latest == NULL) {
9594 if (flags & TRYAGAIN)
9596 if (flags & RESTART_UTF8) {
9597 *flagp = RESTART_UTF8;
9600 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9602 else if (ret == NULL)
9604 *flagp |= flags&(HASWIDTH|POSTPONED);
9605 if (chain == NULL) /* First piece. */
9606 *flagp |= flags&SPSTART;
9609 REGTAIL(pRExC_state, chain, latest);
9614 if (chain == NULL) { /* Loop ran zero times. */
9615 chain = reg_node(pRExC_state, NOTHING);
9620 *flagp |= flags&SIMPLE;
9627 - regpiece - something followed by possible [*+?]
9629 * Note that the branching code sequences used for ? and the general cases
9630 * of * and + are somewhat optimized: they use the same NOTHING node as
9631 * both the endmarker for their branch list and the body of the last branch.
9632 * It might seem that this node could be dispensed with entirely, but the
9633 * endmarker role is not redundant.
9635 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9637 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9641 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9648 const char * const origparse = RExC_parse;
9650 I32 max = REG_INFTY;
9651 #ifdef RE_TRACK_PATTERN_OFFSETS
9654 const char *maxpos = NULL;
9656 /* Save the original in case we change the emitted regop to a FAIL. */
9657 regnode * const orig_emit = RExC_emit;
9659 GET_RE_DEBUG_FLAGS_DECL;
9661 PERL_ARGS_ASSERT_REGPIECE;
9663 DEBUG_PARSE("piec");
9665 ret = regatom(pRExC_state, &flags,depth+1);
9667 if (flags & (TRYAGAIN|RESTART_UTF8))
9668 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9670 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
9676 if (op == '{' && regcurly(RExC_parse, FALSE)) {
9678 #ifdef RE_TRACK_PATTERN_OFFSETS
9679 parse_start = RExC_parse; /* MJD */
9681 next = RExC_parse + 1;
9682 while (isDIGIT(*next) || *next == ',') {
9691 if (*next == '}') { /* got one */
9695 min = atoi(RExC_parse);
9699 maxpos = RExC_parse;
9701 if (!max && *maxpos != '0')
9702 max = REG_INFTY; /* meaning "infinity" */
9703 else if (max >= REG_INFTY)
9704 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9706 nextchar(pRExC_state);
9707 if (max < min) { /* If can't match, warn and optimize to fail
9710 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9712 /* We can't back off the size because we have to reserve
9713 * enough space for all the things we are about to throw
9714 * away, but we can shrink it by the ammount we are about
9716 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9719 RExC_emit = orig_emit;
9721 ret = reg_node(pRExC_state, OPFAIL);
9726 if ((flags&SIMPLE)) {
9727 RExC_naughty += 2 + RExC_naughty / 2;
9728 reginsert(pRExC_state, CURLY, ret, depth+1);
9729 Set_Node_Offset(ret, parse_start+1); /* MJD */
9730 Set_Node_Cur_Length(ret, parse_start);
9733 regnode * const w = reg_node(pRExC_state, WHILEM);
9736 REGTAIL(pRExC_state, ret, w);
9737 if (!SIZE_ONLY && RExC_extralen) {
9738 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9739 reginsert(pRExC_state, NOTHING,ret, depth+1);
9740 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9742 reginsert(pRExC_state, CURLYX,ret, depth+1);
9744 Set_Node_Offset(ret, parse_start+1);
9745 Set_Node_Length(ret,
9746 op == '{' ? (RExC_parse - parse_start) : 1);
9748 if (!SIZE_ONLY && RExC_extralen)
9749 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9750 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9752 RExC_whilem_seen++, RExC_extralen += 3;
9753 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9762 ARG1_SET(ret, (U16)min);
9763 ARG2_SET(ret, (U16)max);
9775 #if 0 /* Now runtime fix should be reliable. */
9777 /* if this is reinstated, don't forget to put this back into perldiag:
9779 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9781 (F) The part of the regexp subject to either the * or + quantifier
9782 could match an empty string. The {#} shows in the regular
9783 expression about where the problem was discovered.
9787 if (!(flags&HASWIDTH) && op != '?')
9788 vFAIL("Regexp *+ operand could be empty");
9791 #ifdef RE_TRACK_PATTERN_OFFSETS
9792 parse_start = RExC_parse;
9794 nextchar(pRExC_state);
9796 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9798 if (op == '*' && (flags&SIMPLE)) {
9799 reginsert(pRExC_state, STAR, ret, depth+1);
9803 else if (op == '*') {
9807 else if (op == '+' && (flags&SIMPLE)) {
9808 reginsert(pRExC_state, PLUS, ret, depth+1);
9812 else if (op == '+') {
9816 else if (op == '?') {
9821 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9822 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9823 ckWARN3reg(RExC_parse,
9824 "%.*s matches null string many times",
9825 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9827 (void)ReREFCNT_inc(RExC_rx_sv);
9830 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9831 nextchar(pRExC_state);
9832 reginsert(pRExC_state, MINMOD, ret, depth+1);
9833 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9836 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9838 nextchar(pRExC_state);
9839 ender = reg_node(pRExC_state, SUCCEED);
9840 REGTAIL(pRExC_state, ret, ender);
9841 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9843 ender = reg_node(pRExC_state, TAIL);
9844 REGTAIL(pRExC_state, ret, ender);
9847 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9849 vFAIL("Nested quantifiers");
9856 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9857 const bool strict /* Apply stricter parsing rules? */
9861 /* This is expected to be called by a parser routine that has recognized '\N'
9862 and needs to handle the rest. RExC_parse is expected to point at the first
9863 char following the N at the time of the call. On successful return,
9864 RExC_parse has been updated to point to just after the sequence identified
9865 by this routine, and <*flagp> has been updated.
9867 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9870 \N may begin either a named sequence, or if outside a character class, mean
9871 to match a non-newline. For non single-quoted regexes, the tokenizer has
9872 attempted to decide which, and in the case of a named sequence, converted it
9873 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9874 where c1... are the characters in the sequence. For single-quoted regexes,
9875 the tokenizer passes the \N sequence through unchanged; this code will not
9876 attempt to determine this nor expand those, instead raising a syntax error.
9877 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9878 or there is no '}', it signals that this \N occurrence means to match a
9881 Only the \N{U+...} form should occur in a character class, for the same
9882 reason that '.' inside a character class means to just match a period: it
9883 just doesn't make sense.
9885 The function raises an error (via vFAIL), and doesn't return for various
9886 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9887 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9888 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9889 only possible if node_p is non-NULL.
9892 If <valuep> is non-null, it means the caller can accept an input sequence
9893 consisting of a just a single code point; <*valuep> is set to that value
9894 if the input is such.
9896 If <node_p> is non-null it signifies that the caller can accept any other
9897 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9899 1) \N means not-a-NL: points to a newly created REG_ANY node;
9900 2) \N{}: points to a new NOTHING node;
9901 3) otherwise: points to a new EXACT node containing the resolved
9903 Note that FALSE is returned for single code point sequences if <valuep> is
9907 char * endbrace; /* '}' following the name */
9909 char *endchar; /* Points to '.' or '}' ending cur char in the input
9911 bool has_multiple_chars; /* true if the input stream contains a sequence of
9912 more than one character */
9914 GET_RE_DEBUG_FLAGS_DECL;
9916 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9920 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9922 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9923 * modifier. The other meaning does not */
9924 p = (RExC_flags & RXf_PMf_EXTENDED)
9925 ? regwhite( pRExC_state, RExC_parse )
9928 /* Disambiguate between \N meaning a named character versus \N meaning
9929 * [^\n]. The former is assumed when it can't be the latter. */
9930 if (*p != '{' || regcurly(p, FALSE)) {
9933 /* no bare \N in a charclass */
9934 if (in_char_class) {
9935 vFAIL("\\N in a character class must be a named character: \\N{...}");
9939 nextchar(pRExC_state);
9940 *node_p = reg_node(pRExC_state, REG_ANY);
9941 *flagp |= HASWIDTH|SIMPLE;
9944 Set_Node_Length(*node_p, 1); /* MJD */
9948 /* Here, we have decided it should be a named character or sequence */
9950 /* The test above made sure that the next real character is a '{', but
9951 * under the /x modifier, it could be separated by space (or a comment and
9952 * \n) and this is not allowed (for consistency with \x{...} and the
9953 * tokenizer handling of \N{NAME}). */
9954 if (*RExC_parse != '{') {
9955 vFAIL("Missing braces on \\N{}");
9958 RExC_parse++; /* Skip past the '{' */
9960 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9961 || ! (endbrace == RExC_parse /* nothing between the {} */
9962 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9963 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9965 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9966 vFAIL("\\N{NAME} must be resolved by the lexer");
9969 if (endbrace == RExC_parse) { /* empty: \N{} */
9972 *node_p = reg_node(pRExC_state,NOTHING);
9974 else if (in_char_class) {
9975 if (SIZE_ONLY && in_char_class) {
9977 RExC_parse++; /* Position after the "}" */
9978 vFAIL("Zero length \\N{}");
9981 ckWARNreg(RExC_parse,
9982 "Ignoring zero length \\N{} in character class");
9990 nextchar(pRExC_state);
9994 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9995 RExC_parse += 2; /* Skip past the 'U+' */
9997 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9999 /* Code points are separated by dots. If none, there is only one code
10000 * point, and is terminated by the brace */
10001 has_multiple_chars = (endchar < endbrace);
10003 if (valuep && (! has_multiple_chars || in_char_class)) {
10004 /* We only pay attention to the first char of
10005 multichar strings being returned in char classes. I kinda wonder
10006 if this makes sense as it does change the behaviour
10007 from earlier versions, OTOH that behaviour was broken
10008 as well. XXX Solution is to recharacterize as
10009 [rest-of-class]|multi1|multi2... */
10011 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10012 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10013 | PERL_SCAN_DISALLOW_PREFIX
10014 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10016 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10018 /* The tokenizer should have guaranteed validity, but it's possible to
10019 * bypass it by using single quoting, so check */
10020 if (length_of_hex == 0
10021 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10023 RExC_parse += length_of_hex; /* Includes all the valid */
10024 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10025 ? UTF8SKIP(RExC_parse)
10027 /* Guard against malformed utf8 */
10028 if (RExC_parse >= endchar) {
10029 RExC_parse = endchar;
10031 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10034 if (in_char_class && has_multiple_chars) {
10036 RExC_parse = endbrace;
10037 vFAIL("\\N{} in character class restricted to one character");
10040 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10044 RExC_parse = endbrace + 1;
10046 else if (! node_p || ! has_multiple_chars) {
10048 /* Here, the input is legal, but not according to the caller's
10049 * options. We fail without advancing the parse, so that the
10050 * caller can try again */
10056 /* What is done here is to convert this to a sub-pattern of the form
10057 * (?:\x{char1}\x{char2}...)
10058 * and then call reg recursively. That way, it retains its atomicness,
10059 * while not having to worry about special handling that some code
10060 * points may have. toke.c has converted the original Unicode values
10061 * to native, so that we can just pass on the hex values unchanged. We
10062 * do have to set a flag to keep recoding from happening in the
10065 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10067 char *orig_end = RExC_end;
10070 while (RExC_parse < endbrace) {
10072 /* Convert to notation the rest of the code understands */
10073 sv_catpv(substitute_parse, "\\x{");
10074 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10075 sv_catpv(substitute_parse, "}");
10077 /* Point to the beginning of the next character in the sequence. */
10078 RExC_parse = endchar + 1;
10079 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10081 sv_catpv(substitute_parse, ")");
10083 RExC_parse = SvPV(substitute_parse, len);
10085 /* Don't allow empty number */
10087 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10089 RExC_end = RExC_parse + len;
10091 /* The values are Unicode, and therefore not subject to recoding */
10092 RExC_override_recoding = 1;
10094 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10095 if (flags & RESTART_UTF8) {
10096 *flagp = RESTART_UTF8;
10099 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10102 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10104 RExC_parse = endbrace;
10105 RExC_end = orig_end;
10106 RExC_override_recoding = 0;
10108 nextchar(pRExC_state);
10118 * It returns the code point in utf8 for the value in *encp.
10119 * value: a code value in the source encoding
10120 * encp: a pointer to an Encode object
10122 * If the result from Encode is not a single character,
10123 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10126 S_reg_recode(pTHX_ const char value, SV **encp)
10129 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10130 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10131 const STRLEN newlen = SvCUR(sv);
10132 UV uv = UNICODE_REPLACEMENT;
10134 PERL_ARGS_ASSERT_REG_RECODE;
10138 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10141 if (!newlen || numlen != newlen) {
10142 uv = UNICODE_REPLACEMENT;
10148 PERL_STATIC_INLINE U8
10149 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10153 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10159 op = get_regex_charset(RExC_flags);
10160 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10161 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10162 been, so there is no hole */
10165 return op + EXACTF;
10168 PERL_STATIC_INLINE void
10169 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10171 /* This knows the details about sizing an EXACTish node, setting flags for
10172 * it (by setting <*flagp>, and potentially populating it with a single
10175 * If <len> (the length in bytes) is non-zero, this function assumes that
10176 * the node has already been populated, and just does the sizing. In this
10177 * case <code_point> should be the final code point that has already been
10178 * placed into the node. This value will be ignored except that under some
10179 * circumstances <*flagp> is set based on it.
10181 * If <len> is zero, the function assumes that the node is to contain only
10182 * the single character given by <code_point> and calculates what <len>
10183 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10184 * additionally will populate the node's STRING with <code_point>, if <len>
10185 * is 0. In both cases <*flagp> is appropriately set
10187 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10188 * 255, must be folded (the former only when the rules indicate it can
10191 bool len_passed_in = cBOOL(len != 0);
10192 U8 character[UTF8_MAXBYTES_CASE+1];
10194 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10196 if (! len_passed_in) {
10198 if (FOLD && (! LOC || code_point > 255)) {
10199 _to_uni_fold_flags(code_point,
10202 FOLD_FLAGS_FULL | ((LOC)
10203 ? FOLD_FLAGS_LOCALE
10204 : (ASCII_FOLD_RESTRICTED)
10205 ? FOLD_FLAGS_NOMIX_ASCII
10209 uvchr_to_utf8( character, code_point);
10210 len = UTF8SKIP(character);
10214 || code_point != LATIN_SMALL_LETTER_SHARP_S
10215 || ASCII_FOLD_RESTRICTED
10216 || ! AT_LEAST_UNI_SEMANTICS)
10218 *character = (U8) code_point;
10223 *(character + 1) = 's';
10229 RExC_size += STR_SZ(len);
10232 RExC_emit += STR_SZ(len);
10233 STR_LEN(node) = len;
10234 if (! len_passed_in) {
10235 Copy((char *) character, STRING(node), len, char);
10239 *flagp |= HASWIDTH;
10241 /* A single character node is SIMPLE, except for the special-cased SHARP S
10243 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10244 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10245 || ! FOLD || ! DEPENDS_SEMANTICS))
10252 - regatom - the lowest level
10254 Try to identify anything special at the start of the pattern. If there
10255 is, then handle it as required. This may involve generating a single regop,
10256 such as for an assertion; or it may involve recursing, such as to
10257 handle a () structure.
10259 If the string doesn't start with something special then we gobble up
10260 as much literal text as we can.
10262 Once we have been able to handle whatever type of thing started the
10263 sequence, we return.
10265 Note: we have to be careful with escapes, as they can be both literal
10266 and special, and in the case of \10 and friends, context determines which.
10268 A summary of the code structure is:
10270 switch (first_byte) {
10271 cases for each special:
10272 handle this special;
10275 switch (2nd byte) {
10276 cases for each unambiguous special:
10277 handle this special;
10279 cases for each ambigous special/literal:
10281 if (special) handle here
10283 default: // unambiguously literal:
10286 default: // is a literal char
10289 create EXACTish node for literal;
10290 while (more input and node isn't full) {
10291 switch (input_byte) {
10292 cases for each special;
10293 make sure parse pointer is set so that the next call to
10294 regatom will see this special first
10295 goto loopdone; // EXACTish node terminated by prev. char
10297 append char to EXACTISH node;
10299 get next input byte;
10303 return the generated node;
10305 Specifically there are two separate switches for handling
10306 escape sequences, with the one for handling literal escapes requiring
10307 a dummy entry for all of the special escapes that are actually handled
10310 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10312 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10314 Otherwise does not return NULL.
10318 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10321 regnode *ret = NULL;
10323 char *parse_start = RExC_parse;
10327 GET_RE_DEBUG_FLAGS_DECL;
10329 *flagp = WORST; /* Tentatively. */
10331 DEBUG_PARSE("atom");
10333 PERL_ARGS_ASSERT_REGATOM;
10336 switch ((U8)*RExC_parse) {
10338 RExC_seen_zerolen++;
10339 nextchar(pRExC_state);
10340 if (RExC_flags & RXf_PMf_MULTILINE)
10341 ret = reg_node(pRExC_state, MBOL);
10342 else if (RExC_flags & RXf_PMf_SINGLELINE)
10343 ret = reg_node(pRExC_state, SBOL);
10345 ret = reg_node(pRExC_state, BOL);
10346 Set_Node_Length(ret, 1); /* MJD */
10349 nextchar(pRExC_state);
10351 RExC_seen_zerolen++;
10352 if (RExC_flags & RXf_PMf_MULTILINE)
10353 ret = reg_node(pRExC_state, MEOL);
10354 else if (RExC_flags & RXf_PMf_SINGLELINE)
10355 ret = reg_node(pRExC_state, SEOL);
10357 ret = reg_node(pRExC_state, EOL);
10358 Set_Node_Length(ret, 1); /* MJD */
10361 nextchar(pRExC_state);
10362 if (RExC_flags & RXf_PMf_SINGLELINE)
10363 ret = reg_node(pRExC_state, SANY);
10365 ret = reg_node(pRExC_state, REG_ANY);
10366 *flagp |= HASWIDTH|SIMPLE;
10368 Set_Node_Length(ret, 1); /* MJD */
10372 char * const oregcomp_parse = ++RExC_parse;
10373 ret = regclass(pRExC_state, flagp,depth+1,
10374 FALSE, /* means parse the whole char class */
10375 TRUE, /* allow multi-char folds */
10376 FALSE, /* don't silence non-portable warnings. */
10378 if (*RExC_parse != ']') {
10379 RExC_parse = oregcomp_parse;
10380 vFAIL("Unmatched [");
10383 if (*flagp & RESTART_UTF8)
10385 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10388 nextchar(pRExC_state);
10389 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10393 nextchar(pRExC_state);
10394 ret = reg(pRExC_state, 2, &flags,depth+1);
10396 if (flags & TRYAGAIN) {
10397 if (RExC_parse == RExC_end) {
10398 /* Make parent create an empty node if needed. */
10399 *flagp |= TRYAGAIN;
10404 if (flags & RESTART_UTF8) {
10405 *flagp = RESTART_UTF8;
10408 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10410 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10414 if (flags & TRYAGAIN) {
10415 *flagp |= TRYAGAIN;
10418 vFAIL("Internal urp");
10419 /* Supposed to be caught earlier. */
10422 if (!regcurly(RExC_parse, FALSE)) {
10431 vFAIL("Quantifier follows nothing");
10436 This switch handles escape sequences that resolve to some kind
10437 of special regop and not to literal text. Escape sequnces that
10438 resolve to literal text are handled below in the switch marked
10441 Every entry in this switch *must* have a corresponding entry
10442 in the literal escape switch. However, the opposite is not
10443 required, as the default for this switch is to jump to the
10444 literal text handling code.
10446 switch ((U8)*++RExC_parse) {
10448 /* Special Escapes */
10450 RExC_seen_zerolen++;
10451 ret = reg_node(pRExC_state, SBOL);
10453 goto finish_meta_pat;
10455 ret = reg_node(pRExC_state, GPOS);
10456 RExC_seen |= REG_SEEN_GPOS;
10458 goto finish_meta_pat;
10460 RExC_seen_zerolen++;
10461 ret = reg_node(pRExC_state, KEEPS);
10463 /* XXX:dmq : disabling in-place substitution seems to
10464 * be necessary here to avoid cases of memory corruption, as
10465 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10467 RExC_seen |= REG_SEEN_LOOKBEHIND;
10468 goto finish_meta_pat;
10470 ret = reg_node(pRExC_state, SEOL);
10472 RExC_seen_zerolen++; /* Do not optimize RE away */
10473 goto finish_meta_pat;
10475 ret = reg_node(pRExC_state, EOS);
10477 RExC_seen_zerolen++; /* Do not optimize RE away */
10478 goto finish_meta_pat;
10480 ret = reg_node(pRExC_state, CANY);
10481 RExC_seen |= REG_SEEN_CANY;
10482 *flagp |= HASWIDTH|SIMPLE;
10483 goto finish_meta_pat;
10485 ret = reg_node(pRExC_state, CLUMP);
10486 *flagp |= HASWIDTH;
10487 goto finish_meta_pat;
10493 arg = ANYOF_WORDCHAR;
10497 RExC_seen_zerolen++;
10498 RExC_seen |= REG_SEEN_LOOKBEHIND;
10499 op = BOUND + get_regex_charset(RExC_flags);
10500 if (op > BOUNDA) { /* /aa is same as /a */
10503 ret = reg_node(pRExC_state, op);
10504 FLAGS(ret) = get_regex_charset(RExC_flags);
10506 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10507 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10509 goto finish_meta_pat;
10511 RExC_seen_zerolen++;
10512 RExC_seen |= REG_SEEN_LOOKBEHIND;
10513 op = NBOUND + get_regex_charset(RExC_flags);
10514 if (op > NBOUNDA) { /* /aa is same as /a */
10517 ret = reg_node(pRExC_state, op);
10518 FLAGS(ret) = get_regex_charset(RExC_flags);
10520 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10521 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10523 goto finish_meta_pat;
10533 ret = reg_node(pRExC_state, LNBREAK);
10534 *flagp |= HASWIDTH|SIMPLE;
10535 goto finish_meta_pat;
10543 goto join_posix_op_known;
10549 arg = ANYOF_VERTWS;
10551 goto join_posix_op_known;
10561 op = POSIXD + get_regex_charset(RExC_flags);
10562 if (op > POSIXA) { /* /aa is same as /a */
10566 join_posix_op_known:
10569 op += NPOSIXD - POSIXD;
10572 ret = reg_node(pRExC_state, op);
10574 FLAGS(ret) = namedclass_to_classnum(arg);
10577 *flagp |= HASWIDTH|SIMPLE;
10581 nextchar(pRExC_state);
10582 Set_Node_Length(ret, 2); /* MJD */
10588 char* parse_start = RExC_parse - 2;
10593 ret = regclass(pRExC_state, flagp,depth+1,
10594 TRUE, /* means just parse this element */
10595 FALSE, /* don't allow multi-char folds */
10596 FALSE, /* don't silence non-portable warnings.
10597 It would be a bug if these returned
10600 /* regclass() can only return RESTART_UTF8 if multi-char folds
10603 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10608 Set_Node_Offset(ret, parse_start + 2);
10609 Set_Node_Cur_Length(ret, parse_start);
10610 nextchar(pRExC_state);
10614 /* Handle \N and \N{NAME} with multiple code points here and not
10615 * below because it can be multicharacter. join_exact() will join
10616 * them up later on. Also this makes sure that things like
10617 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10618 * The options to the grok function call causes it to fail if the
10619 * sequence is just a single code point. We then go treat it as
10620 * just another character in the current EXACT node, and hence it
10621 * gets uniform treatment with all the other characters. The
10622 * special treatment for quantifiers is not needed for such single
10623 * character sequences */
10625 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10626 FALSE /* not strict */ )) {
10627 if (*flagp & RESTART_UTF8)
10633 case 'k': /* Handle \k<NAME> and \k'NAME' */
10636 char ch= RExC_parse[1];
10637 if (ch != '<' && ch != '\'' && ch != '{') {
10639 vFAIL2("Sequence %.2s... not terminated",parse_start);
10641 /* this pretty much dupes the code for (?P=...) in reg(), if
10642 you change this make sure you change that */
10643 char* name_start = (RExC_parse += 2);
10645 SV *sv_dat = reg_scan_name(pRExC_state,
10646 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10647 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10648 if (RExC_parse == name_start || *RExC_parse != ch)
10649 vFAIL2("Sequence %.3s... not terminated",parse_start);
10652 num = add_data( pRExC_state, 1, "S" );
10653 RExC_rxi->data->data[num]=(void*)sv_dat;
10654 SvREFCNT_inc_simple_void(sv_dat);
10658 ret = reganode(pRExC_state,
10661 : (ASCII_FOLD_RESTRICTED)
10663 : (AT_LEAST_UNI_SEMANTICS)
10669 *flagp |= HASWIDTH;
10671 /* override incorrect value set in reganode MJD */
10672 Set_Node_Offset(ret, parse_start+1);
10673 Set_Node_Cur_Length(ret, parse_start);
10674 nextchar(pRExC_state);
10680 case '1': case '2': case '3': case '4':
10681 case '5': case '6': case '7': case '8': case '9':
10684 bool isg = *RExC_parse == 'g';
10689 if (*RExC_parse == '{') {
10693 if (*RExC_parse == '-') {
10697 if (hasbrace && !isDIGIT(*RExC_parse)) {
10698 if (isrel) RExC_parse--;
10700 goto parse_named_seq;
10702 num = atoi(RExC_parse);
10703 if (isg && num == 0) {
10704 if (*RExC_parse == '0') {
10705 vFAIL("Reference to invalid group 0");
10708 vFAIL("Unterminated \\g... pattern");
10712 num = RExC_npar - num;
10714 vFAIL("Reference to nonexistent or unclosed group");
10716 if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
10717 /* Probably a character specified in octal, e.g. \35 */
10720 #ifdef RE_TRACK_PATTERN_OFFSETS
10721 char * const parse_start = RExC_parse - 1; /* MJD */
10723 while (isDIGIT(*RExC_parse))
10726 if (*RExC_parse != '}')
10727 vFAIL("Unterminated \\g{...} pattern");
10731 if (num > (I32)RExC_rx->nparens)
10732 vFAIL("Reference to nonexistent group");
10735 ret = reganode(pRExC_state,
10738 : (ASCII_FOLD_RESTRICTED)
10740 : (AT_LEAST_UNI_SEMANTICS)
10746 *flagp |= HASWIDTH;
10748 /* override incorrect value set in reganode MJD */
10749 Set_Node_Offset(ret, parse_start+1);
10750 Set_Node_Cur_Length(ret, parse_start);
10752 nextchar(pRExC_state);
10757 if (RExC_parse >= RExC_end)
10758 FAIL("Trailing \\");
10761 /* Do not generate "unrecognized" warnings here, we fall
10762 back into the quick-grab loop below */
10769 if (RExC_flags & RXf_PMf_EXTENDED) {
10770 if ( reg_skipcomment( pRExC_state ) )
10777 parse_start = RExC_parse - 1;
10786 #define MAX_NODE_STRING_SIZE 127
10787 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10789 U8 upper_parse = MAX_NODE_STRING_SIZE;
10791 U8 node_type = compute_EXACTish(pRExC_state);
10792 bool next_is_quantifier;
10793 char * oldp = NULL;
10795 /* We can convert EXACTF nodes to EXACTFU if they contain only
10796 * characters that match identically regardless of the target
10797 * string's UTF8ness. The reason to do this is that EXACTF is not
10798 * trie-able, EXACTFU is. (We don't need to figure this out until
10800 bool maybe_exactfu = node_type == EXACTF && PASS2;
10802 /* If a folding node contains only code points that don't
10803 * participate in folds, it can be changed into an EXACT node,
10804 * which allows the optimizer more things to look for */
10807 ret = reg_node(pRExC_state, node_type);
10809 /* In pass1, folded, we use a temporary buffer instead of the
10810 * actual node, as the node doesn't exist yet */
10811 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10817 /* We do the EXACTFish to EXACT node only if folding, and not if in
10818 * locale, as whether a character folds or not isn't known until
10819 * runtime. (And we don't need to figure this out until pass 2) */
10820 maybe_exact = FOLD && ! LOC && PASS2;
10822 /* XXX The node can hold up to 255 bytes, yet this only goes to
10823 * 127. I (khw) do not know why. Keeping it somewhat less than
10824 * 255 allows us to not have to worry about overflow due to
10825 * converting to utf8 and fold expansion, but that value is
10826 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10827 * split up by this limit into a single one using the real max of
10828 * 255. Even at 127, this breaks under rare circumstances. If
10829 * folding, we do not want to split a node at a character that is a
10830 * non-final in a multi-char fold, as an input string could just
10831 * happen to want to match across the node boundary. The join
10832 * would solve that problem if the join actually happens. But a
10833 * series of more than two nodes in a row each of 127 would cause
10834 * the first join to succeed to get to 254, but then there wouldn't
10835 * be room for the next one, which could at be one of those split
10836 * multi-char folds. I don't know of any fool-proof solution. One
10837 * could back off to end with only a code point that isn't such a
10838 * non-final, but it is possible for there not to be any in the
10840 for (p = RExC_parse - 1;
10841 len < upper_parse && p < RExC_end;
10846 if (RExC_flags & RXf_PMf_EXTENDED)
10847 p = regwhite( pRExC_state, p );
10858 /* Literal Escapes Switch
10860 This switch is meant to handle escape sequences that
10861 resolve to a literal character.
10863 Every escape sequence that represents something
10864 else, like an assertion or a char class, is handled
10865 in the switch marked 'Special Escapes' above in this
10866 routine, but also has an entry here as anything that
10867 isn't explicitly mentioned here will be treated as
10868 an unescaped equivalent literal.
10871 switch ((U8)*++p) {
10872 /* These are all the special escapes. */
10873 case 'A': /* Start assertion */
10874 case 'b': case 'B': /* Word-boundary assertion*/
10875 case 'C': /* Single char !DANGEROUS! */
10876 case 'd': case 'D': /* digit class */
10877 case 'g': case 'G': /* generic-backref, pos assertion */
10878 case 'h': case 'H': /* HORIZWS */
10879 case 'k': case 'K': /* named backref, keep marker */
10880 case 'p': case 'P': /* Unicode property */
10881 case 'R': /* LNBREAK */
10882 case 's': case 'S': /* space class */
10883 case 'v': case 'V': /* VERTWS */
10884 case 'w': case 'W': /* word class */
10885 case 'X': /* eXtended Unicode "combining character sequence" */
10886 case 'z': case 'Z': /* End of line/string assertion */
10890 /* Anything after here is an escape that resolves to a
10891 literal. (Except digits, which may or may not)
10897 case 'N': /* Handle a single-code point named character. */
10898 /* The options cause it to fail if a multiple code
10899 * point sequence. Handle those in the switch() above
10901 RExC_parse = p + 1;
10902 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10903 flagp, depth, FALSE,
10904 FALSE /* not strict */ ))
10906 if (*flagp & RESTART_UTF8)
10907 FAIL("panic: grok_bslash_N set RESTART_UTF8");
10908 RExC_parse = p = oldp;
10912 if (ender > 0xff) {
10929 ender = ASCII_TO_NATIVE('\033');
10939 const char* error_msg;
10941 bool valid = grok_bslash_o(&p,
10944 TRUE, /* out warnings */
10945 FALSE, /* not strict */
10946 TRUE, /* Output warnings
10951 RExC_parse = p; /* going to die anyway; point
10952 to exact spot of failure */
10956 if (PL_encoding && ender < 0x100) {
10957 goto recode_encoding;
10959 if (ender > 0xff) {
10966 UV result = UV_MAX; /* initialize to erroneous
10968 const char* error_msg;
10970 bool valid = grok_bslash_x(&p,
10973 TRUE, /* out warnings */
10974 FALSE, /* not strict */
10975 TRUE, /* Output warnings
10980 RExC_parse = p; /* going to die anyway; point
10981 to exact spot of failure */
10986 if (PL_encoding && ender < 0x100) {
10987 goto recode_encoding;
10989 if (ender > 0xff) {
10996 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10998 case '8': case '9': /* must be a backreference */
11001 case '1': case '2': case '3':case '4':
11002 case '5': case '6': case '7':
11003 /* When we parse backslash escapes there is ambiguity
11004 * between backreferences and octal escapes. Any escape
11005 * from \1 - \9 is a backreference, any multi-digit
11006 * escape which does not start with 0 and which when
11007 * evaluated as decimal could refer to an already
11008 * parsed capture buffer is a backslash. Anything else
11011 * Note this implies that \118 could be interpreted as
11012 * 118 OR as "\11" . "8" depending on whether there
11013 * were 118 capture buffers defined already in the
11015 if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
11016 { /* Not to be treated as an octal constant, go
11023 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11025 ender = grok_oct(p, &numlen, &flags, NULL);
11026 if (ender > 0xff) {
11030 if (SIZE_ONLY /* like \08, \178 */
11033 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11035 reg_warn_non_literal_string(
11037 form_short_octal_warning(p, numlen));
11040 if (PL_encoding && ender < 0x100)
11041 goto recode_encoding;
11044 if (! RExC_override_recoding) {
11045 SV* enc = PL_encoding;
11046 ender = reg_recode((const char)(U8)ender, &enc);
11047 if (!enc && SIZE_ONLY)
11048 ckWARNreg(p, "Invalid escape in the specified encoding");
11054 FAIL("Trailing \\");
11057 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11058 /* Include any { following the alpha to emphasize
11059 * that it could be part of an escape at some point
11061 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11062 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11064 goto normal_default;
11065 } /* End of switch on '\' */
11067 default: /* A literal character */
11070 && RExC_flags & RXf_PMf_EXTENDED
11071 && ckWARN_d(WARN_DEPRECATED)
11072 && is_PATWS_non_low(p, UTF))
11074 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11075 "Escape literal pattern white space under /x");
11079 if (UTF8_IS_START(*p) && UTF) {
11081 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11082 &numlen, UTF8_ALLOW_DEFAULT);
11088 } /* End of switch on the literal */
11090 /* Here, have looked at the literal character and <ender>
11091 * contains its ordinal, <p> points to the character after it
11094 if ( RExC_flags & RXf_PMf_EXTENDED)
11095 p = regwhite( pRExC_state, p );
11097 /* If the next thing is a quantifier, it applies to this
11098 * character only, which means that this character has to be in
11099 * its own node and can't just be appended to the string in an
11100 * existing node, so if there are already other characters in
11101 * the node, close the node with just them, and set up to do
11102 * this character again next time through, when it will be the
11103 * only thing in its new node */
11104 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11112 const STRLEN unilen = reguni(pRExC_state, ender, s);
11118 /* The loop increments <len> each time, as all but this
11119 * path (and one other) through it add a single byte to
11120 * the EXACTish node. But this one has changed len to
11121 * be the correct final value, so subtract one to
11122 * cancel out the increment that follows */
11126 REGC((char)ender, s++);
11129 else /* FOLD */ if (! ( UTF
11130 /* See comments for join_exact() as to why we fold this
11131 * non-UTF at compile time */
11132 || (node_type == EXACTFU
11133 && ender == LATIN_SMALL_LETTER_SHARP_S)))
11135 if (IS_IN_SOME_FOLD_L1(ender)) {
11136 maybe_exact = FALSE;
11138 /* See if the character's fold differs between /d and
11139 * /u. This includes the multi-char fold SHARP S to
11142 && (PL_fold[ender] != PL_fold_latin1[ender]
11143 || ender == LATIN_SMALL_LETTER_SHARP_S
11145 && isARG2_lower_or_UPPER_ARG1('s', ender)
11146 && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11148 maybe_exactfu = FALSE;
11151 *(s++) = (char) ender;
11155 /* Prime the casefolded buffer. Locale rules, which apply
11156 * only to code points < 256, aren't known until execution,
11157 * so for them, just output the original character using
11158 * utf8. If we start to fold non-UTF patterns, be sure to
11159 * update join_exact() */
11160 if (LOC && ender < 256) {
11161 if (NATIVE_IS_INVARIANT(ender)) {
11165 *s = UTF8_TWO_BYTE_HI(ender);
11166 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11171 UV folded = _to_uni_fold_flags(
11176 | ((LOC) ? FOLD_FLAGS_LOCALE
11177 : (ASCII_FOLD_RESTRICTED)
11178 ? FOLD_FLAGS_NOMIX_ASCII
11182 /* If this node only contains non-folding code points
11183 * so far, see if this new one is also non-folding */
11185 if (folded != ender) {
11186 maybe_exact = FALSE;
11189 /* Here the fold is the original; we have
11190 * to check further to see if anything
11192 if (! PL_utf8_foldable) {
11193 SV* swash = swash_init("utf8",
11195 &PL_sv_undef, 1, 0);
11197 _get_swash_invlist(swash);
11198 SvREFCNT_dec_NN(swash);
11200 if (_invlist_contains_cp(PL_utf8_foldable,
11203 maybe_exact = FALSE;
11211 /* The loop increments <len> each time, as all but this
11212 * path (and one other) through it add a single byte to the
11213 * EXACTish node. But this one has changed len to be the
11214 * correct final value, so subtract one to cancel out the
11215 * increment that follows */
11216 len += foldlen - 1;
11219 if (next_is_quantifier) {
11221 /* Here, the next input is a quantifier, and to get here,
11222 * the current character is the only one in the node.
11223 * Also, here <len> doesn't include the final byte for this
11229 } /* End of loop through literal characters */
11231 /* Here we have either exhausted the input or ran out of room in
11232 * the node. (If we encountered a character that can't be in the
11233 * node, transfer is made directly to <loopdone>, and so we
11234 * wouldn't have fallen off the end of the loop.) In the latter
11235 * case, we artificially have to split the node into two, because
11236 * we just don't have enough space to hold everything. This
11237 * creates a problem if the final character participates in a
11238 * multi-character fold in the non-final position, as a match that
11239 * should have occurred won't, due to the way nodes are matched,
11240 * and our artificial boundary. So back off until we find a non-
11241 * problematic character -- one that isn't at the beginning or
11242 * middle of such a fold. (Either it doesn't participate in any
11243 * folds, or appears only in the final position of all the folds it
11244 * does participate in.) A better solution with far fewer false
11245 * positives, and that would fill the nodes more completely, would
11246 * be to actually have available all the multi-character folds to
11247 * test against, and to back-off only far enough to be sure that
11248 * this node isn't ending with a partial one. <upper_parse> is set
11249 * further below (if we need to reparse the node) to include just
11250 * up through that final non-problematic character that this code
11251 * identifies, so when it is set to less than the full node, we can
11252 * skip the rest of this */
11253 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11255 const STRLEN full_len = len;
11257 assert(len >= MAX_NODE_STRING_SIZE);
11259 /* Here, <s> points to the final byte of the final character.
11260 * Look backwards through the string until find a non-
11261 * problematic character */
11265 /* These two have no multi-char folds to non-UTF characters
11267 if (ASCII_FOLD_RESTRICTED || LOC) {
11271 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11275 if (! PL_NonL1NonFinalFold) {
11276 PL_NonL1NonFinalFold = _new_invlist_C_array(
11277 NonL1_Perl_Non_Final_Folds_invlist);
11280 /* Point to the first byte of the final character */
11281 s = (char *) utf8_hop((U8 *) s, -1);
11283 while (s >= s0) { /* Search backwards until find
11284 non-problematic char */
11285 if (UTF8_IS_INVARIANT(*s)) {
11287 /* There are no ascii characters that participate
11288 * in multi-char folds under /aa. In EBCDIC, the
11289 * non-ascii invariants are all control characters,
11290 * so don't ever participate in any folds. */
11291 if (ASCII_FOLD_RESTRICTED
11292 || ! IS_NON_FINAL_FOLD(*s))
11297 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11299 /* No Latin1 characters participate in multi-char
11300 * folds under /l */
11302 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11308 else if (! _invlist_contains_cp(
11309 PL_NonL1NonFinalFold,
11310 valid_utf8_to_uvchr((U8 *) s, NULL)))
11315 /* Here, the current character is problematic in that
11316 * it does occur in the non-final position of some
11317 * fold, so try the character before it, but have to
11318 * special case the very first byte in the string, so
11319 * we don't read outside the string */
11320 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11321 } /* End of loop backwards through the string */
11323 /* If there were only problematic characters in the string,
11324 * <s> will point to before s0, in which case the length
11325 * should be 0, otherwise include the length of the
11326 * non-problematic character just found */
11327 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11330 /* Here, have found the final character, if any, that is
11331 * non-problematic as far as ending the node without splitting
11332 * it across a potential multi-char fold. <len> contains the
11333 * number of bytes in the node up-to and including that
11334 * character, or is 0 if there is no such character, meaning
11335 * the whole node contains only problematic characters. In
11336 * this case, give up and just take the node as-is. We can't
11341 /* If the node ends in an 's' we make sure it stays EXACTF,
11342 * as if it turns into an EXACTFU, it could later get
11343 * joined with another 's' that would then wrongly match
11345 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11347 maybe_exactfu = FALSE;
11351 /* Here, the node does contain some characters that aren't
11352 * problematic. If one such is the final character in the
11353 * node, we are done */
11354 if (len == full_len) {
11357 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11359 /* If the final character is problematic, but the
11360 * penultimate is not, back-off that last character to
11361 * later start a new node with it */
11366 /* Here, the final non-problematic character is earlier
11367 * in the input than the penultimate character. What we do
11368 * is reparse from the beginning, going up only as far as
11369 * this final ok one, thus guaranteeing that the node ends
11370 * in an acceptable character. The reason we reparse is
11371 * that we know how far in the character is, but we don't
11372 * know how to correlate its position with the input parse.
11373 * An alternate implementation would be to build that
11374 * correlation as we go along during the original parse,
11375 * but that would entail extra work for every node, whereas
11376 * this code gets executed only when the string is too
11377 * large for the node, and the final two characters are
11378 * problematic, an infrequent occurrence. Yet another
11379 * possible strategy would be to save the tail of the
11380 * string, and the next time regatom is called, initialize
11381 * with that. The problem with this is that unless you
11382 * back off one more character, you won't be guaranteed
11383 * regatom will get called again, unless regbranch,
11384 * regpiece ... are also changed. If you do back off that
11385 * extra character, so that there is input guaranteed to
11386 * force calling regatom, you can't handle the case where
11387 * just the first character in the node is acceptable. I
11388 * (khw) decided to try this method which doesn't have that
11389 * pitfall; if performance issues are found, we can do a
11390 * combination of the current approach plus that one */
11396 } /* End of verifying node ends with an appropriate char */
11398 loopdone: /* Jumped to when encounters something that shouldn't be in
11401 /* I (khw) don't know if you can get here with zero length, but the
11402 * old code handled this situation by creating a zero-length EXACT
11403 * node. Might as well be NOTHING instead */
11409 /* If 'maybe_exact' is still set here, means there are no
11410 * code points in the node that participate in folds;
11411 * similarly for 'maybe_exactfu' and code points that match
11412 * differently depending on UTF8ness of the target string
11417 else if (maybe_exactfu) {
11421 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11424 RExC_parse = p - 1;
11425 Set_Node_Cur_Length(ret, parse_start);
11426 nextchar(pRExC_state);
11428 /* len is STRLEN which is unsigned, need to copy to signed */
11431 vFAIL("Internal disaster");
11434 } /* End of label 'defchar:' */
11436 } /* End of giant switch on input character */
11442 S_regwhite( RExC_state_t *pRExC_state, char *p )
11444 const char *e = RExC_end;
11446 PERL_ARGS_ASSERT_REGWHITE;
11451 else if (*p == '#') {
11454 if (*p++ == '\n') {
11460 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11469 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11471 /* Returns the next non-pattern-white space, non-comment character (the
11472 * latter only if 'recognize_comment is true) in the string p, which is
11473 * ended by RExC_end. If there is no line break ending a comment,
11474 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11475 const char *e = RExC_end;
11477 PERL_ARGS_ASSERT_REGPATWS;
11481 if ((len = is_PATWS_safe(p, e, UTF))) {
11484 else if (recognize_comment && *p == '#') {
11488 if (is_LNBREAK_safe(p, e, UTF)) {
11494 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11502 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11503 Character classes ([:foo:]) can also be negated ([:^foo:]).
11504 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11505 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11506 but trigger failures because they are currently unimplemented. */
11508 #define POSIXCC_DONE(c) ((c) == ':')
11509 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11510 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11512 PERL_STATIC_INLINE I32
11513 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11516 I32 namedclass = OOB_NAMEDCLASS;
11518 PERL_ARGS_ASSERT_REGPPOSIXCC;
11520 if (value == '[' && RExC_parse + 1 < RExC_end &&
11521 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11522 POSIXCC(UCHARAT(RExC_parse)))
11524 const char c = UCHARAT(RExC_parse);
11525 char* const s = RExC_parse++;
11527 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11529 if (RExC_parse == RExC_end) {
11532 /* Try to give a better location for the error (than the end of
11533 * the string) by looking for the matching ']' */
11535 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11538 vFAIL2("Unmatched '%c' in POSIX class", c);
11540 /* Grandfather lone [:, [=, [. */
11544 const char* const t = RExC_parse++; /* skip over the c */
11547 if (UCHARAT(RExC_parse) == ']') {
11548 const char *posixcc = s + 1;
11549 RExC_parse++; /* skip over the ending ] */
11552 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11553 const I32 skip = t - posixcc;
11555 /* Initially switch on the length of the name. */
11558 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11559 this is the Perl \w
11561 namedclass = ANYOF_WORDCHAR;
11564 /* Names all of length 5. */
11565 /* alnum alpha ascii blank cntrl digit graph lower
11566 print punct space upper */
11567 /* Offset 4 gives the best switch position. */
11568 switch (posixcc[4]) {
11570 if (memEQ(posixcc, "alph", 4)) /* alpha */
11571 namedclass = ANYOF_ALPHA;
11574 if (memEQ(posixcc, "spac", 4)) /* space */
11575 namedclass = ANYOF_PSXSPC;
11578 if (memEQ(posixcc, "grap", 4)) /* graph */
11579 namedclass = ANYOF_GRAPH;
11582 if (memEQ(posixcc, "asci", 4)) /* ascii */
11583 namedclass = ANYOF_ASCII;
11586 if (memEQ(posixcc, "blan", 4)) /* blank */
11587 namedclass = ANYOF_BLANK;
11590 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11591 namedclass = ANYOF_CNTRL;
11594 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11595 namedclass = ANYOF_ALPHANUMERIC;
11598 if (memEQ(posixcc, "lowe", 4)) /* lower */
11599 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11600 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11601 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11604 if (memEQ(posixcc, "digi", 4)) /* digit */
11605 namedclass = ANYOF_DIGIT;
11606 else if (memEQ(posixcc, "prin", 4)) /* print */
11607 namedclass = ANYOF_PRINT;
11608 else if (memEQ(posixcc, "punc", 4)) /* punct */
11609 namedclass = ANYOF_PUNCT;
11614 if (memEQ(posixcc, "xdigit", 6))
11615 namedclass = ANYOF_XDIGIT;
11619 if (namedclass == OOB_NAMEDCLASS)
11620 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11623 /* The #defines are structured so each complement is +1 to
11624 * the normal one */
11628 assert (posixcc[skip] == ':');
11629 assert (posixcc[skip+1] == ']');
11630 } else if (!SIZE_ONLY) {
11631 /* [[=foo=]] and [[.foo.]] are still future. */
11633 /* adjust RExC_parse so the warning shows after
11634 the class closes */
11635 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11637 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11640 /* Maternal grandfather:
11641 * "[:" ending in ":" but not in ":]" */
11643 vFAIL("Unmatched '[' in POSIX class");
11646 /* Grandfather lone [:, [=, [. */
11656 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11658 /* This applies some heuristics at the current parse position (which should
11659 * be at a '[') to see if what follows might be intended to be a [:posix:]
11660 * class. It returns true if it really is a posix class, of course, but it
11661 * also can return true if it thinks that what was intended was a posix
11662 * class that didn't quite make it.
11664 * It will return true for
11666 * [:alphanumerics] (as long as the ] isn't followed immediately by a
11667 * ')' indicating the end of the (?[
11668 * [:any garbage including %^&$ punctuation:]
11670 * This is designed to be called only from S_handle_regex_sets; it could be
11671 * easily adapted to be called from the spot at the beginning of regclass()
11672 * that checks to see in a normal bracketed class if the surrounding []
11673 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
11674 * change long-standing behavior, so I (khw) didn't do that */
11675 char* p = RExC_parse + 1;
11676 char first_char = *p;
11678 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11680 assert(*(p - 1) == '[');
11682 if (! POSIXCC(first_char)) {
11687 while (p < RExC_end && isWORDCHAR(*p)) p++;
11689 if (p >= RExC_end) {
11693 if (p - RExC_parse > 2 /* Got at least 1 word character */
11694 && (*p == first_char
11695 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11700 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11703 && p - RExC_parse > 2 /* [:] evaluates to colon;
11704 [::] is a bad posix class. */
11705 && first_char == *(p - 1));
11709 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11710 char * const oregcomp_parse)
11712 /* Handle the (?[...]) construct to do set operations */
11715 UV start, end; /* End points of code point ranges */
11717 char *save_end, *save_parse;
11722 const bool save_fold = FOLD;
11724 GET_RE_DEBUG_FLAGS_DECL;
11726 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11729 vFAIL("(?[...]) not valid in locale");
11731 RExC_uni_semantics = 1;
11733 /* This will return only an ANYOF regnode, or (unlikely) something smaller
11734 * (such as EXACT). Thus we can skip most everything if just sizing. We
11735 * call regclass to handle '[]' so as to not have to reinvent its parsing
11736 * rules here (throwing away the size it computes each time). And, we exit
11737 * upon an unescaped ']' that isn't one ending a regclass. To do both
11738 * these things, we need to realize that something preceded by a backslash
11739 * is escaped, so we have to keep track of backslashes */
11741 UV depth = 0; /* how many nested (?[...]) constructs */
11743 Perl_ck_warner_d(aTHX_
11744 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11745 "The regex_sets feature is experimental" REPORT_LOCATION,
11746 (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11748 while (RExC_parse < RExC_end) {
11749 SV* current = NULL;
11750 RExC_parse = regpatws(pRExC_state, RExC_parse,
11751 TRUE); /* means recognize comments */
11752 switch (*RExC_parse) {
11754 if (RExC_parse[1] == '[') depth++, RExC_parse++;
11759 /* Skip the next byte (which could cause us to end up in
11760 * the middle of a UTF-8 character, but since none of those
11761 * are confusable with anything we currently handle in this
11762 * switch (invariants all), it's safe. We'll just hit the
11763 * default: case next time and keep on incrementing until
11764 * we find one of the invariants we do handle. */
11769 /* If this looks like it is a [:posix:] class, leave the
11770 * parse pointer at the '[' to fool regclass() into
11771 * thinking it is part of a '[[:posix:]]'. That function
11772 * will use strict checking to force a syntax error if it
11773 * doesn't work out to a legitimate class */
11774 bool is_posix_class
11775 = could_it_be_a_POSIX_class(pRExC_state);
11776 if (! is_posix_class) {
11780 /* regclass() can only return RESTART_UTF8 if multi-char
11781 folds are allowed. */
11782 if (!regclass(pRExC_state, flagp,depth+1,
11783 is_posix_class, /* parse the whole char
11784 class only if not a
11786 FALSE, /* don't allow multi-char folds */
11787 TRUE, /* silence non-portable warnings. */
11789 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11792 /* function call leaves parse pointing to the ']', except
11793 * if we faked it */
11794 if (is_posix_class) {
11798 SvREFCNT_dec(current); /* In case it returned something */
11803 if (depth--) break;
11805 if (RExC_parse < RExC_end
11806 && *RExC_parse == ')')
11808 node = reganode(pRExC_state, ANYOF, 0);
11809 RExC_size += ANYOF_SKIP;
11810 nextchar(pRExC_state);
11811 Set_Node_Length(node,
11812 RExC_parse - oregcomp_parse + 1); /* MJD */
11821 FAIL("Syntax error in (?[...])");
11824 /* Pass 2 only after this. Everything in this construct is a
11825 * metacharacter. Operands begin with either a '\' (for an escape
11826 * sequence), or a '[' for a bracketed character class. Any other
11827 * character should be an operator, or parenthesis for grouping. Both
11828 * types of operands are handled by calling regclass() to parse them. It
11829 * is called with a parameter to indicate to return the computed inversion
11830 * list. The parsing here is implemented via a stack. Each entry on the
11831 * stack is a single character representing one of the operators, or the
11832 * '('; or else a pointer to an operand inversion list. */
11834 #define IS_OPERAND(a) (! SvIOK(a))
11836 /* The stack starts empty. It is a syntax error if the first thing parsed
11837 * is a binary operator; everything else is pushed on the stack. When an
11838 * operand is parsed, the top of the stack is examined. If it is a binary
11839 * operator, the item before it should be an operand, and both are replaced
11840 * by the result of doing that operation on the new operand and the one on
11841 * the stack. Thus a sequence of binary operands is reduced to a single
11842 * one before the next one is parsed.
11844 * A unary operator may immediately follow a binary in the input, for
11847 * When an operand is parsed and the top of the stack is a unary operator,
11848 * the operation is performed, and then the stack is rechecked to see if
11849 * this new operand is part of a binary operation; if so, it is handled as
11852 * A '(' is simply pushed on the stack; it is valid only if the stack is
11853 * empty, or the top element of the stack is an operator or another '('
11854 * (for which the parenthesized expression will become an operand). By the
11855 * time the corresponding ')' is parsed everything in between should have
11856 * been parsed and evaluated to a single operand (or else is a syntax
11857 * error), and is handled as a regular operand */
11859 sv_2mortal((SV *)(stack = newAV()));
11861 while (RExC_parse < RExC_end) {
11862 I32 top_index = av_tindex(stack);
11864 SV* current = NULL;
11866 /* Skip white space */
11867 RExC_parse = regpatws(pRExC_state, RExC_parse,
11868 TRUE); /* means recognize comments */
11869 if (RExC_parse >= RExC_end) {
11870 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11872 if ((curchar = UCHARAT(RExC_parse)) == ']') {
11879 if (av_tindex(stack) >= 0 /* This makes sure that we can
11880 safely subtract 1 from
11881 RExC_parse in the next clause.
11882 If we have something on the
11883 stack, we have parsed something
11885 && UCHARAT(RExC_parse - 1) == '('
11886 && RExC_parse < RExC_end)
11888 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11889 * This happens when we have some thing like
11891 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11893 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
11895 * Here we would be handling the interpolated
11896 * '$thai_or_lao'. We handle this by a recursive call to
11897 * ourselves which returns the inversion list the
11898 * interpolated expression evaluates to. We use the flags
11899 * from the interpolated pattern. */
11900 U32 save_flags = RExC_flags;
11901 const char * const save_parse = ++RExC_parse;
11903 parse_lparen_question_flags(pRExC_state);
11905 if (RExC_parse == save_parse /* Makes sure there was at
11906 least one flag (or this
11907 embedding wasn't compiled)
11909 || RExC_parse >= RExC_end - 4
11910 || UCHARAT(RExC_parse) != ':'
11911 || UCHARAT(++RExC_parse) != '('
11912 || UCHARAT(++RExC_parse) != '?'
11913 || UCHARAT(++RExC_parse) != '[')
11916 /* In combination with the above, this moves the
11917 * pointer to the point just after the first erroneous
11918 * character (or if there are no flags, to where they
11919 * should have been) */
11920 if (RExC_parse >= RExC_end - 4) {
11921 RExC_parse = RExC_end;
11923 else if (RExC_parse != save_parse) {
11924 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11926 vFAIL("Expecting '(?flags:(?[...'");
11929 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
11930 depth+1, oregcomp_parse);
11932 /* Here, 'current' contains the embedded expression's
11933 * inversion list, and RExC_parse points to the trailing
11934 * ']'; the next character should be the ')' which will be
11935 * paired with the '(' that has been put on the stack, so
11936 * the whole embedded expression reduces to '(operand)' */
11939 RExC_flags = save_flags;
11940 goto handle_operand;
11945 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11946 vFAIL("Unexpected character");
11949 /* regclass() can only return RESTART_UTF8 if multi-char
11950 folds are allowed. */
11951 if (!regclass(pRExC_state, flagp,depth+1,
11952 TRUE, /* means parse just the next thing */
11953 FALSE, /* don't allow multi-char folds */
11954 FALSE, /* don't silence non-portable warnings. */
11956 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11958 /* regclass() will return with parsing just the \ sequence,
11959 * leaving the parse pointer at the next thing to parse */
11961 goto handle_operand;
11963 case '[': /* Is a bracketed character class */
11965 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11967 if (! is_posix_class) {
11971 /* regclass() can only return RESTART_UTF8 if multi-char
11972 folds are allowed. */
11973 if(!regclass(pRExC_state, flagp,depth+1,
11974 is_posix_class, /* parse the whole char class
11975 only if not a posix class */
11976 FALSE, /* don't allow multi-char folds */
11977 FALSE, /* don't silence non-portable warnings. */
11979 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11981 /* function call leaves parse pointing to the ']', except if we
11983 if (is_posix_class) {
11987 goto handle_operand;
11996 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11997 || ! IS_OPERAND(*top_ptr))
12000 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12002 av_push(stack, newSVuv(curchar));
12006 av_push(stack, newSVuv(curchar));
12010 if (top_index >= 0) {
12011 top_ptr = av_fetch(stack, top_index, FALSE);
12013 if (IS_OPERAND(*top_ptr)) {
12015 vFAIL("Unexpected '(' with no preceding operator");
12018 av_push(stack, newSVuv(curchar));
12025 || ! (current = av_pop(stack))
12026 || ! IS_OPERAND(current)
12027 || ! (lparen = av_pop(stack))
12028 || IS_OPERAND(lparen)
12029 || SvUV(lparen) != '(')
12031 SvREFCNT_dec(current);
12033 vFAIL("Unexpected ')'");
12036 SvREFCNT_dec_NN(lparen);
12043 /* Here, we have an operand to process, in 'current' */
12045 if (top_index < 0) { /* Just push if stack is empty */
12046 av_push(stack, current);
12049 SV* top = av_pop(stack);
12051 char current_operator;
12053 if (IS_OPERAND(top)) {
12054 SvREFCNT_dec_NN(top);
12055 SvREFCNT_dec_NN(current);
12056 vFAIL("Operand with no preceding operator");
12058 current_operator = (char) SvUV(top);
12059 switch (current_operator) {
12060 case '(': /* Push the '(' back on followed by the new
12062 av_push(stack, top);
12063 av_push(stack, current);
12064 SvREFCNT_inc(top); /* Counters the '_dec' done
12065 just after the 'break', so
12066 it doesn't get wrongly freed
12071 _invlist_invert(current);
12073 /* Unlike binary operators, the top of the stack,
12074 * now that this unary one has been popped off, may
12075 * legally be an operator, and we now have operand
12078 SvREFCNT_dec_NN(top);
12079 goto handle_operand;
12082 prev = av_pop(stack);
12083 _invlist_intersection(prev,
12086 av_push(stack, current);
12091 prev = av_pop(stack);
12092 _invlist_union(prev, current, ¤t);
12093 av_push(stack, current);
12097 prev = av_pop(stack);;
12098 _invlist_subtract(prev, current, ¤t);
12099 av_push(stack, current);
12102 case '^': /* The union minus the intersection */
12108 prev = av_pop(stack);
12109 _invlist_union(prev, current, &u);
12110 _invlist_intersection(prev, current, &i);
12111 /* _invlist_subtract will overwrite current
12112 without freeing what it already contains */
12114 _invlist_subtract(u, i, ¤t);
12115 av_push(stack, current);
12116 SvREFCNT_dec_NN(i);
12117 SvREFCNT_dec_NN(u);
12118 SvREFCNT_dec_NN(element);
12123 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12125 SvREFCNT_dec_NN(top);
12126 SvREFCNT_dec(prev);
12130 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12133 if (av_tindex(stack) < 0 /* Was empty */
12134 || ((final = av_pop(stack)) == NULL)
12135 || ! IS_OPERAND(final)
12136 || av_tindex(stack) >= 0) /* More left on stack */
12138 vFAIL("Incomplete expression within '(?[ ])'");
12141 /* Here, 'final' is the resultant inversion list from evaluating the
12142 * expression. Return it if so requested */
12143 if (return_invlist) {
12144 *return_invlist = final;
12148 /* Otherwise generate a resultant node, based on 'final'. regclass() is
12149 * expecting a string of ranges and individual code points */
12150 invlist_iterinit(final);
12151 result_string = newSVpvs("");
12152 while (invlist_iternext(final, &start, &end)) {
12153 if (start == end) {
12154 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12157 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12162 save_parse = RExC_parse;
12163 RExC_parse = SvPV(result_string, len);
12164 save_end = RExC_end;
12165 RExC_end = RExC_parse + len;
12167 /* We turn off folding around the call, as the class we have constructed
12168 * already has all folding taken into consideration, and we don't want
12169 * regclass() to add to that */
12170 RExC_flags &= ~RXf_PMf_FOLD;
12171 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12173 node = regclass(pRExC_state, flagp,depth+1,
12174 FALSE, /* means parse the whole char class */
12175 FALSE, /* don't allow multi-char folds */
12176 TRUE, /* silence non-portable warnings. The above may very
12177 well have generated non-portable code points, but
12178 they're valid on this machine */
12181 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12184 RExC_flags |= RXf_PMf_FOLD;
12186 RExC_parse = save_parse + 1;
12187 RExC_end = save_end;
12188 SvREFCNT_dec_NN(final);
12189 SvREFCNT_dec_NN(result_string);
12191 nextchar(pRExC_state);
12192 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12197 /* The names of properties whose definitions are not known at compile time are
12198 * stored in this SV, after a constant heading. So if the length has been
12199 * changed since initialization, then there is a run-time definition. */
12200 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12203 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12204 const bool stop_at_1, /* Just parse the next thing, don't
12205 look for a full character class */
12206 bool allow_multi_folds,
12207 const bool silence_non_portable, /* Don't output warnings
12210 SV** ret_invlist) /* Return an inversion list, not a node */
12212 /* parse a bracketed class specification. Most of these will produce an
12213 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12214 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12215 * under /i with multi-character folds: it will be rewritten following the
12216 * paradigm of this example, where the <multi-fold>s are characters which
12217 * fold to multiple character sequences:
12218 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12219 * gets effectively rewritten as:
12220 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12221 * reg() gets called (recursively) on the rewritten version, and this
12222 * function will return what it constructs. (Actually the <multi-fold>s
12223 * aren't physically removed from the [abcdefghi], it's just that they are
12224 * ignored in the recursion by means of a flag:
12225 * <RExC_in_multi_char_class>.)
12227 * ANYOF nodes contain a bit map for the first 256 characters, with the
12228 * corresponding bit set if that character is in the list. For characters
12229 * above 255, a range list or swash is used. There are extra bits for \w,
12230 * etc. in locale ANYOFs, as what these match is not determinable at
12233 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12234 * to be restarted. This can only happen if ret_invlist is non-NULL.
12238 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12240 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12243 IV namedclass = OOB_NAMEDCLASS;
12244 char *rangebegin = NULL;
12245 bool need_class = 0;
12247 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12248 than just initialized. */
12249 SV* properties = NULL; /* Code points that match \p{} \P{} */
12250 SV* posixes = NULL; /* Code points that match classes like, [:word:],
12251 extended beyond the Latin1 range */
12252 UV element_count = 0; /* Number of distinct elements in the class.
12253 Optimizations may be possible if this is tiny */
12254 AV * multi_char_matches = NULL; /* Code points that fold to more than one
12255 character; used under /i */
12257 char * stop_ptr = RExC_end; /* where to stop parsing */
12258 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12260 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12262 /* Unicode properties are stored in a swash; this holds the current one
12263 * being parsed. If this swash is the only above-latin1 component of the
12264 * character class, an optimization is to pass it directly on to the
12265 * execution engine. Otherwise, it is set to NULL to indicate that there
12266 * are other things in the class that have to be dealt with at execution
12268 SV* swash = NULL; /* Code points that match \p{} \P{} */
12270 /* Set if a component of this character class is user-defined; just passed
12271 * on to the engine */
12272 bool has_user_defined_property = FALSE;
12274 /* inversion list of code points this node matches only when the target
12275 * string is in UTF-8. (Because is under /d) */
12276 SV* depends_list = NULL;
12278 /* inversion list of code points this node matches. For much of the
12279 * function, it includes only those that match regardless of the utf8ness
12280 * of the target string */
12281 SV* cp_list = NULL;
12284 /* In a range, counts how many 0-2 of the ends of it came from literals,
12285 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12286 UV literal_endpoint = 0;
12288 bool invert = FALSE; /* Is this class to be complemented */
12290 /* Is there any thing like \W or [:^digit:] that matches above the legal
12291 * Unicode range? */
12292 bool runtime_posix_matches_above_Unicode = FALSE;
12294 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12295 case we need to change the emitted regop to an EXACT. */
12296 const char * orig_parse = RExC_parse;
12297 const SSize_t orig_size = RExC_size;
12298 GET_RE_DEBUG_FLAGS_DECL;
12300 PERL_ARGS_ASSERT_REGCLASS;
12302 PERL_UNUSED_ARG(depth);
12305 DEBUG_PARSE("clas");
12307 /* Assume we are going to generate an ANYOF node. */
12308 ret = reganode(pRExC_state, ANYOF, 0);
12311 RExC_size += ANYOF_SKIP;
12312 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12315 ANYOF_FLAGS(ret) = 0;
12317 RExC_emit += ANYOF_SKIP;
12319 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12321 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12322 initial_listsv_len = SvCUR(listsv);
12323 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12327 RExC_parse = regpatws(pRExC_state, RExC_parse,
12328 FALSE /* means don't recognize comments */);
12331 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12334 allow_multi_folds = FALSE;
12337 RExC_parse = regpatws(pRExC_state, RExC_parse,
12338 FALSE /* means don't recognize comments */);
12342 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12343 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12344 const char *s = RExC_parse;
12345 const char c = *s++;
12347 while (isWORDCHAR(*s))
12349 if (*s && c == *s && s[1] == ']') {
12350 SAVEFREESV(RExC_rx_sv);
12352 "POSIX syntax [%c %c] belongs inside character classes",
12354 (void)ReREFCNT_inc(RExC_rx_sv);
12358 /* If the caller wants us to just parse a single element, accomplish this
12359 * by faking the loop ending condition */
12360 if (stop_at_1 && RExC_end > RExC_parse) {
12361 stop_ptr = RExC_parse + 1;
12364 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12365 if (UCHARAT(RExC_parse) == ']')
12366 goto charclassloop;
12370 if (RExC_parse >= stop_ptr) {
12375 RExC_parse = regpatws(pRExC_state, RExC_parse,
12376 FALSE /* means don't recognize comments */);
12379 if (UCHARAT(RExC_parse) == ']') {
12385 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12386 save_value = value;
12387 save_prevvalue = prevvalue;
12390 rangebegin = RExC_parse;
12394 value = utf8n_to_uvchr((U8*)RExC_parse,
12395 RExC_end - RExC_parse,
12396 &numlen, UTF8_ALLOW_DEFAULT);
12397 RExC_parse += numlen;
12400 value = UCHARAT(RExC_parse++);
12403 && RExC_parse < RExC_end
12404 && POSIXCC(UCHARAT(RExC_parse)))
12406 namedclass = regpposixcc(pRExC_state, value, strict);
12408 else if (value == '\\') {
12410 value = utf8n_to_uvchr((U8*)RExC_parse,
12411 RExC_end - RExC_parse,
12412 &numlen, UTF8_ALLOW_DEFAULT);
12413 RExC_parse += numlen;
12416 value = UCHARAT(RExC_parse++);
12418 /* Some compilers cannot handle switching on 64-bit integer
12419 * values, therefore value cannot be an UV. Yes, this will
12420 * be a problem later if we want switch on Unicode.
12421 * A similar issue a little bit later when switching on
12422 * namedclass. --jhi */
12424 /* If the \ is escaping white space when white space is being
12425 * skipped, it means that that white space is wanted literally, and
12426 * is already in 'value'. Otherwise, need to translate the escape
12427 * into what it signifies. */
12428 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12430 case 'w': namedclass = ANYOF_WORDCHAR; break;
12431 case 'W': namedclass = ANYOF_NWORDCHAR; break;
12432 case 's': namedclass = ANYOF_SPACE; break;
12433 case 'S': namedclass = ANYOF_NSPACE; break;
12434 case 'd': namedclass = ANYOF_DIGIT; break;
12435 case 'D': namedclass = ANYOF_NDIGIT; break;
12436 case 'v': namedclass = ANYOF_VERTWS; break;
12437 case 'V': namedclass = ANYOF_NVERTWS; break;
12438 case 'h': namedclass = ANYOF_HORIZWS; break;
12439 case 'H': namedclass = ANYOF_NHORIZWS; break;
12440 case 'N': /* Handle \N{NAME} in class */
12442 /* We only pay attention to the first char of
12443 multichar strings being returned. I kinda wonder
12444 if this makes sense as it does change the behaviour
12445 from earlier versions, OTOH that behaviour was broken
12447 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12448 TRUE, /* => charclass */
12451 if (*flagp & RESTART_UTF8)
12452 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12462 /* We will handle any undefined properties ourselves */
12463 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12465 if (RExC_parse >= RExC_end)
12466 vFAIL2("Empty \\%c{}", (U8)value);
12467 if (*RExC_parse == '{') {
12468 const U8 c = (U8)value;
12469 e = strchr(RExC_parse++, '}');
12471 vFAIL2("Missing right brace on \\%c{}", c);
12472 while (isSPACE(UCHARAT(RExC_parse)))
12474 if (e == RExC_parse)
12475 vFAIL2("Empty \\%c{}", c);
12476 n = e - RExC_parse;
12477 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12488 if (UCHARAT(RExC_parse) == '^') {
12491 /* toggle. (The rhs xor gets the single bit that
12492 * differs between P and p; the other xor inverts just
12494 value ^= 'P' ^ 'p';
12496 while (isSPACE(UCHARAT(RExC_parse))) {
12501 /* Try to get the definition of the property into
12502 * <invlist>. If /i is in effect, the effective property
12503 * will have its name be <__NAME_i>. The design is
12504 * discussed in commit
12505 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12506 Newx(name, n + sizeof("_i__\n"), char);
12508 sprintf(name, "%s%.*s%s\n",
12509 (FOLD) ? "__" : "",
12515 /* Look up the property name, and get its swash and
12516 * inversion list, if the property is found */
12518 SvREFCNT_dec_NN(swash);
12520 swash = _core_swash_init("utf8", name, &PL_sv_undef,
12523 NULL, /* No inversion list */
12526 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12528 SvREFCNT_dec_NN(swash);
12532 /* Here didn't find it. It could be a user-defined
12533 * property that will be available at run-time. If we
12534 * accept only compile-time properties, is an error;
12535 * otherwise add it to the list for run-time look up */
12537 RExC_parse = e + 1;
12538 vFAIL3("Property '%.*s' is unknown", (int) n, name);
12540 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12541 (value == 'p' ? '+' : '!'),
12543 has_user_defined_property = TRUE;
12545 /* We don't know yet, so have to assume that the
12546 * property could match something in the Latin1 range,
12547 * hence something that isn't utf8. Note that this
12548 * would cause things in <depends_list> to match
12549 * inappropriately, except that any \p{}, including
12550 * this one forces Unicode semantics, which means there
12551 * is <no depends_list> */
12552 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12556 /* Here, did get the swash and its inversion list. If
12557 * the swash is from a user-defined property, then this
12558 * whole character class should be regarded as such */
12559 has_user_defined_property =
12561 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12563 /* Invert if asking for the complement */
12564 if (value == 'P') {
12565 _invlist_union_complement_2nd(properties,
12569 /* The swash can't be used as-is, because we've
12570 * inverted things; delay removing it to here after
12571 * have copied its invlist above */
12572 SvREFCNT_dec_NN(swash);
12576 _invlist_union(properties, invlist, &properties);
12581 RExC_parse = e + 1;
12582 namedclass = ANYOF_UNIPROP; /* no official name, but it's
12585 /* \p means they want Unicode semantics */
12586 RExC_uni_semantics = 1;
12589 case 'n': value = '\n'; break;
12590 case 'r': value = '\r'; break;
12591 case 't': value = '\t'; break;
12592 case 'f': value = '\f'; break;
12593 case 'b': value = '\b'; break;
12594 case 'e': value = ASCII_TO_NATIVE('\033');break;
12595 case 'a': value = '\a'; break;
12597 RExC_parse--; /* function expects to be pointed at the 'o' */
12599 const char* error_msg;
12600 bool valid = grok_bslash_o(&RExC_parse,
12603 SIZE_ONLY, /* warnings in pass
12606 silence_non_portable,
12612 if (PL_encoding && value < 0x100) {
12613 goto recode_encoding;
12617 RExC_parse--; /* function expects to be pointed at the 'x' */
12619 const char* error_msg;
12620 bool valid = grok_bslash_x(&RExC_parse,
12623 TRUE, /* Output warnings */
12625 silence_non_portable,
12631 if (PL_encoding && value < 0x100)
12632 goto recode_encoding;
12635 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12637 case '0': case '1': case '2': case '3': case '4':
12638 case '5': case '6': case '7':
12640 /* Take 1-3 octal digits */
12641 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12642 numlen = (strict) ? 4 : 3;
12643 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12644 RExC_parse += numlen;
12647 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12648 vFAIL("Need exactly 3 octal digits");
12650 else if (! SIZE_ONLY /* like \08, \178 */
12652 && RExC_parse < RExC_end
12653 && isDIGIT(*RExC_parse)
12654 && ckWARN(WARN_REGEXP))
12656 SAVEFREESV(RExC_rx_sv);
12657 reg_warn_non_literal_string(
12659 form_short_octal_warning(RExC_parse, numlen));
12660 (void)ReREFCNT_inc(RExC_rx_sv);
12663 if (PL_encoding && value < 0x100)
12664 goto recode_encoding;
12668 if (! RExC_override_recoding) {
12669 SV* enc = PL_encoding;
12670 value = reg_recode((const char)(U8)value, &enc);
12673 vFAIL("Invalid escape in the specified encoding");
12675 else if (SIZE_ONLY) {
12676 ckWARNreg(RExC_parse,
12677 "Invalid escape in the specified encoding");
12683 /* Allow \_ to not give an error */
12684 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12686 vFAIL2("Unrecognized escape \\%c in character class",
12690 SAVEFREESV(RExC_rx_sv);
12691 ckWARN2reg(RExC_parse,
12692 "Unrecognized escape \\%c in character class passed through",
12694 (void)ReREFCNT_inc(RExC_rx_sv);
12698 } /* End of switch on char following backslash */
12699 } /* end of handling backslash escape sequences */
12702 literal_endpoint++;
12705 /* Here, we have the current token in 'value' */
12707 /* What matches in a locale is not known until runtime. This includes
12708 * what the Posix classes (like \w, [:space:]) match. Room must be
12709 * reserved (one time per class) to store such classes, either if Perl
12710 * is compiled so that locale nodes always should have this space, or
12711 * if there is such class info to be stored. The space will contain a
12712 * bit for each named class that is to be matched against. This isn't
12713 * needed for \p{} and pseudo-classes, as they are not affected by
12714 * locale, and hence are dealt with separately */
12717 && (ANYOF_LOCALE == ANYOF_CLASS
12718 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12722 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12725 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12726 ANYOF_CLASS_ZERO(ret);
12728 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12731 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12733 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
12734 * literal, as is the character that began the false range, i.e.
12735 * the 'a' in the examples */
12738 const int w = (RExC_parse >= rangebegin)
12739 ? RExC_parse - rangebegin
12742 vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12745 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12746 ckWARN4reg(RExC_parse,
12747 "False [] range \"%*.*s\"",
12749 (void)ReREFCNT_inc(RExC_rx_sv);
12750 cp_list = add_cp_to_invlist(cp_list, '-');
12751 cp_list = add_cp_to_invlist(cp_list, prevvalue);
12755 range = 0; /* this was not a true range */
12756 element_count += 2; /* So counts for three values */
12760 U8 classnum = namedclass_to_classnum(namedclass);
12761 if (namedclass >= ANYOF_MAX) { /* If a special class */
12762 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12764 /* Here, should be \h, \H, \v, or \V. Neither /d nor
12765 * /l make a difference in what these match. There
12766 * would be problems if these characters had folds
12767 * other than themselves, as cp_list is subject to
12769 if (classnum != _CC_VERTSPACE) {
12770 assert( namedclass == ANYOF_HORIZWS
12771 || namedclass == ANYOF_NHORIZWS);
12773 /* It turns out that \h is just a synonym for
12775 classnum = _CC_BLANK;
12778 _invlist_union_maybe_complement_2nd(
12780 PL_XPosix_ptrs[classnum],
12781 cBOOL(namedclass % 2), /* Complement if odd
12782 (NHORIZWS, NVERTWS)
12787 else if (classnum == _CC_ASCII) {
12790 ANYOF_CLASS_SET(ret, namedclass);
12793 #endif /* Not isascii(); just use the hard-coded definition for it */
12794 _invlist_union_maybe_complement_2nd(
12797 cBOOL(namedclass % 2), /* Complement if odd
12801 else { /* Garden variety class */
12803 /* The ascii range inversion list */
12804 SV* ascii_source = PL_Posix_ptrs[classnum];
12806 /* The full Latin1 range inversion list */
12807 SV* l1_source = PL_L1Posix_ptrs[classnum];
12809 /* This code is structured into two major clauses. The
12810 * first is for classes whose complete definitions may not
12811 * already be known. It not, the Latin1 definition
12812 * (guaranteed to already known) is used plus code is
12813 * generated to load the rest at run-time (only if needed).
12814 * If the complete definition is known, it drops down to
12815 * the second clause, where the complete definition is
12818 if (classnum < _FIRST_NON_SWASH_CC) {
12820 /* Here, the class has a swash, which may or not
12821 * already be loaded */
12823 /* The name of the property to use to match the full
12824 * eXtended Unicode range swash for this character
12826 const char *Xname = swash_property_names[classnum];
12828 /* If returning the inversion list, we can't defer
12829 * getting this until runtime */
12830 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
12831 PL_utf8_swash_ptrs[classnum] =
12832 _core_swash_init("utf8", Xname, &PL_sv_undef,
12835 NULL, /* No inversion list */
12836 NULL /* No flags */
12838 assert(PL_utf8_swash_ptrs[classnum]);
12840 if ( ! PL_utf8_swash_ptrs[classnum]) {
12841 if (namedclass % 2 == 0) { /* A non-complemented
12843 /* If not /a matching, there are code points we
12844 * don't know at compile time. Arrange for the
12845 * unknown matches to be loaded at run-time, if
12847 if (! AT_LEAST_ASCII_RESTRICTED) {
12848 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12851 if (LOC) { /* Under locale, set run-time
12853 ANYOF_CLASS_SET(ret, namedclass);
12856 /* Add the current class's code points to
12857 * the running total */
12858 _invlist_union(posixes,
12859 (AT_LEAST_ASCII_RESTRICTED)
12865 else { /* A complemented class */
12866 if (AT_LEAST_ASCII_RESTRICTED) {
12867 /* Under /a should match everything above
12868 * ASCII, plus the complement of the set's
12870 _invlist_union_complement_2nd(posixes,
12875 /* Arrange for the unknown matches to be
12876 * loaded at run-time, if needed */
12877 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12879 runtime_posix_matches_above_Unicode = TRUE;
12881 ANYOF_CLASS_SET(ret, namedclass);
12885 /* We want to match everything in
12886 * Latin1, except those things that
12887 * l1_source matches */
12888 SV* scratch_list = NULL;
12889 _invlist_subtract(PL_Latin1, l1_source,
12892 /* Add the list from this class to the
12895 posixes = scratch_list;
12898 _invlist_union(posixes,
12901 SvREFCNT_dec_NN(scratch_list);
12903 if (DEPENDS_SEMANTICS) {
12905 |= ANYOF_NON_UTF8_LATIN1_ALL;
12910 goto namedclass_done;
12913 /* Here, there is a swash loaded for the class. If no
12914 * inversion list for it yet, get it */
12915 if (! PL_XPosix_ptrs[classnum]) {
12916 PL_XPosix_ptrs[classnum]
12917 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12921 /* Here there is an inversion list already loaded for the
12924 if (namedclass % 2 == 0) { /* A non-complemented class,
12925 like ANYOF_PUNCT */
12927 /* For non-locale, just add it to any existing list
12929 _invlist_union(posixes,
12930 (AT_LEAST_ASCII_RESTRICTED)
12932 : PL_XPosix_ptrs[classnum],
12935 else { /* Locale */
12936 SV* scratch_list = NULL;
12938 /* For above Latin1 code points, we use the full
12940 _invlist_intersection(PL_AboveLatin1,
12941 PL_XPosix_ptrs[classnum],
12943 /* And set the output to it, adding instead if
12944 * there already is an output. Checking if
12945 * 'posixes' is NULL first saves an extra clone.
12946 * Its reference count will be decremented at the
12947 * next union, etc, or if this is the only
12948 * instance, at the end of the routine */
12950 posixes = scratch_list;
12953 _invlist_union(posixes, scratch_list, &posixes);
12954 SvREFCNT_dec_NN(scratch_list);
12957 #ifndef HAS_ISBLANK
12958 if (namedclass != ANYOF_BLANK) {
12960 /* Set this class in the node for runtime
12962 ANYOF_CLASS_SET(ret, namedclass);
12963 #ifndef HAS_ISBLANK
12966 /* No isblank(), use the hard-coded ASCII-range
12967 * blanks, adding them to the running total. */
12969 _invlist_union(posixes, ascii_source, &posixes);
12974 else { /* A complemented class, like ANYOF_NPUNCT */
12976 _invlist_union_complement_2nd(
12978 (AT_LEAST_ASCII_RESTRICTED)
12980 : PL_XPosix_ptrs[classnum],
12982 /* Under /d, everything in the upper half of the
12983 * Latin1 range matches this complement */
12984 if (DEPENDS_SEMANTICS) {
12985 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12988 else { /* Locale */
12989 SV* scratch_list = NULL;
12990 _invlist_subtract(PL_AboveLatin1,
12991 PL_XPosix_ptrs[classnum],
12994 posixes = scratch_list;
12997 _invlist_union(posixes, scratch_list, &posixes);
12998 SvREFCNT_dec_NN(scratch_list);
13000 #ifndef HAS_ISBLANK
13001 if (namedclass != ANYOF_NBLANK) {
13003 ANYOF_CLASS_SET(ret, namedclass);
13004 #ifndef HAS_ISBLANK
13007 /* Get the list of all code points in Latin1
13008 * that are not ASCII blanks, and add them to
13009 * the running total */
13010 _invlist_subtract(PL_Latin1, ascii_source,
13012 _invlist_union(posixes, scratch_list, &posixes);
13013 SvREFCNT_dec_NN(scratch_list);
13020 continue; /* Go get next character */
13022 } /* end of namedclass \blah */
13024 /* Here, we have a single value. If 'range' is set, it is the ending
13025 * of a range--check its validity. Later, we will handle each
13026 * individual code point in the range. If 'range' isn't set, this
13027 * could be the beginning of a range, so check for that by looking
13028 * ahead to see if the next real character to be processed is the range
13029 * indicator--the minus sign */
13032 RExC_parse = regpatws(pRExC_state, RExC_parse,
13033 FALSE /* means don't recognize comments */);
13037 if (prevvalue > value) /* b-a */ {
13038 const int w = RExC_parse - rangebegin;
13039 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
13040 range = 0; /* not a valid range */
13044 prevvalue = value; /* save the beginning of the potential range */
13045 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13046 && *RExC_parse == '-')
13048 char* next_char_ptr = RExC_parse + 1;
13049 if (skip_white) { /* Get the next real char after the '-' */
13050 next_char_ptr = regpatws(pRExC_state,
13052 FALSE); /* means don't recognize
13056 /* If the '-' is at the end of the class (just before the ']',
13057 * it is a literal minus; otherwise it is a range */
13058 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13059 RExC_parse = next_char_ptr;
13061 /* a bad range like \w-, [:word:]- ? */
13062 if (namedclass > OOB_NAMEDCLASS) {
13063 if (strict || ckWARN(WARN_REGEXP)) {
13065 RExC_parse >= rangebegin ?
13066 RExC_parse - rangebegin : 0;
13068 vFAIL4("False [] range \"%*.*s\"",
13073 "False [] range \"%*.*s\"",
13078 cp_list = add_cp_to_invlist(cp_list, '-');
13082 range = 1; /* yeah, it's a range! */
13083 continue; /* but do it the next time */
13088 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13091 /* non-Latin1 code point implies unicode semantics. Must be set in
13092 * pass1 so is there for the whole of pass 2 */
13094 RExC_uni_semantics = 1;
13097 /* Ready to process either the single value, or the completed range.
13098 * For single-valued non-inverted ranges, we consider the possibility
13099 * of multi-char folds. (We made a conscious decision to not do this
13100 * for the other cases because it can often lead to non-intuitive
13101 * results. For example, you have the peculiar case that:
13102 * "s s" =~ /^[^\xDF]+$/i => Y
13103 * "ss" =~ /^[^\xDF]+$/i => N
13105 * See [perl #89750] */
13106 if (FOLD && allow_multi_folds && value == prevvalue) {
13107 if (value == LATIN_SMALL_LETTER_SHARP_S
13108 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13111 /* Here <value> is indeed a multi-char fold. Get what it is */
13113 U8 foldbuf[UTF8_MAXBYTES_CASE];
13116 UV folded = _to_uni_fold_flags(
13121 | ((LOC) ? FOLD_FLAGS_LOCALE
13122 : (ASCII_FOLD_RESTRICTED)
13123 ? FOLD_FLAGS_NOMIX_ASCII
13127 /* Here, <folded> should be the first character of the
13128 * multi-char fold of <value>, with <foldbuf> containing the
13129 * whole thing. But, if this fold is not allowed (because of
13130 * the flags), <fold> will be the same as <value>, and should
13131 * be processed like any other character, so skip the special
13133 if (folded != value) {
13135 /* Skip if we are recursed, currently parsing the class
13136 * again. Otherwise add this character to the list of
13137 * multi-char folds. */
13138 if (! RExC_in_multi_char_class) {
13139 AV** this_array_ptr;
13141 STRLEN cp_count = utf8_length(foldbuf,
13142 foldbuf + foldlen);
13143 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13145 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13148 if (! multi_char_matches) {
13149 multi_char_matches = newAV();
13152 /* <multi_char_matches> is actually an array of arrays.
13153 * There will be one or two top-level elements: [2],
13154 * and/or [3]. The [2] element is an array, each
13155 * element thereof is a character which folds to TWO
13156 * characters; [3] is for folds to THREE characters.
13157 * (Unicode guarantees a maximum of 3 characters in any
13158 * fold.) When we rewrite the character class below,
13159 * we will do so such that the longest folds are
13160 * written first, so that it prefers the longest
13161 * matching strings first. This is done even if it
13162 * turns out that any quantifier is non-greedy, out of
13163 * programmer laziness. Tom Christiansen has agreed
13164 * that this is ok. This makes the test for the
13165 * ligature 'ffi' come before the test for 'ff' */
13166 if (av_exists(multi_char_matches, cp_count)) {
13167 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13169 this_array = *this_array_ptr;
13172 this_array = newAV();
13173 av_store(multi_char_matches, cp_count,
13176 av_push(this_array, multi_fold);
13179 /* This element should not be processed further in this
13182 value = save_value;
13183 prevvalue = save_prevvalue;
13189 /* Deal with this element of the class */
13192 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13194 SV* this_range = _new_invlist(1);
13195 _append_range_to_invlist(this_range, prevvalue, value);
13197 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13198 * If this range was specified using something like 'i-j', we want
13199 * to include only the 'i' and the 'j', and not anything in
13200 * between, so exclude non-ASCII, non-alphabetics from it.
13201 * However, if the range was specified with something like
13202 * [\x89-\x91] or [\x89-j], all code points within it should be
13203 * included. literal_endpoint==2 means both ends of the range used
13204 * a literal character, not \x{foo} */
13205 if (literal_endpoint == 2
13206 && ((prevvalue >= 'a' && value <= 'z')
13207 || (prevvalue >= 'A' && value <= 'Z')))
13209 _invlist_intersection(this_range, PL_ASCII,
13211 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13214 _invlist_union(cp_list, this_range, &cp_list);
13215 literal_endpoint = 0;
13219 range = 0; /* this range (if it was one) is done now */
13220 } /* End of loop through all the text within the brackets */
13222 /* If anything in the class expands to more than one character, we have to
13223 * deal with them by building up a substitute parse string, and recursively
13224 * calling reg() on it, instead of proceeding */
13225 if (multi_char_matches) {
13226 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13229 char *save_end = RExC_end;
13230 char *save_parse = RExC_parse;
13231 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13236 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13237 because too confusing */
13239 sv_catpv(substitute_parse, "(?:");
13243 /* Look at the longest folds first */
13244 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13246 if (av_exists(multi_char_matches, cp_count)) {
13247 AV** this_array_ptr;
13250 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13252 while ((this_sequence = av_pop(*this_array_ptr)) !=
13255 if (! first_time) {
13256 sv_catpv(substitute_parse, "|");
13258 first_time = FALSE;
13260 sv_catpv(substitute_parse, SvPVX(this_sequence));
13265 /* If the character class contains anything else besides these
13266 * multi-character folds, have to include it in recursive parsing */
13267 if (element_count) {
13268 sv_catpv(substitute_parse, "|[");
13269 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13270 sv_catpv(substitute_parse, "]");
13273 sv_catpv(substitute_parse, ")");
13276 /* This is a way to get the parse to skip forward a whole named
13277 * sequence instead of matching the 2nd character when it fails the
13279 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13283 RExC_parse = SvPV(substitute_parse, len);
13284 RExC_end = RExC_parse + len;
13285 RExC_in_multi_char_class = 1;
13286 RExC_emit = (regnode *)orig_emit;
13288 ret = reg(pRExC_state, 1, ®_flags, depth+1);
13290 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13292 RExC_parse = save_parse;
13293 RExC_end = save_end;
13294 RExC_in_multi_char_class = 0;
13295 SvREFCNT_dec_NN(multi_char_matches);
13299 /* If the character class contains only a single element, it may be
13300 * optimizable into another node type which is smaller and runs faster.
13301 * Check if this is the case for this class */
13302 if (element_count == 1 && ! ret_invlist) {
13306 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13307 [:digit:] or \p{foo} */
13309 /* All named classes are mapped into POSIXish nodes, with its FLAG
13310 * argument giving which class it is */
13311 switch ((I32)namedclass) {
13312 case ANYOF_UNIPROP:
13315 /* These don't depend on the charset modifiers. They always
13316 * match under /u rules */
13317 case ANYOF_NHORIZWS:
13318 case ANYOF_HORIZWS:
13319 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13322 case ANYOF_NVERTWS:
13327 /* The actual POSIXish node for all the rest depends on the
13328 * charset modifier. The ones in the first set depend only on
13329 * ASCII or, if available on this platform, locale */
13333 op = (LOC) ? POSIXL : POSIXA;
13344 /* under /a could be alpha */
13346 if (ASCII_RESTRICTED) {
13347 namedclass = ANYOF_ALPHA + (namedclass % 2);
13355 /* The rest have more possibilities depending on the charset.
13356 * We take advantage of the enum ordering of the charset
13357 * modifiers to get the exact node type, */
13359 op = POSIXD + get_regex_charset(RExC_flags);
13360 if (op > POSIXA) { /* /aa is same as /a */
13363 #ifndef HAS_ISBLANK
13365 && (namedclass == ANYOF_BLANK
13366 || namedclass == ANYOF_NBLANK))
13373 /* The odd numbered ones are the complements of the
13374 * next-lower even number one */
13375 if (namedclass % 2 == 1) {
13379 arg = namedclass_to_classnum(namedclass);
13383 else if (value == prevvalue) {
13385 /* Here, the class consists of just a single code point */
13388 if (! LOC && value == '\n') {
13389 op = REG_ANY; /* Optimize [^\n] */
13390 *flagp |= HASWIDTH|SIMPLE;
13394 else if (value < 256 || UTF) {
13396 /* Optimize a single value into an EXACTish node, but not if it
13397 * would require converting the pattern to UTF-8. */
13398 op = compute_EXACTish(pRExC_state);
13400 } /* Otherwise is a range */
13401 else if (! LOC) { /* locale could vary these */
13402 if (prevvalue == '0') {
13403 if (value == '9') {
13410 /* Here, we have changed <op> away from its initial value iff we found
13411 * an optimization */
13414 /* Throw away this ANYOF regnode, and emit the calculated one,
13415 * which should correspond to the beginning, not current, state of
13417 const char * cur_parse = RExC_parse;
13418 RExC_parse = (char *)orig_parse;
13422 /* To get locale nodes to not use the full ANYOF size would
13423 * require moving the code above that writes the portions
13424 * of it that aren't in other nodes to after this point.
13425 * e.g. ANYOF_CLASS_SET */
13426 RExC_size = orig_size;
13430 RExC_emit = (regnode *)orig_emit;
13431 if (PL_regkind[op] == POSIXD) {
13433 op += NPOSIXD - POSIXD;
13438 ret = reg_node(pRExC_state, op);
13440 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13444 *flagp |= HASWIDTH|SIMPLE;
13446 else if (PL_regkind[op] == EXACT) {
13447 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13450 RExC_parse = (char *) cur_parse;
13452 SvREFCNT_dec(posixes);
13453 SvREFCNT_dec(cp_list);
13460 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13462 /* If folding, we calculate all characters that could fold to or from the
13463 * ones already on the list */
13464 if (FOLD && cp_list) {
13465 UV start, end; /* End points of code point ranges */
13467 SV* fold_intersection = NULL;
13469 /* If the highest code point is within Latin1, we can use the
13470 * compiled-in Alphas list, and not have to go out to disk. This
13471 * yields two false positives, the masculine and feminine ordinal
13472 * indicators, which are weeded out below using the
13473 * IS_IN_SOME_FOLD_L1() macro */
13474 if (invlist_highest(cp_list) < 256) {
13475 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13476 &fold_intersection);
13480 /* Here, there are non-Latin1 code points, so we will have to go
13481 * fetch the list of all the characters that participate in folds
13483 if (! PL_utf8_foldable) {
13484 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13485 &PL_sv_undef, 1, 0);
13486 PL_utf8_foldable = _get_swash_invlist(swash);
13487 SvREFCNT_dec_NN(swash);
13490 /* This is a hash that for a particular fold gives all characters
13491 * that are involved in it */
13492 if (! PL_utf8_foldclosures) {
13494 /* If we were unable to find any folds, then we likely won't be
13495 * able to find the closures. So just create an empty list.
13496 * Folding will effectively be restricted to the non-Unicode
13497 * rules hard-coded into Perl. (This case happens legitimately
13498 * during compilation of Perl itself before the Unicode tables
13499 * are generated) */
13500 if (_invlist_len(PL_utf8_foldable) == 0) {
13501 PL_utf8_foldclosures = newHV();
13504 /* If the folds haven't been read in, call a fold function
13506 if (! PL_utf8_tofold) {
13507 U8 dummy[UTF8_MAXBYTES_CASE+1];
13509 /* This string is just a short named one above \xff */
13510 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13511 assert(PL_utf8_tofold); /* Verify that worked */
13513 PL_utf8_foldclosures =
13514 _swash_inversion_hash(PL_utf8_tofold);
13518 /* Only the characters in this class that participate in folds need
13519 * be checked. Get the intersection of this class and all the
13520 * possible characters that are foldable. This can quickly narrow
13521 * down a large class */
13522 _invlist_intersection(PL_utf8_foldable, cp_list,
13523 &fold_intersection);
13526 /* Now look at the foldable characters in this class individually */
13527 invlist_iterinit(fold_intersection);
13528 while (invlist_iternext(fold_intersection, &start, &end)) {
13531 /* Locale folding for Latin1 characters is deferred until runtime */
13532 if (LOC && start < 256) {
13536 /* Look at every character in the range */
13537 for (j = start; j <= end; j++) {
13539 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13545 /* We have the latin1 folding rules hard-coded here so that
13546 * an innocent-looking character class, like /[ks]/i won't
13547 * have to go out to disk to find the possible matches.
13548 * XXX It would be better to generate these via regen, in
13549 * case a new version of the Unicode standard adds new
13550 * mappings, though that is not really likely, and may be
13551 * caught by the default: case of the switch below. */
13553 if (IS_IN_SOME_FOLD_L1(j)) {
13555 /* ASCII is always matched; non-ASCII is matched only
13556 * under Unicode rules */
13557 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13559 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13563 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13567 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13568 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13570 /* Certain Latin1 characters have matches outside
13571 * Latin1. To get here, <j> is one of those
13572 * characters. None of these matches is valid for
13573 * ASCII characters under /aa, which is why the 'if'
13574 * just above excludes those. These matches only
13575 * happen when the target string is utf8. The code
13576 * below adds the single fold closures for <j> to the
13577 * inversion list. */
13582 add_cp_to_invlist(cp_list, KELVIN_SIGN);
13586 cp_list = add_cp_to_invlist(cp_list,
13587 LATIN_SMALL_LETTER_LONG_S);
13590 cp_list = add_cp_to_invlist(cp_list,
13591 GREEK_CAPITAL_LETTER_MU);
13592 cp_list = add_cp_to_invlist(cp_list,
13593 GREEK_SMALL_LETTER_MU);
13595 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13596 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13598 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13600 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13601 cp_list = add_cp_to_invlist(cp_list,
13602 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13604 case LATIN_SMALL_LETTER_SHARP_S:
13605 cp_list = add_cp_to_invlist(cp_list,
13606 LATIN_CAPITAL_LETTER_SHARP_S);
13608 case 'F': case 'f':
13609 case 'I': case 'i':
13610 case 'L': case 'l':
13611 case 'T': case 't':
13612 case 'A': case 'a':
13613 case 'H': case 'h':
13614 case 'J': case 'j':
13615 case 'N': case 'n':
13616 case 'W': case 'w':
13617 case 'Y': case 'y':
13618 /* These all are targets of multi-character
13619 * folds from code points that require UTF8 to
13620 * express, so they can't match unless the
13621 * target string is in UTF-8, so no action here
13622 * is necessary, as regexec.c properly handles
13623 * the general case for UTF-8 matching and
13624 * multi-char folds */
13627 /* Use deprecated warning to increase the
13628 * chances of this being output */
13629 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13636 /* Here is an above Latin1 character. We don't have the rules
13637 * hard-coded for it. First, get its fold. This is the simple
13638 * fold, as the multi-character folds have been handled earlier
13639 * and separated out */
13640 _to_uni_fold_flags(j, foldbuf, &foldlen,
13642 ? FOLD_FLAGS_LOCALE
13643 : (ASCII_FOLD_RESTRICTED)
13644 ? FOLD_FLAGS_NOMIX_ASCII
13647 /* Single character fold of above Latin1. Add everything in
13648 * its fold closure to the list that this node should match.
13649 * The fold closures data structure is a hash with the keys
13650 * being the UTF-8 of every character that is folded to, like
13651 * 'k', and the values each an array of all code points that
13652 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
13653 * Multi-character folds are not included */
13654 if ((listp = hv_fetch(PL_utf8_foldclosures,
13655 (char *) foldbuf, foldlen, FALSE)))
13657 AV* list = (AV*) *listp;
13659 for (k = 0; k <= av_len(list); k++) {
13660 SV** c_p = av_fetch(list, k, FALSE);
13663 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13667 /* /aa doesn't allow folds between ASCII and non-; /l
13668 * doesn't allow them between above and below 256 */
13669 if ((ASCII_FOLD_RESTRICTED
13670 && (isASCII(c) != isASCII(j)))
13671 || (LOC && c < 256)) {
13675 /* Folds involving non-ascii Latin1 characters
13676 * under /d are added to a separate list */
13677 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13679 cp_list = add_cp_to_invlist(cp_list, c);
13682 depends_list = add_cp_to_invlist(depends_list, c);
13688 SvREFCNT_dec_NN(fold_intersection);
13691 /* And combine the result (if any) with any inversion list from posix
13692 * classes. The lists are kept separate up to now because we don't want to
13693 * fold the classes (folding of those is automatically handled by the swash
13694 * fetching code) */
13696 if (! DEPENDS_SEMANTICS) {
13698 _invlist_union(cp_list, posixes, &cp_list);
13699 SvREFCNT_dec_NN(posixes);
13706 /* Under /d, we put into a separate list the Latin1 things that
13707 * match only when the target string is utf8 */
13708 SV* nonascii_but_latin1_properties = NULL;
13709 _invlist_intersection(posixes, PL_Latin1,
13710 &nonascii_but_latin1_properties);
13711 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13712 &nonascii_but_latin1_properties);
13713 _invlist_subtract(posixes, nonascii_but_latin1_properties,
13716 _invlist_union(cp_list, posixes, &cp_list);
13717 SvREFCNT_dec_NN(posixes);
13723 if (depends_list) {
13724 _invlist_union(depends_list, nonascii_but_latin1_properties,
13726 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13729 depends_list = nonascii_but_latin1_properties;
13734 /* And combine the result (if any) with any inversion list from properties.
13735 * The lists are kept separate up to now so that we can distinguish the two
13736 * in regards to matching above-Unicode. A run-time warning is generated
13737 * if a Unicode property is matched against a non-Unicode code point. But,
13738 * we allow user-defined properties to match anything, without any warning,
13739 * and we also suppress the warning if there is a portion of the character
13740 * class that isn't a Unicode property, and which matches above Unicode, \W
13741 * or [\x{110000}] for example.
13742 * (Note that in this case, unlike the Posix one above, there is no
13743 * <depends_list>, because having a Unicode property forces Unicode
13746 bool warn_super = ! has_user_defined_property;
13749 /* If it matters to the final outcome, see if a non-property
13750 * component of the class matches above Unicode. If so, the
13751 * warning gets suppressed. This is true even if just a single
13752 * such code point is specified, as though not strictly correct if
13753 * another such code point is matched against, the fact that they
13754 * are using above-Unicode code points indicates they should know
13755 * the issues involved */
13757 bool non_prop_matches_above_Unicode =
13758 runtime_posix_matches_above_Unicode
13759 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13761 non_prop_matches_above_Unicode =
13762 ! non_prop_matches_above_Unicode;
13764 warn_super = ! non_prop_matches_above_Unicode;
13767 _invlist_union(properties, cp_list, &cp_list);
13768 SvREFCNT_dec_NN(properties);
13771 cp_list = properties;
13775 OP(ret) = ANYOF_WARN_SUPER;
13779 /* Here, we have calculated what code points should be in the character
13782 * Now we can see about various optimizations. Fold calculation (which we
13783 * did above) needs to take place before inversion. Otherwise /[^k]/i
13784 * would invert to include K, which under /i would match k, which it
13785 * shouldn't. Therefore we can't invert folded locale now, as it won't be
13786 * folded until runtime */
13788 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13789 * at compile time. Besides not inverting folded locale now, we can't
13790 * invert if there are things such as \w, which aren't known until runtime
13793 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13795 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13797 _invlist_invert(cp_list);
13799 /* Any swash can't be used as-is, because we've inverted things */
13801 SvREFCNT_dec_NN(swash);
13805 /* Clear the invert flag since have just done it here */
13810 *ret_invlist = cp_list;
13811 SvREFCNT_dec(swash);
13813 /* Discard the generated node */
13815 RExC_size = orig_size;
13818 RExC_emit = orig_emit;
13823 /* If we didn't do folding, it's because some information isn't available
13824 * until runtime; set the run-time fold flag for these. (We don't have to
13825 * worry about properties folding, as that is taken care of by the swash
13829 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13832 /* Some character classes are equivalent to other nodes. Such nodes take
13833 * up less room and generally fewer operations to execute than ANYOF nodes.
13834 * Above, we checked for and optimized into some such equivalents for
13835 * certain common classes that are easy to test. Getting to this point in
13836 * the code means that the class didn't get optimized there. Since this
13837 * code is only executed in Pass 2, it is too late to save space--it has
13838 * been allocated in Pass 1, and currently isn't given back. But turning
13839 * things into an EXACTish node can allow the optimizer to join it to any
13840 * adjacent such nodes. And if the class is equivalent to things like /./,
13841 * expensive run-time swashes can be avoided. Now that we have more
13842 * complete information, we can find things necessarily missed by the
13843 * earlier code. I (khw) am not sure how much to look for here. It would
13844 * be easy, but perhaps too slow, to check any candidates against all the
13845 * node types they could possibly match using _invlistEQ(). */
13850 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13851 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13854 U8 op = END; /* The optimzation node-type */
13855 const char * cur_parse= RExC_parse;
13857 invlist_iterinit(cp_list);
13858 if (! invlist_iternext(cp_list, &start, &end)) {
13860 /* Here, the list is empty. This happens, for example, when a
13861 * Unicode property is the only thing in the character class, and
13862 * it doesn't match anything. (perluniprops.pod notes such
13865 *flagp |= HASWIDTH|SIMPLE;
13867 else if (start == end) { /* The range is a single code point */
13868 if (! invlist_iternext(cp_list, &start, &end)
13870 /* Don't do this optimization if it would require changing
13871 * the pattern to UTF-8 */
13872 && (start < 256 || UTF))
13874 /* Here, the list contains a single code point. Can optimize
13875 * into an EXACT node */
13884 /* A locale node under folding with one code point can be
13885 * an EXACTFL, as its fold won't be calculated until
13891 /* Here, we are generally folding, but there is only one
13892 * code point to match. If we have to, we use an EXACT
13893 * node, but it would be better for joining with adjacent
13894 * nodes in the optimization pass if we used the same
13895 * EXACTFish node that any such are likely to be. We can
13896 * do this iff the code point doesn't participate in any
13897 * folds. For example, an EXACTF of a colon is the same as
13898 * an EXACT one, since nothing folds to or from a colon. */
13900 if (IS_IN_SOME_FOLD_L1(value)) {
13905 if (! PL_utf8_foldable) {
13906 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13907 &PL_sv_undef, 1, 0);
13908 PL_utf8_foldable = _get_swash_invlist(swash);
13909 SvREFCNT_dec_NN(swash);
13911 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13916 /* If we haven't found the node type, above, it means we
13917 * can use the prevailing one */
13919 op = compute_EXACTish(pRExC_state);
13924 else if (start == 0) {
13925 if (end == UV_MAX) {
13927 *flagp |= HASWIDTH|SIMPLE;
13930 else if (end == '\n' - 1
13931 && invlist_iternext(cp_list, &start, &end)
13932 && start == '\n' + 1 && end == UV_MAX)
13935 *flagp |= HASWIDTH|SIMPLE;
13939 invlist_iterfinish(cp_list);
13942 RExC_parse = (char *)orig_parse;
13943 RExC_emit = (regnode *)orig_emit;
13945 ret = reg_node(pRExC_state, op);
13947 RExC_parse = (char *)cur_parse;
13949 if (PL_regkind[op] == EXACT) {
13950 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13953 SvREFCNT_dec_NN(cp_list);
13958 /* Here, <cp_list> contains all the code points we can determine at
13959 * compile time that match under all conditions. Go through it, and
13960 * for things that belong in the bitmap, put them there, and delete from
13961 * <cp_list>. While we are at it, see if everything above 255 is in the
13962 * list, and if so, set a flag to speed up execution */
13963 ANYOF_BITMAP_ZERO(ret);
13966 /* This gets set if we actually need to modify things */
13967 bool change_invlist = FALSE;
13971 /* Start looking through <cp_list> */
13972 invlist_iterinit(cp_list);
13973 while (invlist_iternext(cp_list, &start, &end)) {
13977 if (end == UV_MAX && start <= 256) {
13978 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13981 /* Quit if are above what we should change */
13986 change_invlist = TRUE;
13988 /* Set all the bits in the range, up to the max that we are doing */
13989 high = (end < 255) ? end : 255;
13990 for (i = start; i <= (int) high; i++) {
13991 if (! ANYOF_BITMAP_TEST(ret, i)) {
13992 ANYOF_BITMAP_SET(ret, i);
13996 invlist_iterfinish(cp_list);
13998 /* Done with loop; remove any code points that are in the bitmap from
14000 if (change_invlist) {
14001 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
14004 /* If have completely emptied it, remove it completely */
14005 if (_invlist_len(cp_list) == 0) {
14006 SvREFCNT_dec_NN(cp_list);
14012 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14015 /* Here, the bitmap has been populated with all the Latin1 code points that
14016 * always match. Can now add to the overall list those that match only
14017 * when the target string is UTF-8 (<depends_list>). */
14018 if (depends_list) {
14020 _invlist_union(cp_list, depends_list, &cp_list);
14021 SvREFCNT_dec_NN(depends_list);
14024 cp_list = depends_list;
14028 /* If there is a swash and more than one element, we can't use the swash in
14029 * the optimization below. */
14030 if (swash && element_count > 1) {
14031 SvREFCNT_dec_NN(swash);
14036 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14038 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
14041 /* av[0] stores the character class description in its textual form:
14042 * used later (regexec.c:Perl_regclass_swash()) to initialize the
14043 * appropriate swash, and is also useful for dumping the regnode.
14044 * av[1] if NULL, is a placeholder to later contain the swash computed
14045 * from av[0]. But if no further computation need be done, the
14046 * swash is stored there now.
14047 * av[2] stores the cp_list inversion list for use in addition or
14048 * instead of av[0]; used only if av[1] is NULL
14049 * av[3] is set if any component of the class is from a user-defined
14050 * property; used only if av[1] is NULL */
14051 AV * const av = newAV();
14054 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14055 ? SvREFCNT_inc(listsv) : &PL_sv_undef);
14057 av_store(av, 1, swash);
14058 SvREFCNT_dec_NN(cp_list);
14061 av_store(av, 1, NULL);
14063 av_store(av, 2, cp_list);
14064 av_store(av, 3, newSVuv(has_user_defined_property));
14068 rv = newRV_noinc(MUTABLE_SV(av));
14069 n = add_data(pRExC_state, 1, "s");
14070 RExC_rxi->data->data[n] = (void*)rv;
14074 *flagp |= HASWIDTH|SIMPLE;
14077 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14080 /* reg_skipcomment()
14082 Absorbs an /x style # comments from the input stream.
14083 Returns true if there is more text remaining in the stream.
14084 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14085 terminates the pattern without including a newline.
14087 Note its the callers responsibility to ensure that we are
14088 actually in /x mode
14093 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14097 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14099 while (RExC_parse < RExC_end)
14100 if (*RExC_parse++ == '\n') {
14105 /* we ran off the end of the pattern without ending
14106 the comment, so we have to add an \n when wrapping */
14107 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14115 Advances the parse position, and optionally absorbs
14116 "whitespace" from the inputstream.
14118 Without /x "whitespace" means (?#...) style comments only,
14119 with /x this means (?#...) and # comments and whitespace proper.
14121 Returns the RExC_parse point from BEFORE the scan occurs.
14123 This is the /x friendly way of saying RExC_parse++.
14127 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14129 char* const retval = RExC_parse++;
14131 PERL_ARGS_ASSERT_NEXTCHAR;
14134 if (RExC_end - RExC_parse >= 3
14135 && *RExC_parse == '('
14136 && RExC_parse[1] == '?'
14137 && RExC_parse[2] == '#')
14139 while (*RExC_parse != ')') {
14140 if (RExC_parse == RExC_end)
14141 FAIL("Sequence (?#... not terminated");
14147 if (RExC_flags & RXf_PMf_EXTENDED) {
14148 if (isSPACE(*RExC_parse)) {
14152 else if (*RExC_parse == '#') {
14153 if ( reg_skipcomment( pRExC_state ) )
14162 - reg_node - emit a node
14164 STATIC regnode * /* Location. */
14165 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14169 regnode * const ret = RExC_emit;
14170 GET_RE_DEBUG_FLAGS_DECL;
14172 PERL_ARGS_ASSERT_REG_NODE;
14175 SIZE_ALIGN(RExC_size);
14179 if (RExC_emit >= RExC_emit_bound)
14180 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14181 op, RExC_emit, RExC_emit_bound);
14183 NODE_ALIGN_FILL(ret);
14185 FILL_ADVANCE_NODE(ptr, op);
14186 #ifdef RE_TRACK_PATTERN_OFFSETS
14187 if (RExC_offsets) { /* MJD */
14188 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14189 "reg_node", __LINE__,
14191 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14192 ? "Overwriting end of array!\n" : "OK",
14193 (UV)(RExC_emit - RExC_emit_start),
14194 (UV)(RExC_parse - RExC_start),
14195 (UV)RExC_offsets[0]));
14196 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14204 - reganode - emit a node with an argument
14206 STATIC regnode * /* Location. */
14207 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14211 regnode * const ret = RExC_emit;
14212 GET_RE_DEBUG_FLAGS_DECL;
14214 PERL_ARGS_ASSERT_REGANODE;
14217 SIZE_ALIGN(RExC_size);
14222 assert(2==regarglen[op]+1);
14224 Anything larger than this has to allocate the extra amount.
14225 If we changed this to be:
14227 RExC_size += (1 + regarglen[op]);
14229 then it wouldn't matter. Its not clear what side effect
14230 might come from that so its not done so far.
14235 if (RExC_emit >= RExC_emit_bound)
14236 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14237 op, RExC_emit, RExC_emit_bound);
14239 NODE_ALIGN_FILL(ret);
14241 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14242 #ifdef RE_TRACK_PATTERN_OFFSETS
14243 if (RExC_offsets) { /* MJD */
14244 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14248 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14249 "Overwriting end of array!\n" : "OK",
14250 (UV)(RExC_emit - RExC_emit_start),
14251 (UV)(RExC_parse - RExC_start),
14252 (UV)RExC_offsets[0]));
14253 Set_Cur_Node_Offset;
14261 - reguni - emit (if appropriate) a Unicode character
14264 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14268 PERL_ARGS_ASSERT_REGUNI;
14270 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14274 - reginsert - insert an operator in front of already-emitted operand
14276 * Means relocating the operand.
14279 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14285 const int offset = regarglen[(U8)op];
14286 const int size = NODE_STEP_REGNODE + offset;
14287 GET_RE_DEBUG_FLAGS_DECL;
14289 PERL_ARGS_ASSERT_REGINSERT;
14290 PERL_UNUSED_ARG(depth);
14291 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14292 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14301 if (RExC_open_parens) {
14303 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14304 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14305 if ( RExC_open_parens[paren] >= opnd ) {
14306 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14307 RExC_open_parens[paren] += size;
14309 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14311 if ( RExC_close_parens[paren] >= opnd ) {
14312 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14313 RExC_close_parens[paren] += size;
14315 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14320 while (src > opnd) {
14321 StructCopy(--src, --dst, regnode);
14322 #ifdef RE_TRACK_PATTERN_OFFSETS
14323 if (RExC_offsets) { /* MJD 20010112 */
14324 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14328 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14329 ? "Overwriting end of array!\n" : "OK",
14330 (UV)(src - RExC_emit_start),
14331 (UV)(dst - RExC_emit_start),
14332 (UV)RExC_offsets[0]));
14333 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14334 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14340 place = opnd; /* Op node, where operand used to be. */
14341 #ifdef RE_TRACK_PATTERN_OFFSETS
14342 if (RExC_offsets) { /* MJD */
14343 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14347 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14348 ? "Overwriting end of array!\n" : "OK",
14349 (UV)(place - RExC_emit_start),
14350 (UV)(RExC_parse - RExC_start),
14351 (UV)RExC_offsets[0]));
14352 Set_Node_Offset(place, RExC_parse);
14353 Set_Node_Length(place, 1);
14356 src = NEXTOPER(place);
14357 FILL_ADVANCE_NODE(place, op);
14358 Zero(src, offset, regnode);
14362 - regtail - set the next-pointer at the end of a node chain of p to val.
14363 - SEE ALSO: regtail_study
14365 /* TODO: All three parms should be const */
14367 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14371 GET_RE_DEBUG_FLAGS_DECL;
14373 PERL_ARGS_ASSERT_REGTAIL;
14375 PERL_UNUSED_ARG(depth);
14381 /* Find last node. */
14384 regnode * const temp = regnext(scan);
14386 SV * const mysv=sv_newmortal();
14387 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14388 regprop(RExC_rx, mysv, scan);
14389 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14390 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14391 (temp == NULL ? "->" : ""),
14392 (temp == NULL ? PL_reg_name[OP(val)] : "")
14400 if (reg_off_by_arg[OP(scan)]) {
14401 ARG_SET(scan, val - scan);
14404 NEXT_OFF(scan) = val - scan;
14410 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14411 - Look for optimizable sequences at the same time.
14412 - currently only looks for EXACT chains.
14414 This is experimental code. The idea is to use this routine to perform
14415 in place optimizations on branches and groups as they are constructed,
14416 with the long term intention of removing optimization from study_chunk so
14417 that it is purely analytical.
14419 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14420 to control which is which.
14423 /* TODO: All four parms should be const */
14426 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14431 #ifdef EXPERIMENTAL_INPLACESCAN
14434 GET_RE_DEBUG_FLAGS_DECL;
14436 PERL_ARGS_ASSERT_REGTAIL_STUDY;
14442 /* Find last node. */
14446 regnode * const temp = regnext(scan);
14447 #ifdef EXPERIMENTAL_INPLACESCAN
14448 if (PL_regkind[OP(scan)] == EXACT) {
14449 bool has_exactf_sharp_s; /* Unexamined in this routine */
14450 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14455 switch (OP(scan)) {
14458 case EXACTFA_NO_TRIE:
14463 if( exact == PSEUDO )
14465 else if ( exact != OP(scan) )
14474 SV * const mysv=sv_newmortal();
14475 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14476 regprop(RExC_rx, mysv, scan);
14477 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14478 SvPV_nolen_const(mysv),
14479 REG_NODE_NUM(scan),
14480 PL_reg_name[exact]);
14487 SV * const mysv_val=sv_newmortal();
14488 DEBUG_PARSE_MSG("");
14489 regprop(RExC_rx, mysv_val, val);
14490 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14491 SvPV_nolen_const(mysv_val),
14492 (IV)REG_NODE_NUM(val),
14496 if (reg_off_by_arg[OP(scan)]) {
14497 ARG_SET(scan, val - scan);
14500 NEXT_OFF(scan) = val - scan;
14508 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14513 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14518 for (bit=0; bit<32; bit++) {
14519 if (flags & (1<<bit)) {
14520 if (!set++ && lead)
14521 PerlIO_printf(Perl_debug_log, "%s",lead);
14522 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14527 PerlIO_printf(Perl_debug_log, "\n");
14529 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14534 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14540 for (bit=0; bit<32; bit++) {
14541 if (flags & (1<<bit)) {
14542 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14545 if (!set++ && lead)
14546 PerlIO_printf(Perl_debug_log, "%s",lead);
14547 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14550 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14551 if (!set++ && lead) {
14552 PerlIO_printf(Perl_debug_log, "%s",lead);
14555 case REGEX_UNICODE_CHARSET:
14556 PerlIO_printf(Perl_debug_log, "UNICODE");
14558 case REGEX_LOCALE_CHARSET:
14559 PerlIO_printf(Perl_debug_log, "LOCALE");
14561 case REGEX_ASCII_RESTRICTED_CHARSET:
14562 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14564 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14565 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14568 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14574 PerlIO_printf(Perl_debug_log, "\n");
14576 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14582 Perl_regdump(pTHX_ const regexp *r)
14586 SV * const sv = sv_newmortal();
14587 SV *dsv= sv_newmortal();
14588 RXi_GET_DECL(r,ri);
14589 GET_RE_DEBUG_FLAGS_DECL;
14591 PERL_ARGS_ASSERT_REGDUMP;
14593 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14595 /* Header fields of interest. */
14596 if (r->anchored_substr) {
14597 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14598 RE_SV_DUMPLEN(r->anchored_substr), 30);
14599 PerlIO_printf(Perl_debug_log,
14600 "anchored %s%s at %"IVdf" ",
14601 s, RE_SV_TAIL(r->anchored_substr),
14602 (IV)r->anchored_offset);
14603 } else if (r->anchored_utf8) {
14604 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14605 RE_SV_DUMPLEN(r->anchored_utf8), 30);
14606 PerlIO_printf(Perl_debug_log,
14607 "anchored utf8 %s%s at %"IVdf" ",
14608 s, RE_SV_TAIL(r->anchored_utf8),
14609 (IV)r->anchored_offset);
14611 if (r->float_substr) {
14612 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14613 RE_SV_DUMPLEN(r->float_substr), 30);
14614 PerlIO_printf(Perl_debug_log,
14615 "floating %s%s at %"IVdf"..%"UVuf" ",
14616 s, RE_SV_TAIL(r->float_substr),
14617 (IV)r->float_min_offset, (UV)r->float_max_offset);
14618 } else if (r->float_utf8) {
14619 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14620 RE_SV_DUMPLEN(r->float_utf8), 30);
14621 PerlIO_printf(Perl_debug_log,
14622 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14623 s, RE_SV_TAIL(r->float_utf8),
14624 (IV)r->float_min_offset, (UV)r->float_max_offset);
14626 if (r->check_substr || r->check_utf8)
14627 PerlIO_printf(Perl_debug_log,
14629 (r->check_substr == r->float_substr
14630 && r->check_utf8 == r->float_utf8
14631 ? "(checking floating" : "(checking anchored"));
14632 if (r->extflags & RXf_NOSCAN)
14633 PerlIO_printf(Perl_debug_log, " noscan");
14634 if (r->extflags & RXf_CHECK_ALL)
14635 PerlIO_printf(Perl_debug_log, " isall");
14636 if (r->check_substr || r->check_utf8)
14637 PerlIO_printf(Perl_debug_log, ") ");
14639 if (ri->regstclass) {
14640 regprop(r, sv, ri->regstclass);
14641 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14643 if (r->extflags & RXf_ANCH) {
14644 PerlIO_printf(Perl_debug_log, "anchored");
14645 if (r->extflags & RXf_ANCH_BOL)
14646 PerlIO_printf(Perl_debug_log, "(BOL)");
14647 if (r->extflags & RXf_ANCH_MBOL)
14648 PerlIO_printf(Perl_debug_log, "(MBOL)");
14649 if (r->extflags & RXf_ANCH_SBOL)
14650 PerlIO_printf(Perl_debug_log, "(SBOL)");
14651 if (r->extflags & RXf_ANCH_GPOS)
14652 PerlIO_printf(Perl_debug_log, "(GPOS)");
14653 PerlIO_putc(Perl_debug_log, ' ');
14655 if (r->extflags & RXf_GPOS_SEEN)
14656 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14657 if (r->intflags & PREGf_SKIP)
14658 PerlIO_printf(Perl_debug_log, "plus ");
14659 if (r->intflags & PREGf_IMPLICIT)
14660 PerlIO_printf(Perl_debug_log, "implicit ");
14661 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14662 if (r->extflags & RXf_EVAL_SEEN)
14663 PerlIO_printf(Perl_debug_log, "with eval ");
14664 PerlIO_printf(Perl_debug_log, "\n");
14666 regdump_extflags("r->extflags: ",r->extflags);
14667 regdump_intflags("r->intflags: ",r->intflags);
14670 PERL_ARGS_ASSERT_REGDUMP;
14671 PERL_UNUSED_CONTEXT;
14672 PERL_UNUSED_ARG(r);
14673 #endif /* DEBUGGING */
14677 - regprop - printable representation of opcode
14679 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14682 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14683 if (flags & ANYOF_INVERT) \
14684 /*make sure the invert info is in each */ \
14685 sv_catpvs(sv, "^"); \
14691 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14697 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14698 static const char * const anyofs[] = {
14699 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14700 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
14701 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
14702 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
14703 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
14704 || _CC_VERTSPACE != 16
14705 #error Need to adjust order of anyofs[]
14742 RXi_GET_DECL(prog,progi);
14743 GET_RE_DEBUG_FLAGS_DECL;
14745 PERL_ARGS_ASSERT_REGPROP;
14749 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
14750 /* It would be nice to FAIL() here, but this may be called from
14751 regexec.c, and it would be hard to supply pRExC_state. */
14752 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14753 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14755 k = PL_regkind[OP(o)];
14758 sv_catpvs(sv, " ");
14759 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14760 * is a crude hack but it may be the best for now since
14761 * we have no flag "this EXACTish node was UTF-8"
14763 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14764 PERL_PV_ESCAPE_UNI_DETECT |
14765 PERL_PV_ESCAPE_NONASCII |
14766 PERL_PV_PRETTY_ELLIPSES |
14767 PERL_PV_PRETTY_LTGT |
14768 PERL_PV_PRETTY_NOCLEAR
14770 } else if (k == TRIE) {
14771 /* print the details of the trie in dumpuntil instead, as
14772 * progi->data isn't available here */
14773 const char op = OP(o);
14774 const U32 n = ARG(o);
14775 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14776 (reg_ac_data *)progi->data->data[n] :
14778 const reg_trie_data * const trie
14779 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14781 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14782 DEBUG_TRIE_COMPILE_r(
14783 Perl_sv_catpvf(aTHX_ sv,
14784 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14785 (UV)trie->startstate,
14786 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14787 (UV)trie->wordcount,
14790 (UV)TRIE_CHARCOUNT(trie),
14791 (UV)trie->uniquecharcount
14794 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14795 sv_catpvs(sv, "[");
14796 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
14798 : TRIE_BITMAP(trie));
14799 sv_catpvs(sv, "]");
14802 } else if (k == CURLY) {
14803 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14804 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14805 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14807 else if (k == WHILEM && o->flags) /* Ordinal/of */
14808 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14809 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14810 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14811 if ( RXp_PAREN_NAMES(prog) ) {
14812 if ( k != REF || (OP(o) < NREF)) {
14813 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14814 SV **name= av_fetch(list, ARG(o), 0 );
14816 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14819 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14820 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14821 I32 *nums=(I32*)SvPVX(sv_dat);
14822 SV **name= av_fetch(list, nums[0], 0 );
14825 for ( n=0; n<SvIVX(sv_dat); n++ ) {
14826 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14827 (n ? "," : ""), (IV)nums[n]);
14829 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14833 } else if (k == GOSUB)
14834 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14835 else if (k == VERB) {
14837 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14838 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14839 } else if (k == LOGICAL)
14840 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14841 else if (k == ANYOF) {
14842 const U8 flags = ANYOF_FLAGS(o);
14846 if (flags & ANYOF_LOCALE)
14847 sv_catpvs(sv, "{loc}");
14848 if (flags & ANYOF_LOC_FOLD)
14849 sv_catpvs(sv, "{i}");
14850 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14851 if (flags & ANYOF_INVERT)
14852 sv_catpvs(sv, "^");
14854 /* output what the standard cp 0-255 bitmap matches */
14855 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
14857 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14858 /* output any special charclass tests (used entirely under use locale) */
14859 if (ANYOF_CLASS_TEST_ANY_SET(o)) {
14861 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
14862 if (ANYOF_CLASS_TEST(o,i)) {
14863 sv_catpv(sv, anyofs[i]);
14869 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14871 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14872 sv_catpvs(sv, "{non-utf8-latin1-all}");
14875 /* output information about the unicode matching */
14876 if (flags & ANYOF_UNICODE_ALL)
14877 sv_catpvs(sv, "{unicode_all}");
14878 else if (ANYOF_NONBITMAP(o)) {
14879 SV *lv; /* Set if there is something outside the bit map. */
14880 bool byte_output = FALSE; /* If something in the bitmap has been
14883 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
14884 sv_catpvs(sv, "{outside bitmap}");
14887 sv_catpvs(sv, "{utf8}");
14890 /* Get the stuff that wasn't in the bitmap */
14891 (void) regclass_swash(prog, o, FALSE, &lv, NULL);
14892 if (lv && lv != &PL_sv_undef) {
14893 char *s = savesvpv(lv);
14894 char * const origs = s;
14896 while (*s && *s != '\n')
14900 const char * const t = ++s;
14903 sv_catpvs(sv, " ");
14909 /* Truncate very long output */
14910 if (s - origs > 256) {
14911 Perl_sv_catpvf(aTHX_ sv,
14913 (int) (s - origs - 1),
14919 else if (*s == '\t') {
14933 SvREFCNT_dec_NN(lv);
14937 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14939 else if (k == POSIXD || k == NPOSIXD) {
14940 U8 index = FLAGS(o) * 2;
14941 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14942 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14945 sv_catpv(sv, anyofs[index]);
14948 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14949 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14951 PERL_UNUSED_CONTEXT;
14952 PERL_UNUSED_ARG(sv);
14953 PERL_UNUSED_ARG(o);
14954 PERL_UNUSED_ARG(prog);
14955 #endif /* DEBUGGING */
14959 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14960 { /* Assume that RE_INTUIT is set */
14962 struct regexp *const prog = ReANY(r);
14963 GET_RE_DEBUG_FLAGS_DECL;
14965 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14966 PERL_UNUSED_CONTEXT;
14970 const char * const s = SvPV_nolen_const(prog->check_substr
14971 ? prog->check_substr : prog->check_utf8);
14973 if (!PL_colorset) reginitcolors();
14974 PerlIO_printf(Perl_debug_log,
14975 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14977 prog->check_substr ? "" : "utf8 ",
14978 PL_colors[5],PL_colors[0],
14981 (strlen(s) > 60 ? "..." : ""));
14984 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14990 handles refcounting and freeing the perl core regexp structure. When
14991 it is necessary to actually free the structure the first thing it
14992 does is call the 'free' method of the regexp_engine associated to
14993 the regexp, allowing the handling of the void *pprivate; member
14994 first. (This routine is not overridable by extensions, which is why
14995 the extensions free is called first.)
14997 See regdupe and regdupe_internal if you change anything here.
14999 #ifndef PERL_IN_XSUB_RE
15001 Perl_pregfree(pTHX_ REGEXP *r)
15007 Perl_pregfree2(pTHX_ REGEXP *rx)
15010 struct regexp *const r = ReANY(rx);
15011 GET_RE_DEBUG_FLAGS_DECL;
15013 PERL_ARGS_ASSERT_PREGFREE2;
15015 if (r->mother_re) {
15016 ReREFCNT_dec(r->mother_re);
15018 CALLREGFREE_PVT(rx); /* free the private data */
15019 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15020 Safefree(r->xpv_len_u.xpvlenu_pv);
15023 SvREFCNT_dec(r->anchored_substr);
15024 SvREFCNT_dec(r->anchored_utf8);
15025 SvREFCNT_dec(r->float_substr);
15026 SvREFCNT_dec(r->float_utf8);
15027 Safefree(r->substrs);
15029 RX_MATCH_COPY_FREE(rx);
15030 #ifdef PERL_ANY_COW
15031 SvREFCNT_dec(r->saved_copy);
15034 SvREFCNT_dec(r->qr_anoncv);
15035 rx->sv_u.svu_rx = 0;
15040 This is a hacky workaround to the structural issue of match results
15041 being stored in the regexp structure which is in turn stored in
15042 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15043 could be PL_curpm in multiple contexts, and could require multiple
15044 result sets being associated with the pattern simultaneously, such
15045 as when doing a recursive match with (??{$qr})
15047 The solution is to make a lightweight copy of the regexp structure
15048 when a qr// is returned from the code executed by (??{$qr}) this
15049 lightweight copy doesn't actually own any of its data except for
15050 the starp/end and the actual regexp structure itself.
15056 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15058 struct regexp *ret;
15059 struct regexp *const r = ReANY(rx);
15060 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15062 PERL_ARGS_ASSERT_REG_TEMP_COPY;
15065 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15067 SvOK_off((SV *)ret_x);
15069 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15070 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15071 made both spots point to the same regexp body.) */
15072 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15073 assert(!SvPVX(ret_x));
15074 ret_x->sv_u.svu_rx = temp->sv_any;
15075 temp->sv_any = NULL;
15076 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15077 SvREFCNT_dec_NN(temp);
15078 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15079 ing below will not set it. */
15080 SvCUR_set(ret_x, SvCUR(rx));
15083 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15084 sv_force_normal(sv) is called. */
15086 ret = ReANY(ret_x);
15088 SvFLAGS(ret_x) |= SvUTF8(rx);
15089 /* We share the same string buffer as the original regexp, on which we
15090 hold a reference count, incremented when mother_re is set below.
15091 The string pointer is copied here, being part of the regexp struct.
15093 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15094 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15096 const I32 npar = r->nparens+1;
15097 Newx(ret->offs, npar, regexp_paren_pair);
15098 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15101 Newx(ret->substrs, 1, struct reg_substr_data);
15102 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15104 SvREFCNT_inc_void(ret->anchored_substr);
15105 SvREFCNT_inc_void(ret->anchored_utf8);
15106 SvREFCNT_inc_void(ret->float_substr);
15107 SvREFCNT_inc_void(ret->float_utf8);
15109 /* check_substr and check_utf8, if non-NULL, point to either their
15110 anchored or float namesakes, and don't hold a second reference. */
15112 RX_MATCH_COPIED_off(ret_x);
15113 #ifdef PERL_ANY_COW
15114 ret->saved_copy = NULL;
15116 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15117 SvREFCNT_inc_void(ret->qr_anoncv);
15123 /* regfree_internal()
15125 Free the private data in a regexp. This is overloadable by
15126 extensions. Perl takes care of the regexp structure in pregfree(),
15127 this covers the *pprivate pointer which technically perl doesn't
15128 know about, however of course we have to handle the
15129 regexp_internal structure when no extension is in use.
15131 Note this is called before freeing anything in the regexp
15136 Perl_regfree_internal(pTHX_ REGEXP * const rx)
15139 struct regexp *const r = ReANY(rx);
15140 RXi_GET_DECL(r,ri);
15141 GET_RE_DEBUG_FLAGS_DECL;
15143 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15149 SV *dsv= sv_newmortal();
15150 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15151 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15152 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15153 PL_colors[4],PL_colors[5],s);
15156 #ifdef RE_TRACK_PATTERN_OFFSETS
15158 Safefree(ri->u.offsets); /* 20010421 MJD */
15160 if (ri->code_blocks) {
15162 for (n = 0; n < ri->num_code_blocks; n++)
15163 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15164 Safefree(ri->code_blocks);
15168 int n = ri->data->count;
15171 /* If you add a ->what type here, update the comment in regcomp.h */
15172 switch (ri->data->what[n]) {
15178 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15181 Safefree(ri->data->data[n]);
15187 { /* Aho Corasick add-on structure for a trie node.
15188 Used in stclass optimization only */
15190 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15192 refcount = --aho->refcount;
15195 PerlMemShared_free(aho->states);
15196 PerlMemShared_free(aho->fail);
15197 /* do this last!!!! */
15198 PerlMemShared_free(ri->data->data[n]);
15199 PerlMemShared_free(ri->regstclass);
15205 /* trie structure. */
15207 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15209 refcount = --trie->refcount;
15212 PerlMemShared_free(trie->charmap);
15213 PerlMemShared_free(trie->states);
15214 PerlMemShared_free(trie->trans);
15216 PerlMemShared_free(trie->bitmap);
15218 PerlMemShared_free(trie->jump);
15219 PerlMemShared_free(trie->wordinfo);
15220 /* do this last!!!! */
15221 PerlMemShared_free(ri->data->data[n]);
15226 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15229 Safefree(ri->data->what);
15230 Safefree(ri->data);
15236 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15237 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15238 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15241 re_dup - duplicate a regexp.
15243 This routine is expected to clone a given regexp structure. It is only
15244 compiled under USE_ITHREADS.
15246 After all of the core data stored in struct regexp is duplicated
15247 the regexp_engine.dupe method is used to copy any private data
15248 stored in the *pprivate pointer. This allows extensions to handle
15249 any duplication it needs to do.
15251 See pregfree() and regfree_internal() if you change anything here.
15253 #if defined(USE_ITHREADS)
15254 #ifndef PERL_IN_XSUB_RE
15256 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15260 const struct regexp *r = ReANY(sstr);
15261 struct regexp *ret = ReANY(dstr);
15263 PERL_ARGS_ASSERT_RE_DUP_GUTS;
15265 npar = r->nparens+1;
15266 Newx(ret->offs, npar, regexp_paren_pair);
15267 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15269 if (ret->substrs) {
15270 /* Do it this way to avoid reading from *r after the StructCopy().
15271 That way, if any of the sv_dup_inc()s dislodge *r from the L1
15272 cache, it doesn't matter. */
15273 const bool anchored = r->check_substr
15274 ? r->check_substr == r->anchored_substr
15275 : r->check_utf8 == r->anchored_utf8;
15276 Newx(ret->substrs, 1, struct reg_substr_data);
15277 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15279 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15280 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15281 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15282 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15284 /* check_substr and check_utf8, if non-NULL, point to either their
15285 anchored or float namesakes, and don't hold a second reference. */
15287 if (ret->check_substr) {
15289 assert(r->check_utf8 == r->anchored_utf8);
15290 ret->check_substr = ret->anchored_substr;
15291 ret->check_utf8 = ret->anchored_utf8;
15293 assert(r->check_substr == r->float_substr);
15294 assert(r->check_utf8 == r->float_utf8);
15295 ret->check_substr = ret->float_substr;
15296 ret->check_utf8 = ret->float_utf8;
15298 } else if (ret->check_utf8) {
15300 ret->check_utf8 = ret->anchored_utf8;
15302 ret->check_utf8 = ret->float_utf8;
15307 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15308 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15311 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15313 if (RX_MATCH_COPIED(dstr))
15314 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15316 ret->subbeg = NULL;
15317 #ifdef PERL_ANY_COW
15318 ret->saved_copy = NULL;
15321 /* Whether mother_re be set or no, we need to copy the string. We
15322 cannot refrain from copying it when the storage points directly to
15323 our mother regexp, because that's
15324 1: a buffer in a different thread
15325 2: something we no longer hold a reference on
15326 so we need to copy it locally. */
15327 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15328 ret->mother_re = NULL;
15330 #endif /* PERL_IN_XSUB_RE */
15335 This is the internal complement to regdupe() which is used to copy
15336 the structure pointed to by the *pprivate pointer in the regexp.
15337 This is the core version of the extension overridable cloning hook.
15338 The regexp structure being duplicated will be copied by perl prior
15339 to this and will be provided as the regexp *r argument, however
15340 with the /old/ structures pprivate pointer value. Thus this routine
15341 may override any copying normally done by perl.
15343 It returns a pointer to the new regexp_internal structure.
15347 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15350 struct regexp *const r = ReANY(rx);
15351 regexp_internal *reti;
15353 RXi_GET_DECL(r,ri);
15355 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15359 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15360 Copy(ri->program, reti->program, len+1, regnode);
15362 reti->num_code_blocks = ri->num_code_blocks;
15363 if (ri->code_blocks) {
15365 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15366 struct reg_code_block);
15367 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15368 struct reg_code_block);
15369 for (n = 0; n < ri->num_code_blocks; n++)
15370 reti->code_blocks[n].src_regex = (REGEXP*)
15371 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15374 reti->code_blocks = NULL;
15376 reti->regstclass = NULL;
15379 struct reg_data *d;
15380 const int count = ri->data->count;
15383 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15384 char, struct reg_data);
15385 Newx(d->what, count, U8);
15388 for (i = 0; i < count; i++) {
15389 d->what[i] = ri->data->what[i];
15390 switch (d->what[i]) {
15391 /* see also regcomp.h and regfree_internal() */
15392 case 'a': /* actually an AV, but the dup function is identical. */
15396 case 'u': /* actually an HV, but the dup function is identical. */
15397 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15400 /* This is cheating. */
15401 Newx(d->data[i], 1, struct regnode_charclass_class);
15402 StructCopy(ri->data->data[i], d->data[i],
15403 struct regnode_charclass_class);
15404 reti->regstclass = (regnode*)d->data[i];
15407 /* Trie stclasses are readonly and can thus be shared
15408 * without duplication. We free the stclass in pregfree
15409 * when the corresponding reg_ac_data struct is freed.
15411 reti->regstclass= ri->regstclass;
15415 ((reg_trie_data*)ri->data->data[i])->refcount++;
15420 d->data[i] = ri->data->data[i];
15423 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15432 reti->name_list_idx = ri->name_list_idx;
15434 #ifdef RE_TRACK_PATTERN_OFFSETS
15435 if (ri->u.offsets) {
15436 Newx(reti->u.offsets, 2*len+1, U32);
15437 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15440 SetProgLen(reti,len);
15443 return (void*)reti;
15446 #endif /* USE_ITHREADS */
15448 #ifndef PERL_IN_XSUB_RE
15451 - regnext - dig the "next" pointer out of a node
15454 Perl_regnext(pTHX_ regnode *p)
15462 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15463 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15466 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15475 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15478 STRLEN l1 = strlen(pat1);
15479 STRLEN l2 = strlen(pat2);
15482 const char *message;
15484 PERL_ARGS_ASSERT_RE_CROAK2;
15490 Copy(pat1, buf, l1 , char);
15491 Copy(pat2, buf + l1, l2 , char);
15492 buf[l1 + l2] = '\n';
15493 buf[l1 + l2 + 1] = '\0';
15495 /* ANSI variant takes additional second argument */
15496 va_start(args, pat2);
15500 msv = vmess(buf, &args);
15502 message = SvPV_const(msv,l1);
15505 Copy(message, buf, l1 , char);
15506 buf[l1-1] = '\0'; /* Overwrite \n */
15507 Perl_croak(aTHX_ "%s", buf);
15510 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15512 #ifndef PERL_IN_XSUB_RE
15514 Perl_save_re_context(pTHX)
15518 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15520 const REGEXP * const rx = PM_GETRE(PL_curpm);
15523 for (i = 1; i <= RX_NPARENS(rx); i++) {
15524 char digits[TYPE_CHARS(long)];
15525 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15526 GV *const *const gvp
15527 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15530 GV * const gv = *gvp;
15531 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15543 S_put_byte(pTHX_ SV *sv, int c)
15545 PERL_ARGS_ASSERT_PUT_BYTE;
15547 /* Our definition of isPRINT() ignores locales, so only bytes that are
15548 not part of UTF-8 are considered printable. I assume that the same
15549 holds for UTF-EBCDIC.
15550 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15551 which Wikipedia says:
15553 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15554 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15555 identical, to the ASCII delete (DEL) or rubout control character. ...
15556 it is typically mapped to hexadecimal code 9F, in order to provide a
15557 unique character mapping in both directions)
15559 So the old condition can be simplified to !isPRINT(c) */
15562 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
15563 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
15564 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
15565 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
15566 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
15569 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15574 const char string = c;
15575 if (c == '-' || c == ']' || c == '\\' || c == '^')
15576 sv_catpvs(sv, "\\");
15577 sv_catpvn(sv, &string, 1);
15582 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
15584 /* Appends to 'sv' a displayable version of the innards of the bracketed
15585 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
15586 * output anything */
15589 int rangestart = -1;
15590 bool has_output_anything = FALSE;
15592 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
15594 for (i = 0; i <= 256; i++) {
15595 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
15596 if (rangestart == -1)
15598 } else if (rangestart != -1) {
15600 if (i <= rangestart + 3) { /* Individual chars in short ranges */
15601 for (; rangestart < i; rangestart++)
15602 put_byte(sv, rangestart);
15605 || ! isALPHANUMERIC(rangestart)
15606 || ! isALPHANUMERIC(j)
15607 || isDIGIT(rangestart) != isDIGIT(j)
15608 || isUPPER(rangestart) != isUPPER(j)
15609 || isLOWER(rangestart) != isLOWER(j)
15611 /* This final test should get optimized out except
15612 * on EBCDIC platforms, where it causes ranges that
15613 * cross discontinuities like i/j to be shown as hex
15614 * instead of the misleading, e.g. H-K (since that
15615 * range includes more than H, I, J, K). */
15616 || (j - rangestart)
15617 != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
15619 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
15621 (j < 256) ? j : 255);
15623 else { /* Here, the ends of the range are both digits, or both
15624 uppercase, or both lowercase; and there's no
15625 discontinuity in the range (which could happen on EBCDIC
15627 put_byte(sv, rangestart);
15628 sv_catpvs(sv, "-");
15632 has_output_anything = TRUE;
15636 return has_output_anything;
15639 #define CLEAR_OPTSTART \
15640 if (optstart) STMT_START { \
15641 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15645 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15647 STATIC const regnode *
15648 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15649 const regnode *last, const regnode *plast,
15650 SV* sv, I32 indent, U32 depth)
15653 U8 op = PSEUDO; /* Arbitrary non-END op. */
15654 const regnode *next;
15655 const regnode *optstart= NULL;
15657 RXi_GET_DECL(r,ri);
15658 GET_RE_DEBUG_FLAGS_DECL;
15660 PERL_ARGS_ASSERT_DUMPUNTIL;
15662 #ifdef DEBUG_DUMPUNTIL
15663 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15664 last ? last-start : 0,plast ? plast-start : 0);
15667 if (plast && plast < last)
15670 while (PL_regkind[op] != END && (!last || node < last)) {
15671 /* While that wasn't END last time... */
15674 if (op == CLOSE || op == WHILEM)
15676 next = regnext((regnode *)node);
15679 if (OP(node) == OPTIMIZED) {
15680 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15687 regprop(r, sv, node);
15688 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15689 (int)(2*indent + 1), "", SvPVX_const(sv));
15691 if (OP(node) != OPTIMIZED) {
15692 if (next == NULL) /* Next ptr. */
15693 PerlIO_printf(Perl_debug_log, " (0)");
15694 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15695 PerlIO_printf(Perl_debug_log, " (FAIL)");
15697 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15698 (void)PerlIO_putc(Perl_debug_log, '\n');
15702 if (PL_regkind[(U8)op] == BRANCHJ) {
15705 const regnode *nnode = (OP(next) == LONGJMP
15706 ? regnext((regnode *)next)
15708 if (last && nnode > last)
15710 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15713 else if (PL_regkind[(U8)op] == BRANCH) {
15715 DUMPUNTIL(NEXTOPER(node), next);
15717 else if ( PL_regkind[(U8)op] == TRIE ) {
15718 const regnode *this_trie = node;
15719 const char op = OP(node);
15720 const U32 n = ARG(node);
15721 const reg_ac_data * const ac = op>=AHOCORASICK ?
15722 (reg_ac_data *)ri->data->data[n] :
15724 const reg_trie_data * const trie =
15725 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15727 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15729 const regnode *nextbranch= NULL;
15732 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15733 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15735 PerlIO_printf(Perl_debug_log, "%*s%s ",
15736 (int)(2*(indent+3)), "",
15737 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15738 PL_colors[0], PL_colors[1],
15739 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15740 PERL_PV_PRETTY_ELLIPSES |
15741 PERL_PV_PRETTY_LTGT
15746 U16 dist= trie->jump[word_idx+1];
15747 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15748 (UV)((dist ? this_trie + dist : next) - start));
15751 nextbranch= this_trie + trie->jump[0];
15752 DUMPUNTIL(this_trie + dist, nextbranch);
15754 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15755 nextbranch= regnext((regnode *)nextbranch);
15757 PerlIO_printf(Perl_debug_log, "\n");
15760 if (last && next > last)
15765 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
15766 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15767 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15769 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15771 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15773 else if ( op == PLUS || op == STAR) {
15774 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15776 else if (PL_regkind[(U8)op] == ANYOF) {
15777 /* arglen 1 + class block */
15778 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15779 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15780 node = NEXTOPER(node);
15782 else if (PL_regkind[(U8)op] == EXACT) {
15783 /* Literal string, where present. */
15784 node += NODE_SZ_STR(node) - 1;
15785 node = NEXTOPER(node);
15788 node = NEXTOPER(node);
15789 node += regarglen[(U8)op];
15791 if (op == CURLYX || op == OPEN)
15795 #ifdef DEBUG_DUMPUNTIL
15796 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15801 #endif /* DEBUGGING */
15805 * c-indentation-style: bsd
15806 * c-basic-offset: 4
15807 * indent-tabs-mode: nil
15810 * ex: set ts=8 sts=4 sw=4 et: