5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 # if defined(BUGGY_MSC6)
104 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105 # pragma optimize("a",off)
106 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107 # pragma optimize("w",on )
108 # endif /* BUGGY_MSC6 */
112 #define STATIC static
116 typedef struct RExC_state_t {
117 U32 flags; /* RXf_* are we folding, multilining? */
118 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119 char *precomp; /* uncompiled string. */
120 REGEXP *rx_sv; /* The SV that is the regexp. */
121 regexp *rx; /* perl core regexp structure */
122 regexp_internal *rxi; /* internal data for regexp object pprivate field */
123 char *start; /* Start of input for compile */
124 char *end; /* End of input for compile */
125 char *parse; /* Input-scan pointer. */
126 I32 whilem_seen; /* number of WHILEM in this expr */
127 regnode *emit_start; /* Start of emitted-code area */
128 regnode *emit_bound; /* First regnode outside of the allocated space */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* Capture buffer count, (OPEN). */
135 I32 cpar; /* Capture buffer count, (CLOSE). */
136 I32 nestroot; /* root parens we are in - used by accept */
139 regnode **open_parens; /* pointers to open parens */
140 regnode **close_parens; /* pointers to close parens */
141 regnode *opend; /* END node in program */
142 I32 utf8; /* whether the pattern is utf8 or not */
143 I32 orig_utf8; /* whether the pattern was originally in utf8 */
144 /* XXX use this for future optimisation of case
145 * where pattern must be upgraded to utf8. */
146 I32 uni_semantics; /* If a d charset modifier should use unicode
147 rules, even if the pattern is not in
149 HV *paren_names; /* Paren names */
151 regnode **recurse; /* Recurse regops */
152 I32 recurse_count; /* Number of recurse regops */
155 I32 override_recoding;
156 I32 in_multi_char_class;
157 struct reg_code_block *code_blocks; /* positions of literal (?{})
159 int num_code_blocks; /* size of code_blocks[] */
160 int code_index; /* next code_blocks[] slot */
162 char *starttry; /* -Dr: where regtry was called. */
163 #define RExC_starttry (pRExC_state->starttry)
165 SV *runtime_code_qr; /* qr with the runtime code blocks */
167 const char *lastparse;
169 AV *paren_name_list; /* idx -> name */
170 #define RExC_lastparse (pRExC_state->lastparse)
171 #define RExC_lastnum (pRExC_state->lastnum)
172 #define RExC_paren_name_list (pRExC_state->paren_name_list)
176 #define RExC_flags (pRExC_state->flags)
177 #define RExC_pm_flags (pRExC_state->pm_flags)
178 #define RExC_precomp (pRExC_state->precomp)
179 #define RExC_rx_sv (pRExC_state->rx_sv)
180 #define RExC_rx (pRExC_state->rx)
181 #define RExC_rxi (pRExC_state->rxi)
182 #define RExC_start (pRExC_state->start)
183 #define RExC_end (pRExC_state->end)
184 #define RExC_parse (pRExC_state->parse)
185 #define RExC_whilem_seen (pRExC_state->whilem_seen)
186 #ifdef RE_TRACK_PATTERN_OFFSETS
187 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
189 #define RExC_emit (pRExC_state->emit)
190 #define RExC_emit_start (pRExC_state->emit_start)
191 #define RExC_emit_bound (pRExC_state->emit_bound)
192 #define RExC_naughty (pRExC_state->naughty)
193 #define RExC_sawback (pRExC_state->sawback)
194 #define RExC_seen (pRExC_state->seen)
195 #define RExC_size (pRExC_state->size)
196 #define RExC_npar (pRExC_state->npar)
197 #define RExC_nestroot (pRExC_state->nestroot)
198 #define RExC_extralen (pRExC_state->extralen)
199 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
200 #define RExC_utf8 (pRExC_state->utf8)
201 #define RExC_uni_semantics (pRExC_state->uni_semantics)
202 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
203 #define RExC_open_parens (pRExC_state->open_parens)
204 #define RExC_close_parens (pRExC_state->close_parens)
205 #define RExC_opend (pRExC_state->opend)
206 #define RExC_paren_names (pRExC_state->paren_names)
207 #define RExC_recurse (pRExC_state->recurse)
208 #define RExC_recurse_count (pRExC_state->recurse_count)
209 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
210 #define RExC_contains_locale (pRExC_state->contains_locale)
211 #define RExC_override_recoding (pRExC_state->override_recoding)
212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
215 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
216 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
217 ((*s) == '{' && regcurly(s, FALSE)))
220 #undef SPSTART /* dratted cpp namespace... */
223 * Flags to be passed up and down.
225 #define WORST 0 /* Worst case. */
226 #define HASWIDTH 0x01 /* Known to match non-null strings. */
228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
229 * character. (There needs to be a case: in the switch statement in regexec.c
230 * for any node marked SIMPLE.) Note that this is not the same thing as
233 #define SPSTART 0x04 /* Starts with * or + */
234 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
235 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
239 /* whether trie related optimizations are enabled */
240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
241 #define TRIE_STUDY_OPT
242 #define FULL_TRIE_STUDY
248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
249 #define PBITVAL(paren) (1 << ((paren) & 7))
250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
254 /* If not already in utf8, do a longjmp back to the beginning */
255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
256 #define REQUIRE_UTF8 STMT_START { \
257 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
260 /* This converts the named class defined in regcomp.h to its equivalent class
261 * number defined in handy.h. */
262 #define namedclass_to_classnum(class) ((int) ((class) / 2))
263 #define classnum_to_namedclass(classnum) ((classnum) * 2)
265 /* About scan_data_t.
267 During optimisation we recurse through the regexp program performing
268 various inplace (keyhole style) optimisations. In addition study_chunk
269 and scan_commit populate this data structure with information about
270 what strings MUST appear in the pattern. We look for the longest
271 string that must appear at a fixed location, and we look for the
272 longest string that may appear at a floating location. So for instance
277 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
278 strings (because they follow a .* construct). study_chunk will identify
279 both FOO and BAR as being the longest fixed and floating strings respectively.
281 The strings can be composites, for instance
285 will result in a composite fixed substring 'foo'.
287 For each string some basic information is maintained:
289 - offset or min_offset
290 This is the position the string must appear at, or not before.
291 It also implicitly (when combined with minlenp) tells us how many
292 characters must match before the string we are searching for.
293 Likewise when combined with minlenp and the length of the string it
294 tells us how many characters must appear after the string we have
298 Only used for floating strings. This is the rightmost point that
299 the string can appear at. If set to I32 max it indicates that the
300 string can occur infinitely far to the right.
303 A pointer to the minimum number of characters of the pattern that the
304 string was found inside. This is important as in the case of positive
305 lookahead or positive lookbehind we can have multiple patterns
310 The minimum length of the pattern overall is 3, the minimum length
311 of the lookahead part is 3, but the minimum length of the part that
312 will actually match is 1. So 'FOO's minimum length is 3, but the
313 minimum length for the F is 1. This is important as the minimum length
314 is used to determine offsets in front of and behind the string being
315 looked for. Since strings can be composites this is the length of the
316 pattern at the time it was committed with a scan_commit. Note that
317 the length is calculated by study_chunk, so that the minimum lengths
318 are not known until the full pattern has been compiled, thus the
319 pointer to the value.
323 In the case of lookbehind the string being searched for can be
324 offset past the start point of the final matching string.
325 If this value was just blithely removed from the min_offset it would
326 invalidate some of the calculations for how many chars must match
327 before or after (as they are derived from min_offset and minlen and
328 the length of the string being searched for).
329 When the final pattern is compiled and the data is moved from the
330 scan_data_t structure into the regexp structure the information
331 about lookbehind is factored in, with the information that would
332 have been lost precalculated in the end_shift field for the
335 The fields pos_min and pos_delta are used to store the minimum offset
336 and the delta to the maximum offset at the current point in the pattern.
340 typedef struct scan_data_t {
341 /*I32 len_min; unused */
342 /*I32 len_delta; unused */
346 I32 last_end; /* min value, <0 unless valid. */
349 SV **longest; /* Either &l_fixed, or &l_float. */
350 SV *longest_fixed; /* longest fixed string found in pattern */
351 I32 offset_fixed; /* offset where it starts */
352 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
353 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
354 SV *longest_float; /* longest floating string found in pattern */
355 I32 offset_float_min; /* earliest point in string it can appear */
356 I32 offset_float_max; /* latest point in string it can appear */
357 I32 *minlen_float; /* pointer to the minlen relevant to the string */
358 I32 lookbehind_float; /* is the position of the string modified by LB */
362 struct regnode_charclass_class *start_class;
366 * Forward declarations for pregcomp()'s friends.
369 static const scan_data_t zero_scan_data =
370 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
372 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
373 #define SF_BEFORE_SEOL 0x0001
374 #define SF_BEFORE_MEOL 0x0002
375 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
376 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
379 # define SF_FIX_SHIFT_EOL (0+2)
380 # define SF_FL_SHIFT_EOL (0+4)
382 # define SF_FIX_SHIFT_EOL (+2)
383 # define SF_FL_SHIFT_EOL (+4)
386 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
387 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
389 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
390 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
391 #define SF_IS_INF 0x0040
392 #define SF_HAS_PAR 0x0080
393 #define SF_IN_PAR 0x0100
394 #define SF_HAS_EVAL 0x0200
395 #define SCF_DO_SUBSTR 0x0400
396 #define SCF_DO_STCLASS_AND 0x0800
397 #define SCF_DO_STCLASS_OR 0x1000
398 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
399 #define SCF_WHILEM_VISITED_POS 0x2000
401 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
402 #define SCF_SEEN_ACCEPT 0x8000
404 #define UTF cBOOL(RExC_utf8)
406 /* The enums for all these are ordered so things work out correctly */
407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
417 #define OOB_NAMEDCLASS -1
419 /* There is no code point that is out-of-bounds, so this is problematic. But
420 * its only current use is to initialize a variable that is always set before
422 #define OOB_UNICODE 0xDEADBEEF
424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
428 /* length of regex to show in messages that don't mark a position within */
429 #define RegexLengthToShowInErrorMessages 127
432 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
433 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
434 * op/pragma/warn/regcomp.
436 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
437 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
442 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
443 * arg. Show regex, up to a maximum length. If it's too long, chop and add
446 #define _FAIL(code) STMT_START { \
447 const char *ellipses = ""; \
448 IV len = RExC_end - RExC_precomp; \
451 SAVEFREESV(RExC_rx_sv); \
452 if (len > RegexLengthToShowInErrorMessages) { \
453 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
454 len = RegexLengthToShowInErrorMessages - 10; \
460 #define FAIL(msg) _FAIL( \
461 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
462 msg, (int)len, RExC_precomp, ellipses))
464 #define FAIL2(msg,arg) _FAIL( \
465 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
466 arg, (int)len, RExC_precomp, ellipses))
469 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
471 #define Simple_vFAIL(m) STMT_START { \
472 const IV offset = RExC_parse - RExC_precomp; \
473 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
480 #define vFAIL(m) STMT_START { \
482 SAVEFREESV(RExC_rx_sv); \
487 * Like Simple_vFAIL(), but accepts two arguments.
489 #define Simple_vFAIL2(m,a1) STMT_START { \
490 const IV offset = RExC_parse - RExC_precomp; \
491 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
492 (int)offset, RExC_precomp, RExC_precomp + offset); \
496 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
498 #define vFAIL2(m,a1) STMT_START { \
500 SAVEFREESV(RExC_rx_sv); \
501 Simple_vFAIL2(m, a1); \
506 * Like Simple_vFAIL(), but accepts three arguments.
508 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
509 const IV offset = RExC_parse - RExC_precomp; \
510 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
515 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
517 #define vFAIL3(m,a1,a2) STMT_START { \
519 SAVEFREESV(RExC_rx_sv); \
520 Simple_vFAIL3(m, a1, a2); \
524 * Like Simple_vFAIL(), but accepts four arguments.
526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
527 const IV offset = RExC_parse - RExC_precomp; \
528 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
529 (int)offset, RExC_precomp, RExC_precomp + offset); \
532 #define vFAIL4(m,a1,a2,a3) STMT_START { \
534 SAVEFREESV(RExC_rx_sv); \
535 Simple_vFAIL4(m, a1, a2, a3); \
538 /* m is not necessarily a "literal string", in this macro */
539 #define reg_warn_non_literal_string(loc, m) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
542 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
545 #define ckWARNreg(loc,m) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
548 (int)offset, RExC_precomp, RExC_precomp + offset); \
551 #define vWARN_dep(loc, m) STMT_START { \
552 const IV offset = loc - RExC_precomp; \
553 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
554 (int)offset, RExC_precomp, RExC_precomp + offset); \
557 #define ckWARNdep(loc,m) STMT_START { \
558 const IV offset = loc - RExC_precomp; \
559 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
561 (int)offset, RExC_precomp, RExC_precomp + offset); \
564 #define ckWARNregdep(loc,m) STMT_START { \
565 const IV offset = loc - RExC_precomp; \
566 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
568 (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define ckWARN2regdep(loc,m, a1) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
575 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
578 #define ckWARN2reg(loc, m, a1) STMT_START { \
579 const IV offset = loc - RExC_precomp; \
580 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
581 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
584 #define vWARN3(loc, m, a1, a2) STMT_START { \
585 const IV offset = loc - RExC_precomp; \
586 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
587 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
590 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
591 const IV offset = loc - RExC_precomp; \
592 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
593 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
596 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
597 const IV offset = loc - RExC_precomp; \
598 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
599 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
602 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
603 const IV offset = loc - RExC_precomp; \
604 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
605 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
608 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
609 const IV offset = loc - RExC_precomp; \
610 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
611 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
615 /* Allow for side effects in s */
616 #define REGC(c,s) STMT_START { \
617 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
620 /* Macros for recording node offsets. 20001227 mjd@plover.com
621 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
622 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
623 * Element 0 holds the number n.
624 * Position is 1 indexed.
626 #ifndef RE_TRACK_PATTERN_OFFSETS
627 #define Set_Node_Offset_To_R(node,byte)
628 #define Set_Node_Offset(node,byte)
629 #define Set_Cur_Node_Offset
630 #define Set_Node_Length_To_R(node,len)
631 #define Set_Node_Length(node,len)
632 #define Set_Node_Cur_Length(node)
633 #define Node_Offset(n)
634 #define Node_Length(n)
635 #define Set_Node_Offset_Length(node,offset,len)
636 #define ProgLen(ri) ri->u.proglen
637 #define SetProgLen(ri,x) ri->u.proglen = x
639 #define ProgLen(ri) ri->u.offsets[0]
640 #define SetProgLen(ri,x) ri->u.offsets[0] = x
641 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
643 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
644 __LINE__, (int)(node), (int)(byte))); \
646 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
648 RExC_offsets[2*(node)-1] = (byte); \
653 #define Set_Node_Offset(node,byte) \
654 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
655 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
657 #define Set_Node_Length_To_R(node,len) STMT_START { \
659 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
660 __LINE__, (int)(node), (int)(len))); \
662 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
664 RExC_offsets[2*(node)] = (len); \
669 #define Set_Node_Length(node,len) \
670 Set_Node_Length_To_R((node)-RExC_emit_start, len)
671 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
672 #define Set_Node_Cur_Length(node) \
673 Set_Node_Length(node, RExC_parse - parse_start)
675 /* Get offsets and lengths */
676 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
677 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
679 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
680 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
681 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
685 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
686 #define EXPERIMENTAL_INPLACESCAN
687 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
689 #define DEBUG_STUDYDATA(str,data,depth) \
690 DEBUG_OPTIMISE_MORE_r(if(data){ \
691 PerlIO_printf(Perl_debug_log, \
692 "%*s" str "Pos:%"IVdf"/%"IVdf \
693 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
694 (int)(depth)*2, "", \
695 (IV)((data)->pos_min), \
696 (IV)((data)->pos_delta), \
697 (UV)((data)->flags), \
698 (IV)((data)->whilem_c), \
699 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
700 is_inf ? "INF " : "" \
702 if ((data)->last_found) \
703 PerlIO_printf(Perl_debug_log, \
704 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
705 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
706 SvPVX_const((data)->last_found), \
707 (IV)((data)->last_end), \
708 (IV)((data)->last_start_min), \
709 (IV)((data)->last_start_max), \
710 ((data)->longest && \
711 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
712 SvPVX_const((data)->longest_fixed), \
713 (IV)((data)->offset_fixed), \
714 ((data)->longest && \
715 (data)->longest==&((data)->longest_float)) ? "*" : "", \
716 SvPVX_const((data)->longest_float), \
717 (IV)((data)->offset_float_min), \
718 (IV)((data)->offset_float_max) \
720 PerlIO_printf(Perl_debug_log,"\n"); \
723 /* Mark that we cannot extend a found fixed substring at this point.
724 Update the longest found anchored substring and the longest found
725 floating substrings if needed. */
728 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
730 const STRLEN l = CHR_SVLEN(data->last_found);
731 const STRLEN old_l = CHR_SVLEN(*data->longest);
732 GET_RE_DEBUG_FLAGS_DECL;
734 PERL_ARGS_ASSERT_SCAN_COMMIT;
736 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
737 SvSetMagicSV(*data->longest, data->last_found);
738 if (*data->longest == data->longest_fixed) {
739 data->offset_fixed = l ? data->last_start_min : data->pos_min;
740 if (data->flags & SF_BEFORE_EOL)
742 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
744 data->flags &= ~SF_FIX_BEFORE_EOL;
745 data->minlen_fixed=minlenp;
746 data->lookbehind_fixed=0;
748 else { /* *data->longest == data->longest_float */
749 data->offset_float_min = l ? data->last_start_min : data->pos_min;
750 data->offset_float_max = (l
751 ? data->last_start_max
752 : data->pos_min + data->pos_delta);
753 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
754 data->offset_float_max = I32_MAX;
755 if (data->flags & SF_BEFORE_EOL)
757 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
759 data->flags &= ~SF_FL_BEFORE_EOL;
760 data->minlen_float=minlenp;
761 data->lookbehind_float=0;
764 SvCUR_set(data->last_found, 0);
766 SV * const sv = data->last_found;
767 if (SvUTF8(sv) && SvMAGICAL(sv)) {
768 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
774 data->flags &= ~SF_BEFORE_EOL;
775 DEBUG_STUDYDATA("commit: ",data,0);
778 /* These macros set, clear and test whether the synthetic start class ('ssc',
779 * given by the parameter) matches an empty string (EOS). This uses the
780 * 'next_off' field in the node, to save a bit in the flags field. The ssc
781 * stands alone, so there is never a next_off, so this field is otherwise
782 * unused. The EOS information is used only for compilation, but theoretically
783 * it could be passed on to the execution code. This could be used to store
784 * more than one bit of information, but only this one is currently used. */
785 #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
786 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
787 #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
789 /* Can match anything (initialization) */
791 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
793 PERL_ARGS_ASSERT_CL_ANYTHING;
795 ANYOF_BITMAP_SETALL(cl);
796 cl->flags = ANYOF_UNICODE_ALL;
799 /* If any portion of the regex is to operate under locale rules,
800 * initialization includes it. The reason this isn't done for all regexes
801 * is that the optimizer was written under the assumption that locale was
802 * all-or-nothing. Given the complexity and lack of documentation in the
803 * optimizer, and that there are inadequate test cases for locale, so many
804 * parts of it may not work properly, it is safest to avoid locale unless
806 if (RExC_contains_locale) {
807 ANYOF_CLASS_SETALL(cl); /* /l uses class */
808 cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
811 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
815 /* Can match anything (initialization) */
817 S_cl_is_anything(const struct regnode_charclass_class *cl)
821 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
823 for (value = 0; value < ANYOF_MAX; value += 2)
824 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
826 if (!(cl->flags & ANYOF_UNICODE_ALL))
828 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
833 /* Can match anything (initialization) */
835 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
837 PERL_ARGS_ASSERT_CL_INIT;
839 Zero(cl, 1, struct regnode_charclass_class);
841 cl_anything(pRExC_state, cl);
842 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
845 /* These two functions currently do the exact same thing */
846 #define cl_init_zero S_cl_init
848 /* 'AND' a given class with another one. Can create false positives. 'cl'
849 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
850 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
852 S_cl_and(struct regnode_charclass_class *cl,
853 const struct regnode_charclass_class *and_with)
855 PERL_ARGS_ASSERT_CL_AND;
857 assert(PL_regkind[and_with->type] == ANYOF);
859 /* I (khw) am not sure all these restrictions are necessary XXX */
860 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
861 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
862 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
863 && !(and_with->flags & ANYOF_LOC_FOLD)
864 && !(cl->flags & ANYOF_LOC_FOLD)) {
867 if (and_with->flags & ANYOF_INVERT)
868 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
869 cl->bitmap[i] &= ~and_with->bitmap[i];
871 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
872 cl->bitmap[i] &= and_with->bitmap[i];
873 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
875 if (and_with->flags & ANYOF_INVERT) {
877 /* Here, the and'ed node is inverted. Get the AND of the flags that
878 * aren't affected by the inversion. Those that are affected are
879 * handled individually below */
880 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
881 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
882 cl->flags |= affected_flags;
884 /* We currently don't know how to deal with things that aren't in the
885 * bitmap, but we know that the intersection is no greater than what
886 * is already in cl, so let there be false positives that get sorted
887 * out after the synthetic start class succeeds, and the node is
888 * matched for real. */
890 /* The inversion of these two flags indicate that the resulting
891 * intersection doesn't have them */
892 if (and_with->flags & ANYOF_UNICODE_ALL) {
893 cl->flags &= ~ANYOF_UNICODE_ALL;
895 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
896 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
899 else { /* and'd node is not inverted */
900 U8 outside_bitmap_but_not_utf8; /* Temp variable */
902 if (! ANYOF_NONBITMAP(and_with)) {
904 /* Here 'and_with' doesn't match anything outside the bitmap
905 * (except possibly ANYOF_UNICODE_ALL), which means the
906 * intersection can't either, except for ANYOF_UNICODE_ALL, in
907 * which case we don't know what the intersection is, but it's no
908 * greater than what cl already has, so can just leave it alone,
909 * with possible false positives */
910 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
911 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
912 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
915 else if (! ANYOF_NONBITMAP(cl)) {
917 /* Here, 'and_with' does match something outside the bitmap, and cl
918 * doesn't have a list of things to match outside the bitmap. If
919 * cl can match all code points above 255, the intersection will
920 * be those above-255 code points that 'and_with' matches. If cl
921 * can't match all Unicode code points, it means that it can't
922 * match anything outside the bitmap (since the 'if' that got us
923 * into this block tested for that), so we leave the bitmap empty.
925 if (cl->flags & ANYOF_UNICODE_ALL) {
926 ARG_SET(cl, ARG(and_with));
928 /* and_with's ARG may match things that don't require UTF8.
929 * And now cl's will too, in spite of this being an 'and'. See
930 * the comments below about the kludge */
931 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
935 /* Here, both 'and_with' and cl match something outside the
936 * bitmap. Currently we do not do the intersection, so just match
937 * whatever cl had at the beginning. */
941 /* Take the intersection of the two sets of flags. However, the
942 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
943 * kludge around the fact that this flag is not treated like the others
944 * which are initialized in cl_anything(). The way the optimizer works
945 * is that the synthetic start class (SSC) is initialized to match
946 * anything, and then the first time a real node is encountered, its
947 * values are AND'd with the SSC's with the result being the values of
948 * the real node. However, there are paths through the optimizer where
949 * the AND never gets called, so those initialized bits are set
950 * inappropriately, which is not usually a big deal, as they just cause
951 * false positives in the SSC, which will just mean a probably
952 * imperceptible slow down in execution. However this bit has a
953 * higher false positive consequence in that it can cause utf8.pm,
954 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
955 * bigger slowdown and also causes significant extra memory to be used.
956 * In order to prevent this, the code now takes a different tack. The
957 * bit isn't set unless some part of the regular expression needs it,
958 * but once set it won't get cleared. This means that these extra
959 * modules won't get loaded unless there was some path through the
960 * pattern that would have required them anyway, and so any false
961 * positives that occur by not ANDing them out when they could be
962 * aren't as severe as they would be if we treated this bit like all
964 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
965 & ANYOF_NONBITMAP_NON_UTF8;
966 cl->flags &= and_with->flags;
967 cl->flags |= outside_bitmap_but_not_utf8;
971 /* 'OR' a given class with another one. Can create false positives. 'cl'
972 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
973 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
975 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
977 PERL_ARGS_ASSERT_CL_OR;
979 if (or_with->flags & ANYOF_INVERT) {
981 /* Here, the or'd node is to be inverted. This means we take the
982 * complement of everything not in the bitmap, but currently we don't
983 * know what that is, so give up and match anything */
984 if (ANYOF_NONBITMAP(or_with)) {
985 cl_anything(pRExC_state, cl);
988 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
989 * <= (B1 | !B2) | (CL1 | !CL2)
990 * which is wasteful if CL2 is small, but we ignore CL2:
991 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
992 * XXXX Can we handle case-fold? Unclear:
993 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
994 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
996 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
997 && !(or_with->flags & ANYOF_LOC_FOLD)
998 && !(cl->flags & ANYOF_LOC_FOLD) ) {
1001 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1002 cl->bitmap[i] |= ~or_with->bitmap[i];
1003 } /* XXXX: logic is complicated otherwise */
1005 cl_anything(pRExC_state, cl);
1008 /* And, we can just take the union of the flags that aren't affected
1009 * by the inversion */
1010 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1012 /* For the remaining flags:
1013 ANYOF_UNICODE_ALL and inverted means to not match anything above
1014 255, which means that the union with cl should just be
1015 what cl has in it, so can ignore this flag
1016 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1017 is 127-255 to match them, but then invert that, so the
1018 union with cl should just be what cl has in it, so can
1021 } else { /* 'or_with' is not inverted */
1022 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1023 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024 && (!(or_with->flags & ANYOF_LOC_FOLD)
1025 || (cl->flags & ANYOF_LOC_FOLD)) ) {
1028 /* OR char bitmap and class bitmap separately */
1029 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1030 cl->bitmap[i] |= or_with->bitmap[i];
1031 if (or_with->flags & ANYOF_CLASS) {
1032 ANYOF_CLASS_OR(or_with, cl);
1035 else { /* XXXX: logic is complicated, leave it along for a moment. */
1036 cl_anything(pRExC_state, cl);
1039 if (ANYOF_NONBITMAP(or_with)) {
1041 /* Use the added node's outside-the-bit-map match if there isn't a
1042 * conflict. If there is a conflict (both nodes match something
1043 * outside the bitmap, but what they match outside is not the same
1044 * pointer, and hence not easily compared until XXX we extend
1045 * inversion lists this far), give up and allow the start class to
1046 * match everything outside the bitmap. If that stuff is all above
1047 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1048 if (! ANYOF_NONBITMAP(cl)) {
1049 ARG_SET(cl, ARG(or_with));
1051 else if (ARG(cl) != ARG(or_with)) {
1053 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1054 cl_anything(pRExC_state, cl);
1057 cl->flags |= ANYOF_UNICODE_ALL;
1062 /* Take the union */
1063 cl->flags |= or_with->flags;
1067 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1068 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1069 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1070 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1075 dump_trie(trie,widecharmap,revcharmap)
1076 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1077 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1079 These routines dump out a trie in a somewhat readable format.
1080 The _interim_ variants are used for debugging the interim
1081 tables that are used to generate the final compressed
1082 representation which is what dump_trie expects.
1084 Part of the reason for their existence is to provide a form
1085 of documentation as to how the different representations function.
1090 Dumps the final compressed table form of the trie to Perl_debug_log.
1091 Used for debugging make_trie().
1095 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1096 AV *revcharmap, U32 depth)
1099 SV *sv=sv_newmortal();
1100 int colwidth= widecharmap ? 6 : 4;
1102 GET_RE_DEBUG_FLAGS_DECL;
1104 PERL_ARGS_ASSERT_DUMP_TRIE;
1106 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1107 (int)depth * 2 + 2,"",
1108 "Match","Base","Ofs" );
1110 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1111 SV ** const tmp = av_fetch( revcharmap, state, 0);
1113 PerlIO_printf( Perl_debug_log, "%*s",
1115 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1116 PL_colors[0], PL_colors[1],
1117 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1118 PERL_PV_ESCAPE_FIRSTCHAR
1123 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1124 (int)depth * 2 + 2,"");
1126 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1127 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1128 PerlIO_printf( Perl_debug_log, "\n");
1130 for( state = 1 ; state < trie->statecount ; state++ ) {
1131 const U32 base = trie->states[ state ].trans.base;
1133 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1135 if ( trie->states[ state ].wordnum ) {
1136 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1138 PerlIO_printf( Perl_debug_log, "%6s", "" );
1141 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1146 while( ( base + ofs < trie->uniquecharcount ) ||
1147 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1148 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1151 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1153 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1154 if ( ( base + ofs >= trie->uniquecharcount ) &&
1155 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1156 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1158 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1160 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1162 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1166 PerlIO_printf( Perl_debug_log, "]");
1169 PerlIO_printf( Perl_debug_log, "\n" );
1171 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1172 for (word=1; word <= trie->wordcount; word++) {
1173 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1174 (int)word, (int)(trie->wordinfo[word].prev),
1175 (int)(trie->wordinfo[word].len));
1177 PerlIO_printf(Perl_debug_log, "\n" );
1180 Dumps a fully constructed but uncompressed trie in list form.
1181 List tries normally only are used for construction when the number of
1182 possible chars (trie->uniquecharcount) is very high.
1183 Used for debugging make_trie().
1186 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1187 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1191 SV *sv=sv_newmortal();
1192 int colwidth= widecharmap ? 6 : 4;
1193 GET_RE_DEBUG_FLAGS_DECL;
1195 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1197 /* print out the table precompression. */
1198 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1199 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1200 "------:-----+-----------------\n" );
1202 for( state=1 ; state < next_alloc ; state ++ ) {
1205 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1206 (int)depth * 2 + 2,"", (UV)state );
1207 if ( ! trie->states[ state ].wordnum ) {
1208 PerlIO_printf( Perl_debug_log, "%5s| ","");
1210 PerlIO_printf( Perl_debug_log, "W%4x| ",
1211 trie->states[ state ].wordnum
1214 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1215 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1217 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1219 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1220 PL_colors[0], PL_colors[1],
1221 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1222 PERL_PV_ESCAPE_FIRSTCHAR
1224 TRIE_LIST_ITEM(state,charid).forid,
1225 (UV)TRIE_LIST_ITEM(state,charid).newstate
1228 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1229 (int)((depth * 2) + 14), "");
1232 PerlIO_printf( Perl_debug_log, "\n");
1237 Dumps a fully constructed but uncompressed trie in table form.
1238 This is the normal DFA style state transition table, with a few
1239 twists to facilitate compression later.
1240 Used for debugging make_trie().
1243 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1244 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1249 SV *sv=sv_newmortal();
1250 int colwidth= widecharmap ? 6 : 4;
1251 GET_RE_DEBUG_FLAGS_DECL;
1253 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1256 print out the table precompression so that we can do a visual check
1257 that they are identical.
1260 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1262 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1263 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1265 PerlIO_printf( Perl_debug_log, "%*s",
1267 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1268 PL_colors[0], PL_colors[1],
1269 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1270 PERL_PV_ESCAPE_FIRSTCHAR
1276 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1278 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1279 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1282 PerlIO_printf( Perl_debug_log, "\n" );
1284 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1286 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1287 (int)depth * 2 + 2,"",
1288 (UV)TRIE_NODENUM( state ) );
1290 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1291 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1293 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1295 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1297 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1298 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1300 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1301 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1309 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1310 startbranch: the first branch in the whole branch sequence
1311 first : start branch of sequence of branch-exact nodes.
1312 May be the same as startbranch
1313 last : Thing following the last branch.
1314 May be the same as tail.
1315 tail : item following the branch sequence
1316 count : words in the sequence
1317 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1318 depth : indent depth
1320 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1322 A trie is an N'ary tree where the branches are determined by digital
1323 decomposition of the key. IE, at the root node you look up the 1st character and
1324 follow that branch repeat until you find the end of the branches. Nodes can be
1325 marked as "accepting" meaning they represent a complete word. Eg:
1329 would convert into the following structure. Numbers represent states, letters
1330 following numbers represent valid transitions on the letter from that state, if
1331 the number is in square brackets it represents an accepting state, otherwise it
1332 will be in parenthesis.
1334 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1338 (1) +-i->(6)-+-s->[7]
1340 +-s->(3)-+-h->(4)-+-e->[5]
1342 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1344 This shows that when matching against the string 'hers' we will begin at state 1
1345 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1346 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1347 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1348 single traverse. We store a mapping from accepting to state to which word was
1349 matched, and then when we have multiple possibilities we try to complete the
1350 rest of the regex in the order in which they occured in the alternation.
1352 The only prior NFA like behaviour that would be changed by the TRIE support is
1353 the silent ignoring of duplicate alternations which are of the form:
1355 / (DUPE|DUPE) X? (?{ ... }) Y /x
1357 Thus EVAL blocks following a trie may be called a different number of times with
1358 and without the optimisation. With the optimisations dupes will be silently
1359 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1360 the following demonstrates:
1362 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1364 which prints out 'word' three times, but
1366 'words'=~/(word|word|word)(?{ print $1 })S/
1368 which doesnt print it out at all. This is due to other optimisations kicking in.
1370 Example of what happens on a structural level:
1372 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1374 1: CURLYM[1] {1,32767}(18)
1385 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1386 and should turn into:
1388 1: CURLYM[1] {1,32767}(18)
1390 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1398 Cases where tail != last would be like /(?foo|bar)baz/:
1408 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1409 and would end up looking like:
1412 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1419 d = uvuni_to_utf8_flags(d, uv, 0);
1421 is the recommended Unicode-aware way of saying
1426 #define TRIE_STORE_REVCHAR(val) \
1429 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1430 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1431 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1432 SvCUR_set(zlopp, kapow - flrbbbbb); \
1435 av_push(revcharmap, zlopp); \
1437 char ooooff = (char)val; \
1438 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1442 #define TRIE_READ_CHAR STMT_START { \
1445 /* if it is UTF then it is either already folded, or does not need folding */ \
1446 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1448 else if (folder == PL_fold_latin1) { \
1449 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1450 if ( foldlen > 0 ) { \
1451 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1457 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1458 skiplen = UNISKIP(uvc); \
1459 foldlen -= skiplen; \
1460 scan = foldbuf + skiplen; \
1463 /* raw data, will be folded later if needed */ \
1471 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1472 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1473 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1474 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1476 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1477 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1478 TRIE_LIST_CUR( state )++; \
1481 #define TRIE_LIST_NEW(state) STMT_START { \
1482 Newxz( trie->states[ state ].trans.list, \
1483 4, reg_trie_trans_le ); \
1484 TRIE_LIST_CUR( state ) = 1; \
1485 TRIE_LIST_LEN( state ) = 4; \
1488 #define TRIE_HANDLE_WORD(state) STMT_START { \
1489 U16 dupe= trie->states[ state ].wordnum; \
1490 regnode * const noper_next = regnext( noper ); \
1493 /* store the word for dumping */ \
1495 if (OP(noper) != NOTHING) \
1496 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1498 tmp = newSVpvn_utf8( "", 0, UTF ); \
1499 av_push( trie_words, tmp ); \
1503 trie->wordinfo[curword].prev = 0; \
1504 trie->wordinfo[curword].len = wordlen; \
1505 trie->wordinfo[curword].accept = state; \
1507 if ( noper_next < tail ) { \
1509 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1510 trie->jump[curword] = (U16)(noper_next - convert); \
1512 jumper = noper_next; \
1514 nextbranch= regnext(cur); \
1518 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1519 /* chain, so that when the bits of chain are later */\
1520 /* linked together, the dups appear in the chain */\
1521 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1522 trie->wordinfo[dupe].prev = curword; \
1524 /* we haven't inserted this word yet. */ \
1525 trie->states[ state ].wordnum = curword; \
1530 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1531 ( ( base + charid >= ucharcount \
1532 && base + charid < ubound \
1533 && state == trie->trans[ base - ucharcount + charid ].check \
1534 && trie->trans[ base - ucharcount + charid ].next ) \
1535 ? trie->trans[ base - ucharcount + charid ].next \
1536 : ( state==1 ? special : 0 ) \
1540 #define MADE_JUMP_TRIE 2
1541 #define MADE_EXACT_TRIE 4
1544 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1547 /* first pass, loop through and scan words */
1548 reg_trie_data *trie;
1549 HV *widecharmap = NULL;
1550 AV *revcharmap = newAV();
1552 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1557 regnode *jumper = NULL;
1558 regnode *nextbranch = NULL;
1559 regnode *convert = NULL;
1560 U32 *prev_states; /* temp array mapping each state to previous one */
1561 /* we just use folder as a flag in utf8 */
1562 const U8 * folder = NULL;
1565 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1566 AV *trie_words = NULL;
1567 /* along with revcharmap, this only used during construction but both are
1568 * useful during debugging so we store them in the struct when debugging.
1571 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1572 STRLEN trie_charcount=0;
1574 SV *re_trie_maxbuff;
1575 GET_RE_DEBUG_FLAGS_DECL;
1577 PERL_ARGS_ASSERT_MAKE_TRIE;
1579 PERL_UNUSED_ARG(depth);
1586 case EXACTFU_TRICKYFOLD:
1587 case EXACTFU: folder = PL_fold_latin1; break;
1588 case EXACTF: folder = PL_fold; break;
1589 case EXACTFL: folder = PL_fold_locale; break;
1590 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1593 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1595 trie->startstate = 1;
1596 trie->wordcount = word_count;
1597 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1598 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1600 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1601 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1602 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1605 trie_words = newAV();
1608 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1609 if (!SvIOK(re_trie_maxbuff)) {
1610 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1612 DEBUG_TRIE_COMPILE_r({
1613 PerlIO_printf( Perl_debug_log,
1614 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1615 (int)depth * 2 + 2, "",
1616 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1617 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1621 /* Find the node we are going to overwrite */
1622 if ( first == startbranch && OP( last ) != BRANCH ) {
1623 /* whole branch chain */
1626 /* branch sub-chain */
1627 convert = NEXTOPER( first );
1630 /* -- First loop and Setup --
1632 We first traverse the branches and scan each word to determine if it
1633 contains widechars, and how many unique chars there are, this is
1634 important as we have to build a table with at least as many columns as we
1637 We use an array of integers to represent the character codes 0..255
1638 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1639 native representation of the character value as the key and IV's for the
1642 *TODO* If we keep track of how many times each character is used we can
1643 remap the columns so that the table compression later on is more
1644 efficient in terms of memory by ensuring the most common value is in the
1645 middle and the least common are on the outside. IMO this would be better
1646 than a most to least common mapping as theres a decent chance the most
1647 common letter will share a node with the least common, meaning the node
1648 will not be compressible. With a middle is most common approach the worst
1649 case is when we have the least common nodes twice.
1653 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1654 regnode *noper = NEXTOPER( cur );
1655 const U8 *uc = (U8*)STRING( noper );
1656 const U8 *e = uc + STR_LEN( noper );
1658 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1660 const U8 *scan = (U8*)NULL;
1661 U32 wordlen = 0; /* required init */
1663 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1665 if (OP(noper) == NOTHING) {
1666 regnode *noper_next= regnext(noper);
1667 if (noper_next != tail && OP(noper_next) == flags) {
1669 uc= (U8*)STRING(noper);
1670 e= uc + STR_LEN(noper);
1671 trie->minlen= STR_LEN(noper);
1678 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1679 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1680 regardless of encoding */
1681 if (OP( noper ) == EXACTFU_SS) {
1682 /* false positives are ok, so just set this */
1683 TRIE_BITMAP_SET(trie,0xDF);
1686 for ( ; uc < e ; uc += len ) {
1687 TRIE_CHARCOUNT(trie)++;
1692 U8 folded= folder[ (U8) uvc ];
1693 if ( !trie->charmap[ folded ] ) {
1694 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1695 TRIE_STORE_REVCHAR( folded );
1698 if ( !trie->charmap[ uvc ] ) {
1699 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1700 TRIE_STORE_REVCHAR( uvc );
1703 /* store the codepoint in the bitmap, and its folded
1705 TRIE_BITMAP_SET(trie, uvc);
1707 /* store the folded codepoint */
1708 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1711 /* store first byte of utf8 representation of
1712 variant codepoints */
1713 if (! UNI_IS_INVARIANT(uvc)) {
1714 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1717 set_bit = 0; /* We've done our bit :-) */
1722 widecharmap = newHV();
1724 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1727 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1729 if ( !SvTRUE( *svpp ) ) {
1730 sv_setiv( *svpp, ++trie->uniquecharcount );
1731 TRIE_STORE_REVCHAR(uvc);
1735 if( cur == first ) {
1736 trie->minlen = chars;
1737 trie->maxlen = chars;
1738 } else if (chars < trie->minlen) {
1739 trie->minlen = chars;
1740 } else if (chars > trie->maxlen) {
1741 trie->maxlen = chars;
1743 if (OP( noper ) == EXACTFU_SS) {
1744 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1745 if (trie->minlen > 1)
1748 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1749 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1750 * - We assume that any such sequence might match a 2 byte string */
1751 if (trie->minlen > 2 )
1755 } /* end first pass */
1756 DEBUG_TRIE_COMPILE_r(
1757 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1758 (int)depth * 2 + 2,"",
1759 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1760 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1761 (int)trie->minlen, (int)trie->maxlen )
1765 We now know what we are dealing with in terms of unique chars and
1766 string sizes so we can calculate how much memory a naive
1767 representation using a flat table will take. If it's over a reasonable
1768 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1769 conservative but potentially much slower representation using an array
1772 At the end we convert both representations into the same compressed
1773 form that will be used in regexec.c for matching with. The latter
1774 is a form that cannot be used to construct with but has memory
1775 properties similar to the list form and access properties similar
1776 to the table form making it both suitable for fast searches and
1777 small enough that its feasable to store for the duration of a program.
1779 See the comment in the code where the compressed table is produced
1780 inplace from the flat tabe representation for an explanation of how
1781 the compression works.
1786 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1789 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1791 Second Pass -- Array Of Lists Representation
1793 Each state will be represented by a list of charid:state records
1794 (reg_trie_trans_le) the first such element holds the CUR and LEN
1795 points of the allocated array. (See defines above).
1797 We build the initial structure using the lists, and then convert
1798 it into the compressed table form which allows faster lookups
1799 (but cant be modified once converted).
1802 STRLEN transcount = 1;
1804 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1805 "%*sCompiling trie using list compiler\n",
1806 (int)depth * 2 + 2, ""));
1808 trie->states = (reg_trie_state *)
1809 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1810 sizeof(reg_trie_state) );
1814 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1816 regnode *noper = NEXTOPER( cur );
1817 U8 *uc = (U8*)STRING( noper );
1818 const U8 *e = uc + STR_LEN( noper );
1819 U32 state = 1; /* required init */
1820 U16 charid = 0; /* sanity init */
1821 U8 *scan = (U8*)NULL; /* sanity init */
1822 STRLEN foldlen = 0; /* required init */
1823 U32 wordlen = 0; /* required init */
1824 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1827 if (OP(noper) == NOTHING) {
1828 regnode *noper_next= regnext(noper);
1829 if (noper_next != tail && OP(noper_next) == flags) {
1831 uc= (U8*)STRING(noper);
1832 e= uc + STR_LEN(noper);
1836 if (OP(noper) != NOTHING) {
1837 for ( ; uc < e ; uc += len ) {
1842 charid = trie->charmap[ uvc ];
1844 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1848 charid=(U16)SvIV( *svpp );
1851 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1858 if ( !trie->states[ state ].trans.list ) {
1859 TRIE_LIST_NEW( state );
1861 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1862 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1863 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1868 newstate = next_alloc++;
1869 prev_states[newstate] = state;
1870 TRIE_LIST_PUSH( state, charid, newstate );
1875 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1879 TRIE_HANDLE_WORD(state);
1881 } /* end second pass */
1883 /* next alloc is the NEXT state to be allocated */
1884 trie->statecount = next_alloc;
1885 trie->states = (reg_trie_state *)
1886 PerlMemShared_realloc( trie->states,
1888 * sizeof(reg_trie_state) );
1890 /* and now dump it out before we compress it */
1891 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1892 revcharmap, next_alloc,
1896 trie->trans = (reg_trie_trans *)
1897 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1904 for( state=1 ; state < next_alloc ; state ++ ) {
1908 DEBUG_TRIE_COMPILE_MORE_r(
1909 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1913 if (trie->states[state].trans.list) {
1914 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1918 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1919 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1920 if ( forid < minid ) {
1922 } else if ( forid > maxid ) {
1926 if ( transcount < tp + maxid - minid + 1) {
1928 trie->trans = (reg_trie_trans *)
1929 PerlMemShared_realloc( trie->trans,
1931 * sizeof(reg_trie_trans) );
1932 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1934 base = trie->uniquecharcount + tp - minid;
1935 if ( maxid == minid ) {
1937 for ( ; zp < tp ; zp++ ) {
1938 if ( ! trie->trans[ zp ].next ) {
1939 base = trie->uniquecharcount + zp - minid;
1940 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1941 trie->trans[ zp ].check = state;
1947 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1948 trie->trans[ tp ].check = state;
1953 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1954 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1955 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1956 trie->trans[ tid ].check = state;
1958 tp += ( maxid - minid + 1 );
1960 Safefree(trie->states[ state ].trans.list);
1963 DEBUG_TRIE_COMPILE_MORE_r(
1964 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1967 trie->states[ state ].trans.base=base;
1969 trie->lasttrans = tp + 1;
1973 Second Pass -- Flat Table Representation.
1975 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1976 We know that we will need Charcount+1 trans at most to store the data
1977 (one row per char at worst case) So we preallocate both structures
1978 assuming worst case.
1980 We then construct the trie using only the .next slots of the entry
1983 We use the .check field of the first entry of the node temporarily to
1984 make compression both faster and easier by keeping track of how many non
1985 zero fields are in the node.
1987 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1990 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1991 number representing the first entry of the node, and state as a
1992 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1993 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1994 are 2 entrys per node. eg:
2002 The table is internally in the right hand, idx form. However as we also
2003 have to deal with the states array which is indexed by nodenum we have to
2004 use TRIE_NODENUM() to convert.
2007 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2008 "%*sCompiling trie using table compiler\n",
2009 (int)depth * 2 + 2, ""));
2011 trie->trans = (reg_trie_trans *)
2012 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2013 * trie->uniquecharcount + 1,
2014 sizeof(reg_trie_trans) );
2015 trie->states = (reg_trie_state *)
2016 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2017 sizeof(reg_trie_state) );
2018 next_alloc = trie->uniquecharcount + 1;
2021 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2023 regnode *noper = NEXTOPER( cur );
2024 const U8 *uc = (U8*)STRING( noper );
2025 const U8 *e = uc + STR_LEN( noper );
2027 U32 state = 1; /* required init */
2029 U16 charid = 0; /* sanity init */
2030 U32 accept_state = 0; /* sanity init */
2031 U8 *scan = (U8*)NULL; /* sanity init */
2033 STRLEN foldlen = 0; /* required init */
2034 U32 wordlen = 0; /* required init */
2036 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2038 if (OP(noper) == NOTHING) {
2039 regnode *noper_next= regnext(noper);
2040 if (noper_next != tail && OP(noper_next) == flags) {
2042 uc= (U8*)STRING(noper);
2043 e= uc + STR_LEN(noper);
2047 if ( OP(noper) != NOTHING ) {
2048 for ( ; uc < e ; uc += len ) {
2053 charid = trie->charmap[ uvc ];
2055 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2056 charid = svpp ? (U16)SvIV(*svpp) : 0;
2060 if ( !trie->trans[ state + charid ].next ) {
2061 trie->trans[ state + charid ].next = next_alloc;
2062 trie->trans[ state ].check++;
2063 prev_states[TRIE_NODENUM(next_alloc)]
2064 = TRIE_NODENUM(state);
2065 next_alloc += trie->uniquecharcount;
2067 state = trie->trans[ state + charid ].next;
2069 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2071 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2074 accept_state = TRIE_NODENUM( state );
2075 TRIE_HANDLE_WORD(accept_state);
2077 } /* end second pass */
2079 /* and now dump it out before we compress it */
2080 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2082 next_alloc, depth+1));
2086 * Inplace compress the table.*
2088 For sparse data sets the table constructed by the trie algorithm will
2089 be mostly 0/FAIL transitions or to put it another way mostly empty.
2090 (Note that leaf nodes will not contain any transitions.)
2092 This algorithm compresses the tables by eliminating most such
2093 transitions, at the cost of a modest bit of extra work during lookup:
2095 - Each states[] entry contains a .base field which indicates the
2096 index in the state[] array wheres its transition data is stored.
2098 - If .base is 0 there are no valid transitions from that node.
2100 - If .base is nonzero then charid is added to it to find an entry in
2103 -If trans[states[state].base+charid].check!=state then the
2104 transition is taken to be a 0/Fail transition. Thus if there are fail
2105 transitions at the front of the node then the .base offset will point
2106 somewhere inside the previous nodes data (or maybe even into a node
2107 even earlier), but the .check field determines if the transition is
2111 The following process inplace converts the table to the compressed
2112 table: We first do not compress the root node 1,and mark all its
2113 .check pointers as 1 and set its .base pointer as 1 as well. This
2114 allows us to do a DFA construction from the compressed table later,
2115 and ensures that any .base pointers we calculate later are greater
2118 - We set 'pos' to indicate the first entry of the second node.
2120 - We then iterate over the columns of the node, finding the first and
2121 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2122 and set the .check pointers accordingly, and advance pos
2123 appropriately and repreat for the next node. Note that when we copy
2124 the next pointers we have to convert them from the original
2125 NODEIDX form to NODENUM form as the former is not valid post
2128 - If a node has no transitions used we mark its base as 0 and do not
2129 advance the pos pointer.
2131 - If a node only has one transition we use a second pointer into the
2132 structure to fill in allocated fail transitions from other states.
2133 This pointer is independent of the main pointer and scans forward
2134 looking for null transitions that are allocated to a state. When it
2135 finds one it writes the single transition into the "hole". If the
2136 pointer doesnt find one the single transition is appended as normal.
2138 - Once compressed we can Renew/realloc the structures to release the
2141 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2142 specifically Fig 3.47 and the associated pseudocode.
2146 const U32 laststate = TRIE_NODENUM( next_alloc );
2149 trie->statecount = laststate;
2151 for ( state = 1 ; state < laststate ; state++ ) {
2153 const U32 stateidx = TRIE_NODEIDX( state );
2154 const U32 o_used = trie->trans[ stateidx ].check;
2155 U32 used = trie->trans[ stateidx ].check;
2156 trie->trans[ stateidx ].check = 0;
2158 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2159 if ( flag || trie->trans[ stateidx + charid ].next ) {
2160 if ( trie->trans[ stateidx + charid ].next ) {
2162 for ( ; zp < pos ; zp++ ) {
2163 if ( ! trie->trans[ zp ].next ) {
2167 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2168 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2169 trie->trans[ zp ].check = state;
2170 if ( ++zp > pos ) pos = zp;
2177 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2179 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2180 trie->trans[ pos ].check = state;
2185 trie->lasttrans = pos + 1;
2186 trie->states = (reg_trie_state *)
2187 PerlMemShared_realloc( trie->states, laststate
2188 * sizeof(reg_trie_state) );
2189 DEBUG_TRIE_COMPILE_MORE_r(
2190 PerlIO_printf( Perl_debug_log,
2191 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2192 (int)depth * 2 + 2,"",
2193 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2196 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2199 } /* end table compress */
2201 DEBUG_TRIE_COMPILE_MORE_r(
2202 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2203 (int)depth * 2 + 2, "",
2204 (UV)trie->statecount,
2205 (UV)trie->lasttrans)
2207 /* resize the trans array to remove unused space */
2208 trie->trans = (reg_trie_trans *)
2209 PerlMemShared_realloc( trie->trans, trie->lasttrans
2210 * sizeof(reg_trie_trans) );
2212 { /* Modify the program and insert the new TRIE node */
2213 U8 nodetype =(U8)(flags & 0xFF);
2217 regnode *optimize = NULL;
2218 #ifdef RE_TRACK_PATTERN_OFFSETS
2221 U32 mjd_nodelen = 0;
2222 #endif /* RE_TRACK_PATTERN_OFFSETS */
2223 #endif /* DEBUGGING */
2225 This means we convert either the first branch or the first Exact,
2226 depending on whether the thing following (in 'last') is a branch
2227 or not and whther first is the startbranch (ie is it a sub part of
2228 the alternation or is it the whole thing.)
2229 Assuming its a sub part we convert the EXACT otherwise we convert
2230 the whole branch sequence, including the first.
2232 /* Find the node we are going to overwrite */
2233 if ( first != startbranch || OP( last ) == BRANCH ) {
2234 /* branch sub-chain */
2235 NEXT_OFF( first ) = (U16)(last - first);
2236 #ifdef RE_TRACK_PATTERN_OFFSETS
2238 mjd_offset= Node_Offset((convert));
2239 mjd_nodelen= Node_Length((convert));
2242 /* whole branch chain */
2244 #ifdef RE_TRACK_PATTERN_OFFSETS
2247 const regnode *nop = NEXTOPER( convert );
2248 mjd_offset= Node_Offset((nop));
2249 mjd_nodelen= Node_Length((nop));
2253 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2254 (int)depth * 2 + 2, "",
2255 (UV)mjd_offset, (UV)mjd_nodelen)
2258 /* But first we check to see if there is a common prefix we can
2259 split out as an EXACT and put in front of the TRIE node. */
2260 trie->startstate= 1;
2261 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2263 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2267 const U32 base = trie->states[ state ].trans.base;
2269 if ( trie->states[state].wordnum )
2272 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2273 if ( ( base + ofs >= trie->uniquecharcount ) &&
2274 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2275 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2277 if ( ++count > 1 ) {
2278 SV **tmp = av_fetch( revcharmap, ofs, 0);
2279 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2280 if ( state == 1 ) break;
2282 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2284 PerlIO_printf(Perl_debug_log,
2285 "%*sNew Start State=%"UVuf" Class: [",
2286 (int)depth * 2 + 2, "",
2289 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2290 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2292 TRIE_BITMAP_SET(trie,*ch);
2294 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2296 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2300 TRIE_BITMAP_SET(trie,*ch);
2302 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2303 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2309 SV **tmp = av_fetch( revcharmap, idx, 0);
2311 char *ch = SvPV( *tmp, len );
2313 SV *sv=sv_newmortal();
2314 PerlIO_printf( Perl_debug_log,
2315 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2316 (int)depth * 2 + 2, "",
2318 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2319 PL_colors[0], PL_colors[1],
2320 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2321 PERL_PV_ESCAPE_FIRSTCHAR
2326 OP( convert ) = nodetype;
2327 str=STRING(convert);
2330 STR_LEN(convert) += len;
2336 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2341 trie->prefixlen = (state-1);
2343 regnode *n = convert+NODE_SZ_STR(convert);
2344 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2345 trie->startstate = state;
2346 trie->minlen -= (state - 1);
2347 trie->maxlen -= (state - 1);
2349 /* At least the UNICOS C compiler choked on this
2350 * being argument to DEBUG_r(), so let's just have
2353 #ifdef PERL_EXT_RE_BUILD
2359 regnode *fix = convert;
2360 U32 word = trie->wordcount;
2362 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2363 while( ++fix < n ) {
2364 Set_Node_Offset_Length(fix, 0, 0);
2367 SV ** const tmp = av_fetch( trie_words, word, 0 );
2369 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2370 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2372 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2380 NEXT_OFF(convert) = (U16)(tail - convert);
2381 DEBUG_r(optimize= n);
2387 if ( trie->maxlen ) {
2388 NEXT_OFF( convert ) = (U16)(tail - convert);
2389 ARG_SET( convert, data_slot );
2390 /* Store the offset to the first unabsorbed branch in
2391 jump[0], which is otherwise unused by the jump logic.
2392 We use this when dumping a trie and during optimisation. */
2394 trie->jump[0] = (U16)(nextbranch - convert);
2396 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2397 * and there is a bitmap
2398 * and the first "jump target" node we found leaves enough room
2399 * then convert the TRIE node into a TRIEC node, with the bitmap
2400 * embedded inline in the opcode - this is hypothetically faster.
2402 if ( !trie->states[trie->startstate].wordnum
2404 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2406 OP( convert ) = TRIEC;
2407 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2408 PerlMemShared_free(trie->bitmap);
2411 OP( convert ) = TRIE;
2413 /* store the type in the flags */
2414 convert->flags = nodetype;
2418 + regarglen[ OP( convert ) ];
2420 /* XXX We really should free up the resource in trie now,
2421 as we won't use them - (which resources?) dmq */
2423 /* needed for dumping*/
2424 DEBUG_r(if (optimize) {
2425 regnode *opt = convert;
2427 while ( ++opt < optimize) {
2428 Set_Node_Offset_Length(opt,0,0);
2431 Try to clean up some of the debris left after the
2434 while( optimize < jumper ) {
2435 mjd_nodelen += Node_Length((optimize));
2436 OP( optimize ) = OPTIMIZED;
2437 Set_Node_Offset_Length(optimize,0,0);
2440 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2442 } /* end node insert */
2444 /* Finish populating the prev field of the wordinfo array. Walk back
2445 * from each accept state until we find another accept state, and if
2446 * so, point the first word's .prev field at the second word. If the
2447 * second already has a .prev field set, stop now. This will be the
2448 * case either if we've already processed that word's accept state,
2449 * or that state had multiple words, and the overspill words were
2450 * already linked up earlier.
2457 for (word=1; word <= trie->wordcount; word++) {
2459 if (trie->wordinfo[word].prev)
2461 state = trie->wordinfo[word].accept;
2463 state = prev_states[state];
2466 prev = trie->states[state].wordnum;
2470 trie->wordinfo[word].prev = prev;
2472 Safefree(prev_states);
2476 /* and now dump out the compressed format */
2477 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2479 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2481 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2482 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2484 SvREFCNT_dec_NN(revcharmap);
2488 : trie->startstate>1
2494 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2496 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2498 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2499 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2502 We find the fail state for each state in the trie, this state is the longest proper
2503 suffix of the current state's 'word' that is also a proper prefix of another word in our
2504 trie. State 1 represents the word '' and is thus the default fail state. This allows
2505 the DFA not to have to restart after its tried and failed a word at a given point, it
2506 simply continues as though it had been matching the other word in the first place.
2508 'abcdgu'=~/abcdefg|cdgu/
2509 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2510 fail, which would bring us to the state representing 'd' in the second word where we would
2511 try 'g' and succeed, proceeding to match 'cdgu'.
2513 /* add a fail transition */
2514 const U32 trie_offset = ARG(source);
2515 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2517 const U32 ucharcount = trie->uniquecharcount;
2518 const U32 numstates = trie->statecount;
2519 const U32 ubound = trie->lasttrans + ucharcount;
2523 U32 base = trie->states[ 1 ].trans.base;
2526 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2527 GET_RE_DEBUG_FLAGS_DECL;
2529 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2531 PERL_UNUSED_ARG(depth);
2535 ARG_SET( stclass, data_slot );
2536 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2537 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2538 aho->trie=trie_offset;
2539 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2540 Copy( trie->states, aho->states, numstates, reg_trie_state );
2541 Newxz( q, numstates, U32);
2542 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2545 /* initialize fail[0..1] to be 1 so that we always have
2546 a valid final fail state */
2547 fail[ 0 ] = fail[ 1 ] = 1;
2549 for ( charid = 0; charid < ucharcount ; charid++ ) {
2550 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2552 q[ q_write ] = newstate;
2553 /* set to point at the root */
2554 fail[ q[ q_write++ ] ]=1;
2557 while ( q_read < q_write) {
2558 const U32 cur = q[ q_read++ % numstates ];
2559 base = trie->states[ cur ].trans.base;
2561 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2562 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2564 U32 fail_state = cur;
2567 fail_state = fail[ fail_state ];
2568 fail_base = aho->states[ fail_state ].trans.base;
2569 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2571 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2572 fail[ ch_state ] = fail_state;
2573 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2575 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2577 q[ q_write++ % numstates] = ch_state;
2581 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2582 when we fail in state 1, this allows us to use the
2583 charclass scan to find a valid start char. This is based on the principle
2584 that theres a good chance the string being searched contains lots of stuff
2585 that cant be a start char.
2587 fail[ 0 ] = fail[ 1 ] = 0;
2588 DEBUG_TRIE_COMPILE_r({
2589 PerlIO_printf(Perl_debug_log,
2590 "%*sStclass Failtable (%"UVuf" states): 0",
2591 (int)(depth * 2), "", (UV)numstates
2593 for( q_read=1; q_read<numstates; q_read++ ) {
2594 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2596 PerlIO_printf(Perl_debug_log, "\n");
2599 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2604 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2605 * These need to be revisited when a newer toolchain becomes available.
2607 #if defined(__sparc64__) && defined(__GNUC__)
2608 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2609 # undef SPARC64_GCC_WORKAROUND
2610 # define SPARC64_GCC_WORKAROUND 1
2614 #define DEBUG_PEEP(str,scan,depth) \
2615 DEBUG_OPTIMISE_r({if (scan){ \
2616 SV * const mysv=sv_newmortal(); \
2617 regnode *Next = regnext(scan); \
2618 regprop(RExC_rx, mysv, scan); \
2619 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2620 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2621 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2625 /* The below joins as many adjacent EXACTish nodes as possible into a single
2626 * one. The regop may be changed if the node(s) contain certain sequences that
2627 * require special handling. The joining is only done if:
2628 * 1) there is room in the current conglomerated node to entirely contain the
2630 * 2) they are the exact same node type
2632 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2633 * these get optimized out
2635 * If a node is to match under /i (folded), the number of characters it matches
2636 * can be different than its character length if it contains a multi-character
2637 * fold. *min_subtract is set to the total delta of the input nodes.
2639 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2640 * and contains LATIN SMALL LETTER SHARP S
2642 * This is as good a place as any to discuss the design of handling these
2643 * multi-character fold sequences. It's been wrong in Perl for a very long
2644 * time. There are three code points in Unicode whose multi-character folds
2645 * were long ago discovered to mess things up. The previous designs for
2646 * dealing with these involved assigning a special node for them. This
2647 * approach doesn't work, as evidenced by this example:
2648 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2649 * Both these fold to "sss", but if the pattern is parsed to create a node that
2650 * would match just the \xDF, it won't be able to handle the case where a
2651 * successful match would have to cross the node's boundary. The new approach
2652 * that hopefully generally solves the problem generates an EXACTFU_SS node
2655 * It turns out that there are problems with all multi-character folds, and not
2656 * just these three. Now the code is general, for all such cases, but the
2657 * three still have some special handling. The approach taken is:
2658 * 1) This routine examines each EXACTFish node that could contain multi-
2659 * character fold sequences. It returns in *min_subtract how much to
2660 * subtract from the the actual length of the string to get a real minimum
2661 * match length; it is 0 if there are no multi-char folds. This delta is
2662 * used by the caller to adjust the min length of the match, and the delta
2663 * between min and max, so that the optimizer doesn't reject these
2664 * possibilities based on size constraints.
2665 * 2) Certain of these sequences require special handling by the trie code,
2666 * so, if found, this code changes the joined node type to special ops:
2667 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2668 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2669 * is used for an EXACTFU node that contains at least one "ss" sequence in
2670 * it. For non-UTF-8 patterns and strings, this is the only case where
2671 * there is a possible fold length change. That means that a regular
2672 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2673 * with length changes, and so can be processed faster. regexec.c takes
2674 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2675 * pre-folded by regcomp.c. This saves effort in regex matching.
2676 * However, the pre-folding isn't done for non-UTF8 patterns because the
2677 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2678 * down by forcing the pattern into UTF8 unless necessary. Also what
2679 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2680 * possibilities for the non-UTF8 patterns are quite simple, except for
2681 * the sharp s. All the ones that don't involve a UTF-8 target string are
2682 * members of a fold-pair, and arrays are set up for all of them so that
2683 * the other member of the pair can be found quickly. Code elsewhere in
2684 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2685 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2686 * described in the next item.
2687 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2688 * 'ss' or not is not knowable at compile time. It will match iff the
2689 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2690 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2691 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2692 * described in item 3). An assumption that the optimizer part of
2693 * regexec.c (probably unwittingly) makes is that a character in the
2694 * pattern corresponds to at most a single character in the target string.
2695 * (And I do mean character, and not byte here, unlike other parts of the
2696 * documentation that have never been updated to account for multibyte
2697 * Unicode.) This assumption is wrong only in this case, as all other
2698 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2699 * virtue of having this file pre-fold UTF-8 patterns. I'm
2700 * reluctant to try to change this assumption, so instead the code punts.
2701 * This routine examines EXACTF nodes for the sharp s, and returns a
2702 * boolean indicating whether or not the node is an EXACTF node that
2703 * contains a sharp s. When it is true, the caller sets a flag that later
2704 * causes the optimizer in this file to not set values for the floating
2705 * and fixed string lengths, and thus avoids the optimizer code in
2706 * regexec.c that makes the invalid assumption. Thus, there is no
2707 * optimization based on string lengths for EXACTF nodes that contain the
2708 * sharp s. This only happens for /id rules (which means the pattern
2712 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2713 if (PL_regkind[OP(scan)] == EXACT) \
2714 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2717 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2718 /* Merge several consecutive EXACTish nodes into one. */
2719 regnode *n = regnext(scan);
2721 regnode *next = scan + NODE_SZ_STR(scan);
2725 regnode *stop = scan;
2726 GET_RE_DEBUG_FLAGS_DECL;
2728 PERL_UNUSED_ARG(depth);
2731 PERL_ARGS_ASSERT_JOIN_EXACT;
2732 #ifndef EXPERIMENTAL_INPLACESCAN
2733 PERL_UNUSED_ARG(flags);
2734 PERL_UNUSED_ARG(val);
2736 DEBUG_PEEP("join",scan,depth);
2738 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2739 * EXACT ones that are mergeable to the current one. */
2741 && (PL_regkind[OP(n)] == NOTHING
2742 || (stringok && OP(n) == OP(scan)))
2744 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2747 if (OP(n) == TAIL || n > next)
2749 if (PL_regkind[OP(n)] == NOTHING) {
2750 DEBUG_PEEP("skip:",n,depth);
2751 NEXT_OFF(scan) += NEXT_OFF(n);
2752 next = n + NODE_STEP_REGNODE;
2759 else if (stringok) {
2760 const unsigned int oldl = STR_LEN(scan);
2761 regnode * const nnext = regnext(n);
2763 /* XXX I (khw) kind of doubt that this works on platforms where
2764 * U8_MAX is above 255 because of lots of other assumptions */
2765 /* Don't join if the sum can't fit into a single node */
2766 if (oldl + STR_LEN(n) > U8_MAX)
2769 DEBUG_PEEP("merg",n,depth);
2772 NEXT_OFF(scan) += NEXT_OFF(n);
2773 STR_LEN(scan) += STR_LEN(n);
2774 next = n + NODE_SZ_STR(n);
2775 /* Now we can overwrite *n : */
2776 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2784 #ifdef EXPERIMENTAL_INPLACESCAN
2785 if (flags && !NEXT_OFF(n)) {
2786 DEBUG_PEEP("atch", val, depth);
2787 if (reg_off_by_arg[OP(n)]) {
2788 ARG_SET(n, val - n);
2791 NEXT_OFF(n) = val - n;
2799 *has_exactf_sharp_s = FALSE;
2801 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2802 * can now analyze for sequences of problematic code points. (Prior to
2803 * this final joining, sequences could have been split over boundaries, and
2804 * hence missed). The sequences only happen in folding, hence for any
2805 * non-EXACT EXACTish node */
2806 if (OP(scan) != EXACT) {
2807 const U8 * const s0 = (U8*) STRING(scan);
2809 const U8 * const s_end = s0 + STR_LEN(scan);
2811 /* One pass is made over the node's string looking for all the
2812 * possibilities. to avoid some tests in the loop, there are two main
2813 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2817 /* Examine the string for a multi-character fold sequence. UTF-8
2818 * patterns have all characters pre-folded by the time this code is
2820 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2821 length sequence we are looking for is 2 */
2824 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2825 if (! len) { /* Not a multi-char fold: get next char */
2830 /* Nodes with 'ss' require special handling, except for EXACTFL
2831 * and EXACTFA for which there is no multi-char fold to this */
2832 if (len == 2 && *s == 's' && *(s+1) == 's'
2833 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2836 OP(scan) = EXACTFU_SS;
2839 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2840 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2841 COMBINING_DIAERESIS_UTF8
2842 COMBINING_ACUTE_ACCENT_UTF8,
2844 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2845 COMBINING_DIAERESIS_UTF8
2846 COMBINING_ACUTE_ACCENT_UTF8,
2851 /* These two folds require special handling by trie's, so
2852 * change the node type to indicate this. If EXACTFA and
2853 * EXACTFL were ever to be handled by trie's, this would
2854 * have to be changed. If this node has already been
2855 * changed to EXACTFU_SS in this loop, leave it as is. (I
2856 * (khw) think it doesn't matter in regexec.c for UTF
2857 * patterns, but no need to change it */
2858 if (OP(scan) == EXACTFU) {
2859 OP(scan) = EXACTFU_TRICKYFOLD;
2863 else { /* Here is a generic multi-char fold. */
2864 const U8* multi_end = s + len;
2866 /* Count how many characters in it. In the case of /l and
2867 * /aa, no folds which contain ASCII code points are
2868 * allowed, so check for those, and skip if found. (In
2869 * EXACTFL, no folds are allowed to any Latin1 code point,
2870 * not just ASCII. But there aren't any of these
2871 * currently, nor ever likely, so don't take the time to
2872 * test for them. The code that generates the
2873 * is_MULTI_foo() macros croaks should one actually get put
2874 * into Unicode .) */
2875 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2876 count = utf8_length(s, multi_end);
2880 while (s < multi_end) {
2883 goto next_iteration;
2893 /* The delta is how long the sequence is minus 1 (1 is how long
2894 * the character that folds to the sequence is) */
2895 *min_subtract += count - 1;
2899 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2901 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2902 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2903 * nodes can't have multi-char folds to this range (and there are
2904 * no existing ones in the upper latin1 range). In the EXACTF
2905 * case we look also for the sharp s, which can be in the final
2906 * position. Otherwise we can stop looking 1 byte earlier because
2907 * have to find at least two characters for a multi-fold */
2908 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2910 /* The below is perhaps overboard, but this allows us to save a
2911 * test each time through the loop at the expense of a mask. This
2912 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2913 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2914 * are 64. This uses an exclusive 'or' to find that bit and then
2915 * inverts it to form a mask, with just a single 0, in the bit
2916 * position where 'S' and 's' differ. */
2917 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2918 const U8 s_masked = 's' & S_or_s_mask;
2921 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2922 if (! len) { /* Not a multi-char fold. */
2923 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2925 *has_exactf_sharp_s = TRUE;
2932 && ((*s & S_or_s_mask) == s_masked)
2933 && ((*(s+1) & S_or_s_mask) == s_masked))
2936 /* EXACTF nodes need to know that the minimum length
2937 * changed so that a sharp s in the string can match this
2938 * ss in the pattern, but they remain EXACTF nodes, as they
2939 * won't match this unless the target string is is UTF-8,
2940 * which we don't know until runtime */
2941 if (OP(scan) != EXACTF) {
2942 OP(scan) = EXACTFU_SS;
2946 *min_subtract += len - 1;
2953 /* Allow dumping but overwriting the collection of skipped
2954 * ops and/or strings with fake optimized ops */
2955 n = scan + NODE_SZ_STR(scan);
2963 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2967 /* REx optimizer. Converts nodes into quicker variants "in place".
2968 Finds fixed substrings. */
2970 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2971 to the position after last scanned or to NULL. */
2973 #define INIT_AND_WITHP \
2974 assert(!and_withp); \
2975 Newx(and_withp,1,struct regnode_charclass_class); \
2976 SAVEFREEPV(and_withp)
2978 /* this is a chain of data about sub patterns we are processing that
2979 need to be handled separately/specially in study_chunk. Its so
2980 we can simulate recursion without losing state. */
2982 typedef struct scan_frame {
2983 regnode *last; /* last node to process in this frame */
2984 regnode *next; /* next node to process when last is reached */
2985 struct scan_frame *prev; /*previous frame*/
2986 I32 stop; /* what stopparen do we use */
2990 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2993 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2994 I32 *minlenp, I32 *deltap,
2999 struct regnode_charclass_class *and_withp,
3000 U32 flags, U32 depth)
3001 /* scanp: Start here (read-write). */
3002 /* deltap: Write maxlen-minlen here. */
3003 /* last: Stop before this one. */
3004 /* data: string data about the pattern */
3005 /* stopparen: treat close N as END */
3006 /* recursed: which subroutines have we recursed into */
3007 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3010 I32 min = 0; /* There must be at least this number of characters to match */
3012 regnode *scan = *scanp, *next;
3014 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3015 int is_inf_internal = 0; /* The studied chunk is infinite */
3016 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3017 scan_data_t data_fake;
3018 SV *re_trie_maxbuff = NULL;
3019 regnode *first_non_open = scan;
3020 I32 stopmin = I32_MAX;
3021 scan_frame *frame = NULL;
3022 GET_RE_DEBUG_FLAGS_DECL;
3024 PERL_ARGS_ASSERT_STUDY_CHUNK;
3027 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3031 while (first_non_open && OP(first_non_open) == OPEN)
3032 first_non_open=regnext(first_non_open);
3037 while ( scan && OP(scan) != END && scan < last ){
3038 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3039 node length to get a real minimum (because
3040 the folded version may be shorter) */
3041 bool has_exactf_sharp_s = FALSE;
3042 /* Peephole optimizer: */
3043 DEBUG_STUDYDATA("Peep:", data,depth);
3044 DEBUG_PEEP("Peep",scan,depth);
3046 /* Its not clear to khw or hv why this is done here, and not in the
3047 * clauses that deal with EXACT nodes. khw's guess is that it's
3048 * because of a previous design */
3049 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3051 /* Follow the next-chain of the current node and optimize
3052 away all the NOTHINGs from it. */
3053 if (OP(scan) != CURLYX) {
3054 const int max = (reg_off_by_arg[OP(scan)]
3056 /* I32 may be smaller than U16 on CRAYs! */
3057 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3058 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3062 /* Skip NOTHING and LONGJMP. */
3063 while ((n = regnext(n))
3064 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3065 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3066 && off + noff < max)
3068 if (reg_off_by_arg[OP(scan)])
3071 NEXT_OFF(scan) = off;
3076 /* The principal pseudo-switch. Cannot be a switch, since we
3077 look into several different things. */
3078 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3079 || OP(scan) == IFTHEN) {
3080 next = regnext(scan);
3082 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3084 if (OP(next) == code || code == IFTHEN) {
3085 /* NOTE - There is similar code to this block below for handling
3086 TRIE nodes on a re-study. If you change stuff here check there
3088 I32 max1 = 0, min1 = I32_MAX, num = 0;
3089 struct regnode_charclass_class accum;
3090 regnode * const startbranch=scan;
3092 if (flags & SCF_DO_SUBSTR)
3093 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3094 if (flags & SCF_DO_STCLASS)
3095 cl_init_zero(pRExC_state, &accum);
3097 while (OP(scan) == code) {
3098 I32 deltanext, minnext, f = 0, fake;
3099 struct regnode_charclass_class this_class;
3102 data_fake.flags = 0;
3104 data_fake.whilem_c = data->whilem_c;
3105 data_fake.last_closep = data->last_closep;
3108 data_fake.last_closep = &fake;
3110 data_fake.pos_delta = delta;
3111 next = regnext(scan);
3112 scan = NEXTOPER(scan);
3114 scan = NEXTOPER(scan);
3115 if (flags & SCF_DO_STCLASS) {
3116 cl_init(pRExC_state, &this_class);
3117 data_fake.start_class = &this_class;
3118 f = SCF_DO_STCLASS_AND;
3120 if (flags & SCF_WHILEM_VISITED_POS)
3121 f |= SCF_WHILEM_VISITED_POS;
3123 /* we suppose the run is continuous, last=next...*/
3124 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3126 stopparen, recursed, NULL, f,depth+1);
3129 if (max1 < minnext + deltanext)
3130 max1 = minnext + deltanext;
3131 if (deltanext == I32_MAX)
3132 is_inf = is_inf_internal = 1;
3134 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3136 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3137 if ( stopmin > minnext)
3138 stopmin = min + min1;
3139 flags &= ~SCF_DO_SUBSTR;
3141 data->flags |= SCF_SEEN_ACCEPT;
3144 if (data_fake.flags & SF_HAS_EVAL)
3145 data->flags |= SF_HAS_EVAL;
3146 data->whilem_c = data_fake.whilem_c;
3148 if (flags & SCF_DO_STCLASS)
3149 cl_or(pRExC_state, &accum, &this_class);
3151 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3153 if (flags & SCF_DO_SUBSTR) {
3154 data->pos_min += min1;
3155 data->pos_delta += max1 - min1;
3156 if (max1 != min1 || is_inf)
3157 data->longest = &(data->longest_float);
3160 delta += max1 - min1;
3161 if (flags & SCF_DO_STCLASS_OR) {
3162 cl_or(pRExC_state, data->start_class, &accum);
3164 cl_and(data->start_class, and_withp);
3165 flags &= ~SCF_DO_STCLASS;
3168 else if (flags & SCF_DO_STCLASS_AND) {
3170 cl_and(data->start_class, &accum);
3171 flags &= ~SCF_DO_STCLASS;
3174 /* Switch to OR mode: cache the old value of
3175 * data->start_class */
3177 StructCopy(data->start_class, and_withp,
3178 struct regnode_charclass_class);
3179 flags &= ~SCF_DO_STCLASS_AND;
3180 StructCopy(&accum, data->start_class,
3181 struct regnode_charclass_class);
3182 flags |= SCF_DO_STCLASS_OR;
3183 SET_SSC_EOS(data->start_class);
3187 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3190 Assuming this was/is a branch we are dealing with: 'scan' now
3191 points at the item that follows the branch sequence, whatever
3192 it is. We now start at the beginning of the sequence and look
3199 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3201 If we can find such a subsequence we need to turn the first
3202 element into a trie and then add the subsequent branch exact
3203 strings to the trie.
3207 1. patterns where the whole set of branches can be converted.
3209 2. patterns where only a subset can be converted.
3211 In case 1 we can replace the whole set with a single regop
3212 for the trie. In case 2 we need to keep the start and end
3215 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3216 becomes BRANCH TRIE; BRANCH X;
3218 There is an additional case, that being where there is a
3219 common prefix, which gets split out into an EXACT like node
3220 preceding the TRIE node.
3222 If x(1..n)==tail then we can do a simple trie, if not we make
3223 a "jump" trie, such that when we match the appropriate word
3224 we "jump" to the appropriate tail node. Essentially we turn
3225 a nested if into a case structure of sorts.
3230 if (!re_trie_maxbuff) {
3231 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3232 if (!SvIOK(re_trie_maxbuff))
3233 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3235 if ( SvIV(re_trie_maxbuff)>=0 ) {
3237 regnode *first = (regnode *)NULL;
3238 regnode *last = (regnode *)NULL;
3239 regnode *tail = scan;
3244 SV * const mysv = sv_newmortal(); /* for dumping */
3246 /* var tail is used because there may be a TAIL
3247 regop in the way. Ie, the exacts will point to the
3248 thing following the TAIL, but the last branch will
3249 point at the TAIL. So we advance tail. If we
3250 have nested (?:) we may have to move through several
3254 while ( OP( tail ) == TAIL ) {
3255 /* this is the TAIL generated by (?:) */
3256 tail = regnext( tail );
3260 DEBUG_TRIE_COMPILE_r({
3261 regprop(RExC_rx, mysv, tail );
3262 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3263 (int)depth * 2 + 2, "",
3264 "Looking for TRIE'able sequences. Tail node is: ",
3265 SvPV_nolen_const( mysv )
3271 Step through the branches
3272 cur represents each branch,
3273 noper is the first thing to be matched as part of that branch
3274 noper_next is the regnext() of that node.
3276 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3277 via a "jump trie" but we also support building with NOJUMPTRIE,
3278 which restricts the trie logic to structures like /FOO|BAR/.
3280 If noper is a trieable nodetype then the branch is a possible optimization
3281 target. If we are building under NOJUMPTRIE then we require that noper_next
3282 is the same as scan (our current position in the regex program).
3284 Once we have two or more consecutive such branches we can create a
3285 trie of the EXACT's contents and stitch it in place into the program.
3287 If the sequence represents all of the branches in the alternation we
3288 replace the entire thing with a single TRIE node.
3290 Otherwise when it is a subsequence we need to stitch it in place and
3291 replace only the relevant branches. This means the first branch has
3292 to remain as it is used by the alternation logic, and its next pointer,
3293 and needs to be repointed at the item on the branch chain following
3294 the last branch we have optimized away.
3296 This could be either a BRANCH, in which case the subsequence is internal,
3297 or it could be the item following the branch sequence in which case the
3298 subsequence is at the end (which does not necessarily mean the first node
3299 is the start of the alternation).
3301 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3304 ----------------+-----------
3308 EXACTFU_SS | EXACTFU
3309 EXACTFU_TRICKYFOLD | EXACTFU
3314 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3315 ( EXACT == (X) ) ? EXACT : \
3316 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3319 /* dont use tail as the end marker for this traverse */
3320 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3321 regnode * const noper = NEXTOPER( cur );
3322 U8 noper_type = OP( noper );
3323 U8 noper_trietype = TRIE_TYPE( noper_type );
3324 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3325 regnode * const noper_next = regnext( noper );
3326 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3327 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3330 DEBUG_TRIE_COMPILE_r({
3331 regprop(RExC_rx, mysv, cur);
3332 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3333 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3335 regprop(RExC_rx, mysv, noper);
3336 PerlIO_printf( Perl_debug_log, " -> %s",
3337 SvPV_nolen_const(mysv));
3340 regprop(RExC_rx, mysv, noper_next );
3341 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3342 SvPV_nolen_const(mysv));
3344 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3345 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3346 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3350 /* Is noper a trieable nodetype that can be merged with the
3351 * current trie (if there is one)? */
3355 ( noper_trietype == NOTHING)
3356 || ( trietype == NOTHING )
3357 || ( trietype == noper_trietype )
3360 && noper_next == tail
3364 /* Handle mergable triable node
3365 * Either we are the first node in a new trieable sequence,
3366 * in which case we do some bookkeeping, otherwise we update
3367 * the end pointer. */
3370 if ( noper_trietype == NOTHING ) {
3371 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3372 regnode * const noper_next = regnext( noper );
3373 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3374 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3377 if ( noper_next_trietype ) {
3378 trietype = noper_next_trietype;
3379 } else if (noper_next_type) {
3380 /* a NOTHING regop is 1 regop wide. We need at least two
3381 * for a trie so we can't merge this in */
3385 trietype = noper_trietype;
3388 if ( trietype == NOTHING )
3389 trietype = noper_trietype;
3394 } /* end handle mergable triable node */
3396 /* handle unmergable node -
3397 * noper may either be a triable node which can not be tried
3398 * together with the current trie, or a non triable node */
3400 /* If last is set and trietype is not NOTHING then we have found
3401 * at least two triable branch sequences in a row of a similar
3402 * trietype so we can turn them into a trie. If/when we
3403 * allow NOTHING to start a trie sequence this condition will be
3404 * required, and it isn't expensive so we leave it in for now. */
3405 if ( trietype && trietype != NOTHING )
3406 make_trie( pRExC_state,
3407 startbranch, first, cur, tail, count,
3408 trietype, depth+1 );
3409 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3413 && noper_next == tail
3416 /* noper is triable, so we can start a new trie sequence */
3419 trietype = noper_trietype;
3421 /* if we already saw a first but the current node is not triable then we have
3422 * to reset the first information. */
3427 } /* end handle unmergable node */
3428 } /* loop over branches */
3429 DEBUG_TRIE_COMPILE_r({
3430 regprop(RExC_rx, mysv, cur);
3431 PerlIO_printf( Perl_debug_log,
3432 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3433 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3436 if ( last && trietype ) {
3437 if ( trietype != NOTHING ) {
3438 /* the last branch of the sequence was part of a trie,
3439 * so we have to construct it here outside of the loop
3441 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3442 #ifdef TRIE_STUDY_OPT
3443 if ( ((made == MADE_EXACT_TRIE &&
3444 startbranch == first)
3445 || ( first_non_open == first )) &&
3447 flags |= SCF_TRIE_RESTUDY;
3448 if ( startbranch == first
3451 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3456 /* at this point we know whatever we have is a NOTHING sequence/branch
3457 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3459 if ( startbranch == first ) {
3461 /* the entire thing is a NOTHING sequence, something like this:
3462 * (?:|) So we can turn it into a plain NOTHING op. */
3463 DEBUG_TRIE_COMPILE_r({
3464 regprop(RExC_rx, mysv, cur);
3465 PerlIO_printf( Perl_debug_log,
3466 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3467 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3470 OP(startbranch)= NOTHING;
3471 NEXT_OFF(startbranch)= tail - startbranch;
3472 for ( opt= startbranch + 1; opt < tail ; opt++ )
3476 } /* end if ( last) */
3477 } /* TRIE_MAXBUF is non zero */
3482 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3483 scan = NEXTOPER(NEXTOPER(scan));
3484 } else /* single branch is optimized. */
3485 scan = NEXTOPER(scan);
3487 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3488 scan_frame *newframe = NULL;
3493 if (OP(scan) != SUSPEND) {
3494 /* set the pointer */
3495 if (OP(scan) == GOSUB) {
3497 RExC_recurse[ARG2L(scan)] = scan;
3498 start = RExC_open_parens[paren-1];
3499 end = RExC_close_parens[paren-1];
3502 start = RExC_rxi->program + 1;
3506 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3507 SAVEFREEPV(recursed);
3509 if (!PAREN_TEST(recursed,paren+1)) {
3510 PAREN_SET(recursed,paren+1);
3511 Newx(newframe,1,scan_frame);
3513 if (flags & SCF_DO_SUBSTR) {
3514 SCAN_COMMIT(pRExC_state,data,minlenp);
3515 data->longest = &(data->longest_float);
3517 is_inf = is_inf_internal = 1;
3518 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3519 cl_anything(pRExC_state, data->start_class);
3520 flags &= ~SCF_DO_STCLASS;
3523 Newx(newframe,1,scan_frame);
3526 end = regnext(scan);
3531 SAVEFREEPV(newframe);
3532 newframe->next = regnext(scan);
3533 newframe->last = last;
3534 newframe->stop = stopparen;
3535 newframe->prev = frame;
3545 else if (OP(scan) == EXACT) {
3546 I32 l = STR_LEN(scan);
3549 const U8 * const s = (U8*)STRING(scan);
3550 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3551 l = utf8_length(s, s + l);
3553 uc = *((U8*)STRING(scan));
3556 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3557 /* The code below prefers earlier match for fixed
3558 offset, later match for variable offset. */
3559 if (data->last_end == -1) { /* Update the start info. */
3560 data->last_start_min = data->pos_min;
3561 data->last_start_max = is_inf
3562 ? I32_MAX : data->pos_min + data->pos_delta;
3564 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3566 SvUTF8_on(data->last_found);
3568 SV * const sv = data->last_found;
3569 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3570 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3571 if (mg && mg->mg_len >= 0)
3572 mg->mg_len += utf8_length((U8*)STRING(scan),
3573 (U8*)STRING(scan)+STR_LEN(scan));
3575 data->last_end = data->pos_min + l;
3576 data->pos_min += l; /* As in the first entry. */
3577 data->flags &= ~SF_BEFORE_EOL;
3579 if (flags & SCF_DO_STCLASS_AND) {
3580 /* Check whether it is compatible with what we know already! */
3584 /* If compatible, we or it in below. It is compatible if is
3585 * in the bitmp and either 1) its bit or its fold is set, or 2)
3586 * it's for a locale. Even if there isn't unicode semantics
3587 * here, at runtime there may be because of matching against a
3588 * utf8 string, so accept a possible false positive for
3589 * latin1-range folds */
3591 (!(data->start_class->flags & ANYOF_LOCALE)
3592 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3593 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3594 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3599 ANYOF_CLASS_ZERO(data->start_class);
3600 ANYOF_BITMAP_ZERO(data->start_class);
3602 ANYOF_BITMAP_SET(data->start_class, uc);
3603 else if (uc >= 0x100) {
3606 /* Some Unicode code points fold to the Latin1 range; as
3607 * XXX temporary code, instead of figuring out if this is
3608 * one, just assume it is and set all the start class bits
3609 * that could be some such above 255 code point's fold
3610 * which will generate fals positives. As the code
3611 * elsewhere that does compute the fold settles down, it
3612 * can be extracted out and re-used here */
3613 for (i = 0; i < 256; i++){
3614 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3615 ANYOF_BITMAP_SET(data->start_class, i);
3619 CLEAR_SSC_EOS(data->start_class);
3621 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3623 else if (flags & SCF_DO_STCLASS_OR) {
3624 /* false positive possible if the class is case-folded */
3626 ANYOF_BITMAP_SET(data->start_class, uc);
3628 data->start_class->flags |= ANYOF_UNICODE_ALL;
3629 CLEAR_SSC_EOS(data->start_class);
3630 cl_and(data->start_class, and_withp);
3632 flags &= ~SCF_DO_STCLASS;
3634 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3635 I32 l = STR_LEN(scan);
3636 UV uc = *((U8*)STRING(scan));
3638 /* Search for fixed substrings supports EXACT only. */
3639 if (flags & SCF_DO_SUBSTR) {
3641 SCAN_COMMIT(pRExC_state, data, minlenp);
3644 const U8 * const s = (U8 *)STRING(scan);
3645 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3646 l = utf8_length(s, s + l);
3648 if (has_exactf_sharp_s) {
3649 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3651 min += l - min_subtract;
3653 delta += min_subtract;
3654 if (flags & SCF_DO_SUBSTR) {
3655 data->pos_min += l - min_subtract;
3656 if (data->pos_min < 0) {
3659 data->pos_delta += min_subtract;
3661 data->longest = &(data->longest_float);
3664 if (flags & SCF_DO_STCLASS_AND) {
3665 /* Check whether it is compatible with what we know already! */
3668 (!(data->start_class->flags & ANYOF_LOCALE)
3669 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3670 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3674 ANYOF_CLASS_ZERO(data->start_class);
3675 ANYOF_BITMAP_ZERO(data->start_class);
3677 ANYOF_BITMAP_SET(data->start_class, uc);
3678 CLEAR_SSC_EOS(data->start_class);
3679 if (OP(scan) == EXACTFL) {
3680 /* XXX This set is probably no longer necessary, and
3681 * probably wrong as LOCALE now is on in the initial
3683 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3687 /* Also set the other member of the fold pair. In case
3688 * that unicode semantics is called for at runtime, use
3689 * the full latin1 fold. (Can't do this for locale,
3690 * because not known until runtime) */
3691 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3693 /* All other (EXACTFL handled above) folds except under
3694 * /iaa that include s, S, and sharp_s also may include
3696 if (OP(scan) != EXACTFA) {
3697 if (uc == 's' || uc == 'S') {
3698 ANYOF_BITMAP_SET(data->start_class,
3699 LATIN_SMALL_LETTER_SHARP_S);
3701 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3702 ANYOF_BITMAP_SET(data->start_class, 's');
3703 ANYOF_BITMAP_SET(data->start_class, 'S');
3708 else if (uc >= 0x100) {
3710 for (i = 0; i < 256; i++){
3711 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3712 ANYOF_BITMAP_SET(data->start_class, i);
3717 else if (flags & SCF_DO_STCLASS_OR) {
3718 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3719 /* false positive possible if the class is case-folded.
3720 Assume that the locale settings are the same... */
3722 ANYOF_BITMAP_SET(data->start_class, uc);
3723 if (OP(scan) != EXACTFL) {
3725 /* And set the other member of the fold pair, but
3726 * can't do that in locale because not known until
3728 ANYOF_BITMAP_SET(data->start_class,
3729 PL_fold_latin1[uc]);
3731 /* All folds except under /iaa that include s, S,
3732 * and sharp_s also may include the others */
3733 if (OP(scan) != EXACTFA) {
3734 if (uc == 's' || uc == 'S') {
3735 ANYOF_BITMAP_SET(data->start_class,
3736 LATIN_SMALL_LETTER_SHARP_S);
3738 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3739 ANYOF_BITMAP_SET(data->start_class, 's');
3740 ANYOF_BITMAP_SET(data->start_class, 'S');
3745 CLEAR_SSC_EOS(data->start_class);
3747 cl_and(data->start_class, and_withp);
3749 flags &= ~SCF_DO_STCLASS;
3751 else if (REGNODE_VARIES(OP(scan))) {
3752 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3753 I32 f = flags, pos_before = 0;
3754 regnode * const oscan = scan;
3755 struct regnode_charclass_class this_class;
3756 struct regnode_charclass_class *oclass = NULL;
3757 I32 next_is_eval = 0;
3759 switch (PL_regkind[OP(scan)]) {
3760 case WHILEM: /* End of (?:...)* . */
3761 scan = NEXTOPER(scan);
3764 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3765 next = NEXTOPER(scan);
3766 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3768 maxcount = REG_INFTY;
3769 next = regnext(scan);
3770 scan = NEXTOPER(scan);
3774 if (flags & SCF_DO_SUBSTR)
3779 if (flags & SCF_DO_STCLASS) {
3781 maxcount = REG_INFTY;
3782 next = regnext(scan);
3783 scan = NEXTOPER(scan);
3786 is_inf = is_inf_internal = 1;
3787 scan = regnext(scan);
3788 if (flags & SCF_DO_SUBSTR) {
3789 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3790 data->longest = &(data->longest_float);
3792 goto optimize_curly_tail;
3794 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3795 && (scan->flags == stopparen))
3800 mincount = ARG1(scan);
3801 maxcount = ARG2(scan);
3803 next = regnext(scan);
3804 if (OP(scan) == CURLYX) {
3805 I32 lp = (data ? *(data->last_closep) : 0);
3806 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3808 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3809 next_is_eval = (OP(scan) == EVAL);
3811 if (flags & SCF_DO_SUBSTR) {
3812 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3813 pos_before = data->pos_min;
3817 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3819 data->flags |= SF_IS_INF;
3821 if (flags & SCF_DO_STCLASS) {
3822 cl_init(pRExC_state, &this_class);
3823 oclass = data->start_class;
3824 data->start_class = &this_class;
3825 f |= SCF_DO_STCLASS_AND;
3826 f &= ~SCF_DO_STCLASS_OR;
3828 /* Exclude from super-linear cache processing any {n,m}
3829 regops for which the combination of input pos and regex
3830 pos is not enough information to determine if a match
3833 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3834 regex pos at the \s*, the prospects for a match depend not
3835 only on the input position but also on how many (bar\s*)
3836 repeats into the {4,8} we are. */
3837 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3838 f &= ~SCF_WHILEM_VISITED_POS;
3840 /* This will finish on WHILEM, setting scan, or on NULL: */
3841 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3842 last, data, stopparen, recursed, NULL,
3844 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3846 if (flags & SCF_DO_STCLASS)
3847 data->start_class = oclass;
3848 if (mincount == 0 || minnext == 0) {
3849 if (flags & SCF_DO_STCLASS_OR) {
3850 cl_or(pRExC_state, data->start_class, &this_class);
3852 else if (flags & SCF_DO_STCLASS_AND) {
3853 /* Switch to OR mode: cache the old value of
3854 * data->start_class */
3856 StructCopy(data->start_class, and_withp,
3857 struct regnode_charclass_class);
3858 flags &= ~SCF_DO_STCLASS_AND;
3859 StructCopy(&this_class, data->start_class,
3860 struct regnode_charclass_class);
3861 flags |= SCF_DO_STCLASS_OR;
3862 SET_SSC_EOS(data->start_class);
3864 } else { /* Non-zero len */
3865 if (flags & SCF_DO_STCLASS_OR) {
3866 cl_or(pRExC_state, data->start_class, &this_class);
3867 cl_and(data->start_class, and_withp);
3869 else if (flags & SCF_DO_STCLASS_AND)
3870 cl_and(data->start_class, &this_class);
3871 flags &= ~SCF_DO_STCLASS;
3873 if (!scan) /* It was not CURLYX, but CURLY. */
3875 if ( /* ? quantifier ok, except for (?{ ... }) */
3876 (next_is_eval || !(mincount == 0 && maxcount == 1))
3877 && (minnext == 0) && (deltanext == 0)
3878 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3879 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3881 /* Fatal warnings may leak the regexp without this: */
3882 SAVEFREESV(RExC_rx_sv);
3883 ckWARNreg(RExC_parse,
3884 "Quantifier unexpected on zero-length expression");
3885 (void)ReREFCNT_inc(RExC_rx_sv);
3888 min += minnext * mincount;
3889 is_inf_internal |= ((maxcount == REG_INFTY
3890 && (minnext + deltanext) > 0)
3891 || deltanext == I32_MAX);
3892 is_inf |= is_inf_internal;
3893 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3895 /* Try powerful optimization CURLYX => CURLYN. */
3896 if ( OP(oscan) == CURLYX && data
3897 && data->flags & SF_IN_PAR
3898 && !(data->flags & SF_HAS_EVAL)
3899 && !deltanext && minnext == 1 ) {
3900 /* Try to optimize to CURLYN. */
3901 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3902 regnode * const nxt1 = nxt;
3909 if (!REGNODE_SIMPLE(OP(nxt))
3910 && !(PL_regkind[OP(nxt)] == EXACT
3911 && STR_LEN(nxt) == 1))
3917 if (OP(nxt) != CLOSE)
3919 if (RExC_open_parens) {
3920 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3921 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3923 /* Now we know that nxt2 is the only contents: */
3924 oscan->flags = (U8)ARG(nxt);
3926 OP(nxt1) = NOTHING; /* was OPEN. */
3929 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3930 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3931 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3932 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3933 OP(nxt + 1) = OPTIMIZED; /* was count. */
3934 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3939 /* Try optimization CURLYX => CURLYM. */
3940 if ( OP(oscan) == CURLYX && data
3941 && !(data->flags & SF_HAS_PAR)
3942 && !(data->flags & SF_HAS_EVAL)
3943 && !deltanext /* atom is fixed width */
3944 && minnext != 0 /* CURLYM can't handle zero width */
3945 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3947 /* XXXX How to optimize if data == 0? */
3948 /* Optimize to a simpler form. */
3949 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3953 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3954 && (OP(nxt2) != WHILEM))
3956 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3957 /* Need to optimize away parenths. */
3958 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3959 /* Set the parenth number. */
3960 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3962 oscan->flags = (U8)ARG(nxt);
3963 if (RExC_open_parens) {
3964 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3965 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3967 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3968 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3971 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3972 OP(nxt + 1) = OPTIMIZED; /* was count. */
3973 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3974 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3977 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3978 regnode *nnxt = regnext(nxt1);
3980 if (reg_off_by_arg[OP(nxt1)])
3981 ARG_SET(nxt1, nxt2 - nxt1);
3982 else if (nxt2 - nxt1 < U16_MAX)
3983 NEXT_OFF(nxt1) = nxt2 - nxt1;
3985 OP(nxt) = NOTHING; /* Cannot beautify */
3990 /* Optimize again: */
3991 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3992 NULL, stopparen, recursed, NULL, 0,depth+1);
3997 else if ((OP(oscan) == CURLYX)
3998 && (flags & SCF_WHILEM_VISITED_POS)
3999 /* See the comment on a similar expression above.
4000 However, this time it's not a subexpression
4001 we care about, but the expression itself. */
4002 && (maxcount == REG_INFTY)
4003 && data && ++data->whilem_c < 16) {
4004 /* This stays as CURLYX, we can put the count/of pair. */
4005 /* Find WHILEM (as in regexec.c) */
4006 regnode *nxt = oscan + NEXT_OFF(oscan);
4008 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4010 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4011 | (RExC_whilem_seen << 4)); /* On WHILEM */
4013 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4015 if (flags & SCF_DO_SUBSTR) {
4016 SV *last_str = NULL;
4017 int counted = mincount != 0;
4019 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4020 #if defined(SPARC64_GCC_WORKAROUND)
4023 const char *s = NULL;
4026 if (pos_before >= data->last_start_min)
4029 b = data->last_start_min;
4032 s = SvPV_const(data->last_found, l);
4033 old = b - data->last_start_min;
4036 I32 b = pos_before >= data->last_start_min
4037 ? pos_before : data->last_start_min;
4039 const char * const s = SvPV_const(data->last_found, l);
4040 I32 old = b - data->last_start_min;
4044 old = utf8_hop((U8*)s, old) - (U8*)s;
4046 /* Get the added string: */
4047 last_str = newSVpvn_utf8(s + old, l, UTF);
4048 if (deltanext == 0 && pos_before == b) {
4049 /* What was added is a constant string */
4051 SvGROW(last_str, (mincount * l) + 1);
4052 repeatcpy(SvPVX(last_str) + l,
4053 SvPVX_const(last_str), l, mincount - 1);
4054 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4055 /* Add additional parts. */
4056 SvCUR_set(data->last_found,
4057 SvCUR(data->last_found) - l);
4058 sv_catsv(data->last_found, last_str);
4060 SV * sv = data->last_found;
4062 SvUTF8(sv) && SvMAGICAL(sv) ?
4063 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4064 if (mg && mg->mg_len >= 0)
4065 mg->mg_len += CHR_SVLEN(last_str) - l;
4067 data->last_end += l * (mincount - 1);
4070 /* start offset must point into the last copy */
4071 data->last_start_min += minnext * (mincount - 1);
4072 data->last_start_max += is_inf ? I32_MAX
4073 : (maxcount - 1) * (minnext + data->pos_delta);
4076 /* It is counted once already... */
4077 data->pos_min += minnext * (mincount - counted);
4078 data->pos_delta += - counted * deltanext +
4079 (minnext + deltanext) * maxcount - minnext * mincount;
4080 if (mincount != maxcount) {
4081 /* Cannot extend fixed substrings found inside
4083 SCAN_COMMIT(pRExC_state,data,minlenp);
4084 if (mincount && last_str) {
4085 SV * const sv = data->last_found;
4086 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4087 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4091 sv_setsv(sv, last_str);
4092 data->last_end = data->pos_min;
4093 data->last_start_min =
4094 data->pos_min - CHR_SVLEN(last_str);
4095 data->last_start_max = is_inf
4097 : data->pos_min + data->pos_delta
4098 - CHR_SVLEN(last_str);
4100 data->longest = &(data->longest_float);
4102 SvREFCNT_dec(last_str);
4104 if (data && (fl & SF_HAS_EVAL))
4105 data->flags |= SF_HAS_EVAL;
4106 optimize_curly_tail:
4107 if (OP(oscan) != CURLYX) {
4108 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4110 NEXT_OFF(oscan) += NEXT_OFF(next);
4113 default: /* REF, and CLUMP only? */
4114 if (flags & SCF_DO_SUBSTR) {
4115 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4116 data->longest = &(data->longest_float);
4118 is_inf = is_inf_internal = 1;
4119 if (flags & SCF_DO_STCLASS_OR)
4120 cl_anything(pRExC_state, data->start_class);
4121 flags &= ~SCF_DO_STCLASS;
4125 else if (OP(scan) == LNBREAK) {
4126 if (flags & SCF_DO_STCLASS) {
4128 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4129 if (flags & SCF_DO_STCLASS_AND) {
4130 for (value = 0; value < 256; value++)
4131 if (!is_VERTWS_cp(value))
4132 ANYOF_BITMAP_CLEAR(data->start_class, value);
4135 for (value = 0; value < 256; value++)
4136 if (is_VERTWS_cp(value))
4137 ANYOF_BITMAP_SET(data->start_class, value);
4139 if (flags & SCF_DO_STCLASS_OR)
4140 cl_and(data->start_class, and_withp);
4141 flags &= ~SCF_DO_STCLASS;
4144 delta++; /* Because of the 2 char string cr-lf */
4145 if (flags & SCF_DO_SUBSTR) {
4146 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4148 data->pos_delta += 1;
4149 data->longest = &(data->longest_float);
4152 else if (REGNODE_SIMPLE(OP(scan))) {
4155 if (flags & SCF_DO_SUBSTR) {
4156 SCAN_COMMIT(pRExC_state,data,minlenp);
4160 if (flags & SCF_DO_STCLASS) {
4162 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4164 /* Some of the logic below assumes that switching
4165 locale on will only add false positives. */
4166 switch (PL_regkind[OP(scan)]) {
4172 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4175 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4176 cl_anything(pRExC_state, data->start_class);
4179 if (OP(scan) == SANY)
4181 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4182 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4183 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4184 cl_anything(pRExC_state, data->start_class);
4186 if (flags & SCF_DO_STCLASS_AND || !value)
4187 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4190 if (flags & SCF_DO_STCLASS_AND)
4191 cl_and(data->start_class,
4192 (struct regnode_charclass_class*)scan);
4194 cl_or(pRExC_state, data->start_class,
4195 (struct regnode_charclass_class*)scan);
4203 classnum = FLAGS(scan);
4204 if (flags & SCF_DO_STCLASS_AND) {
4205 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4206 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4207 for (value = 0; value < loop_max; value++) {
4208 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4209 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4215 if (data->start_class->flags & ANYOF_LOCALE) {
4216 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4220 /* Even if under locale, set the bits for non-locale
4221 * in case it isn't a true locale-node. This will
4222 * create false positives if it truly is locale */
4223 for (value = 0; value < loop_max; value++) {
4224 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4225 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4237 classnum = FLAGS(scan);
4238 if (flags & SCF_DO_STCLASS_AND) {
4239 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4240 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4241 for (value = 0; value < loop_max; value++) {
4242 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4243 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4249 if (data->start_class->flags & ANYOF_LOCALE) {
4250 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4254 /* Even if under locale, set the bits for non-locale in
4255 * case it isn't a true locale-node. This will create
4256 * false positives if it truly is locale */
4257 for (value = 0; value < loop_max; value++) {
4258 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4259 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4262 if (PL_regkind[OP(scan)] == NPOSIXD) {
4263 data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4269 if (flags & SCF_DO_STCLASS_OR)
4270 cl_and(data->start_class, and_withp);
4271 flags &= ~SCF_DO_STCLASS;
4274 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4275 data->flags |= (OP(scan) == MEOL
4278 SCAN_COMMIT(pRExC_state, data, minlenp);
4281 else if ( PL_regkind[OP(scan)] == BRANCHJ
4282 /* Lookbehind, or need to calculate parens/evals/stclass: */
4283 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4284 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4285 if ( OP(scan) == UNLESSM &&
4287 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4288 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4291 regnode *upto= regnext(scan);
4293 SV * const mysv_val=sv_newmortal();
4294 DEBUG_STUDYDATA("OPFAIL",data,depth);
4296 /*DEBUG_PARSE_MSG("opfail");*/
4297 regprop(RExC_rx, mysv_val, upto);
4298 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4299 SvPV_nolen_const(mysv_val),
4300 (IV)REG_NODE_NUM(upto),
4305 NEXT_OFF(scan) = upto - scan;
4306 for (opt= scan + 1; opt < upto ; opt++)
4307 OP(opt) = OPTIMIZED;
4311 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4312 || OP(scan) == UNLESSM )
4314 /* Negative Lookahead/lookbehind
4315 In this case we can't do fixed string optimisation.
4318 I32 deltanext, minnext, fake = 0;
4320 struct regnode_charclass_class intrnl;
4323 data_fake.flags = 0;
4325 data_fake.whilem_c = data->whilem_c;
4326 data_fake.last_closep = data->last_closep;
4329 data_fake.last_closep = &fake;
4330 data_fake.pos_delta = delta;
4331 if ( flags & SCF_DO_STCLASS && !scan->flags
4332 && OP(scan) == IFMATCH ) { /* Lookahead */
4333 cl_init(pRExC_state, &intrnl);
4334 data_fake.start_class = &intrnl;
4335 f |= SCF_DO_STCLASS_AND;
4337 if (flags & SCF_WHILEM_VISITED_POS)
4338 f |= SCF_WHILEM_VISITED_POS;
4339 next = regnext(scan);
4340 nscan = NEXTOPER(NEXTOPER(scan));
4341 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4342 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4345 FAIL("Variable length lookbehind not implemented");
4347 else if (minnext > (I32)U8_MAX) {
4348 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4350 scan->flags = (U8)minnext;
4353 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4355 if (data_fake.flags & SF_HAS_EVAL)
4356 data->flags |= SF_HAS_EVAL;
4357 data->whilem_c = data_fake.whilem_c;
4359 if (f & SCF_DO_STCLASS_AND) {
4360 if (flags & SCF_DO_STCLASS_OR) {
4361 /* OR before, AND after: ideally we would recurse with
4362 * data_fake to get the AND applied by study of the
4363 * remainder of the pattern, and then derecurse;
4364 * *** HACK *** for now just treat as "no information".
4365 * See [perl #56690].
4367 cl_init(pRExC_state, data->start_class);
4369 /* AND before and after: combine and continue */
4370 const int was = TEST_SSC_EOS(data->start_class);
4372 cl_and(data->start_class, &intrnl);
4374 SET_SSC_EOS(data->start_class);
4378 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4380 /* Positive Lookahead/lookbehind
4381 In this case we can do fixed string optimisation,
4382 but we must be careful about it. Note in the case of
4383 lookbehind the positions will be offset by the minimum
4384 length of the pattern, something we won't know about
4385 until after the recurse.
4387 I32 deltanext, fake = 0;
4389 struct regnode_charclass_class intrnl;
4391 /* We use SAVEFREEPV so that when the full compile
4392 is finished perl will clean up the allocated
4393 minlens when it's all done. This way we don't
4394 have to worry about freeing them when we know
4395 they wont be used, which would be a pain.
4398 Newx( minnextp, 1, I32 );
4399 SAVEFREEPV(minnextp);
4402 StructCopy(data, &data_fake, scan_data_t);
4403 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4406 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4407 data_fake.last_found=newSVsv(data->last_found);
4411 data_fake.last_closep = &fake;
4412 data_fake.flags = 0;
4413 data_fake.pos_delta = delta;
4415 data_fake.flags |= SF_IS_INF;
4416 if ( flags & SCF_DO_STCLASS && !scan->flags
4417 && OP(scan) == IFMATCH ) { /* Lookahead */
4418 cl_init(pRExC_state, &intrnl);
4419 data_fake.start_class = &intrnl;
4420 f |= SCF_DO_STCLASS_AND;
4422 if (flags & SCF_WHILEM_VISITED_POS)
4423 f |= SCF_WHILEM_VISITED_POS;
4424 next = regnext(scan);
4425 nscan = NEXTOPER(NEXTOPER(scan));
4427 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4428 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4431 FAIL("Variable length lookbehind not implemented");
4433 else if (*minnextp > (I32)U8_MAX) {
4434 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4436 scan->flags = (U8)*minnextp;
4441 if (f & SCF_DO_STCLASS_AND) {
4442 const int was = TEST_SSC_EOS(data.start_class);
4444 cl_and(data->start_class, &intrnl);
4446 SET_SSC_EOS(data->start_class);
4449 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4451 if (data_fake.flags & SF_HAS_EVAL)
4452 data->flags |= SF_HAS_EVAL;
4453 data->whilem_c = data_fake.whilem_c;
4454 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4455 if (RExC_rx->minlen<*minnextp)
4456 RExC_rx->minlen=*minnextp;
4457 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4458 SvREFCNT_dec_NN(data_fake.last_found);
4460 if ( data_fake.minlen_fixed != minlenp )
4462 data->offset_fixed= data_fake.offset_fixed;
4463 data->minlen_fixed= data_fake.minlen_fixed;
4464 data->lookbehind_fixed+= scan->flags;
4466 if ( data_fake.minlen_float != minlenp )
4468 data->minlen_float= data_fake.minlen_float;
4469 data->offset_float_min=data_fake.offset_float_min;
4470 data->offset_float_max=data_fake.offset_float_max;
4471 data->lookbehind_float+= scan->flags;
4478 else if (OP(scan) == OPEN) {
4479 if (stopparen != (I32)ARG(scan))
4482 else if (OP(scan) == CLOSE) {
4483 if (stopparen == (I32)ARG(scan)) {
4486 if ((I32)ARG(scan) == is_par) {
4487 next = regnext(scan);
4489 if ( next && (OP(next) != WHILEM) && next < last)
4490 is_par = 0; /* Disable optimization */
4493 *(data->last_closep) = ARG(scan);
4495 else if (OP(scan) == EVAL) {
4497 data->flags |= SF_HAS_EVAL;
4499 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4500 if (flags & SCF_DO_SUBSTR) {
4501 SCAN_COMMIT(pRExC_state,data,minlenp);
4502 flags &= ~SCF_DO_SUBSTR;
4504 if (data && OP(scan)==ACCEPT) {
4505 data->flags |= SCF_SEEN_ACCEPT;
4510 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4512 if (flags & SCF_DO_SUBSTR) {
4513 SCAN_COMMIT(pRExC_state,data,minlenp);
4514 data->longest = &(data->longest_float);
4516 is_inf = is_inf_internal = 1;
4517 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4518 cl_anything(pRExC_state, data->start_class);
4519 flags &= ~SCF_DO_STCLASS;
4521 else if (OP(scan) == GPOS) {
4522 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4523 !(delta || is_inf || (data && data->pos_delta)))
4525 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4526 RExC_rx->extflags |= RXf_ANCH_GPOS;
4527 if (RExC_rx->gofs < (U32)min)
4528 RExC_rx->gofs = min;
4530 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4534 #ifdef TRIE_STUDY_OPT
4535 #ifdef FULL_TRIE_STUDY
4536 else if (PL_regkind[OP(scan)] == TRIE) {
4537 /* NOTE - There is similar code to this block above for handling
4538 BRANCH nodes on the initial study. If you change stuff here
4540 regnode *trie_node= scan;
4541 regnode *tail= regnext(scan);
4542 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4543 I32 max1 = 0, min1 = I32_MAX;
4544 struct regnode_charclass_class accum;
4546 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4547 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4548 if (flags & SCF_DO_STCLASS)
4549 cl_init_zero(pRExC_state, &accum);
4555 const regnode *nextbranch= NULL;
4558 for ( word=1 ; word <= trie->wordcount ; word++)
4560 I32 deltanext=0, minnext=0, f = 0, fake;
4561 struct regnode_charclass_class this_class;
4563 data_fake.flags = 0;
4565 data_fake.whilem_c = data->whilem_c;
4566 data_fake.last_closep = data->last_closep;
4569 data_fake.last_closep = &fake;
4570 data_fake.pos_delta = delta;
4571 if (flags & SCF_DO_STCLASS) {
4572 cl_init(pRExC_state, &this_class);
4573 data_fake.start_class = &this_class;
4574 f = SCF_DO_STCLASS_AND;
4576 if (flags & SCF_WHILEM_VISITED_POS)
4577 f |= SCF_WHILEM_VISITED_POS;
4579 if (trie->jump[word]) {
4581 nextbranch = trie_node + trie->jump[0];
4582 scan= trie_node + trie->jump[word];
4583 /* We go from the jump point to the branch that follows
4584 it. Note this means we need the vestigal unused branches
4585 even though they arent otherwise used.
4587 minnext = study_chunk(pRExC_state, &scan, minlenp,
4588 &deltanext, (regnode *)nextbranch, &data_fake,
4589 stopparen, recursed, NULL, f,depth+1);
4591 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4592 nextbranch= regnext((regnode*)nextbranch);
4594 if (min1 > (I32)(minnext + trie->minlen))
4595 min1 = minnext + trie->minlen;
4596 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4597 max1 = minnext + deltanext + trie->maxlen;
4598 if (deltanext == I32_MAX)
4599 is_inf = is_inf_internal = 1;
4601 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4603 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4604 if ( stopmin > min + min1)
4605 stopmin = min + min1;
4606 flags &= ~SCF_DO_SUBSTR;
4608 data->flags |= SCF_SEEN_ACCEPT;
4611 if (data_fake.flags & SF_HAS_EVAL)
4612 data->flags |= SF_HAS_EVAL;
4613 data->whilem_c = data_fake.whilem_c;
4615 if (flags & SCF_DO_STCLASS)
4616 cl_or(pRExC_state, &accum, &this_class);
4619 if (flags & SCF_DO_SUBSTR) {
4620 data->pos_min += min1;
4621 data->pos_delta += max1 - min1;
4622 if (max1 != min1 || is_inf)
4623 data->longest = &(data->longest_float);
4626 delta += max1 - min1;
4627 if (flags & SCF_DO_STCLASS_OR) {
4628 cl_or(pRExC_state, data->start_class, &accum);
4630 cl_and(data->start_class, and_withp);
4631 flags &= ~SCF_DO_STCLASS;
4634 else if (flags & SCF_DO_STCLASS_AND) {
4636 cl_and(data->start_class, &accum);
4637 flags &= ~SCF_DO_STCLASS;
4640 /* Switch to OR mode: cache the old value of
4641 * data->start_class */
4643 StructCopy(data->start_class, and_withp,
4644 struct regnode_charclass_class);
4645 flags &= ~SCF_DO_STCLASS_AND;
4646 StructCopy(&accum, data->start_class,
4647 struct regnode_charclass_class);
4648 flags |= SCF_DO_STCLASS_OR;
4649 SET_SSC_EOS(data->start_class);
4656 else if (PL_regkind[OP(scan)] == TRIE) {
4657 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4660 min += trie->minlen;
4661 delta += (trie->maxlen - trie->minlen);
4662 flags &= ~SCF_DO_STCLASS; /* xxx */
4663 if (flags & SCF_DO_SUBSTR) {
4664 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4665 data->pos_min += trie->minlen;
4666 data->pos_delta += (trie->maxlen - trie->minlen);
4667 if (trie->maxlen != trie->minlen)
4668 data->longest = &(data->longest_float);
4670 if (trie->jump) /* no more substrings -- for now /grr*/
4671 flags &= ~SCF_DO_SUBSTR;
4673 #endif /* old or new */
4674 #endif /* TRIE_STUDY_OPT */
4676 /* Else: zero-length, ignore. */
4677 scan = regnext(scan);
4682 stopparen = frame->stop;
4683 frame = frame->prev;
4684 goto fake_study_recurse;
4689 DEBUG_STUDYDATA("pre-fin:",data,depth);
4692 *deltap = is_inf_internal ? I32_MAX : delta;
4693 if (flags & SCF_DO_SUBSTR && is_inf)
4694 data->pos_delta = I32_MAX - data->pos_min;
4695 if (is_par > (I32)U8_MAX)
4697 if (is_par && pars==1 && data) {
4698 data->flags |= SF_IN_PAR;
4699 data->flags &= ~SF_HAS_PAR;
4701 else if (pars && data) {
4702 data->flags |= SF_HAS_PAR;
4703 data->flags &= ~SF_IN_PAR;
4705 if (flags & SCF_DO_STCLASS_OR)
4706 cl_and(data->start_class, and_withp);
4707 if (flags & SCF_TRIE_RESTUDY)
4708 data->flags |= SCF_TRIE_RESTUDY;
4710 DEBUG_STUDYDATA("post-fin:",data,depth);
4712 return min < stopmin ? min : stopmin;
4716 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4718 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4720 PERL_ARGS_ASSERT_ADD_DATA;
4722 Renewc(RExC_rxi->data,
4723 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4724 char, struct reg_data);
4726 Renew(RExC_rxi->data->what, count + n, U8);
4728 Newx(RExC_rxi->data->what, n, U8);
4729 RExC_rxi->data->count = count + n;
4730 Copy(s, RExC_rxi->data->what + count, n, U8);
4734 /*XXX: todo make this not included in a non debugging perl */
4735 #ifndef PERL_IN_XSUB_RE
4737 Perl_reginitcolors(pTHX)
4740 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4742 char *t = savepv(s);
4746 t = strchr(t, '\t');
4752 PL_colors[i] = t = (char *)"";
4757 PL_colors[i++] = (char *)"";
4764 #ifdef TRIE_STUDY_OPT
4765 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4768 (data.flags & SCF_TRIE_RESTUDY) \
4776 #define CHECK_RESTUDY_GOTO_butfirst
4780 * pregcomp - compile a regular expression into internal code
4782 * Decides which engine's compiler to call based on the hint currently in
4786 #ifndef PERL_IN_XSUB_RE
4788 /* return the currently in-scope regex engine (or the default if none) */
4790 regexp_engine const *
4791 Perl_current_re_engine(pTHX)
4795 if (IN_PERL_COMPILETIME) {
4796 HV * const table = GvHV(PL_hintgv);
4800 return &PL_core_reg_engine;
4801 ptr = hv_fetchs(table, "regcomp", FALSE);
4802 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4803 return &PL_core_reg_engine;
4804 return INT2PTR(regexp_engine*,SvIV(*ptr));
4808 if (!PL_curcop->cop_hints_hash)
4809 return &PL_core_reg_engine;
4810 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4811 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4812 return &PL_core_reg_engine;
4813 return INT2PTR(regexp_engine*,SvIV(ptr));
4819 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4822 regexp_engine const *eng = current_re_engine();
4823 GET_RE_DEBUG_FLAGS_DECL;
4825 PERL_ARGS_ASSERT_PREGCOMP;
4827 /* Dispatch a request to compile a regexp to correct regexp engine. */
4829 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4832 return CALLREGCOMP_ENG(eng, pattern, flags);
4836 /* public(ish) entry point for the perl core's own regex compiling code.
4837 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4838 * pattern rather than a list of OPs, and uses the internal engine rather
4839 * than the current one */
4842 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4844 SV *pat = pattern; /* defeat constness! */
4845 PERL_ARGS_ASSERT_RE_COMPILE;
4846 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4847 #ifdef PERL_IN_XSUB_RE
4850 &PL_core_reg_engine,
4852 NULL, NULL, rx_flags, 0);
4855 /* see if there are any run-time code blocks in the pattern.
4856 * False positives are allowed */
4859 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4860 U32 pm_flags, char *pat, STRLEN plen)
4865 /* avoid infinitely recursing when we recompile the pattern parcelled up
4866 * as qr'...'. A single constant qr// string can't have have any
4867 * run-time component in it, and thus, no runtime code. (A non-qr
4868 * string, however, can, e.g. $x =~ '(?{})') */
4869 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4872 for (s = 0; s < plen; s++) {
4873 if (n < pRExC_state->num_code_blocks
4874 && s == pRExC_state->code_blocks[n].start)
4876 s = pRExC_state->code_blocks[n].end;
4880 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4882 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
4884 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
4891 /* Handle run-time code blocks. We will already have compiled any direct
4892 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4893 * copy of it, but with any literal code blocks blanked out and
4894 * appropriate chars escaped; then feed it into
4896 * eval "qr'modified_pattern'"
4900 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4904 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4906 * After eval_sv()-ing that, grab any new code blocks from the returned qr
4907 * and merge them with any code blocks of the original regexp.
4909 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4910 * instead, just save the qr and return FALSE; this tells our caller that
4911 * the original pattern needs upgrading to utf8.
4915 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4916 char *pat, STRLEN plen)
4920 GET_RE_DEBUG_FLAGS_DECL;
4922 if (pRExC_state->runtime_code_qr) {
4923 /* this is the second time we've been called; this should
4924 * only happen if the main pattern got upgraded to utf8
4925 * during compilation; re-use the qr we compiled first time
4926 * round (which should be utf8 too)
4928 qr = pRExC_state->runtime_code_qr;
4929 pRExC_state->runtime_code_qr = NULL;
4930 assert(RExC_utf8 && SvUTF8(qr));
4936 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4940 /* determine how many extra chars we need for ' and \ escaping */
4941 for (s = 0; s < plen; s++) {
4942 if (pat[s] == '\'' || pat[s] == '\\')
4946 Newx(newpat, newlen, char);
4948 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4950 for (s = 0; s < plen; s++) {
4951 if (n < pRExC_state->num_code_blocks
4952 && s == pRExC_state->code_blocks[n].start)
4954 /* blank out literal code block */
4955 assert(pat[s] == '(');
4956 while (s <= pRExC_state->code_blocks[n].end) {
4964 if (pat[s] == '\'' || pat[s] == '\\')
4969 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4973 PerlIO_printf(Perl_debug_log,
4974 "%sre-parsing pattern for runtime code:%s %s\n",
4975 PL_colors[4],PL_colors[5],newpat);
4978 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4984 PUSHSTACKi(PERLSI_REQUIRE);
4985 /* this causes the toker to collapse \\ into \ when parsing
4986 * qr''; normally only q'' does this. It also alters hints
4988 PL_reg_state.re_reparsing = TRUE;
4989 eval_sv(sv, G_SCALAR);
4990 SvREFCNT_dec_NN(sv);
4995 SV * const errsv = ERRSV;
4996 if (SvTRUE_NN(errsv))
4998 Safefree(pRExC_state->code_blocks);
4999 /* use croak_sv ? */
5000 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5003 assert(SvROK(qr_ref));
5005 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5006 /* the leaving below frees the tmp qr_ref.
5007 * Give qr a life of its own */
5015 if (!RExC_utf8 && SvUTF8(qr)) {
5016 /* first time through; the pattern got upgraded; save the
5017 * qr for the next time through */
5018 assert(!pRExC_state->runtime_code_qr);
5019 pRExC_state->runtime_code_qr = qr;
5024 /* extract any code blocks within the returned qr// */
5027 /* merge the main (r1) and run-time (r2) code blocks into one */
5029 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5030 struct reg_code_block *new_block, *dst;
5031 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5034 if (!r2->num_code_blocks) /* we guessed wrong */
5036 SvREFCNT_dec_NN(qr);
5041 r1->num_code_blocks + r2->num_code_blocks,
5042 struct reg_code_block);
5045 while ( i1 < r1->num_code_blocks
5046 || i2 < r2->num_code_blocks)
5048 struct reg_code_block *src;
5051 if (i1 == r1->num_code_blocks) {
5052 src = &r2->code_blocks[i2++];
5055 else if (i2 == r2->num_code_blocks)
5056 src = &r1->code_blocks[i1++];
5057 else if ( r1->code_blocks[i1].start
5058 < r2->code_blocks[i2].start)
5060 src = &r1->code_blocks[i1++];
5061 assert(src->end < r2->code_blocks[i2].start);
5064 assert( r1->code_blocks[i1].start
5065 > r2->code_blocks[i2].start);
5066 src = &r2->code_blocks[i2++];
5068 assert(src->end < r1->code_blocks[i1].start);
5071 assert(pat[src->start] == '(');
5072 assert(pat[src->end] == ')');
5073 dst->start = src->start;
5074 dst->end = src->end;
5075 dst->block = src->block;
5076 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5080 r1->num_code_blocks += r2->num_code_blocks;
5081 Safefree(r1->code_blocks);
5082 r1->code_blocks = new_block;
5085 SvREFCNT_dec_NN(qr);
5091 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5093 /* This is the common code for setting up the floating and fixed length
5094 * string data extracted from Perlre_op_compile() below. Returns a boolean
5095 * as to whether succeeded or not */
5099 if (! (longest_length
5100 || (eol /* Can't have SEOL and MULTI */
5101 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5103 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5104 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5109 /* copy the information about the longest from the reg_scan_data
5110 over to the program. */
5111 if (SvUTF8(sv_longest)) {
5112 *rx_utf8 = sv_longest;
5115 *rx_substr = sv_longest;
5118 /* end_shift is how many chars that must be matched that
5119 follow this item. We calculate it ahead of time as once the
5120 lookbehind offset is added in we lose the ability to correctly
5122 ml = minlen ? *(minlen) : (I32)longest_length;
5123 *rx_end_shift = ml - offset
5124 - longest_length + (SvTAIL(sv_longest) != 0)
5127 t = (eol/* Can't have SEOL and MULTI */
5128 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5129 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5135 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5136 * regular expression into internal code.
5137 * The pattern may be passed either as:
5138 * a list of SVs (patternp plus pat_count)
5139 * a list of OPs (expr)
5140 * If both are passed, the SV list is used, but the OP list indicates
5141 * which SVs are actually pre-compiled code blocks
5143 * The SVs in the list have magic and qr overloading applied to them (and
5144 * the list may be modified in-place with replacement SVs in the latter
5147 * If the pattern hasn't changed from old_re, then old_re will be
5150 * eng is the current engine. If that engine has an op_comp method, then
5151 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5152 * do the initial concatenation of arguments and pass on to the external
5155 * If is_bare_re is not null, set it to a boolean indicating whether the
5156 * arg list reduced (after overloading) to a single bare regex which has
5157 * been returned (i.e. /$qr/).
5159 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5161 * pm_flags contains the PMf_* flags, typically based on those from the
5162 * pm_flags field of the related PMOP. Currently we're only interested in
5163 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5165 * We can't allocate space until we know how big the compiled form will be,
5166 * but we can't compile it (and thus know how big it is) until we've got a
5167 * place to put the code. So we cheat: we compile it twice, once with code
5168 * generation turned off and size counting turned on, and once "for real".
5169 * This also means that we don't allocate space until we are sure that the
5170 * thing really will compile successfully, and we never have to move the
5171 * code and thus invalidate pointers into it. (Note that it has to be in
5172 * one piece because free() must be able to free it all.) [NB: not true in perl]
5174 * Beware that the optimization-preparation code in here knows about some
5175 * of the structure of the compiled regexp. [I'll say.]
5179 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5180 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5181 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5186 regexp_internal *ri;
5195 SV * VOL code_blocksv = NULL;
5197 /* these are all flags - maybe they should be turned
5198 * into a single int with different bit masks */
5199 I32 sawlookahead = 0;
5202 bool used_setjump = FALSE;
5203 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5204 bool code_is_utf8 = 0;
5205 bool VOL recompile = 0;
5206 bool runtime_code = 0;
5210 RExC_state_t RExC_state;
5211 RExC_state_t * const pRExC_state = &RExC_state;
5212 #ifdef TRIE_STUDY_OPT
5214 RExC_state_t copyRExC_state;
5216 GET_RE_DEBUG_FLAGS_DECL;
5218 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5220 DEBUG_r(if (!PL_colorset) reginitcolors());
5222 #ifndef PERL_IN_XSUB_RE
5223 /* Initialize these here instead of as-needed, as is quick and avoids
5224 * having to test them each time otherwise */
5225 if (! PL_AboveLatin1) {
5226 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5227 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5228 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5230 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5231 = _new_invlist_C_array(L1PosixAlnum_invlist);
5232 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5233 = _new_invlist_C_array(PosixAlnum_invlist);
5235 PL_L1Posix_ptrs[_CC_ALPHA]
5236 = _new_invlist_C_array(L1PosixAlpha_invlist);
5237 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5239 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5240 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5242 /* Cased is the same as Alpha in the ASCII range */
5243 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5244 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5246 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5247 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5249 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5250 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5252 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5253 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5255 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5256 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5258 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5259 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5261 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5262 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5264 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5265 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5266 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5267 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5269 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5270 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5272 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5274 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5275 PL_L1Posix_ptrs[_CC_WORDCHAR]
5276 = _new_invlist_C_array(L1PosixWord_invlist);
5278 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5279 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5281 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5285 pRExC_state->code_blocks = NULL;
5286 pRExC_state->num_code_blocks = 0;
5289 *is_bare_re = FALSE;
5291 if (expr && (expr->op_type == OP_LIST ||
5292 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5294 /* is the source UTF8, and how many code blocks are there? */
5298 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5299 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5301 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5302 /* count of DO blocks */
5306 pRExC_state->num_code_blocks = ncode;
5307 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5312 /* handle a list of SVs */
5316 /* apply magic and RE overloading to each arg */
5317 for (svp = patternp; svp < patternp + pat_count; svp++) {
5320 if (SvROK(rx) && SvAMAGIC(rx)) {
5321 SV *sv = AMG_CALLunary(rx, regexp_amg);
5325 if (SvTYPE(sv) != SVt_REGEXP)
5326 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5332 if (pat_count > 1) {
5333 /* concat multiple args and find any code block indexes */
5338 STRLEN orig_patlen = 0;
5340 if (pRExC_state->num_code_blocks) {
5341 o = cLISTOPx(expr)->op_first;
5342 assert( o->op_type == OP_PUSHMARK
5343 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5344 || o->op_type == OP_PADRANGE);
5348 pat = newSVpvn("", 0);
5351 /* determine if the pattern is going to be utf8 (needed
5352 * in advance to align code block indices correctly).
5353 * XXX This could fail to be detected for an arg with
5354 * overloading but not concat overloading; but the main effect
5355 * in this obscure case is to need a 'use re eval' for a
5356 * literal code block */
5357 for (svp = patternp; svp < patternp + pat_count; svp++) {
5364 for (svp = patternp; svp < patternp + pat_count; svp++) {
5365 SV *sv, *msv = *svp;
5368 /* we make the assumption here that each op in the list of
5369 * op_siblings maps to one SV pushed onto the stack,
5370 * except for code blocks, with have both an OP_NULL and
5372 * This allows us to match up the list of SVs against the
5373 * list of OPs to find the next code block.
5375 * Note that PUSHMARK PADSV PADSV ..
5377 * PADRANGE NULL NULL ..
5378 * so the alignment still works. */
5380 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5381 assert(n < pRExC_state->num_code_blocks);
5382 pRExC_state->code_blocks[n].start = SvCUR(pat);
5383 pRExC_state->code_blocks[n].block = o;
5384 pRExC_state->code_blocks[n].src_regex = NULL;
5387 o = o->op_sibling; /* skip CONST */
5393 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5394 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5397 /* overloading involved: all bets are off over literal
5398 * code. Pretend we haven't seen it */
5399 pRExC_state->num_code_blocks -= n;
5405 while (SvAMAGIC(msv)
5406 && (sv = AMG_CALLunary(msv, string_amg))
5410 && SvRV(msv) == SvRV(sv))
5415 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5417 orig_patlen = SvCUR(pat);
5418 sv_catsv_nomg(pat, msv);
5421 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5424 /* extract any code blocks within any embedded qr//'s */
5425 if (rx && SvTYPE(rx) == SVt_REGEXP
5426 && RX_ENGINE((REGEXP*)rx)->op_comp)
5429 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5430 if (ri->num_code_blocks) {
5432 /* the presence of an embedded qr// with code means
5433 * we should always recompile: the text of the
5434 * qr// may not have changed, but it may be a
5435 * different closure than last time */
5437 Renew(pRExC_state->code_blocks,
5438 pRExC_state->num_code_blocks + ri->num_code_blocks,
5439 struct reg_code_block);
5440 pRExC_state->num_code_blocks += ri->num_code_blocks;
5441 for (i=0; i < ri->num_code_blocks; i++) {
5442 struct reg_code_block *src, *dst;
5443 STRLEN offset = orig_patlen
5444 + ReANY((REGEXP *)rx)->pre_prefix;
5445 assert(n < pRExC_state->num_code_blocks);
5446 src = &ri->code_blocks[i];
5447 dst = &pRExC_state->code_blocks[n];
5448 dst->start = src->start + offset;
5449 dst->end = src->end + offset;
5450 dst->block = src->block;
5451 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5465 while (SvAMAGIC(pat)
5466 && (sv = AMG_CALLunary(pat, string_amg))
5474 /* handle bare regex: foo =~ $re */
5479 if (SvTYPE(re) == SVt_REGEXP) {
5483 Safefree(pRExC_state->code_blocks);
5489 /* not a list of SVs, so must be a list of OPs */
5491 if (expr->op_type == OP_LIST) {
5496 pat = newSVpvn("", 0);
5501 /* given a list of CONSTs and DO blocks in expr, append all
5502 * the CONSTs to pat, and record the start and end of each
5503 * code block in code_blocks[] (each DO{} op is followed by an
5504 * OP_CONST containing the corresponding literal '(?{...})
5507 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5508 if (o->op_type == OP_CONST) {
5509 sv_catsv(pat, cSVOPo_sv);
5511 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5515 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5516 assert(i+1 < pRExC_state->num_code_blocks);
5517 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5518 pRExC_state->code_blocks[i].block = o;
5519 pRExC_state->code_blocks[i].src_regex = NULL;
5525 assert(expr->op_type == OP_CONST);
5526 pat = cSVOPx_sv(expr);
5530 exp = SvPV_nomg(pat, plen);
5532 if (!eng->op_comp) {
5533 if ((SvUTF8(pat) && IN_BYTES)
5534 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5536 /* make a temporary copy; either to convert to bytes,
5537 * or to avoid repeating get-magic / overloaded stringify */
5538 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5539 (IN_BYTES ? 0 : SvUTF8(pat)));
5541 Safefree(pRExC_state->code_blocks);
5542 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5545 /* ignore the utf8ness if the pattern is 0 length */
5546 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5547 RExC_uni_semantics = 0;
5548 RExC_contains_locale = 0;
5549 pRExC_state->runtime_code_qr = NULL;
5551 /****************** LONG JUMP TARGET HERE***********************/
5552 /* Longjmp back to here if have to switch in midstream to utf8 */
5553 if (! RExC_orig_utf8) {
5554 JMPENV_PUSH(jump_ret);
5555 used_setjump = TRUE;
5558 if (jump_ret == 0) { /* First time through */
5562 SV *dsv= sv_newmortal();
5563 RE_PV_QUOTED_DECL(s, RExC_utf8,
5564 dsv, exp, plen, 60);
5565 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5566 PL_colors[4],PL_colors[5],s);
5569 else { /* longjumped back */
5572 STRLEN s = 0, d = 0;
5575 /* If the cause for the longjmp was other than changing to utf8, pop
5576 * our own setjmp, and longjmp to the correct handler */
5577 if (jump_ret != UTF8_LONGJMP) {
5579 JMPENV_JUMP(jump_ret);
5584 /* It's possible to write a regexp in ascii that represents Unicode
5585 codepoints outside of the byte range, such as via \x{100}. If we
5586 detect such a sequence we have to convert the entire pattern to utf8
5587 and then recompile, as our sizing calculation will have been based
5588 on 1 byte == 1 character, but we will need to use utf8 to encode
5589 at least some part of the pattern, and therefore must convert the whole
5592 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5593 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5595 /* upgrade pattern to UTF8, and if there are code blocks,
5596 * recalculate the indices.
5597 * This is essentially an unrolled Perl_bytes_to_utf8() */
5599 src = (U8*)SvPV_nomg(pat, plen);
5600 Newx(dst, plen * 2 + 1, U8);
5603 const UV uv = NATIVE_TO_ASCII(src[s]);
5604 if (UNI_IS_INVARIANT(uv))
5605 dst[d] = (U8)UTF_TO_NATIVE(uv);
5607 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5608 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5610 if (n < pRExC_state->num_code_blocks) {
5611 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5612 pRExC_state->code_blocks[n].start = d;
5613 assert(dst[d] == '(');
5616 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5617 pRExC_state->code_blocks[n].end = d;
5618 assert(dst[d] == ')');
5631 RExC_orig_utf8 = RExC_utf8 = 1;
5634 /* return old regex if pattern hasn't changed */
5638 && !!RX_UTF8(old_re) == !!RExC_utf8
5639 && RX_PRECOMP(old_re)
5640 && RX_PRELEN(old_re) == plen
5641 && memEQ(RX_PRECOMP(old_re), exp, plen))
5643 /* with runtime code, always recompile */
5644 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5646 if (!runtime_code) {
5650 Safefree(pRExC_state->code_blocks);
5654 else if ((pm_flags & PMf_USE_RE_EVAL)
5655 /* this second condition covers the non-regex literal case,
5656 * i.e. $foo =~ '(?{})'. */
5657 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5658 && (PL_hints & HINT_RE_EVAL))
5660 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5663 #ifdef TRIE_STUDY_OPT
5667 rx_flags = orig_rx_flags;
5669 if (initial_charset == REGEX_LOCALE_CHARSET) {
5670 RExC_contains_locale = 1;
5672 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5674 /* Set to use unicode semantics if the pattern is in utf8 and has the
5675 * 'depends' charset specified, as it means unicode when utf8 */
5676 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5680 RExC_flags = rx_flags;
5681 RExC_pm_flags = pm_flags;
5684 if (TAINTING_get && TAINT_get)
5685 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5687 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5688 /* whoops, we have a non-utf8 pattern, whilst run-time code
5689 * got compiled as utf8. Try again with a utf8 pattern */
5690 JMPENV_JUMP(UTF8_LONGJMP);
5693 assert(!pRExC_state->runtime_code_qr);
5698 RExC_in_lookbehind = 0;
5699 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5701 RExC_override_recoding = 0;
5702 RExC_in_multi_char_class = 0;
5704 /* First pass: determine size, legality. */
5712 RExC_emit = &PL_regdummy;
5713 RExC_whilem_seen = 0;
5714 RExC_open_parens = NULL;
5715 RExC_close_parens = NULL;
5717 RExC_paren_names = NULL;
5719 RExC_paren_name_list = NULL;
5721 RExC_recurse = NULL;
5722 RExC_recurse_count = 0;
5723 pRExC_state->code_index = 0;
5725 #if 0 /* REGC() is (currently) a NOP at the first pass.
5726 * Clever compilers notice this and complain. --jhi */
5727 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5730 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5732 RExC_lastparse=NULL;
5734 /* reg may croak on us, not giving us a chance to free
5735 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5736 need it to survive as long as the regexp (qr/(?{})/).
5737 We must check that code_blocksv is not already set, because we may
5738 have longjmped back. */
5739 if (pRExC_state->code_blocks && !code_blocksv) {
5740 code_blocksv = newSV_type(SVt_PV);
5741 SAVEFREESV(code_blocksv);
5742 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5743 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5745 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5746 RExC_precomp = NULL;
5750 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5752 /* Here, finished first pass. Get rid of any added setjmp */
5758 PerlIO_printf(Perl_debug_log,
5759 "Required size %"IVdf" nodes\n"
5760 "Starting second pass (creation)\n",
5763 RExC_lastparse=NULL;
5766 /* The first pass could have found things that force Unicode semantics */
5767 if ((RExC_utf8 || RExC_uni_semantics)
5768 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5770 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5773 /* Small enough for pointer-storage convention?
5774 If extralen==0, this means that we will not need long jumps. */
5775 if (RExC_size >= 0x10000L && RExC_extralen)
5776 RExC_size += RExC_extralen;
5779 if (RExC_whilem_seen > 15)
5780 RExC_whilem_seen = 15;
5782 /* Allocate space and zero-initialize. Note, the two step process
5783 of zeroing when in debug mode, thus anything assigned has to
5784 happen after that */
5785 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5787 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5788 char, regexp_internal);
5789 if ( r == NULL || ri == NULL )
5790 FAIL("Regexp out of space");
5792 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5793 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5795 /* bulk initialize base fields with 0. */
5796 Zero(ri, sizeof(regexp_internal), char);
5799 /* non-zero initialization begins here */
5802 r->extflags = rx_flags;
5803 if (pm_flags & PMf_IS_QR) {
5804 ri->code_blocks = pRExC_state->code_blocks;
5805 ri->num_code_blocks = pRExC_state->num_code_blocks;
5810 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5811 if (pRExC_state->code_blocks[n].src_regex)
5812 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5813 SAVEFREEPV(pRExC_state->code_blocks);
5817 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5818 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5820 /* The caret is output if there are any defaults: if not all the STD
5821 * flags are set, or if no character set specifier is needed */
5823 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5825 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5826 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5827 >> RXf_PMf_STD_PMMOD_SHIFT);
5828 const char *fptr = STD_PAT_MODS; /*"msix"*/
5830 /* Allocate for the worst case, which is all the std flags are turned
5831 * on. If more precision is desired, we could do a population count of
5832 * the flags set. This could be done with a small lookup table, or by
5833 * shifting, masking and adding, or even, when available, assembly
5834 * language for a machine-language population count.
5835 * We never output a minus, as all those are defaults, so are
5836 * covered by the caret */
5837 const STRLEN wraplen = plen + has_p + has_runon
5838 + has_default /* If needs a caret */
5840 /* If needs a character set specifier */
5841 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5842 + (sizeof(STD_PAT_MODS) - 1)
5843 + (sizeof("(?:)") - 1);
5845 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5846 r->xpv_len_u.xpvlenu_pv = p;
5848 SvFLAGS(rx) |= SVf_UTF8;
5851 /* If a default, cover it using the caret */
5853 *p++= DEFAULT_PAT_MOD;
5857 const char* const name = get_regex_charset_name(r->extflags, &len);
5858 Copy(name, p, len, char);
5862 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5865 while((ch = *fptr++)) {
5873 Copy(RExC_precomp, p, plen, char);
5874 assert ((RX_WRAPPED(rx) - p) < 16);
5875 r->pre_prefix = p - RX_WRAPPED(rx);
5881 SvCUR_set(rx, p - RX_WRAPPED(rx));
5885 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5887 if (RExC_seen & REG_SEEN_RECURSE) {
5888 Newxz(RExC_open_parens, RExC_npar,regnode *);
5889 SAVEFREEPV(RExC_open_parens);
5890 Newxz(RExC_close_parens,RExC_npar,regnode *);
5891 SAVEFREEPV(RExC_close_parens);
5894 /* Useful during FAIL. */
5895 #ifdef RE_TRACK_PATTERN_OFFSETS
5896 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5897 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5898 "%s %"UVuf" bytes for offset annotations.\n",
5899 ri->u.offsets ? "Got" : "Couldn't get",
5900 (UV)((2*RExC_size+1) * sizeof(U32))));
5902 SetProgLen(ri,RExC_size);
5907 /* Second pass: emit code. */
5908 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5909 RExC_pm_flags = pm_flags;
5914 RExC_emit_start = ri->program;
5915 RExC_emit = ri->program;
5916 RExC_emit_bound = ri->program + RExC_size + 1;
5917 pRExC_state->code_index = 0;
5919 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5920 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5924 /* XXXX To minimize changes to RE engine we always allocate
5925 3-units-long substrs field. */
5926 Newx(r->substrs, 1, struct reg_substr_data);
5927 if (RExC_recurse_count) {
5928 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5929 SAVEFREEPV(RExC_recurse);
5933 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5934 Zero(r->substrs, 1, struct reg_substr_data);
5936 #ifdef TRIE_STUDY_OPT
5938 StructCopy(&zero_scan_data, &data, scan_data_t);
5939 copyRExC_state = RExC_state;
5942 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5944 RExC_state = copyRExC_state;
5945 if (seen & REG_TOP_LEVEL_BRANCHES)
5946 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5948 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5949 StructCopy(&zero_scan_data, &data, scan_data_t);
5952 StructCopy(&zero_scan_data, &data, scan_data_t);
5955 /* Dig out information for optimizations. */
5956 r->extflags = RExC_flags; /* was pm_op */
5957 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5960 SvUTF8_on(rx); /* Unicode in it? */
5961 ri->regstclass = NULL;
5962 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5963 r->intflags |= PREGf_NAUGHTY;
5964 scan = ri->program + 1; /* First BRANCH. */
5966 /* testing for BRANCH here tells us whether there is "must appear"
5967 data in the pattern. If there is then we can use it for optimisations */
5968 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5970 STRLEN longest_float_length, longest_fixed_length;
5971 struct regnode_charclass_class ch_class; /* pointed to by data */
5973 I32 last_close = 0; /* pointed to by data */
5974 regnode *first= scan;
5975 regnode *first_next= regnext(first);
5977 * Skip introductions and multiplicators >= 1
5978 * so that we can extract the 'meat' of the pattern that must
5979 * match in the large if() sequence following.
5980 * NOTE that EXACT is NOT covered here, as it is normally
5981 * picked up by the optimiser separately.
5983 * This is unfortunate as the optimiser isnt handling lookahead
5984 * properly currently.
5987 while ((OP(first) == OPEN && (sawopen = 1)) ||
5988 /* An OR of *one* alternative - should not happen now. */
5989 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5990 /* for now we can't handle lookbehind IFMATCH*/
5991 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5992 (OP(first) == PLUS) ||
5993 (OP(first) == MINMOD) ||
5994 /* An {n,m} with n>0 */
5995 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5996 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5999 * the only op that could be a regnode is PLUS, all the rest
6000 * will be regnode_1 or regnode_2.
6003 if (OP(first) == PLUS)
6006 first += regarglen[OP(first)];
6008 first = NEXTOPER(first);
6009 first_next= regnext(first);
6012 /* Starting-point info. */
6014 DEBUG_PEEP("first:",first,0);
6015 /* Ignore EXACT as we deal with it later. */
6016 if (PL_regkind[OP(first)] == EXACT) {
6017 if (OP(first) == EXACT)
6018 NOOP; /* Empty, get anchored substr later. */
6020 ri->regstclass = first;
6023 else if (PL_regkind[OP(first)] == TRIE &&
6024 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6027 /* this can happen only on restudy */
6028 if ( OP(first) == TRIE ) {
6029 struct regnode_1 *trieop = (struct regnode_1 *)
6030 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6031 StructCopy(first,trieop,struct regnode_1);
6032 trie_op=(regnode *)trieop;
6034 struct regnode_charclass *trieop = (struct regnode_charclass *)
6035 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6036 StructCopy(first,trieop,struct regnode_charclass);
6037 trie_op=(regnode *)trieop;
6040 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6041 ri->regstclass = trie_op;
6044 else if (REGNODE_SIMPLE(OP(first)))
6045 ri->regstclass = first;
6046 else if (PL_regkind[OP(first)] == BOUND ||
6047 PL_regkind[OP(first)] == NBOUND)
6048 ri->regstclass = first;
6049 else if (PL_regkind[OP(first)] == BOL) {
6050 r->extflags |= (OP(first) == MBOL
6052 : (OP(first) == SBOL
6055 first = NEXTOPER(first);
6058 else if (OP(first) == GPOS) {
6059 r->extflags |= RXf_ANCH_GPOS;
6060 first = NEXTOPER(first);
6063 else if ((!sawopen || !RExC_sawback) &&
6064 (OP(first) == STAR &&
6065 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6066 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6068 /* turn .* into ^.* with an implied $*=1 */
6070 (OP(NEXTOPER(first)) == REG_ANY)
6073 r->extflags |= type;
6074 r->intflags |= PREGf_IMPLICIT;
6075 first = NEXTOPER(first);
6078 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6079 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6080 /* x+ must match at the 1st pos of run of x's */
6081 r->intflags |= PREGf_SKIP;
6083 /* Scan is after the zeroth branch, first is atomic matcher. */
6084 #ifdef TRIE_STUDY_OPT
6087 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6088 (IV)(first - scan + 1))
6092 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6093 (IV)(first - scan + 1))
6099 * If there's something expensive in the r.e., find the
6100 * longest literal string that must appear and make it the
6101 * regmust. Resolve ties in favor of later strings, since
6102 * the regstart check works with the beginning of the r.e.
6103 * and avoiding duplication strengthens checking. Not a
6104 * strong reason, but sufficient in the absence of others.
6105 * [Now we resolve ties in favor of the earlier string if
6106 * it happens that c_offset_min has been invalidated, since the
6107 * earlier string may buy us something the later one won't.]
6110 data.longest_fixed = newSVpvs("");
6111 data.longest_float = newSVpvs("");
6112 data.last_found = newSVpvs("");
6113 data.longest = &(data.longest_fixed);
6114 ENTER_with_name("study_chunk");
6115 SAVEFREESV(data.longest_fixed);
6116 SAVEFREESV(data.longest_float);
6117 SAVEFREESV(data.last_found);
6119 if (!ri->regstclass) {
6120 cl_init(pRExC_state, &ch_class);
6121 data.start_class = &ch_class;
6122 stclass_flag = SCF_DO_STCLASS_AND;
6123 } else /* XXXX Check for BOUND? */
6125 data.last_closep = &last_close;
6127 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6128 &data, -1, NULL, NULL,
6129 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6132 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6135 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6136 && data.last_start_min == 0 && data.last_end > 0
6137 && !RExC_seen_zerolen
6138 && !(RExC_seen & REG_SEEN_VERBARG)
6139 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6140 r->extflags |= RXf_CHECK_ALL;
6141 scan_commit(pRExC_state, &data,&minlen,0);
6143 longest_float_length = CHR_SVLEN(data.longest_float);
6145 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6146 && data.offset_fixed == data.offset_float_min
6147 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6148 && S_setup_longest (aTHX_ pRExC_state,
6152 &(r->float_end_shift),
6153 data.lookbehind_float,
6154 data.offset_float_min,
6156 longest_float_length,
6157 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6158 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6160 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6161 r->float_max_offset = data.offset_float_max;
6162 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6163 r->float_max_offset -= data.lookbehind_float;
6164 SvREFCNT_inc_simple_void_NN(data.longest_float);
6167 r->float_substr = r->float_utf8 = NULL;
6168 longest_float_length = 0;
6171 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6173 if (S_setup_longest (aTHX_ pRExC_state,
6175 &(r->anchored_utf8),
6176 &(r->anchored_substr),
6177 &(r->anchored_end_shift),
6178 data.lookbehind_fixed,
6181 longest_fixed_length,
6182 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6183 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6185 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6186 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6189 r->anchored_substr = r->anchored_utf8 = NULL;
6190 longest_fixed_length = 0;
6192 LEAVE_with_name("study_chunk");
6195 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6196 ri->regstclass = NULL;
6198 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6200 && ! TEST_SSC_EOS(data.start_class)
6201 && !cl_is_anything(data.start_class))
6203 const U32 n = add_data(pRExC_state, 1, "f");
6204 OP(data.start_class) = ANYOF_SYNTHETIC;
6206 Newx(RExC_rxi->data->data[n], 1,
6207 struct regnode_charclass_class);
6208 StructCopy(data.start_class,
6209 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6210 struct regnode_charclass_class);
6211 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6212 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6213 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6214 regprop(r, sv, (regnode*)data.start_class);
6215 PerlIO_printf(Perl_debug_log,
6216 "synthetic stclass \"%s\".\n",
6217 SvPVX_const(sv));});
6220 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6221 if (longest_fixed_length > longest_float_length) {
6222 r->check_end_shift = r->anchored_end_shift;
6223 r->check_substr = r->anchored_substr;
6224 r->check_utf8 = r->anchored_utf8;
6225 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6226 if (r->extflags & RXf_ANCH_SINGLE)
6227 r->extflags |= RXf_NOSCAN;
6230 r->check_end_shift = r->float_end_shift;
6231 r->check_substr = r->float_substr;
6232 r->check_utf8 = r->float_utf8;
6233 r->check_offset_min = r->float_min_offset;
6234 r->check_offset_max = r->float_max_offset;
6236 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6237 This should be changed ASAP! */
6238 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6239 r->extflags |= RXf_USE_INTUIT;
6240 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6241 r->extflags |= RXf_INTUIT_TAIL;
6243 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6244 if ( (STRLEN)minlen < longest_float_length )
6245 minlen= longest_float_length;
6246 if ( (STRLEN)minlen < longest_fixed_length )
6247 minlen= longest_fixed_length;
6251 /* Several toplevels. Best we can is to set minlen. */
6253 struct regnode_charclass_class ch_class;
6256 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6258 scan = ri->program + 1;
6259 cl_init(pRExC_state, &ch_class);
6260 data.start_class = &ch_class;
6261 data.last_closep = &last_close;
6264 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6265 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6267 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6269 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6270 = r->float_substr = r->float_utf8 = NULL;
6272 if (! TEST_SSC_EOS(data.start_class)
6273 && !cl_is_anything(data.start_class))
6275 const U32 n = add_data(pRExC_state, 1, "f");
6276 OP(data.start_class) = ANYOF_SYNTHETIC;
6278 Newx(RExC_rxi->data->data[n], 1,
6279 struct regnode_charclass_class);
6280 StructCopy(data.start_class,
6281 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6282 struct regnode_charclass_class);
6283 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6284 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6285 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6286 regprop(r, sv, (regnode*)data.start_class);
6287 PerlIO_printf(Perl_debug_log,
6288 "synthetic stclass \"%s\".\n",
6289 SvPVX_const(sv));});
6293 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6294 the "real" pattern. */
6296 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6297 (IV)minlen, (IV)r->minlen);
6299 r->minlenret = minlen;
6300 if (r->minlen < minlen)
6303 if (RExC_seen & REG_SEEN_GPOS)
6304 r->extflags |= RXf_GPOS_SEEN;
6305 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6306 r->extflags |= RXf_LOOKBEHIND_SEEN;
6307 if (pRExC_state->num_code_blocks)
6308 r->extflags |= RXf_EVAL_SEEN;
6309 if (RExC_seen & REG_SEEN_CANY)
6310 r->extflags |= RXf_CANY_SEEN;
6311 if (RExC_seen & REG_SEEN_VERBARG)
6313 r->intflags |= PREGf_VERBARG_SEEN;
6314 r->extflags |= RXf_MODIFIES_VARS;
6316 if (RExC_seen & REG_SEEN_CUTGROUP)
6317 r->intflags |= PREGf_CUTGROUP_SEEN;
6318 if (pm_flags & PMf_USE_RE_EVAL)
6319 r->intflags |= PREGf_USE_RE_EVAL;
6320 if (RExC_paren_names)
6321 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6323 RXp_PAREN_NAMES(r) = NULL;
6325 #ifdef STUPID_PATTERN_CHECKS
6326 if (RX_PRELEN(rx) == 0)
6327 r->extflags |= RXf_NULL;
6328 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6329 r->extflags |= RXf_WHITE;
6330 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6331 r->extflags |= RXf_START_ONLY;
6334 regnode *first = ri->program + 1;
6337 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6338 r->extflags |= RXf_NULL;
6339 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6340 r->extflags |= RXf_START_ONLY;
6341 else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6342 && OP(regnext(first)) == END)
6343 r->extflags |= RXf_WHITE;
6347 if (RExC_paren_names) {
6348 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6349 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6352 ri->name_list_idx = 0;
6354 if (RExC_recurse_count) {
6355 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6356 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6357 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6360 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6361 /* assume we don't need to swap parens around before we match */
6364 PerlIO_printf(Perl_debug_log,"Final program:\n");
6367 #ifdef RE_TRACK_PATTERN_OFFSETS
6368 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6369 const U32 len = ri->u.offsets[0];
6371 GET_RE_DEBUG_FLAGS_DECL;
6372 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6373 for (i = 1; i <= len; i++) {
6374 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6375 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6376 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6378 PerlIO_printf(Perl_debug_log, "\n");
6383 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6384 * by setting the regexp SV to readonly-only instead. If the
6385 * pattern's been recompiled, the USEDness should remain. */
6386 if (old_re && SvREADONLY(old_re))
6394 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6397 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6399 PERL_UNUSED_ARG(value);
6401 if (flags & RXapif_FETCH) {
6402 return reg_named_buff_fetch(rx, key, flags);
6403 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6404 Perl_croak_no_modify();
6406 } else if (flags & RXapif_EXISTS) {
6407 return reg_named_buff_exists(rx, key, flags)
6410 } else if (flags & RXapif_REGNAMES) {
6411 return reg_named_buff_all(rx, flags);
6412 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6413 return reg_named_buff_scalar(rx, flags);
6415 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6421 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6424 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6425 PERL_UNUSED_ARG(lastkey);
6427 if (flags & RXapif_FIRSTKEY)
6428 return reg_named_buff_firstkey(rx, flags);
6429 else if (flags & RXapif_NEXTKEY)
6430 return reg_named_buff_nextkey(rx, flags);
6432 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6438 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6441 AV *retarray = NULL;
6443 struct regexp *const rx = ReANY(r);
6445 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6447 if (flags & RXapif_ALL)
6450 if (rx && RXp_PAREN_NAMES(rx)) {
6451 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6454 SV* sv_dat=HeVAL(he_str);
6455 I32 *nums=(I32*)SvPVX(sv_dat);
6456 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6457 if ((I32)(rx->nparens) >= nums[i]
6458 && rx->offs[nums[i]].start != -1
6459 && rx->offs[nums[i]].end != -1)
6462 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6467 ret = newSVsv(&PL_sv_undef);
6470 av_push(retarray, ret);
6473 return newRV_noinc(MUTABLE_SV(retarray));
6480 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6483 struct regexp *const rx = ReANY(r);
6485 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6487 if (rx && RXp_PAREN_NAMES(rx)) {
6488 if (flags & RXapif_ALL) {
6489 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6491 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6493 SvREFCNT_dec_NN(sv);
6505 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6507 struct regexp *const rx = ReANY(r);
6509 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6511 if ( rx && RXp_PAREN_NAMES(rx) ) {
6512 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6514 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6521 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6523 struct regexp *const rx = ReANY(r);
6524 GET_RE_DEBUG_FLAGS_DECL;
6526 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6528 if (rx && RXp_PAREN_NAMES(rx)) {
6529 HV *hv = RXp_PAREN_NAMES(rx);
6531 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6534 SV* sv_dat = HeVAL(temphe);
6535 I32 *nums = (I32*)SvPVX(sv_dat);
6536 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6537 if ((I32)(rx->lastparen) >= nums[i] &&
6538 rx->offs[nums[i]].start != -1 &&
6539 rx->offs[nums[i]].end != -1)
6545 if (parno || flags & RXapif_ALL) {
6546 return newSVhek(HeKEY_hek(temphe));
6554 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6559 struct regexp *const rx = ReANY(r);
6561 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6563 if (rx && RXp_PAREN_NAMES(rx)) {
6564 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6565 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6566 } else if (flags & RXapif_ONE) {
6567 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6568 av = MUTABLE_AV(SvRV(ret));
6569 length = av_len(av);
6570 SvREFCNT_dec_NN(ret);
6571 return newSViv(length + 1);
6573 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6577 return &PL_sv_undef;
6581 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6583 struct regexp *const rx = ReANY(r);
6586 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6588 if (rx && RXp_PAREN_NAMES(rx)) {
6589 HV *hv= RXp_PAREN_NAMES(rx);
6591 (void)hv_iterinit(hv);
6592 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6595 SV* sv_dat = HeVAL(temphe);
6596 I32 *nums = (I32*)SvPVX(sv_dat);
6597 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6598 if ((I32)(rx->lastparen) >= nums[i] &&
6599 rx->offs[nums[i]].start != -1 &&
6600 rx->offs[nums[i]].end != -1)
6606 if (parno || flags & RXapif_ALL) {
6607 av_push(av, newSVhek(HeKEY_hek(temphe)));
6612 return newRV_noinc(MUTABLE_SV(av));
6616 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6619 struct regexp *const rx = ReANY(r);
6625 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6627 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6628 || n == RX_BUFF_IDX_CARET_FULLMATCH
6629 || n == RX_BUFF_IDX_CARET_POSTMATCH
6631 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6638 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6639 /* no need to distinguish between them any more */
6640 n = RX_BUFF_IDX_FULLMATCH;
6642 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6643 && rx->offs[0].start != -1)
6645 /* $`, ${^PREMATCH} */
6646 i = rx->offs[0].start;
6650 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6651 && rx->offs[0].end != -1)
6653 /* $', ${^POSTMATCH} */
6654 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6655 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6658 if ( 0 <= n && n <= (I32)rx->nparens &&
6659 (s1 = rx->offs[n].start) != -1 &&
6660 (t1 = rx->offs[n].end) != -1)
6662 /* $&, ${^MATCH}, $1 ... */
6664 s = rx->subbeg + s1 - rx->suboffset;
6669 assert(s >= rx->subbeg);
6670 assert(rx->sublen >= (s - rx->subbeg) + i );
6672 #if NO_TAINT_SUPPORT
6673 sv_setpvn(sv, s, i);
6675 const int oldtainted = TAINT_get;
6677 sv_setpvn(sv, s, i);
6678 TAINT_set(oldtainted);
6680 if ( (rx->extflags & RXf_CANY_SEEN)
6681 ? (RXp_MATCH_UTF8(rx)
6682 && (!i || is_utf8_string((U8*)s, i)))
6683 : (RXp_MATCH_UTF8(rx)) )
6690 if (RXp_MATCH_TAINTED(rx)) {
6691 if (SvTYPE(sv) >= SVt_PVMG) {
6692 MAGIC* const mg = SvMAGIC(sv);
6695 SvMAGIC_set(sv, mg->mg_moremagic);
6697 if ((mgt = SvMAGIC(sv))) {
6698 mg->mg_moremagic = mgt;
6699 SvMAGIC_set(sv, mg);
6710 sv_setsv(sv,&PL_sv_undef);
6716 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6717 SV const * const value)
6719 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6721 PERL_UNUSED_ARG(rx);
6722 PERL_UNUSED_ARG(paren);
6723 PERL_UNUSED_ARG(value);
6726 Perl_croak_no_modify();
6730 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6733 struct regexp *const rx = ReANY(r);
6737 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6739 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6741 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6742 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6746 case RX_BUFF_IDX_PREMATCH: /* $` */
6747 if (rx->offs[0].start != -1) {
6748 i = rx->offs[0].start;
6757 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6758 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6760 case RX_BUFF_IDX_POSTMATCH: /* $' */
6761 if (rx->offs[0].end != -1) {
6762 i = rx->sublen - rx->offs[0].end;
6764 s1 = rx->offs[0].end;
6771 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6772 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6776 /* $& / ${^MATCH}, $1, $2, ... */
6778 if (paren <= (I32)rx->nparens &&
6779 (s1 = rx->offs[paren].start) != -1 &&
6780 (t1 = rx->offs[paren].end) != -1)
6786 if (ckWARN(WARN_UNINITIALIZED))
6787 report_uninit((const SV *)sv);
6792 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6793 const char * const s = rx->subbeg - rx->suboffset + s1;
6798 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6805 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6807 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6808 PERL_UNUSED_ARG(rx);
6812 return newSVpvs("Regexp");
6815 /* Scans the name of a named buffer from the pattern.
6816 * If flags is REG_RSN_RETURN_NULL returns null.
6817 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6818 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6819 * to the parsed name as looked up in the RExC_paren_names hash.
6820 * If there is an error throws a vFAIL().. type exception.
6823 #define REG_RSN_RETURN_NULL 0
6824 #define REG_RSN_RETURN_NAME 1
6825 #define REG_RSN_RETURN_DATA 2
6828 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6830 char *name_start = RExC_parse;
6832 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6834 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6835 /* skip IDFIRST by using do...while */
6838 RExC_parse += UTF8SKIP(RExC_parse);
6839 } while (isWORDCHAR_utf8((U8*)RExC_parse));
6843 } while (isWORDCHAR(*RExC_parse));
6845 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6846 vFAIL("Group name must start with a non-digit word character");
6850 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6851 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6852 if ( flags == REG_RSN_RETURN_NAME)
6854 else if (flags==REG_RSN_RETURN_DATA) {
6857 if ( ! sv_name ) /* should not happen*/
6858 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6859 if (RExC_paren_names)
6860 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6862 sv_dat = HeVAL(he_str);
6864 vFAIL("Reference to nonexistent named group");
6868 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6869 (unsigned long) flags);
6871 assert(0); /* NOT REACHED */
6876 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6877 int rem=(int)(RExC_end - RExC_parse); \
6886 if (RExC_lastparse!=RExC_parse) \
6887 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6890 iscut ? "..." : "<" \
6893 PerlIO_printf(Perl_debug_log,"%16s",""); \
6896 num = RExC_size + 1; \
6898 num=REG_NODE_NUM(RExC_emit); \
6899 if (RExC_lastnum!=num) \
6900 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6902 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6903 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6904 (int)((depth*2)), "", \
6908 RExC_lastparse=RExC_parse; \
6913 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6914 DEBUG_PARSE_MSG((funcname)); \
6915 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6917 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6918 DEBUG_PARSE_MSG((funcname)); \
6919 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6922 /* This section of code defines the inversion list object and its methods. The
6923 * interfaces are highly subject to change, so as much as possible is static to
6924 * this file. An inversion list is here implemented as a malloc'd C UV array
6925 * with some added info that is placed as UVs at the beginning in a header
6926 * portion. An inversion list for Unicode is an array of code points, sorted
6927 * by ordinal number. The zeroth element is the first code point in the list.
6928 * The 1th element is the first element beyond that not in the list. In other
6929 * words, the first range is
6930 * invlist[0]..(invlist[1]-1)
6931 * The other ranges follow. Thus every element whose index is divisible by two
6932 * marks the beginning of a range that is in the list, and every element not
6933 * divisible by two marks the beginning of a range not in the list. A single
6934 * element inversion list that contains the single code point N generally
6935 * consists of two elements
6938 * (The exception is when N is the highest representable value on the
6939 * machine, in which case the list containing just it would be a single
6940 * element, itself. By extension, if the last range in the list extends to
6941 * infinity, then the first element of that range will be in the inversion list
6942 * at a position that is divisible by two, and is the final element in the
6944 * Taking the complement (inverting) an inversion list is quite simple, if the
6945 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6946 * This implementation reserves an element at the beginning of each inversion
6947 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
6948 * actual beginning of the list is either that element if 0, or the next one if
6951 * More about inversion lists can be found in "Unicode Demystified"
6952 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6953 * More will be coming when functionality is added later.
6955 * The inversion list data structure is currently implemented as an SV pointing
6956 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6957 * array of UV whose memory management is automatically handled by the existing
6958 * facilities for SV's.
6960 * Some of the methods should always be private to the implementation, and some
6961 * should eventually be made public */
6963 /* The header definitions are in F<inline_invlist.c> */
6964 #define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
6965 #define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
6967 #define INVLIST_INITIAL_LEN 10
6969 PERL_STATIC_INLINE UV*
6970 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6972 /* Returns a pointer to the first element in the inversion list's array.
6973 * This is called upon initialization of an inversion list. Where the
6974 * array begins depends on whether the list has the code point U+0000
6975 * in it or not. The other parameter tells it whether the code that
6976 * follows this call is about to put a 0 in the inversion list or not.
6977 * The first element is either the element with 0, if 0, or the next one,
6980 UV* zero = get_invlist_zero_addr(invlist);
6982 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6985 assert(! *_get_invlist_len_addr(invlist));
6987 /* 1^1 = 0; 1^0 = 1 */
6988 *zero = 1 ^ will_have_0;
6989 return zero + *zero;
6992 PERL_STATIC_INLINE UV*
6993 S_invlist_array(pTHX_ SV* const invlist)
6995 /* Returns the pointer to the inversion list's array. Every time the
6996 * length changes, this needs to be called in case malloc or realloc moved
6999 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7001 /* Must not be empty. If these fail, you probably didn't check for <len>
7002 * being non-zero before trying to get the array */
7003 assert(*_get_invlist_len_addr(invlist));
7004 assert(*get_invlist_zero_addr(invlist) == 0
7005 || *get_invlist_zero_addr(invlist) == 1);
7007 /* The array begins either at the element reserved for zero if the
7008 * list contains 0 (that element will be set to 0), or otherwise the next
7009 * element (in which case the reserved element will be set to 1). */
7010 return (UV *) (get_invlist_zero_addr(invlist)
7011 + *get_invlist_zero_addr(invlist));
7014 PERL_STATIC_INLINE void
7015 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7017 /* Sets the current number of elements stored in the inversion list */
7019 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7021 *_get_invlist_len_addr(invlist) = len;
7023 assert(len <= SvLEN(invlist));
7025 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7026 /* If the list contains U+0000, that element is part of the header,
7027 * and should not be counted as part of the array. It will contain
7028 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7030 * SvCUR_set(invlist,
7031 * TO_INTERNAL_SIZE(len
7032 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7033 * But, this is only valid if len is not 0. The consequences of not doing
7034 * this is that the memory allocation code may think that 1 more UV is
7035 * being used than actually is, and so might do an unnecessary grow. That
7036 * seems worth not bothering to make this the precise amount.
7038 * Note that when inverting, SvCUR shouldn't change */
7041 PERL_STATIC_INLINE IV*
7042 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7044 /* Return the address of the UV that is reserved to hold the cached index
7047 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7049 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7052 PERL_STATIC_INLINE IV
7053 S_invlist_previous_index(pTHX_ SV* const invlist)
7055 /* Returns cached index of previous search */
7057 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7059 return *get_invlist_previous_index_addr(invlist);
7062 PERL_STATIC_INLINE void
7063 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7065 /* Caches <index> for later retrieval */
7067 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7069 assert(index == 0 || index < (int) _invlist_len(invlist));
7071 *get_invlist_previous_index_addr(invlist) = index;
7074 PERL_STATIC_INLINE UV
7075 S_invlist_max(pTHX_ SV* const invlist)
7077 /* Returns the maximum number of elements storable in the inversion list's
7078 * array, without having to realloc() */
7080 PERL_ARGS_ASSERT_INVLIST_MAX;
7082 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7083 ? _invlist_len(invlist)
7084 : FROM_INTERNAL_SIZE(SvLEN(invlist));
7087 PERL_STATIC_INLINE UV*
7088 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7090 /* Return the address of the UV that is reserved to hold 0 if the inversion
7091 * list contains 0. This has to be the last element of the heading, as the
7092 * list proper starts with either it if 0, or the next element if not.
7093 * (But we force it to contain either 0 or 1) */
7095 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7097 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7100 #ifndef PERL_IN_XSUB_RE
7102 Perl__new_invlist(pTHX_ IV initial_size)
7105 /* Return a pointer to a newly constructed inversion list, with enough
7106 * space to store 'initial_size' elements. If that number is negative, a
7107 * system default is used instead */
7111 if (initial_size < 0) {
7112 initial_size = INVLIST_INITIAL_LEN;
7115 /* Allocate the initial space */
7116 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7117 invlist_set_len(new_list, 0);
7119 /* Force iterinit() to be used to get iteration to work */
7120 *get_invlist_iter_addr(new_list) = UV_MAX;
7122 /* This should force a segfault if a method doesn't initialize this
7124 *get_invlist_zero_addr(new_list) = UV_MAX;
7126 *get_invlist_previous_index_addr(new_list) = 0;
7127 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7128 #if HEADER_LENGTH != 5
7129 # error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7137 S__new_invlist_C_array(pTHX_ UV* list)
7139 /* Return a pointer to a newly constructed inversion list, initialized to
7140 * point to <list>, which has to be in the exact correct inversion list
7141 * form, including internal fields. Thus this is a dangerous routine that
7142 * should not be used in the wrong hands */
7144 SV* invlist = newSV_type(SVt_PV);
7146 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7148 SvPV_set(invlist, (char *) list);
7149 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7150 shouldn't touch it */
7151 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7153 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7154 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7157 /* Initialize the iteration pointer.
7158 * XXX This could be done at compile time in charclass_invlists.h, but I
7159 * (khw) am not confident that the suffixes for specifying the C constant
7160 * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured
7161 * to use 64 bits; might need a Configure probe */
7162 invlist_iterfinish(invlist);
7168 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7170 /* Grow the maximum size of an inversion list */
7172 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7174 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7177 PERL_STATIC_INLINE void
7178 S_invlist_trim(pTHX_ SV* const invlist)
7180 PERL_ARGS_ASSERT_INVLIST_TRIM;
7182 /* Change the length of the inversion list to how many entries it currently
7185 SvPV_shrink_to_cur((SV *) invlist);
7188 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7191 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7193 /* Subject to change or removal. Append the range from 'start' to 'end' at
7194 * the end of the inversion list. The range must be above any existing
7198 UV max = invlist_max(invlist);
7199 UV len = _invlist_len(invlist);
7201 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7203 if (len == 0) { /* Empty lists must be initialized */
7204 array = _invlist_array_init(invlist, start == 0);
7207 /* Here, the existing list is non-empty. The current max entry in the
7208 * list is generally the first value not in the set, except when the
7209 * set extends to the end of permissible values, in which case it is
7210 * the first entry in that final set, and so this call is an attempt to
7211 * append out-of-order */
7213 UV final_element = len - 1;
7214 array = invlist_array(invlist);
7215 if (array[final_element] > start
7216 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7218 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7219 array[final_element], start,
7220 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7223 /* Here, it is a legal append. If the new range begins with the first
7224 * value not in the set, it is extending the set, so the new first
7225 * value not in the set is one greater than the newly extended range.
7227 if (array[final_element] == start) {
7228 if (end != UV_MAX) {
7229 array[final_element] = end + 1;
7232 /* But if the end is the maximum representable on the machine,
7233 * just let the range that this would extend to have no end */
7234 invlist_set_len(invlist, len - 1);
7240 /* Here the new range doesn't extend any existing set. Add it */
7242 len += 2; /* Includes an element each for the start and end of range */
7244 /* If overflows the existing space, extend, which may cause the array to be
7247 invlist_extend(invlist, len);
7248 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7249 failure in invlist_array() */
7250 array = invlist_array(invlist);
7253 invlist_set_len(invlist, len);
7256 /* The next item on the list starts the range, the one after that is
7257 * one past the new range. */
7258 array[len - 2] = start;
7259 if (end != UV_MAX) {
7260 array[len - 1] = end + 1;
7263 /* But if the end is the maximum representable on the machine, just let
7264 * the range have no end */
7265 invlist_set_len(invlist, len - 1);
7269 #ifndef PERL_IN_XSUB_RE
7272 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7274 /* Searches the inversion list for the entry that contains the input code
7275 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7276 * return value is the index into the list's array of the range that
7281 IV high = _invlist_len(invlist);
7282 const IV highest_element = high - 1;
7285 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7287 /* If list is empty, return failure. */
7292 /* (We can't get the array unless we know the list is non-empty) */
7293 array = invlist_array(invlist);
7295 mid = invlist_previous_index(invlist);
7296 assert(mid >=0 && mid <= highest_element);
7298 /* <mid> contains the cache of the result of the previous call to this
7299 * function (0 the first time). See if this call is for the same result,
7300 * or if it is for mid-1. This is under the theory that calls to this
7301 * function will often be for related code points that are near each other.
7302 * And benchmarks show that caching gives better results. We also test
7303 * here if the code point is within the bounds of the list. These tests
7304 * replace others that would have had to be made anyway to make sure that
7305 * the array bounds were not exceeded, and these give us extra information
7306 * at the same time */
7307 if (cp >= array[mid]) {
7308 if (cp >= array[highest_element]) {
7309 return highest_element;
7312 /* Here, array[mid] <= cp < array[highest_element]. This means that
7313 * the final element is not the answer, so can exclude it; it also
7314 * means that <mid> is not the final element, so can refer to 'mid + 1'
7316 if (cp < array[mid + 1]) {
7322 else { /* cp < aray[mid] */
7323 if (cp < array[0]) { /* Fail if outside the array */
7327 if (cp >= array[mid - 1]) {
7332 /* Binary search. What we are looking for is <i> such that
7333 * array[i] <= cp < array[i+1]
7334 * The loop below converges on the i+1. Note that there may not be an
7335 * (i+1)th element in the array, and things work nonetheless */
7336 while (low < high) {
7337 mid = (low + high) / 2;
7338 assert(mid <= highest_element);
7339 if (array[mid] <= cp) { /* cp >= array[mid] */
7342 /* We could do this extra test to exit the loop early.
7343 if (cp < array[low]) {
7348 else { /* cp < array[mid] */
7355 invlist_set_previous_index(invlist, high);
7360 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7362 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7363 * but is used when the swash has an inversion list. This makes this much
7364 * faster, as it uses a binary search instead of a linear one. This is
7365 * intimately tied to that function, and perhaps should be in utf8.c,
7366 * except it is intimately tied to inversion lists as well. It assumes
7367 * that <swatch> is all 0's on input */
7370 const IV len = _invlist_len(invlist);
7374 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7376 if (len == 0) { /* Empty inversion list */
7380 array = invlist_array(invlist);
7382 /* Find which element it is */
7383 i = _invlist_search(invlist, start);
7385 /* We populate from <start> to <end> */
7386 while (current < end) {
7389 /* The inversion list gives the results for every possible code point
7390 * after the first one in the list. Only those ranges whose index is
7391 * even are ones that the inversion list matches. For the odd ones,
7392 * and if the initial code point is not in the list, we have to skip
7393 * forward to the next element */
7394 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7396 if (i >= len) { /* Finished if beyond the end of the array */
7400 if (current >= end) { /* Finished if beyond the end of what we
7402 if (LIKELY(end < UV_MAX)) {
7406 /* We get here when the upper bound is the maximum
7407 * representable on the machine, and we are looking for just
7408 * that code point. Have to special case it */
7410 goto join_end_of_list;
7413 assert(current >= start);
7415 /* The current range ends one below the next one, except don't go past
7418 upper = (i < len && array[i] < end) ? array[i] : end;
7420 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7421 * for each code point in it */
7422 for (; current < upper; current++) {
7423 const STRLEN offset = (STRLEN)(current - start);
7424 swatch[offset >> 3] |= 1 << (offset & 7);
7429 /* Quit if at the end of the list */
7432 /* But first, have to deal with the highest possible code point on
7433 * the platform. The previous code assumes that <end> is one
7434 * beyond where we want to populate, but that is impossible at the
7435 * platform's infinity, so have to handle it specially */
7436 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7438 const STRLEN offset = (STRLEN)(end - start);
7439 swatch[offset >> 3] |= 1 << (offset & 7);
7444 /* Advance to the next range, which will be for code points not in the
7453 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7455 /* Take the union of two inversion lists and point <output> to it. *output
7456 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7457 * the reference count to that list will be decremented. The first list,
7458 * <a>, may be NULL, in which case a copy of the second list is returned.
7459 * If <complement_b> is TRUE, the union is taken of the complement
7460 * (inversion) of <b> instead of b itself.
7462 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7463 * Richard Gillam, published by Addison-Wesley, and explained at some
7464 * length there. The preface says to incorporate its examples into your
7465 * code at your own risk.
7467 * The algorithm is like a merge sort.
7469 * XXX A potential performance improvement is to keep track as we go along
7470 * if only one of the inputs contributes to the result, meaning the other
7471 * is a subset of that one. In that case, we can skip the final copy and
7472 * return the larger of the input lists, but then outside code might need
7473 * to keep track of whether to free the input list or not */
7475 UV* array_a; /* a's array */
7477 UV len_a; /* length of a's array */
7480 SV* u; /* the resulting union */
7484 UV i_a = 0; /* current index into a's array */
7488 /* running count, as explained in the algorithm source book; items are
7489 * stopped accumulating and are output when the count changes to/from 0.
7490 * The count is incremented when we start a range that's in the set, and
7491 * decremented when we start a range that's not in the set. So its range
7492 * is 0 to 2. Only when the count is zero is something not in the set.
7496 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7499 /* If either one is empty, the union is the other one */
7500 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7507 *output = invlist_clone(b);
7509 _invlist_invert(*output);
7511 } /* else *output already = b; */
7514 else if ((len_b = _invlist_len(b)) == 0) {
7519 /* The complement of an empty list is a list that has everything in it,
7520 * so the union with <a> includes everything too */
7525 *output = _new_invlist(1);
7526 _append_range_to_invlist(*output, 0, UV_MAX);
7528 else if (*output != a) {
7529 *output = invlist_clone(a);
7531 /* else *output already = a; */
7535 /* Here both lists exist and are non-empty */
7536 array_a = invlist_array(a);
7537 array_b = invlist_array(b);
7539 /* If are to take the union of 'a' with the complement of b, set it
7540 * up so are looking at b's complement. */
7543 /* To complement, we invert: if the first element is 0, remove it. To
7544 * do this, we just pretend the array starts one later, and clear the
7545 * flag as we don't have to do anything else later */
7546 if (array_b[0] == 0) {
7549 complement_b = FALSE;
7553 /* But if the first element is not zero, we unshift a 0 before the
7554 * array. The data structure reserves a space for that 0 (which
7555 * should be a '1' right now), so physical shifting is unneeded,
7556 * but temporarily change that element to 0. Before exiting the
7557 * routine, we must restore the element to '1' */
7564 /* Size the union for the worst case: that the sets are completely
7566 u = _new_invlist(len_a + len_b);
7568 /* Will contain U+0000 if either component does */
7569 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7570 || (len_b > 0 && array_b[0] == 0));
7572 /* Go through each list item by item, stopping when exhausted one of
7574 while (i_a < len_a && i_b < len_b) {
7575 UV cp; /* The element to potentially add to the union's array */
7576 bool cp_in_set; /* is it in the the input list's set or not */
7578 /* We need to take one or the other of the two inputs for the union.
7579 * Since we are merging two sorted lists, we take the smaller of the
7580 * next items. In case of a tie, we take the one that is in its set
7581 * first. If we took one not in the set first, it would decrement the
7582 * count, possibly to 0 which would cause it to be output as ending the
7583 * range, and the next time through we would take the same number, and
7584 * output it again as beginning the next range. By doing it the
7585 * opposite way, there is no possibility that the count will be
7586 * momentarily decremented to 0, and thus the two adjoining ranges will
7587 * be seamlessly merged. (In a tie and both are in the set or both not
7588 * in the set, it doesn't matter which we take first.) */
7589 if (array_a[i_a] < array_b[i_b]
7590 || (array_a[i_a] == array_b[i_b]
7591 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7593 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7597 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7598 cp = array_b[i_b++];
7601 /* Here, have chosen which of the two inputs to look at. Only output
7602 * if the running count changes to/from 0, which marks the
7603 * beginning/end of a range in that's in the set */
7606 array_u[i_u++] = cp;
7613 array_u[i_u++] = cp;
7618 /* Here, we are finished going through at least one of the lists, which
7619 * means there is something remaining in at most one. We check if the list
7620 * that hasn't been exhausted is positioned such that we are in the middle
7621 * of a range in its set or not. (i_a and i_b point to the element beyond
7622 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7623 * is potentially more to output.
7624 * There are four cases:
7625 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7626 * in the union is entirely from the non-exhausted set.
7627 * 2) Both were in their sets, count is 2. Nothing further should
7628 * be output, as everything that remains will be in the exhausted
7629 * list's set, hence in the union; decrementing to 1 but not 0 insures
7631 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7632 * Nothing further should be output because the union includes
7633 * everything from the exhausted set. Not decrementing ensures that.
7634 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7635 * decrementing to 0 insures that we look at the remainder of the
7636 * non-exhausted set */
7637 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7638 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7643 /* The final length is what we've output so far, plus what else is about to
7644 * be output. (If 'count' is non-zero, then the input list we exhausted
7645 * has everything remaining up to the machine's limit in its set, and hence
7646 * in the union, so there will be no further output. */
7649 /* At most one of the subexpressions will be non-zero */
7650 len_u += (len_a - i_a) + (len_b - i_b);
7653 /* Set result to final length, which can change the pointer to array_u, so
7655 if (len_u != _invlist_len(u)) {
7656 invlist_set_len(u, len_u);
7658 array_u = invlist_array(u);
7661 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7662 * the other) ended with everything above it not in its set. That means
7663 * that the remaining part of the union is precisely the same as the
7664 * non-exhausted list, so can just copy it unchanged. (If both list were
7665 * exhausted at the same time, then the operations below will be both 0.)
7668 IV copy_count; /* At most one will have a non-zero copy count */
7669 if ((copy_count = len_a - i_a) > 0) {
7670 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7672 else if ((copy_count = len_b - i_b) > 0) {
7673 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7677 /* If we've changed b, restore it */
7682 /* We may be removing a reference to one of the inputs */
7683 if (a == *output || b == *output) {
7684 assert(! invlist_is_iterating(*output));
7685 SvREFCNT_dec_NN(*output);
7693 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7695 /* Take the intersection of two inversion lists and point <i> to it. *i
7696 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7697 * the reference count to that list will be decremented.
7698 * If <complement_b> is TRUE, the result will be the intersection of <a>
7699 * and the complement (or inversion) of <b> instead of <b> directly.
7701 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7702 * Richard Gillam, published by Addison-Wesley, and explained at some
7703 * length there. The preface says to incorporate its examples into your
7704 * code at your own risk. In fact, it had bugs
7706 * The algorithm is like a merge sort, and is essentially the same as the
7710 UV* array_a; /* a's array */
7712 UV len_a; /* length of a's array */
7715 SV* r; /* the resulting intersection */
7719 UV i_a = 0; /* current index into a's array */
7723 /* running count, as explained in the algorithm source book; items are
7724 * stopped accumulating and are output when the count changes to/from 2.
7725 * The count is incremented when we start a range that's in the set, and
7726 * decremented when we start a range that's not in the set. So its range
7727 * is 0 to 2. Only when the count is 2 is something in the intersection.
7731 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7734 /* Special case if either one is empty */
7735 len_a = _invlist_len(a);
7736 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7738 if (len_a != 0 && complement_b) {
7740 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7741 * be empty. Here, also we are using 'b's complement, which hence
7742 * must be every possible code point. Thus the intersection is
7745 *i = invlist_clone(a);
7751 /* else *i is already 'a' */
7755 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7756 * intersection must be empty */
7763 *i = _new_invlist(0);
7767 /* Here both lists exist and are non-empty */
7768 array_a = invlist_array(a);
7769 array_b = invlist_array(b);
7771 /* If are to take the intersection of 'a' with the complement of b, set it
7772 * up so are looking at b's complement. */
7775 /* To complement, we invert: if the first element is 0, remove it. To
7776 * do this, we just pretend the array starts one later, and clear the
7777 * flag as we don't have to do anything else later */
7778 if (array_b[0] == 0) {
7781 complement_b = FALSE;
7785 /* But if the first element is not zero, we unshift a 0 before the
7786 * array. The data structure reserves a space for that 0 (which
7787 * should be a '1' right now), so physical shifting is unneeded,
7788 * but temporarily change that element to 0. Before exiting the
7789 * routine, we must restore the element to '1' */
7796 /* Size the intersection for the worst case: that the intersection ends up
7797 * fragmenting everything to be completely disjoint */
7798 r= _new_invlist(len_a + len_b);
7800 /* Will contain U+0000 iff both components do */
7801 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7802 && len_b > 0 && array_b[0] == 0);
7804 /* Go through each list item by item, stopping when exhausted one of
7806 while (i_a < len_a && i_b < len_b) {
7807 UV cp; /* The element to potentially add to the intersection's
7809 bool cp_in_set; /* Is it in the input list's set or not */
7811 /* We need to take one or the other of the two inputs for the
7812 * intersection. Since we are merging two sorted lists, we take the
7813 * smaller of the next items. In case of a tie, we take the one that
7814 * is not in its set first (a difference from the union algorithm). If
7815 * we took one in the set first, it would increment the count, possibly
7816 * to 2 which would cause it to be output as starting a range in the
7817 * intersection, and the next time through we would take that same
7818 * number, and output it again as ending the set. By doing it the
7819 * opposite of this, there is no possibility that the count will be
7820 * momentarily incremented to 2. (In a tie and both are in the set or
7821 * both not in the set, it doesn't matter which we take first.) */
7822 if (array_a[i_a] < array_b[i_b]
7823 || (array_a[i_a] == array_b[i_b]
7824 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7826 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7830 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7834 /* Here, have chosen which of the two inputs to look at. Only output
7835 * if the running count changes to/from 2, which marks the
7836 * beginning/end of a range that's in the intersection */
7840 array_r[i_r++] = cp;
7845 array_r[i_r++] = cp;
7851 /* Here, we are finished going through at least one of the lists, which
7852 * means there is something remaining in at most one. We check if the list
7853 * that has been exhausted is positioned such that we are in the middle
7854 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7855 * the ones we care about.) There are four cases:
7856 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7857 * nothing left in the intersection.
7858 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7859 * above 2. What should be output is exactly that which is in the
7860 * non-exhausted set, as everything it has is also in the intersection
7861 * set, and everything it doesn't have can't be in the intersection
7862 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7863 * gets incremented to 2. Like the previous case, the intersection is
7864 * everything that remains in the non-exhausted set.
7865 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7866 * remains 1. And the intersection has nothing more. */
7867 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7868 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7873 /* The final length is what we've output so far plus what else is in the
7874 * intersection. At most one of the subexpressions below will be non-zero */
7877 len_r += (len_a - i_a) + (len_b - i_b);
7880 /* Set result to final length, which can change the pointer to array_r, so
7882 if (len_r != _invlist_len(r)) {
7883 invlist_set_len(r, len_r);
7885 array_r = invlist_array(r);
7888 /* Finish outputting any remaining */
7889 if (count >= 2) { /* At most one will have a non-zero copy count */
7891 if ((copy_count = len_a - i_a) > 0) {
7892 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7894 else if ((copy_count = len_b - i_b) > 0) {
7895 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7899 /* If we've changed b, restore it */
7904 /* We may be removing a reference to one of the inputs */
7905 if (a == *i || b == *i) {
7906 assert(! invlist_is_iterating(*i));
7907 SvREFCNT_dec_NN(*i);
7915 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7917 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7918 * set. A pointer to the inversion list is returned. This may actually be
7919 * a new list, in which case the passed in one has been destroyed. The
7920 * passed in inversion list can be NULL, in which case a new one is created
7921 * with just the one range in it */
7926 if (invlist == NULL) {
7927 invlist = _new_invlist(2);
7931 len = _invlist_len(invlist);
7934 /* If comes after the final entry actually in the list, can just append it
7937 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
7938 && start >= invlist_array(invlist)[len - 1]))
7940 _append_range_to_invlist(invlist, start, end);
7944 /* Here, can't just append things, create and return a new inversion list
7945 * which is the union of this range and the existing inversion list */
7946 range_invlist = _new_invlist(2);
7947 _append_range_to_invlist(range_invlist, start, end);
7949 _invlist_union(invlist, range_invlist, &invlist);
7951 /* The temporary can be freed */
7952 SvREFCNT_dec_NN(range_invlist);
7959 PERL_STATIC_INLINE SV*
7960 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7961 return _add_range_to_invlist(invlist, cp, cp);
7964 #ifndef PERL_IN_XSUB_RE
7966 Perl__invlist_invert(pTHX_ SV* const invlist)
7968 /* Complement the input inversion list. This adds a 0 if the list didn't
7969 * have a zero; removes it otherwise. As described above, the data
7970 * structure is set up so that this is very efficient */
7972 UV* len_pos = _get_invlist_len_addr(invlist);
7974 PERL_ARGS_ASSERT__INVLIST_INVERT;
7976 assert(! invlist_is_iterating(invlist));
7978 /* The inverse of matching nothing is matching everything */
7979 if (*len_pos == 0) {
7980 _append_range_to_invlist(invlist, 0, UV_MAX);
7984 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7985 * zero element was a 0, so it is being removed, so the length decrements
7986 * by 1; and vice-versa. SvCUR is unaffected */
7987 if (*get_invlist_zero_addr(invlist) ^= 1) {
7996 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7998 /* Complement the input inversion list (which must be a Unicode property,
7999 * all of which don't match above the Unicode maximum code point.) And
8000 * Perl has chosen to not have the inversion match above that either. This
8001 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8007 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8009 _invlist_invert(invlist);
8011 len = _invlist_len(invlist);
8013 if (len != 0) { /* If empty do nothing */
8014 array = invlist_array(invlist);
8015 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8016 /* Add 0x110000. First, grow if necessary */
8018 if (invlist_max(invlist) < len) {
8019 invlist_extend(invlist, len);
8020 array = invlist_array(invlist);
8022 invlist_set_len(invlist, len);
8023 array[len - 1] = PERL_UNICODE_MAX + 1;
8025 else { /* Remove the 0x110000 */
8026 invlist_set_len(invlist, len - 1);
8034 PERL_STATIC_INLINE SV*
8035 S_invlist_clone(pTHX_ SV* const invlist)
8038 /* Return a new inversion list that is a copy of the input one, which is
8041 /* Need to allocate extra space to accommodate Perl's addition of a
8042 * trailing NUL to SvPV's, since it thinks they are always strings */
8043 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8044 STRLEN length = SvCUR(invlist);
8046 PERL_ARGS_ASSERT_INVLIST_CLONE;
8048 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8049 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8054 PERL_STATIC_INLINE UV*
8055 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8057 /* Return the address of the UV that contains the current iteration
8060 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8062 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8065 PERL_STATIC_INLINE UV*
8066 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8068 /* Return the address of the UV that contains the version id. */
8070 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8072 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8075 PERL_STATIC_INLINE void
8076 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8078 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8080 *get_invlist_iter_addr(invlist) = 0;
8083 PERL_STATIC_INLINE void
8084 S_invlist_iterfinish(pTHX_ SV* invlist)
8086 /* Terminate iterator for invlist. This is to catch development errors.
8087 * Any iteration that is interrupted before completed should call this
8088 * function. Functions that add code points anywhere else but to the end
8089 * of an inversion list assert that they are not in the middle of an
8090 * iteration. If they were, the addition would make the iteration
8091 * problematical: if the iteration hadn't reached the place where things
8092 * were being added, it would be ok */
8094 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8096 *get_invlist_iter_addr(invlist) = UV_MAX;
8100 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8102 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8103 * This call sets in <*start> and <*end>, the next range in <invlist>.
8104 * Returns <TRUE> if successful and the next call will return the next
8105 * range; <FALSE> if was already at the end of the list. If the latter,
8106 * <*start> and <*end> are unchanged, and the next call to this function
8107 * will start over at the beginning of the list */
8109 UV* pos = get_invlist_iter_addr(invlist);
8110 UV len = _invlist_len(invlist);
8113 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8116 *pos = UV_MAX; /* Force iterinit() to be required next time */
8120 array = invlist_array(invlist);
8122 *start = array[(*pos)++];
8128 *end = array[(*pos)++] - 1;
8134 PERL_STATIC_INLINE bool
8135 S_invlist_is_iterating(pTHX_ SV* const invlist)
8137 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8139 return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8142 PERL_STATIC_INLINE UV
8143 S_invlist_highest(pTHX_ SV* const invlist)
8145 /* Returns the highest code point that matches an inversion list. This API
8146 * has an ambiguity, as it returns 0 under either the highest is actually
8147 * 0, or if the list is empty. If this distinction matters to you, check
8148 * for emptiness before calling this function */
8150 UV len = _invlist_len(invlist);
8153 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8159 array = invlist_array(invlist);
8161 /* The last element in the array in the inversion list always starts a
8162 * range that goes to infinity. That range may be for code points that are
8163 * matched in the inversion list, or it may be for ones that aren't
8164 * matched. In the latter case, the highest code point in the set is one
8165 * less than the beginning of this range; otherwise it is the final element
8166 * of this range: infinity */
8167 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8169 : array[len - 1] - 1;
8172 #ifndef PERL_IN_XSUB_RE
8174 Perl__invlist_contents(pTHX_ SV* const invlist)
8176 /* Get the contents of an inversion list into a string SV so that they can
8177 * be printed out. It uses the format traditionally done for debug tracing
8181 SV* output = newSVpvs("\n");
8183 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8185 assert(! invlist_is_iterating(invlist));
8187 invlist_iterinit(invlist);
8188 while (invlist_iternext(invlist, &start, &end)) {
8189 if (end == UV_MAX) {
8190 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8192 else if (end != start) {
8193 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8197 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8205 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8207 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8209 /* Dumps out the ranges in an inversion list. The string 'header'
8210 * if present is output on a line before the first range */
8214 PERL_ARGS_ASSERT__INVLIST_DUMP;
8216 if (header && strlen(header)) {
8217 PerlIO_printf(Perl_debug_log, "%s\n", header);
8219 if (invlist_is_iterating(invlist)) {
8220 PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8224 invlist_iterinit(invlist);
8225 while (invlist_iternext(invlist, &start, &end)) {
8226 if (end == UV_MAX) {
8227 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8229 else if (end != start) {
8230 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8234 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8242 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8244 /* Return a boolean as to if the two passed in inversion lists are
8245 * identical. The final argument, if TRUE, says to take the complement of
8246 * the second inversion list before doing the comparison */
8248 UV* array_a = invlist_array(a);
8249 UV* array_b = invlist_array(b);
8250 UV len_a = _invlist_len(a);
8251 UV len_b = _invlist_len(b);
8253 UV i = 0; /* current index into the arrays */
8254 bool retval = TRUE; /* Assume are identical until proven otherwise */
8256 PERL_ARGS_ASSERT__INVLISTEQ;
8258 /* If are to compare 'a' with the complement of b, set it
8259 * up so are looking at b's complement. */
8262 /* The complement of nothing is everything, so <a> would have to have
8263 * just one element, starting at zero (ending at infinity) */
8265 return (len_a == 1 && array_a[0] == 0);
8267 else if (array_b[0] == 0) {
8269 /* Otherwise, to complement, we invert. Here, the first element is
8270 * 0, just remove it. To do this, we just pretend the array starts
8271 * one later, and clear the flag as we don't have to do anything
8276 complement_b = FALSE;
8280 /* But if the first element is not zero, we unshift a 0 before the
8281 * array. The data structure reserves a space for that 0 (which
8282 * should be a '1' right now), so physical shifting is unneeded,
8283 * but temporarily change that element to 0. Before exiting the
8284 * routine, we must restore the element to '1' */
8291 /* Make sure that the lengths are the same, as well as the final element
8292 * before looping through the remainder. (Thus we test the length, final,
8293 * and first elements right off the bat) */
8294 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8297 else for (i = 0; i < len_a - 1; i++) {
8298 if (array_a[i] != array_b[i]) {
8311 #undef HEADER_LENGTH
8312 #undef INVLIST_INITIAL_LENGTH
8313 #undef TO_INTERNAL_SIZE
8314 #undef FROM_INTERNAL_SIZE
8315 #undef INVLIST_LEN_OFFSET
8316 #undef INVLIST_ZERO_OFFSET
8317 #undef INVLIST_ITER_OFFSET
8318 #undef INVLIST_VERSION_ID
8319 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8321 /* End of inversion list object */
8324 S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8326 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8327 * constructs, and updates RExC_flags with them. On input, RExC_parse
8328 * should point to the first flag; it is updated on output to point to the
8329 * final ')' or ':'. There needs to be at least one flag, or this will
8332 /* for (?g), (?gc), and (?o) warnings; warning
8333 about (?c) will warn about (?g) -- japhy */
8335 #define WASTED_O 0x01
8336 #define WASTED_G 0x02
8337 #define WASTED_C 0x04
8338 #define WASTED_GC (0x02|0x04)
8339 I32 wastedflags = 0x00;
8340 U32 posflags = 0, negflags = 0;
8341 U32 *flagsp = &posflags;
8342 char has_charset_modifier = '\0';
8344 bool has_use_defaults = FALSE;
8345 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8347 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8349 /* '^' as an initial flag sets certain defaults */
8350 if (UCHARAT(RExC_parse) == '^') {
8352 has_use_defaults = TRUE;
8353 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8354 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8355 ? REGEX_UNICODE_CHARSET
8356 : REGEX_DEPENDS_CHARSET);
8359 cs = get_regex_charset(RExC_flags);
8360 if (cs == REGEX_DEPENDS_CHARSET
8361 && (RExC_utf8 || RExC_uni_semantics))
8363 cs = REGEX_UNICODE_CHARSET;
8366 while (*RExC_parse) {
8367 /* && strchr("iogcmsx", *RExC_parse) */
8368 /* (?g), (?gc) and (?o) are useless here
8369 and must be globally applied -- japhy */
8370 switch (*RExC_parse) {
8372 /* Code for the imsx flags */
8373 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8375 case LOCALE_PAT_MOD:
8376 if (has_charset_modifier) {
8377 goto excess_modifier;
8379 else if (flagsp == &negflags) {
8382 cs = REGEX_LOCALE_CHARSET;
8383 has_charset_modifier = LOCALE_PAT_MOD;
8384 RExC_contains_locale = 1;
8386 case UNICODE_PAT_MOD:
8387 if (has_charset_modifier) {
8388 goto excess_modifier;
8390 else if (flagsp == &negflags) {
8393 cs = REGEX_UNICODE_CHARSET;
8394 has_charset_modifier = UNICODE_PAT_MOD;
8396 case ASCII_RESTRICT_PAT_MOD:
8397 if (flagsp == &negflags) {
8400 if (has_charset_modifier) {
8401 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8402 goto excess_modifier;
8404 /* Doubled modifier implies more restricted */
8405 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8408 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8410 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8412 case DEPENDS_PAT_MOD:
8413 if (has_use_defaults) {
8414 goto fail_modifiers;
8416 else if (flagsp == &negflags) {
8419 else if (has_charset_modifier) {
8420 goto excess_modifier;
8423 /* The dual charset means unicode semantics if the
8424 * pattern (or target, not known until runtime) are
8425 * utf8, or something in the pattern indicates unicode
8427 cs = (RExC_utf8 || RExC_uni_semantics)
8428 ? REGEX_UNICODE_CHARSET
8429 : REGEX_DEPENDS_CHARSET;
8430 has_charset_modifier = DEPENDS_PAT_MOD;
8434 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8435 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8437 else if (has_charset_modifier == *(RExC_parse - 1)) {
8438 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8441 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8446 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8448 case ONCE_PAT_MOD: /* 'o' */
8449 case GLOBAL_PAT_MOD: /* 'g' */
8450 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8451 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8452 if (! (wastedflags & wflagbit) ) {
8453 wastedflags |= wflagbit;
8456 "Useless (%s%c) - %suse /%c modifier",
8457 flagsp == &negflags ? "?-" : "?",
8459 flagsp == &negflags ? "don't " : "",
8466 case CONTINUE_PAT_MOD: /* 'c' */
8467 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8468 if (! (wastedflags & WASTED_C) ) {
8469 wastedflags |= WASTED_GC;
8472 "Useless (%sc) - %suse /gc modifier",
8473 flagsp == &negflags ? "?-" : "?",
8474 flagsp == &negflags ? "don't " : ""
8479 case KEEPCOPY_PAT_MOD: /* 'p' */
8480 if (flagsp == &negflags) {
8482 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8484 *flagsp |= RXf_PMf_KEEPCOPY;
8488 /* A flag is a default iff it is following a minus, so
8489 * if there is a minus, it means will be trying to
8490 * re-specify a default which is an error */
8491 if (has_use_defaults || flagsp == &negflags) {
8492 goto fail_modifiers;
8495 wastedflags = 0; /* reset so (?g-c) warns twice */
8499 RExC_flags |= posflags;
8500 RExC_flags &= ~negflags;
8501 set_regex_charset(&RExC_flags, cs);
8507 vFAIL3("Sequence (%.*s...) not recognized",
8508 RExC_parse-seqstart, seqstart);
8517 - reg - regular expression, i.e. main body or parenthesized thing
8519 * Caller must absorb opening parenthesis.
8521 * Combining parenthesis handling with the base level of regular expression
8522 * is a trifle forced, but the need to tie the tails of the branches to what
8523 * follows makes it hard to avoid.
8525 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8527 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8529 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8533 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8534 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8537 regnode *ret; /* Will be the head of the group. */
8540 regnode *ender = NULL;
8543 U32 oregflags = RExC_flags;
8544 bool have_branch = 0;
8546 I32 freeze_paren = 0;
8547 I32 after_freeze = 0;
8549 char * parse_start = RExC_parse; /* MJD */
8550 char * const oregcomp_parse = RExC_parse;
8552 GET_RE_DEBUG_FLAGS_DECL;
8554 PERL_ARGS_ASSERT_REG;
8555 DEBUG_PARSE("reg ");
8557 *flagp = 0; /* Tentatively. */
8560 /* Make an OPEN node, if parenthesized. */
8562 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8563 char *start_verb = RExC_parse;
8564 STRLEN verb_len = 0;
8565 char *start_arg = NULL;
8566 unsigned char op = 0;
8568 int internal_argval = 0; /* internal_argval is only useful if !argok */
8569 while ( *RExC_parse && *RExC_parse != ')' ) {
8570 if ( *RExC_parse == ':' ) {
8571 start_arg = RExC_parse + 1;
8577 verb_len = RExC_parse - start_verb;
8580 while ( *RExC_parse && *RExC_parse != ')' )
8582 if ( *RExC_parse != ')' )
8583 vFAIL("Unterminated verb pattern argument");
8584 if ( RExC_parse == start_arg )
8587 if ( *RExC_parse != ')' )
8588 vFAIL("Unterminated verb pattern");
8591 switch ( *start_verb ) {
8592 case 'A': /* (*ACCEPT) */
8593 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8595 internal_argval = RExC_nestroot;
8598 case 'C': /* (*COMMIT) */
8599 if ( memEQs(start_verb,verb_len,"COMMIT") )
8602 case 'F': /* (*FAIL) */
8603 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8608 case ':': /* (*:NAME) */
8609 case 'M': /* (*MARK:NAME) */
8610 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8615 case 'P': /* (*PRUNE) */
8616 if ( memEQs(start_verb,verb_len,"PRUNE") )
8619 case 'S': /* (*SKIP) */
8620 if ( memEQs(start_verb,verb_len,"SKIP") )
8623 case 'T': /* (*THEN) */
8624 /* [19:06] <TimToady> :: is then */
8625 if ( memEQs(start_verb,verb_len,"THEN") ) {
8627 RExC_seen |= REG_SEEN_CUTGROUP;
8633 vFAIL3("Unknown verb pattern '%.*s'",
8634 verb_len, start_verb);
8637 if ( start_arg && internal_argval ) {
8638 vFAIL3("Verb pattern '%.*s' may not have an argument",
8639 verb_len, start_verb);
8640 } else if ( argok < 0 && !start_arg ) {
8641 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8642 verb_len, start_verb);
8644 ret = reganode(pRExC_state, op, internal_argval);
8645 if ( ! internal_argval && ! SIZE_ONLY ) {
8647 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8648 ARG(ret) = add_data( pRExC_state, 1, "S" );
8649 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8656 if (!internal_argval)
8657 RExC_seen |= REG_SEEN_VERBARG;
8658 } else if ( start_arg ) {
8659 vFAIL3("Verb pattern '%.*s' may not have an argument",
8660 verb_len, start_verb);
8662 ret = reg_node(pRExC_state, op);
8664 nextchar(pRExC_state);
8667 if (*RExC_parse == '?') { /* (?...) */
8668 bool is_logical = 0;
8669 const char * const seqstart = RExC_parse;
8672 paren = *RExC_parse++;
8673 ret = NULL; /* For look-ahead/behind. */
8676 case 'P': /* (?P...) variants for those used to PCRE/Python */
8677 paren = *RExC_parse++;
8678 if ( paren == '<') /* (?P<...>) named capture */
8680 else if (paren == '>') { /* (?P>name) named recursion */
8681 goto named_recursion;
8683 else if (paren == '=') { /* (?P=...) named backref */
8684 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8685 you change this make sure you change that */
8686 char* name_start = RExC_parse;
8688 SV *sv_dat = reg_scan_name(pRExC_state,
8689 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8690 if (RExC_parse == name_start || *RExC_parse != ')')
8691 vFAIL2("Sequence %.3s... not terminated",parse_start);
8694 num = add_data( pRExC_state, 1, "S" );
8695 RExC_rxi->data->data[num]=(void*)sv_dat;
8696 SvREFCNT_inc_simple_void(sv_dat);
8699 ret = reganode(pRExC_state,
8702 : (ASCII_FOLD_RESTRICTED)
8704 : (AT_LEAST_UNI_SEMANTICS)
8712 Set_Node_Offset(ret, parse_start+1);
8713 Set_Node_Cur_Length(ret); /* MJD */
8715 nextchar(pRExC_state);
8719 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8721 case '<': /* (?<...) */
8722 if (*RExC_parse == '!')
8724 else if (*RExC_parse != '=')
8730 case '\'': /* (?'...') */
8731 name_start= RExC_parse;
8732 svname = reg_scan_name(pRExC_state,
8733 SIZE_ONLY ? /* reverse test from the others */
8734 REG_RSN_RETURN_NAME :
8735 REG_RSN_RETURN_NULL);
8736 if (RExC_parse == name_start) {
8738 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8741 if (*RExC_parse != paren)
8742 vFAIL2("Sequence (?%c... not terminated",
8743 paren=='>' ? '<' : paren);
8747 if (!svname) /* shouldn't happen */
8749 "panic: reg_scan_name returned NULL");
8750 if (!RExC_paren_names) {
8751 RExC_paren_names= newHV();
8752 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8754 RExC_paren_name_list= newAV();
8755 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8758 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8760 sv_dat = HeVAL(he_str);
8762 /* croak baby croak */
8764 "panic: paren_name hash element allocation failed");
8765 } else if ( SvPOK(sv_dat) ) {
8766 /* (?|...) can mean we have dupes so scan to check
8767 its already been stored. Maybe a flag indicating
8768 we are inside such a construct would be useful,
8769 but the arrays are likely to be quite small, so
8770 for now we punt -- dmq */
8771 IV count = SvIV(sv_dat);
8772 I32 *pv = (I32*)SvPVX(sv_dat);
8774 for ( i = 0 ; i < count ; i++ ) {
8775 if ( pv[i] == RExC_npar ) {
8781 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8782 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8783 pv[count] = RExC_npar;
8784 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8787 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8788 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8790 SvIV_set(sv_dat, 1);
8793 /* Yes this does cause a memory leak in debugging Perls */
8794 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8795 SvREFCNT_dec_NN(svname);
8798 /*sv_dump(sv_dat);*/
8800 nextchar(pRExC_state);
8802 goto capturing_parens;
8804 RExC_seen |= REG_SEEN_LOOKBEHIND;
8805 RExC_in_lookbehind++;
8807 case '=': /* (?=...) */
8808 RExC_seen_zerolen++;
8810 case '!': /* (?!...) */
8811 RExC_seen_zerolen++;
8812 if (*RExC_parse == ')') {
8813 ret=reg_node(pRExC_state, OPFAIL);
8814 nextchar(pRExC_state);
8818 case '|': /* (?|...) */
8819 /* branch reset, behave like a (?:...) except that
8820 buffers in alternations share the same numbers */
8822 after_freeze = freeze_paren = RExC_npar;
8824 case ':': /* (?:...) */
8825 case '>': /* (?>...) */
8827 case '$': /* (?$...) */
8828 case '@': /* (?@...) */
8829 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8831 case '#': /* (?#...) */
8832 while (*RExC_parse && *RExC_parse != ')')
8834 if (*RExC_parse != ')')
8835 FAIL("Sequence (?#... not terminated");
8836 nextchar(pRExC_state);
8839 case '0' : /* (?0) */
8840 case 'R' : /* (?R) */
8841 if (*RExC_parse != ')')
8842 FAIL("Sequence (?R) not terminated");
8843 ret = reg_node(pRExC_state, GOSTART);
8844 *flagp |= POSTPONED;
8845 nextchar(pRExC_state);
8848 { /* named and numeric backreferences */
8850 case '&': /* (?&NAME) */
8851 parse_start = RExC_parse - 1;
8854 SV *sv_dat = reg_scan_name(pRExC_state,
8855 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8856 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8858 goto gen_recurse_regop;
8859 assert(0); /* NOT REACHED */
8861 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8863 vFAIL("Illegal pattern");
8865 goto parse_recursion;
8867 case '-': /* (?-1) */
8868 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8869 RExC_parse--; /* rewind to let it be handled later */
8873 case '1': case '2': case '3': case '4': /* (?1) */
8874 case '5': case '6': case '7': case '8': case '9':
8877 num = atoi(RExC_parse);
8878 parse_start = RExC_parse - 1; /* MJD */
8879 if (*RExC_parse == '-')
8881 while (isDIGIT(*RExC_parse))
8883 if (*RExC_parse!=')')
8884 vFAIL("Expecting close bracket");
8887 if ( paren == '-' ) {
8889 Diagram of capture buffer numbering.
8890 Top line is the normal capture buffer numbers
8891 Bottom line is the negative indexing as from
8895 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8899 num = RExC_npar + num;
8902 vFAIL("Reference to nonexistent group");
8904 } else if ( paren == '+' ) {
8905 num = RExC_npar + num - 1;
8908 ret = reganode(pRExC_state, GOSUB, num);
8910 if (num > (I32)RExC_rx->nparens) {
8912 vFAIL("Reference to nonexistent group");
8914 ARG2L_SET( ret, RExC_recurse_count++);
8916 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8917 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8921 RExC_seen |= REG_SEEN_RECURSE;
8922 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8923 Set_Node_Offset(ret, parse_start); /* MJD */
8925 *flagp |= POSTPONED;
8926 nextchar(pRExC_state);
8928 } /* named and numeric backreferences */
8929 assert(0); /* NOT REACHED */
8931 case '?': /* (??...) */
8933 if (*RExC_parse != '{') {
8935 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8938 *flagp |= POSTPONED;
8939 paren = *RExC_parse++;
8941 case '{': /* (?{...}) */
8944 struct reg_code_block *cb;
8946 RExC_seen_zerolen++;
8948 if ( !pRExC_state->num_code_blocks
8949 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8950 || pRExC_state->code_blocks[pRExC_state->code_index].start
8951 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8954 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8955 FAIL("panic: Sequence (?{...}): no code block found\n");
8956 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8958 /* this is a pre-compiled code block (?{...}) */
8959 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8960 RExC_parse = RExC_start + cb->end;
8963 if (cb->src_regex) {
8964 n = add_data(pRExC_state, 2, "rl");
8965 RExC_rxi->data->data[n] =
8966 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8967 RExC_rxi->data->data[n+1] = (void*)o;
8970 n = add_data(pRExC_state, 1,
8971 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8972 RExC_rxi->data->data[n] = (void*)o;
8975 pRExC_state->code_index++;
8976 nextchar(pRExC_state);
8980 ret = reg_node(pRExC_state, LOGICAL);
8981 eval = reganode(pRExC_state, EVAL, n);
8984 /* for later propagation into (??{}) return value */
8985 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8987 REGTAIL(pRExC_state, ret, eval);
8988 /* deal with the length of this later - MJD */
8991 ret = reganode(pRExC_state, EVAL, n);
8992 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8993 Set_Node_Offset(ret, parse_start);
8996 case '(': /* (?(?{...})...) and (?(?=...)...) */
8999 if (RExC_parse[0] == '?') { /* (?(?...)) */
9000 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9001 || RExC_parse[1] == '<'
9002 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9005 ret = reg_node(pRExC_state, LOGICAL);
9008 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
9012 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9013 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9015 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9016 char *name_start= RExC_parse++;
9018 SV *sv_dat=reg_scan_name(pRExC_state,
9019 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9020 if (RExC_parse == name_start || *RExC_parse != ch)
9021 vFAIL2("Sequence (?(%c... not terminated",
9022 (ch == '>' ? '<' : ch));
9025 num = add_data( pRExC_state, 1, "S" );
9026 RExC_rxi->data->data[num]=(void*)sv_dat;
9027 SvREFCNT_inc_simple_void(sv_dat);
9029 ret = reganode(pRExC_state,NGROUPP,num);
9030 goto insert_if_check_paren;
9032 else if (RExC_parse[0] == 'D' &&
9033 RExC_parse[1] == 'E' &&
9034 RExC_parse[2] == 'F' &&
9035 RExC_parse[3] == 'I' &&
9036 RExC_parse[4] == 'N' &&
9037 RExC_parse[5] == 'E')
9039 ret = reganode(pRExC_state,DEFINEP,0);
9042 goto insert_if_check_paren;
9044 else if (RExC_parse[0] == 'R') {
9047 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9048 parno = atoi(RExC_parse++);
9049 while (isDIGIT(*RExC_parse))
9051 } else if (RExC_parse[0] == '&') {
9054 sv_dat = reg_scan_name(pRExC_state,
9055 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9056 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9058 ret = reganode(pRExC_state,INSUBP,parno);
9059 goto insert_if_check_paren;
9061 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9064 parno = atoi(RExC_parse++);
9066 while (isDIGIT(*RExC_parse))
9068 ret = reganode(pRExC_state, GROUPP, parno);
9070 insert_if_check_paren:
9071 if ((c = *nextchar(pRExC_state)) != ')')
9072 vFAIL("Switch condition not recognized");
9074 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9075 br = regbranch(pRExC_state, &flags, 1,depth+1);
9077 br = reganode(pRExC_state, LONGJMP, 0);
9079 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9080 c = *nextchar(pRExC_state);
9085 vFAIL("(?(DEFINE)....) does not allow branches");
9086 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9087 regbranch(pRExC_state, &flags, 1,depth+1);
9088 REGTAIL(pRExC_state, ret, lastbr);
9091 c = *nextchar(pRExC_state);
9096 vFAIL("Switch (?(condition)... contains too many branches");
9097 ender = reg_node(pRExC_state, TAIL);
9098 REGTAIL(pRExC_state, br, ender);
9100 REGTAIL(pRExC_state, lastbr, ender);
9101 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9104 REGTAIL(pRExC_state, ret, ender);
9105 RExC_size++; /* XXX WHY do we need this?!!
9106 For large programs it seems to be required
9107 but I can't figure out why. -- dmq*/
9111 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9114 case '[': /* (?[ ... ]) */
9115 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9118 RExC_parse--; /* for vFAIL to print correctly */
9119 vFAIL("Sequence (? incomplete");
9121 default: /* e.g., (?i) */
9124 parse_lparen_question_flags(pRExC_state);
9125 if (UCHARAT(RExC_parse) != ':') {
9126 nextchar(pRExC_state);
9131 nextchar(pRExC_state);
9141 ret = reganode(pRExC_state, OPEN, parno);
9144 RExC_nestroot = parno;
9145 if (RExC_seen & REG_SEEN_RECURSE
9146 && !RExC_open_parens[parno-1])
9148 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9149 "Setting open paren #%"IVdf" to %d\n",
9150 (IV)parno, REG_NODE_NUM(ret)));
9151 RExC_open_parens[parno-1]= ret;
9154 Set_Node_Length(ret, 1); /* MJD */
9155 Set_Node_Offset(ret, RExC_parse); /* MJD */
9163 /* Pick up the branches, linking them together. */
9164 parse_start = RExC_parse; /* MJD */
9165 br = regbranch(pRExC_state, &flags, 1,depth+1);
9167 /* branch_len = (paren != 0); */
9171 if (*RExC_parse == '|') {
9172 if (!SIZE_ONLY && RExC_extralen) {
9173 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9176 reginsert(pRExC_state, BRANCH, br, depth+1);
9177 Set_Node_Length(br, paren != 0);
9178 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9182 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9184 else if (paren == ':') {
9185 *flagp |= flags&SIMPLE;
9187 if (is_open) { /* Starts with OPEN. */
9188 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9190 else if (paren != '?') /* Not Conditional */
9192 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9194 while (*RExC_parse == '|') {
9195 if (!SIZE_ONLY && RExC_extralen) {
9196 ender = reganode(pRExC_state, LONGJMP,0);
9197 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9200 RExC_extralen += 2; /* Account for LONGJMP. */
9201 nextchar(pRExC_state);
9203 if (RExC_npar > after_freeze)
9204 after_freeze = RExC_npar;
9205 RExC_npar = freeze_paren;
9207 br = regbranch(pRExC_state, &flags, 0, depth+1);
9211 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9213 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9216 if (have_branch || paren != ':') {
9217 /* Make a closing node, and hook it on the end. */
9220 ender = reg_node(pRExC_state, TAIL);
9223 ender = reganode(pRExC_state, CLOSE, parno);
9224 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9225 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9226 "Setting close paren #%"IVdf" to %d\n",
9227 (IV)parno, REG_NODE_NUM(ender)));
9228 RExC_close_parens[parno-1]= ender;
9229 if (RExC_nestroot == parno)
9232 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9233 Set_Node_Length(ender,1); /* MJD */
9239 *flagp &= ~HASWIDTH;
9242 ender = reg_node(pRExC_state, SUCCEED);
9245 ender = reg_node(pRExC_state, END);
9247 assert(!RExC_opend); /* there can only be one! */
9252 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9253 SV * const mysv_val1=sv_newmortal();
9254 SV * const mysv_val2=sv_newmortal();
9255 DEBUG_PARSE_MSG("lsbr");
9256 regprop(RExC_rx, mysv_val1, lastbr);
9257 regprop(RExC_rx, mysv_val2, ender);
9258 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9259 SvPV_nolen_const(mysv_val1),
9260 (IV)REG_NODE_NUM(lastbr),
9261 SvPV_nolen_const(mysv_val2),
9262 (IV)REG_NODE_NUM(ender),
9263 (IV)(ender - lastbr)
9266 REGTAIL(pRExC_state, lastbr, ender);
9268 if (have_branch && !SIZE_ONLY) {
9271 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9273 /* Hook the tails of the branches to the closing node. */
9274 for (br = ret; br; br = regnext(br)) {
9275 const U8 op = PL_regkind[OP(br)];
9277 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9278 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9281 else if (op == BRANCHJ) {
9282 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9283 /* for now we always disable this optimisation * /
9284 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9290 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9291 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9292 SV * const mysv_val1=sv_newmortal();
9293 SV * const mysv_val2=sv_newmortal();
9294 DEBUG_PARSE_MSG("NADA");
9295 regprop(RExC_rx, mysv_val1, ret);
9296 regprop(RExC_rx, mysv_val2, ender);
9297 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9298 SvPV_nolen_const(mysv_val1),
9299 (IV)REG_NODE_NUM(ret),
9300 SvPV_nolen_const(mysv_val2),
9301 (IV)REG_NODE_NUM(ender),
9306 if (OP(ender) == TAIL) {
9311 for ( opt= br + 1; opt < ender ; opt++ )
9313 NEXT_OFF(br)= ender - br;
9321 static const char parens[] = "=!<,>";
9323 if (paren && (p = strchr(parens, paren))) {
9324 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9325 int flag = (p - parens) > 1;
9328 node = SUSPEND, flag = 0;
9329 reginsert(pRExC_state, node,ret, depth+1);
9330 Set_Node_Cur_Length(ret);
9331 Set_Node_Offset(ret, parse_start + 1);
9333 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9337 /* Check for proper termination. */
9339 RExC_flags = oregflags;
9340 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9341 RExC_parse = oregcomp_parse;
9342 vFAIL("Unmatched (");
9345 else if (!paren && RExC_parse < RExC_end) {
9346 if (*RExC_parse == ')') {
9348 vFAIL("Unmatched )");
9351 FAIL("Junk on end of regexp"); /* "Can't happen". */
9352 assert(0); /* NOTREACHED */
9355 if (RExC_in_lookbehind) {
9356 RExC_in_lookbehind--;
9358 if (after_freeze > RExC_npar)
9359 RExC_npar = after_freeze;
9364 - regbranch - one alternative of an | operator
9366 * Implements the concatenation operator.
9369 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9373 regnode *chain = NULL;
9375 I32 flags = 0, c = 0;
9376 GET_RE_DEBUG_FLAGS_DECL;
9378 PERL_ARGS_ASSERT_REGBRANCH;
9380 DEBUG_PARSE("brnc");
9385 if (!SIZE_ONLY && RExC_extralen)
9386 ret = reganode(pRExC_state, BRANCHJ,0);
9388 ret = reg_node(pRExC_state, BRANCH);
9389 Set_Node_Length(ret, 1);
9393 if (!first && SIZE_ONLY)
9394 RExC_extralen += 1; /* BRANCHJ */
9396 *flagp = WORST; /* Tentatively. */
9399 nextchar(pRExC_state);
9400 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9402 latest = regpiece(pRExC_state, &flags,depth+1);
9403 if (latest == NULL) {
9404 if (flags & TRYAGAIN)
9408 else if (ret == NULL)
9410 *flagp |= flags&(HASWIDTH|POSTPONED);
9411 if (chain == NULL) /* First piece. */
9412 *flagp |= flags&SPSTART;
9415 REGTAIL(pRExC_state, chain, latest);
9420 if (chain == NULL) { /* Loop ran zero times. */
9421 chain = reg_node(pRExC_state, NOTHING);
9426 *flagp |= flags&SIMPLE;
9433 - regpiece - something followed by possible [*+?]
9435 * Note that the branching code sequences used for ? and the general cases
9436 * of * and + are somewhat optimized: they use the same NOTHING node as
9437 * both the endmarker for their branch list and the body of the last branch.
9438 * It might seem that this node could be dispensed with entirely, but the
9439 * endmarker role is not redundant.
9442 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9449 const char * const origparse = RExC_parse;
9451 I32 max = REG_INFTY;
9452 #ifdef RE_TRACK_PATTERN_OFFSETS
9455 const char *maxpos = NULL;
9457 /* Save the original in case we change the emitted regop to a FAIL. */
9458 regnode * const orig_emit = RExC_emit;
9460 GET_RE_DEBUG_FLAGS_DECL;
9462 PERL_ARGS_ASSERT_REGPIECE;
9464 DEBUG_PARSE("piec");
9466 ret = regatom(pRExC_state, &flags,depth+1);
9468 if (flags & TRYAGAIN)
9475 if (op == '{' && regcurly(RExC_parse, FALSE)) {
9477 #ifdef RE_TRACK_PATTERN_OFFSETS
9478 parse_start = RExC_parse; /* MJD */
9480 next = RExC_parse + 1;
9481 while (isDIGIT(*next) || *next == ',') {
9490 if (*next == '}') { /* got one */
9494 min = atoi(RExC_parse);
9498 maxpos = RExC_parse;
9500 if (!max && *maxpos != '0')
9501 max = REG_INFTY; /* meaning "infinity" */
9502 else if (max >= REG_INFTY)
9503 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9505 nextchar(pRExC_state);
9506 if (max < min) { /* If can't match, warn and optimize to fail
9509 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9511 /* We can't back off the size because we have to reserve
9512 * enough space for all the things we are about to throw
9513 * away, but we can shrink it by the ammount we are about
9515 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9518 RExC_emit = orig_emit;
9520 ret = reg_node(pRExC_state, OPFAIL);
9523 else if (max == 0) { /* replace {0} with a nothing node */
9525 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9528 RExC_emit = orig_emit;
9530 ret = reg_node(pRExC_state, NOTHING);
9535 if ((flags&SIMPLE)) {
9536 RExC_naughty += 2 + RExC_naughty / 2;
9537 reginsert(pRExC_state, CURLY, ret, depth+1);
9538 Set_Node_Offset(ret, parse_start+1); /* MJD */
9539 Set_Node_Cur_Length(ret);
9542 regnode * const w = reg_node(pRExC_state, WHILEM);
9545 REGTAIL(pRExC_state, ret, w);
9546 if (!SIZE_ONLY && RExC_extralen) {
9547 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9548 reginsert(pRExC_state, NOTHING,ret, depth+1);
9549 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9551 reginsert(pRExC_state, CURLYX,ret, depth+1);
9553 Set_Node_Offset(ret, parse_start+1);
9554 Set_Node_Length(ret,
9555 op == '{' ? (RExC_parse - parse_start) : 1);
9557 if (!SIZE_ONLY && RExC_extralen)
9558 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9559 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9561 RExC_whilem_seen++, RExC_extralen += 3;
9562 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9571 ARG1_SET(ret, (U16)min);
9572 ARG2_SET(ret, (U16)max);
9584 #if 0 /* Now runtime fix should be reliable. */
9586 /* if this is reinstated, don't forget to put this back into perldiag:
9588 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9590 (F) The part of the regexp subject to either the * or + quantifier
9591 could match an empty string. The {#} shows in the regular
9592 expression about where the problem was discovered.
9596 if (!(flags&HASWIDTH) && op != '?')
9597 vFAIL("Regexp *+ operand could be empty");
9600 #ifdef RE_TRACK_PATTERN_OFFSETS
9601 parse_start = RExC_parse;
9603 nextchar(pRExC_state);
9605 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9607 if (op == '*' && (flags&SIMPLE)) {
9608 reginsert(pRExC_state, STAR, ret, depth+1);
9612 else if (op == '*') {
9616 else if (op == '+' && (flags&SIMPLE)) {
9617 reginsert(pRExC_state, PLUS, ret, depth+1);
9621 else if (op == '+') {
9625 else if (op == '?') {
9630 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9631 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9632 ckWARN3reg(RExC_parse,
9633 "%.*s matches null string many times",
9634 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9636 (void)ReREFCNT_inc(RExC_rx_sv);
9639 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9640 nextchar(pRExC_state);
9641 reginsert(pRExC_state, MINMOD, ret, depth+1);
9642 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9644 #ifndef REG_ALLOW_MINMOD_SUSPEND
9647 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9649 nextchar(pRExC_state);
9650 ender = reg_node(pRExC_state, SUCCEED);
9651 REGTAIL(pRExC_state, ret, ender);
9652 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9654 ender = reg_node(pRExC_state, TAIL);
9655 REGTAIL(pRExC_state, ret, ender);
9659 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9661 vFAIL("Nested quantifiers");
9668 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9669 const bool strict /* Apply stricter parsing rules? */
9673 /* This is expected to be called by a parser routine that has recognized '\N'
9674 and needs to handle the rest. RExC_parse is expected to point at the first
9675 char following the N at the time of the call. On successful return,
9676 RExC_parse has been updated to point to just after the sequence identified
9677 by this routine, and <*flagp> has been updated.
9679 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9682 \N may begin either a named sequence, or if outside a character class, mean
9683 to match a non-newline. For non single-quoted regexes, the tokenizer has
9684 attempted to decide which, and in the case of a named sequence, converted it
9685 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9686 where c1... are the characters in the sequence. For single-quoted regexes,
9687 the tokenizer passes the \N sequence through unchanged; this code will not
9688 attempt to determine this nor expand those, instead raising a syntax error.
9689 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9690 or there is no '}', it signals that this \N occurrence means to match a
9693 Only the \N{U+...} form should occur in a character class, for the same
9694 reason that '.' inside a character class means to just match a period: it
9695 just doesn't make sense.
9697 The function raises an error (via vFAIL), and doesn't return for various
9698 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9699 success; it returns FALSE otherwise.
9701 If <valuep> is non-null, it means the caller can accept an input sequence
9702 consisting of a just a single code point; <*valuep> is set to that value
9703 if the input is such.
9705 If <node_p> is non-null it signifies that the caller can accept any other
9706 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9708 1) \N means not-a-NL: points to a newly created REG_ANY node;
9709 2) \N{}: points to a new NOTHING node;
9710 3) otherwise: points to a new EXACT node containing the resolved
9712 Note that FALSE is returned for single code point sequences if <valuep> is
9716 char * endbrace; /* '}' following the name */
9718 char *endchar; /* Points to '.' or '}' ending cur char in the input
9720 bool has_multiple_chars; /* true if the input stream contains a sequence of
9721 more than one character */
9723 GET_RE_DEBUG_FLAGS_DECL;
9725 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9729 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9731 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9732 * modifier. The other meaning does not */
9733 p = (RExC_flags & RXf_PMf_EXTENDED)
9734 ? regwhite( pRExC_state, RExC_parse )
9737 /* Disambiguate between \N meaning a named character versus \N meaning
9738 * [^\n]. The former is assumed when it can't be the latter. */
9739 if (*p != '{' || regcurly(p, FALSE)) {
9742 /* no bare \N in a charclass */
9743 if (in_char_class) {
9744 vFAIL("\\N in a character class must be a named character: \\N{...}");
9748 nextchar(pRExC_state);
9749 *node_p = reg_node(pRExC_state, REG_ANY);
9750 *flagp |= HASWIDTH|SIMPLE;
9753 Set_Node_Length(*node_p, 1); /* MJD */
9757 /* Here, we have decided it should be a named character or sequence */
9759 /* The test above made sure that the next real character is a '{', but
9760 * under the /x modifier, it could be separated by space (or a comment and
9761 * \n) and this is not allowed (for consistency with \x{...} and the
9762 * tokenizer handling of \N{NAME}). */
9763 if (*RExC_parse != '{') {
9764 vFAIL("Missing braces on \\N{}");
9767 RExC_parse++; /* Skip past the '{' */
9769 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9770 || ! (endbrace == RExC_parse /* nothing between the {} */
9771 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9772 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9774 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9775 vFAIL("\\N{NAME} must be resolved by the lexer");
9778 if (endbrace == RExC_parse) { /* empty: \N{} */
9781 *node_p = reg_node(pRExC_state,NOTHING);
9783 else if (in_char_class) {
9784 if (SIZE_ONLY && in_char_class) {
9786 RExC_parse++; /* Position after the "}" */
9787 vFAIL("Zero length \\N{}");
9790 ckWARNreg(RExC_parse,
9791 "Ignoring zero length \\N{} in character class");
9799 nextchar(pRExC_state);
9803 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9804 RExC_parse += 2; /* Skip past the 'U+' */
9806 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9808 /* Code points are separated by dots. If none, there is only one code
9809 * point, and is terminated by the brace */
9810 has_multiple_chars = (endchar < endbrace);
9812 if (valuep && (! has_multiple_chars || in_char_class)) {
9813 /* We only pay attention to the first char of
9814 multichar strings being returned in char classes. I kinda wonder
9815 if this makes sense as it does change the behaviour
9816 from earlier versions, OTOH that behaviour was broken
9817 as well. XXX Solution is to recharacterize as
9818 [rest-of-class]|multi1|multi2... */
9820 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9821 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9822 | PERL_SCAN_DISALLOW_PREFIX
9823 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9825 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9827 /* The tokenizer should have guaranteed validity, but it's possible to
9828 * bypass it by using single quoting, so check */
9829 if (length_of_hex == 0
9830 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9832 RExC_parse += length_of_hex; /* Includes all the valid */
9833 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9834 ? UTF8SKIP(RExC_parse)
9836 /* Guard against malformed utf8 */
9837 if (RExC_parse >= endchar) {
9838 RExC_parse = endchar;
9840 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9843 if (in_char_class && has_multiple_chars) {
9845 RExC_parse = endbrace;
9846 vFAIL("\\N{} in character class restricted to one character");
9849 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9853 RExC_parse = endbrace + 1;
9855 else if (! node_p || ! has_multiple_chars) {
9857 /* Here, the input is legal, but not according to the caller's
9858 * options. We fail without advancing the parse, so that the
9859 * caller can try again */
9865 /* What is done here is to convert this to a sub-pattern of the form
9866 * (?:\x{char1}\x{char2}...)
9867 * and then call reg recursively. That way, it retains its atomicness,
9868 * while not having to worry about special handling that some code
9869 * points may have. toke.c has converted the original Unicode values
9870 * to native, so that we can just pass on the hex values unchanged. We
9871 * do have to set a flag to keep recoding from happening in the
9874 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9876 char *orig_end = RExC_end;
9879 while (RExC_parse < endbrace) {
9881 /* Convert to notation the rest of the code understands */
9882 sv_catpv(substitute_parse, "\\x{");
9883 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9884 sv_catpv(substitute_parse, "}");
9886 /* Point to the beginning of the next character in the sequence. */
9887 RExC_parse = endchar + 1;
9888 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9890 sv_catpv(substitute_parse, ")");
9892 RExC_parse = SvPV(substitute_parse, len);
9894 /* Don't allow empty number */
9896 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9898 RExC_end = RExC_parse + len;
9900 /* The values are Unicode, and therefore not subject to recoding */
9901 RExC_override_recoding = 1;
9903 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9904 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9906 RExC_parse = endbrace;
9907 RExC_end = orig_end;
9908 RExC_override_recoding = 0;
9910 nextchar(pRExC_state);
9920 * It returns the code point in utf8 for the value in *encp.
9921 * value: a code value in the source encoding
9922 * encp: a pointer to an Encode object
9924 * If the result from Encode is not a single character,
9925 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9928 S_reg_recode(pTHX_ const char value, SV **encp)
9931 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9932 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9933 const STRLEN newlen = SvCUR(sv);
9934 UV uv = UNICODE_REPLACEMENT;
9936 PERL_ARGS_ASSERT_REG_RECODE;
9940 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9943 if (!newlen || numlen != newlen) {
9944 uv = UNICODE_REPLACEMENT;
9950 PERL_STATIC_INLINE U8
9951 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9955 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9961 op = get_regex_charset(RExC_flags);
9962 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9963 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9964 been, so there is no hole */
9970 PERL_STATIC_INLINE void
9971 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9973 /* This knows the details about sizing an EXACTish node, setting flags for
9974 * it (by setting <*flagp>, and potentially populating it with a single
9977 * If <len> (the length in bytes) is non-zero, this function assumes that
9978 * the node has already been populated, and just does the sizing. In this
9979 * case <code_point> should be the final code point that has already been
9980 * placed into the node. This value will be ignored except that under some
9981 * circumstances <*flagp> is set based on it.
9983 * If <len> is zero, the function assumes that the node is to contain only
9984 * the single character given by <code_point> and calculates what <len>
9985 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9986 * additionally will populate the node's STRING with <code_point>, if <len>
9987 * is 0. In both cases <*flagp> is appropriately set
9989 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9990 * folded (the latter only when the rules indicate it can match 'ss') */
9992 bool len_passed_in = cBOOL(len != 0);
9993 U8 character[UTF8_MAXBYTES_CASE+1];
9995 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9997 if (! len_passed_in) {
10000 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10003 uvchr_to_utf8( character, code_point);
10004 len = UTF8SKIP(character);
10008 || code_point != LATIN_SMALL_LETTER_SHARP_S
10009 || ASCII_FOLD_RESTRICTED
10010 || ! AT_LEAST_UNI_SEMANTICS)
10012 *character = (U8) code_point;
10017 *(character + 1) = 's';
10023 RExC_size += STR_SZ(len);
10026 RExC_emit += STR_SZ(len);
10027 STR_LEN(node) = len;
10028 if (! len_passed_in) {
10029 Copy((char *) character, STRING(node), len, char);
10033 *flagp |= HASWIDTH;
10035 /* A single character node is SIMPLE, except for the special-cased SHARP S
10037 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10038 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10039 || ! FOLD || ! DEPENDS_SEMANTICS))
10046 - regatom - the lowest level
10048 Try to identify anything special at the start of the pattern. If there
10049 is, then handle it as required. This may involve generating a single regop,
10050 such as for an assertion; or it may involve recursing, such as to
10051 handle a () structure.
10053 If the string doesn't start with something special then we gobble up
10054 as much literal text as we can.
10056 Once we have been able to handle whatever type of thing started the
10057 sequence, we return.
10059 Note: we have to be careful with escapes, as they can be both literal
10060 and special, and in the case of \10 and friends, context determines which.
10062 A summary of the code structure is:
10064 switch (first_byte) {
10065 cases for each special:
10066 handle this special;
10069 switch (2nd byte) {
10070 cases for each unambiguous special:
10071 handle this special;
10073 cases for each ambigous special/literal:
10075 if (special) handle here
10077 default: // unambiguously literal:
10080 default: // is a literal char
10083 create EXACTish node for literal;
10084 while (more input and node isn't full) {
10085 switch (input_byte) {
10086 cases for each special;
10087 make sure parse pointer is set so that the next call to
10088 regatom will see this special first
10089 goto loopdone; // EXACTish node terminated by prev. char
10091 append char to EXACTISH node;
10093 get next input byte;
10097 return the generated node;
10099 Specifically there are two separate switches for handling
10100 escape sequences, with the one for handling literal escapes requiring
10101 a dummy entry for all of the special escapes that are actually handled
10106 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10109 regnode *ret = NULL;
10111 char *parse_start = RExC_parse;
10115 GET_RE_DEBUG_FLAGS_DECL;
10117 *flagp = WORST; /* Tentatively. */
10119 DEBUG_PARSE("atom");
10121 PERL_ARGS_ASSERT_REGATOM;
10124 switch ((U8)*RExC_parse) {
10126 RExC_seen_zerolen++;
10127 nextchar(pRExC_state);
10128 if (RExC_flags & RXf_PMf_MULTILINE)
10129 ret = reg_node(pRExC_state, MBOL);
10130 else if (RExC_flags & RXf_PMf_SINGLELINE)
10131 ret = reg_node(pRExC_state, SBOL);
10133 ret = reg_node(pRExC_state, BOL);
10134 Set_Node_Length(ret, 1); /* MJD */
10137 nextchar(pRExC_state);
10139 RExC_seen_zerolen++;
10140 if (RExC_flags & RXf_PMf_MULTILINE)
10141 ret = reg_node(pRExC_state, MEOL);
10142 else if (RExC_flags & RXf_PMf_SINGLELINE)
10143 ret = reg_node(pRExC_state, SEOL);
10145 ret = reg_node(pRExC_state, EOL);
10146 Set_Node_Length(ret, 1); /* MJD */
10149 nextchar(pRExC_state);
10150 if (RExC_flags & RXf_PMf_SINGLELINE)
10151 ret = reg_node(pRExC_state, SANY);
10153 ret = reg_node(pRExC_state, REG_ANY);
10154 *flagp |= HASWIDTH|SIMPLE;
10156 Set_Node_Length(ret, 1); /* MJD */
10160 char * const oregcomp_parse = ++RExC_parse;
10161 ret = regclass(pRExC_state, flagp,depth+1,
10162 FALSE, /* means parse the whole char class */
10163 TRUE, /* allow multi-char folds */
10164 FALSE, /* don't silence non-portable warnings. */
10166 if (*RExC_parse != ']') {
10167 RExC_parse = oregcomp_parse;
10168 vFAIL("Unmatched [");
10170 nextchar(pRExC_state);
10171 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10175 nextchar(pRExC_state);
10176 ret = reg(pRExC_state, 1, &flags,depth+1);
10178 if (flags & TRYAGAIN) {
10179 if (RExC_parse == RExC_end) {
10180 /* Make parent create an empty node if needed. */
10181 *flagp |= TRYAGAIN;
10188 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10192 if (flags & TRYAGAIN) {
10193 *flagp |= TRYAGAIN;
10196 vFAIL("Internal urp");
10197 /* Supposed to be caught earlier. */
10200 if (!regcurly(RExC_parse, FALSE)) {
10209 vFAIL("Quantifier follows nothing");
10214 This switch handles escape sequences that resolve to some kind
10215 of special regop and not to literal text. Escape sequnces that
10216 resolve to literal text are handled below in the switch marked
10219 Every entry in this switch *must* have a corresponding entry
10220 in the literal escape switch. However, the opposite is not
10221 required, as the default for this switch is to jump to the
10222 literal text handling code.
10224 switch ((U8)*++RExC_parse) {
10226 /* Special Escapes */
10228 RExC_seen_zerolen++;
10229 ret = reg_node(pRExC_state, SBOL);
10231 goto finish_meta_pat;
10233 ret = reg_node(pRExC_state, GPOS);
10234 RExC_seen |= REG_SEEN_GPOS;
10236 goto finish_meta_pat;
10238 RExC_seen_zerolen++;
10239 ret = reg_node(pRExC_state, KEEPS);
10241 /* XXX:dmq : disabling in-place substitution seems to
10242 * be necessary here to avoid cases of memory corruption, as
10243 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10245 RExC_seen |= REG_SEEN_LOOKBEHIND;
10246 goto finish_meta_pat;
10248 ret = reg_node(pRExC_state, SEOL);
10250 RExC_seen_zerolen++; /* Do not optimize RE away */
10251 goto finish_meta_pat;
10253 ret = reg_node(pRExC_state, EOS);
10255 RExC_seen_zerolen++; /* Do not optimize RE away */
10256 goto finish_meta_pat;
10258 ret = reg_node(pRExC_state, CANY);
10259 RExC_seen |= REG_SEEN_CANY;
10260 *flagp |= HASWIDTH|SIMPLE;
10261 goto finish_meta_pat;
10263 ret = reg_node(pRExC_state, CLUMP);
10264 *flagp |= HASWIDTH;
10265 goto finish_meta_pat;
10271 arg = ANYOF_WORDCHAR;
10275 RExC_seen_zerolen++;
10276 RExC_seen |= REG_SEEN_LOOKBEHIND;
10277 op = BOUND + get_regex_charset(RExC_flags);
10278 if (op > BOUNDA) { /* /aa is same as /a */
10281 ret = reg_node(pRExC_state, op);
10282 FLAGS(ret) = get_regex_charset(RExC_flags);
10284 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10285 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10287 goto finish_meta_pat;
10289 RExC_seen_zerolen++;
10290 RExC_seen |= REG_SEEN_LOOKBEHIND;
10291 op = NBOUND + get_regex_charset(RExC_flags);
10292 if (op > NBOUNDA) { /* /aa is same as /a */
10295 ret = reg_node(pRExC_state, op);
10296 FLAGS(ret) = get_regex_charset(RExC_flags);
10298 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10299 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10301 goto finish_meta_pat;
10311 ret = reg_node(pRExC_state, LNBREAK);
10312 *flagp |= HASWIDTH|SIMPLE;
10313 goto finish_meta_pat;
10321 goto join_posix_op_known;
10327 arg = ANYOF_VERTWS;
10329 goto join_posix_op_known;
10339 op = POSIXD + get_regex_charset(RExC_flags);
10340 if (op > POSIXA) { /* /aa is same as /a */
10344 join_posix_op_known:
10347 op += NPOSIXD - POSIXD;
10350 ret = reg_node(pRExC_state, op);
10352 FLAGS(ret) = namedclass_to_classnum(arg);
10355 *flagp |= HASWIDTH|SIMPLE;
10359 nextchar(pRExC_state);
10360 Set_Node_Length(ret, 2); /* MJD */
10366 char* parse_start = RExC_parse - 2;
10371 ret = regclass(pRExC_state, flagp,depth+1,
10372 TRUE, /* means just parse this element */
10373 FALSE, /* don't allow multi-char folds */
10374 FALSE, /* don't silence non-portable warnings.
10375 It would be a bug if these returned
10381 Set_Node_Offset(ret, parse_start + 2);
10382 Set_Node_Cur_Length(ret);
10383 nextchar(pRExC_state);
10387 /* Handle \N and \N{NAME} with multiple code points here and not
10388 * below because it can be multicharacter. join_exact() will join
10389 * them up later on. Also this makes sure that things like
10390 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10391 * The options to the grok function call causes it to fail if the
10392 * sequence is just a single code point. We then go treat it as
10393 * just another character in the current EXACT node, and hence it
10394 * gets uniform treatment with all the other characters. The
10395 * special treatment for quantifiers is not needed for such single
10396 * character sequences */
10398 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10399 FALSE /* not strict */ )) {
10404 case 'k': /* Handle \k<NAME> and \k'NAME' */
10407 char ch= RExC_parse[1];
10408 if (ch != '<' && ch != '\'' && ch != '{') {
10410 vFAIL2("Sequence %.2s... not terminated",parse_start);
10412 /* this pretty much dupes the code for (?P=...) in reg(), if
10413 you change this make sure you change that */
10414 char* name_start = (RExC_parse += 2);
10416 SV *sv_dat = reg_scan_name(pRExC_state,
10417 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10418 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10419 if (RExC_parse == name_start || *RExC_parse != ch)
10420 vFAIL2("Sequence %.3s... not terminated",parse_start);
10423 num = add_data( pRExC_state, 1, "S" );
10424 RExC_rxi->data->data[num]=(void*)sv_dat;
10425 SvREFCNT_inc_simple_void(sv_dat);
10429 ret = reganode(pRExC_state,
10432 : (ASCII_FOLD_RESTRICTED)
10434 : (AT_LEAST_UNI_SEMANTICS)
10440 *flagp |= HASWIDTH;
10442 /* override incorrect value set in reganode MJD */
10443 Set_Node_Offset(ret, parse_start+1);
10444 Set_Node_Cur_Length(ret); /* MJD */
10445 nextchar(pRExC_state);
10451 case '1': case '2': case '3': case '4':
10452 case '5': case '6': case '7': case '8': case '9':
10455 bool isg = *RExC_parse == 'g';
10460 if (*RExC_parse == '{') {
10464 if (*RExC_parse == '-') {
10468 if (hasbrace && !isDIGIT(*RExC_parse)) {
10469 if (isrel) RExC_parse--;
10471 goto parse_named_seq;
10473 num = atoi(RExC_parse);
10474 if (isg && num == 0)
10475 vFAIL("Reference to invalid group 0");
10477 num = RExC_npar - num;
10479 vFAIL("Reference to nonexistent or unclosed group");
10481 if (!isg && num > 9 && num >= RExC_npar)
10482 /* Probably a character specified in octal, e.g. \35 */
10485 char * const parse_start = RExC_parse - 1; /* MJD */
10486 while (isDIGIT(*RExC_parse))
10488 if (parse_start == RExC_parse - 1)
10489 vFAIL("Unterminated \\g... pattern");
10491 if (*RExC_parse != '}')
10492 vFAIL("Unterminated \\g{...} pattern");
10496 if (num > (I32)RExC_rx->nparens)
10497 vFAIL("Reference to nonexistent group");
10500 ret = reganode(pRExC_state,
10503 : (ASCII_FOLD_RESTRICTED)
10505 : (AT_LEAST_UNI_SEMANTICS)
10511 *flagp |= HASWIDTH;
10513 /* override incorrect value set in reganode MJD */
10514 Set_Node_Offset(ret, parse_start+1);
10515 Set_Node_Cur_Length(ret); /* MJD */
10517 nextchar(pRExC_state);
10522 if (RExC_parse >= RExC_end)
10523 FAIL("Trailing \\");
10526 /* Do not generate "unrecognized" warnings here, we fall
10527 back into the quick-grab loop below */
10534 if (RExC_flags & RXf_PMf_EXTENDED) {
10535 if ( reg_skipcomment( pRExC_state ) )
10542 parse_start = RExC_parse - 1;
10551 #define MAX_NODE_STRING_SIZE 127
10552 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10554 U8 upper_parse = MAX_NODE_STRING_SIZE;
10557 bool next_is_quantifier;
10558 char * oldp = NULL;
10560 /* If a folding node contains only code points that don't
10561 * participate in folds, it can be changed into an EXACT node,
10562 * which allows the optimizer more things to look for */
10566 node_type = compute_EXACTish(pRExC_state);
10567 ret = reg_node(pRExC_state, node_type);
10569 /* In pass1, folded, we use a temporary buffer instead of the
10570 * actual node, as the node doesn't exist yet */
10571 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10577 /* We do the EXACTFish to EXACT node only if folding, and not if in
10578 * locale, as whether a character folds or not isn't known until
10580 maybe_exact = FOLD && ! LOC;
10582 /* XXX The node can hold up to 255 bytes, yet this only goes to
10583 * 127. I (khw) do not know why. Keeping it somewhat less than
10584 * 255 allows us to not have to worry about overflow due to
10585 * converting to utf8 and fold expansion, but that value is
10586 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10587 * split up by this limit into a single one using the real max of
10588 * 255. Even at 127, this breaks under rare circumstances. If
10589 * folding, we do not want to split a node at a character that is a
10590 * non-final in a multi-char fold, as an input string could just
10591 * happen to want to match across the node boundary. The join
10592 * would solve that problem if the join actually happens. But a
10593 * series of more than two nodes in a row each of 127 would cause
10594 * the first join to succeed to get to 254, but then there wouldn't
10595 * be room for the next one, which could at be one of those split
10596 * multi-char folds. I don't know of any fool-proof solution. One
10597 * could back off to end with only a code point that isn't such a
10598 * non-final, but it is possible for there not to be any in the
10600 for (p = RExC_parse - 1;
10601 len < upper_parse && p < RExC_end;
10606 if (RExC_flags & RXf_PMf_EXTENDED)
10607 p = regwhite( pRExC_state, p );
10618 /* Literal Escapes Switch
10620 This switch is meant to handle escape sequences that
10621 resolve to a literal character.
10623 Every escape sequence that represents something
10624 else, like an assertion or a char class, is handled
10625 in the switch marked 'Special Escapes' above in this
10626 routine, but also has an entry here as anything that
10627 isn't explicitly mentioned here will be treated as
10628 an unescaped equivalent literal.
10631 switch ((U8)*++p) {
10632 /* These are all the special escapes. */
10633 case 'A': /* Start assertion */
10634 case 'b': case 'B': /* Word-boundary assertion*/
10635 case 'C': /* Single char !DANGEROUS! */
10636 case 'd': case 'D': /* digit class */
10637 case 'g': case 'G': /* generic-backref, pos assertion */
10638 case 'h': case 'H': /* HORIZWS */
10639 case 'k': case 'K': /* named backref, keep marker */
10640 case 'p': case 'P': /* Unicode property */
10641 case 'R': /* LNBREAK */
10642 case 's': case 'S': /* space class */
10643 case 'v': case 'V': /* VERTWS */
10644 case 'w': case 'W': /* word class */
10645 case 'X': /* eXtended Unicode "combining character sequence" */
10646 case 'z': case 'Z': /* End of line/string assertion */
10650 /* Anything after here is an escape that resolves to a
10651 literal. (Except digits, which may or may not)
10657 case 'N': /* Handle a single-code point named character. */
10658 /* The options cause it to fail if a multiple code
10659 * point sequence. Handle those in the switch() above
10661 RExC_parse = p + 1;
10662 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10663 flagp, depth, FALSE,
10664 FALSE /* not strict */ ))
10666 RExC_parse = p = oldp;
10670 if (ender > 0xff) {
10687 ender = ASCII_TO_NATIVE('\033');
10691 ender = ASCII_TO_NATIVE('\007');
10697 const char* error_msg;
10699 bool valid = grok_bslash_o(&p,
10702 TRUE, /* out warnings */
10703 FALSE, /* not strict */
10704 TRUE, /* Output warnings
10709 RExC_parse = p; /* going to die anyway; point
10710 to exact spot of failure */
10714 if (PL_encoding && ender < 0x100) {
10715 goto recode_encoding;
10717 if (ender > 0xff) {
10724 UV result = UV_MAX; /* initialize to erroneous
10726 const char* error_msg;
10728 bool valid = grok_bslash_x(&p,
10731 TRUE, /* out warnings */
10732 FALSE, /* not strict */
10733 TRUE, /* Output warnings
10738 RExC_parse = p; /* going to die anyway; point
10739 to exact spot of failure */
10744 if (PL_encoding && ender < 0x100) {
10745 goto recode_encoding;
10747 if (ender > 0xff) {
10754 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10756 case '0': case '1': case '2': case '3':case '4':
10757 case '5': case '6': case '7':
10759 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10761 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10763 ender = grok_oct(p, &numlen, &flags, NULL);
10764 if (ender > 0xff) {
10768 if (SIZE_ONLY /* like \08, \178 */
10771 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
10773 reg_warn_non_literal_string(
10775 form_short_octal_warning(p, numlen));
10778 else { /* Not to be treated as an octal constant, go
10783 if (PL_encoding && ender < 0x100)
10784 goto recode_encoding;
10787 if (! RExC_override_recoding) {
10788 SV* enc = PL_encoding;
10789 ender = reg_recode((const char)(U8)ender, &enc);
10790 if (!enc && SIZE_ONLY)
10791 ckWARNreg(p, "Invalid escape in the specified encoding");
10797 FAIL("Trailing \\");
10800 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10801 /* Include any { following the alpha to emphasize
10802 * that it could be part of an escape at some point
10804 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
10805 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
10807 goto normal_default;
10808 } /* End of switch on '\' */
10810 default: /* A literal character */
10813 && RExC_flags & RXf_PMf_EXTENDED
10814 && ckWARN(WARN_DEPRECATED)
10815 && is_PATWS_non_low(p, UTF))
10817 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
10818 "Escape literal pattern white space under /x");
10822 if (UTF8_IS_START(*p) && UTF) {
10824 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10825 &numlen, UTF8_ALLOW_DEFAULT);
10831 } /* End of switch on the literal */
10833 /* Here, have looked at the literal character and <ender>
10834 * contains its ordinal, <p> points to the character after it
10837 if ( RExC_flags & RXf_PMf_EXTENDED)
10838 p = regwhite( pRExC_state, p );
10840 /* If the next thing is a quantifier, it applies to this
10841 * character only, which means that this character has to be in
10842 * its own node and can't just be appended to the string in an
10843 * existing node, so if there are already other characters in
10844 * the node, close the node with just them, and set up to do
10845 * this character again next time through, when it will be the
10846 * only thing in its new node */
10847 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10855 /* See comments for join_exact() as to why we fold
10856 * this non-UTF at compile time */
10857 || (node_type == EXACTFU
10858 && ender == LATIN_SMALL_LETTER_SHARP_S))
10862 /* Prime the casefolded buffer. Locale rules, which
10863 * apply only to code points < 256, aren't known until
10864 * execution, so for them, just output the original
10865 * character using utf8. If we start to fold non-UTF
10866 * patterns, be sure to update join_exact() */
10867 if (LOC && ender < 256) {
10868 if (UNI_IS_INVARIANT(ender)) {
10872 *s = UTF8_TWO_BYTE_HI(ender);
10873 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10878 UV folded = _to_uni_fold_flags(
10883 | ((LOC) ? FOLD_FLAGS_LOCALE
10884 : (ASCII_FOLD_RESTRICTED)
10885 ? FOLD_FLAGS_NOMIX_ASCII
10889 /* If this node only contains non-folding code
10890 * points so far, see if this new one is also
10893 if (folded != ender) {
10894 maybe_exact = FALSE;
10897 /* Here the fold is the original; we have
10898 * to check further to see if anything
10900 if (! PL_utf8_foldable) {
10901 SV* swash = swash_init("utf8",
10903 &PL_sv_undef, 1, 0);
10905 _get_swash_invlist(swash);
10906 SvREFCNT_dec_NN(swash);
10908 if (_invlist_contains_cp(PL_utf8_foldable,
10911 maybe_exact = FALSE;
10919 /* The loop increments <len> each time, as all but this
10920 * path (and the one just below for UTF) through it add
10921 * a single byte to the EXACTish node. But this one
10922 * has changed len to be the correct final value, so
10923 * subtract one to cancel out the increment that
10925 len += foldlen - 1;
10928 *(s++) = (char) ender;
10929 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10933 const STRLEN unilen = reguni(pRExC_state, ender, s);
10939 /* See comment just above for - 1 */
10943 REGC((char)ender, s++);
10946 if (next_is_quantifier) {
10948 /* Here, the next input is a quantifier, and to get here,
10949 * the current character is the only one in the node.
10950 * Also, here <len> doesn't include the final byte for this
10956 } /* End of loop through literal characters */
10958 /* Here we have either exhausted the input or ran out of room in
10959 * the node. (If we encountered a character that can't be in the
10960 * node, transfer is made directly to <loopdone>, and so we
10961 * wouldn't have fallen off the end of the loop.) In the latter
10962 * case, we artificially have to split the node into two, because
10963 * we just don't have enough space to hold everything. This
10964 * creates a problem if the final character participates in a
10965 * multi-character fold in the non-final position, as a match that
10966 * should have occurred won't, due to the way nodes are matched,
10967 * and our artificial boundary. So back off until we find a non-
10968 * problematic character -- one that isn't at the beginning or
10969 * middle of such a fold. (Either it doesn't participate in any
10970 * folds, or appears only in the final position of all the folds it
10971 * does participate in.) A better solution with far fewer false
10972 * positives, and that would fill the nodes more completely, would
10973 * be to actually have available all the multi-character folds to
10974 * test against, and to back-off only far enough to be sure that
10975 * this node isn't ending with a partial one. <upper_parse> is set
10976 * further below (if we need to reparse the node) to include just
10977 * up through that final non-problematic character that this code
10978 * identifies, so when it is set to less than the full node, we can
10979 * skip the rest of this */
10980 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10982 const STRLEN full_len = len;
10984 assert(len >= MAX_NODE_STRING_SIZE);
10986 /* Here, <s> points to the final byte of the final character.
10987 * Look backwards through the string until find a non-
10988 * problematic character */
10992 /* These two have no multi-char folds to non-UTF characters
10994 if (ASCII_FOLD_RESTRICTED || LOC) {
10998 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11002 if (! PL_NonL1NonFinalFold) {
11003 PL_NonL1NonFinalFold = _new_invlist_C_array(
11004 NonL1_Perl_Non_Final_Folds_invlist);
11007 /* Point to the first byte of the final character */
11008 s = (char *) utf8_hop((U8 *) s, -1);
11010 while (s >= s0) { /* Search backwards until find
11011 non-problematic char */
11012 if (UTF8_IS_INVARIANT(*s)) {
11014 /* There are no ascii characters that participate
11015 * in multi-char folds under /aa. In EBCDIC, the
11016 * non-ascii invariants are all control characters,
11017 * so don't ever participate in any folds. */
11018 if (ASCII_FOLD_RESTRICTED
11019 || ! IS_NON_FINAL_FOLD(*s))
11024 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11026 /* No Latin1 characters participate in multi-char
11027 * folds under /l */
11029 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11035 else if (! _invlist_contains_cp(
11036 PL_NonL1NonFinalFold,
11037 valid_utf8_to_uvchr((U8 *) s, NULL)))
11042 /* Here, the current character is problematic in that
11043 * it does occur in the non-final position of some
11044 * fold, so try the character before it, but have to
11045 * special case the very first byte in the string, so
11046 * we don't read outside the string */
11047 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11048 } /* End of loop backwards through the string */
11050 /* If there were only problematic characters in the string,
11051 * <s> will point to before s0, in which case the length
11052 * should be 0, otherwise include the length of the
11053 * non-problematic character just found */
11054 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11057 /* Here, have found the final character, if any, that is
11058 * non-problematic as far as ending the node without splitting
11059 * it across a potential multi-char fold. <len> contains the
11060 * number of bytes in the node up-to and including that
11061 * character, or is 0 if there is no such character, meaning
11062 * the whole node contains only problematic characters. In
11063 * this case, give up and just take the node as-is. We can't
11069 /* Here, the node does contain some characters that aren't
11070 * problematic. If one such is the final character in the
11071 * node, we are done */
11072 if (len == full_len) {
11075 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11077 /* If the final character is problematic, but the
11078 * penultimate is not, back-off that last character to
11079 * later start a new node with it */
11084 /* Here, the final non-problematic character is earlier
11085 * in the input than the penultimate character. What we do
11086 * is reparse from the beginning, going up only as far as
11087 * this final ok one, thus guaranteeing that the node ends
11088 * in an acceptable character. The reason we reparse is
11089 * that we know how far in the character is, but we don't
11090 * know how to correlate its position with the input parse.
11091 * An alternate implementation would be to build that
11092 * correlation as we go along during the original parse,
11093 * but that would entail extra work for every node, whereas
11094 * this code gets executed only when the string is too
11095 * large for the node, and the final two characters are
11096 * problematic, an infrequent occurrence. Yet another
11097 * possible strategy would be to save the tail of the
11098 * string, and the next time regatom is called, initialize
11099 * with that. The problem with this is that unless you
11100 * back off one more character, you won't be guaranteed
11101 * regatom will get called again, unless regbranch,
11102 * regpiece ... are also changed. If you do back off that
11103 * extra character, so that there is input guaranteed to
11104 * force calling regatom, you can't handle the case where
11105 * just the first character in the node is acceptable. I
11106 * (khw) decided to try this method which doesn't have that
11107 * pitfall; if performance issues are found, we can do a
11108 * combination of the current approach plus that one */
11114 } /* End of verifying node ends with an appropriate char */
11116 loopdone: /* Jumped to when encounters something that shouldn't be in
11119 /* If 'maybe_exact' is still set here, means there are no
11120 * code points in the node that participate in folds */
11121 if (FOLD && maybe_exact) {
11125 /* I (khw) don't know if you can get here with zero length, but the
11126 * old code handled this situation by creating a zero-length EXACT
11127 * node. Might as well be NOTHING instead */
11132 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11135 RExC_parse = p - 1;
11136 Set_Node_Cur_Length(ret); /* MJD */
11137 nextchar(pRExC_state);
11139 /* len is STRLEN which is unsigned, need to copy to signed */
11142 vFAIL("Internal disaster");
11145 } /* End of label 'defchar:' */
11147 } /* End of giant switch on input character */
11153 S_regwhite( RExC_state_t *pRExC_state, char *p )
11155 const char *e = RExC_end;
11157 PERL_ARGS_ASSERT_REGWHITE;
11162 else if (*p == '#') {
11165 if (*p++ == '\n') {
11171 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11180 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11182 /* Returns the next non-pattern-white space, non-comment character (the
11183 * latter only if 'recognize_comment is true) in the string p, which is
11184 * ended by RExC_end. If there is no line break ending a comment,
11185 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11186 const char *e = RExC_end;
11188 PERL_ARGS_ASSERT_REGPATWS;
11192 if ((len = is_PATWS_safe(p, e, UTF))) {
11195 else if (recognize_comment && *p == '#') {
11199 if (is_LNBREAK_safe(p, e, UTF)) {
11205 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11213 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11214 Character classes ([:foo:]) can also be negated ([:^foo:]).
11215 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11216 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11217 but trigger failures because they are currently unimplemented. */
11219 #define POSIXCC_DONE(c) ((c) == ':')
11220 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11221 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11223 PERL_STATIC_INLINE I32
11224 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me,
11228 I32 namedclass = OOB_NAMEDCLASS;
11230 PERL_ARGS_ASSERT_REGPPOSIXCC;
11232 if (value == '[' && RExC_parse + 1 < RExC_end &&
11233 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11234 POSIXCC(UCHARAT(RExC_parse)))
11236 const char c = UCHARAT(RExC_parse);
11237 char* const s = RExC_parse++;
11239 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11241 if (RExC_parse == RExC_end) {
11244 /* Try to give a better location for the error (than the end of
11245 * the string) by looking for the matching ']' */
11247 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11250 vFAIL2("Unmatched '%c' in POSIX class", c);
11252 /* Grandfather lone [:, [=, [. */
11256 const char* const t = RExC_parse++; /* skip over the c */
11259 if (UCHARAT(RExC_parse) == ']') {
11260 const char *posixcc = s + 1;
11261 RExC_parse++; /* skip over the ending ] */
11264 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11265 const I32 skip = t - posixcc;
11267 /* Initially switch on the length of the name. */
11270 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11271 this is the Perl \w
11273 namedclass = ANYOF_WORDCHAR;
11276 /* Names all of length 5. */
11277 /* alnum alpha ascii blank cntrl digit graph lower
11278 print punct space upper */
11279 /* Offset 4 gives the best switch position. */
11280 switch (posixcc[4]) {
11282 if (memEQ(posixcc, "alph", 4)) /* alpha */
11283 namedclass = ANYOF_ALPHA;
11286 if (memEQ(posixcc, "spac", 4)) /* space */
11287 namedclass = ANYOF_PSXSPC;
11290 if (memEQ(posixcc, "grap", 4)) /* graph */
11291 namedclass = ANYOF_GRAPH;
11294 if (memEQ(posixcc, "asci", 4)) /* ascii */
11295 namedclass = ANYOF_ASCII;
11298 if (memEQ(posixcc, "blan", 4)) /* blank */
11299 namedclass = ANYOF_BLANK;
11302 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11303 namedclass = ANYOF_CNTRL;
11306 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11307 namedclass = ANYOF_ALPHANUMERIC;
11310 if (memEQ(posixcc, "lowe", 4)) /* lower */
11311 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11312 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11313 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11316 if (memEQ(posixcc, "digi", 4)) /* digit */
11317 namedclass = ANYOF_DIGIT;
11318 else if (memEQ(posixcc, "prin", 4)) /* print */
11319 namedclass = ANYOF_PRINT;
11320 else if (memEQ(posixcc, "punc", 4)) /* punct */
11321 namedclass = ANYOF_PUNCT;
11326 if (memEQ(posixcc, "xdigit", 6))
11327 namedclass = ANYOF_XDIGIT;
11331 if (namedclass == OOB_NAMEDCLASS)
11332 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11335 /* The #defines are structured so each complement is +1 to
11336 * the normal one */
11340 assert (posixcc[skip] == ':');
11341 assert (posixcc[skip+1] == ']');
11342 } else if (!SIZE_ONLY) {
11343 /* [[=foo=]] and [[.foo.]] are still future. */
11345 /* adjust RExC_parse so the warning shows after
11346 the class closes */
11347 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11349 SvREFCNT_dec(free_me);
11350 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11353 /* Maternal grandfather:
11354 * "[:" ending in ":" but not in ":]" */
11356 vFAIL("Unmatched '[' in POSIX class");
11359 /* Grandfather lone [:, [=, [. */
11369 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11371 /* This applies some heuristics at the current parse position (which should
11372 * be at a '[') to see if what follows might be intended to be a [:posix:]
11373 * class. It returns true if it really is a posix class, of course, but it
11374 * also can return true if it thinks that what was intended was a posix
11375 * class that didn't quite make it.
11377 * It will return true for
11379 * [:alphanumerics] (as long as the ] isn't followed immediately by a
11380 * ')' indicating the end of the (?[
11381 * [:any garbage including %^&$ punctuation:]
11383 * This is designed to be called only from S_handle_regex_sets; it could be
11384 * easily adapted to be called from the spot at the beginning of regclass()
11385 * that checks to see in a normal bracketed class if the surrounding []
11386 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
11387 * change long-standing behavior, so I (khw) didn't do that */
11388 char* p = RExC_parse + 1;
11389 char first_char = *p;
11391 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11393 assert(*(p - 1) == '[');
11395 if (! POSIXCC(first_char)) {
11400 while (p < RExC_end && isWORDCHAR(*p)) p++;
11402 if (p >= RExC_end) {
11406 if (p - RExC_parse > 2 /* Got at least 1 word character */
11407 && (*p == first_char
11408 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11413 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11416 && p - RExC_parse > 2 /* [:] evaluates to colon;
11417 [::] is a bad posix class. */
11418 && first_char == *(p - 1));
11422 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11423 char * const oregcomp_parse)
11425 /* Handle the (?[...]) construct to do set operations */
11428 UV start, end; /* End points of code point ranges */
11430 char *save_end, *save_parse;
11435 const bool save_fold = FOLD;
11437 GET_RE_DEBUG_FLAGS_DECL;
11439 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11442 vFAIL("(?[...]) not valid in locale");
11444 RExC_uni_semantics = 1;
11446 /* This will return only an ANYOF regnode, or (unlikely) something smaller
11447 * (such as EXACT). Thus we can skip most everything if just sizing. We
11448 * call regclass to handle '[]' so as to not have to reinvent its parsing
11449 * rules here (throwing away the size it computes each time). And, we exit
11450 * upon an unescaped ']' that isn't one ending a regclass. To do both
11451 * these things, we need to realize that something preceded by a backslash
11452 * is escaped, so we have to keep track of backslashes */
11455 Perl_ck_warner_d(aTHX_
11456 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11457 "The regex_sets feature is experimental" REPORT_LOCATION,
11458 (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11460 while (RExC_parse < RExC_end) {
11461 SV* current = NULL;
11462 RExC_parse = regpatws(pRExC_state, RExC_parse,
11463 TRUE); /* means recognize comments */
11464 switch (*RExC_parse) {
11468 /* Skip the next byte (which could cause us to end up in
11469 * the middle of a UTF-8 character, but since none of those
11470 * are confusable with anything we currently handle in this
11471 * switch (invariants all), it's safe. We'll just hit the
11472 * default: case next time and keep on incrementing until
11473 * we find one of the invariants we do handle. */
11478 /* If this looks like it is a [:posix:] class, leave the
11479 * parse pointer at the '[' to fool regclass() into
11480 * thinking it is part of a '[[:posix:]]'. That function
11481 * will use strict checking to force a syntax error if it
11482 * doesn't work out to a legitimate class */
11483 bool is_posix_class
11484 = could_it_be_a_POSIX_class(pRExC_state);
11485 if (! is_posix_class) {
11489 (void) regclass(pRExC_state, flagp,depth+1,
11490 is_posix_class, /* parse the whole char
11491 class only if not a
11493 FALSE, /* don't allow multi-char folds */
11494 TRUE, /* silence non-portable warnings. */
11496 /* function call leaves parse pointing to the ']', except
11497 * if we faked it */
11498 if (is_posix_class) {
11502 SvREFCNT_dec(current); /* In case it returned something */
11508 if (RExC_parse < RExC_end
11509 && *RExC_parse == ')')
11511 node = reganode(pRExC_state, ANYOF, 0);
11512 RExC_size += ANYOF_SKIP;
11513 nextchar(pRExC_state);
11514 Set_Node_Length(node,
11515 RExC_parse - oregcomp_parse + 1); /* MJD */
11524 FAIL("Syntax error in (?[...])");
11527 /* Pass 2 only after this. Everything in this construct is a
11528 * metacharacter. Operands begin with either a '\' (for an escape
11529 * sequence), or a '[' for a bracketed character class. Any other
11530 * character should be an operator, or parenthesis for grouping. Both
11531 * types of operands are handled by calling regclass() to parse them. It
11532 * is called with a parameter to indicate to return the computed inversion
11533 * list. The parsing here is implemented via a stack. Each entry on the
11534 * stack is a single character representing one of the operators, or the
11535 * '('; or else a pointer to an operand inversion list. */
11537 #define IS_OPERAND(a) (! SvIOK(a))
11539 /* The stack starts empty. It is a syntax error if the first thing parsed
11540 * is a binary operator; everything else is pushed on the stack. When an
11541 * operand is parsed, the top of the stack is examined. If it is a binary
11542 * operator, the item before it should be an operand, and both are replaced
11543 * by the result of doing that operation on the new operand and the one on
11544 * the stack. Thus a sequence of binary operands is reduced to a single
11545 * one before the next one is parsed.
11547 * A unary operator may immediately follow a binary in the input, for
11550 * When an operand is parsed and the top of the stack is a unary operator,
11551 * the operation is performed, and then the stack is rechecked to see if
11552 * this new operand is part of a binary operation; if so, it is handled as
11555 * A '(' is simply pushed on the stack; it is valid only if the stack is
11556 * empty, or the top element of the stack is an operator or another '('
11557 * (for which the parenthesized expression will become an operand). By the
11558 * time the corresponding ')' is parsed everything in between should have
11559 * been parsed and evaluated to a single operand (or else is a syntax
11560 * error), and is handled as a regular operand */
11564 while (RExC_parse < RExC_end) {
11565 I32 top_index = av_tindex(stack);
11567 SV* current = NULL;
11569 /* Skip white space */
11570 RExC_parse = regpatws(pRExC_state, RExC_parse,
11571 TRUE); /* means recognize comments */
11572 if (RExC_parse >= RExC_end) {
11573 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11575 if ((curchar = UCHARAT(RExC_parse)) == ']') {
11582 if (av_tindex(stack) >= 0 /* This makes sure that we can
11583 safely subtract 1 from
11584 RExC_parse in the next clause.
11585 If we have something on the
11586 stack, we have parsed something
11588 && UCHARAT(RExC_parse - 1) == '('
11589 && RExC_parse < RExC_end)
11591 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11592 * This happens when we have some thing like
11594 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11596 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
11598 * Here we would be handling the interpolated
11599 * '$thai_or_lao'. We handle this by a recursive call to
11600 * ourselves which returns the inversion list the
11601 * interpolated expression evaluates to. We use the flags
11602 * from the interpolated pattern. */
11603 U32 save_flags = RExC_flags;
11604 const char * const save_parse = ++RExC_parse;
11606 parse_lparen_question_flags(pRExC_state);
11608 if (RExC_parse == save_parse /* Makes sure there was at
11609 least one flag (or this
11610 embedding wasn't compiled)
11612 || RExC_parse >= RExC_end - 4
11613 || UCHARAT(RExC_parse) != ':'
11614 || UCHARAT(++RExC_parse) != '('
11615 || UCHARAT(++RExC_parse) != '?'
11616 || UCHARAT(++RExC_parse) != '[')
11619 /* In combination with the above, this moves the
11620 * pointer to the point just after the first erroneous
11621 * character (or if there are no flags, to where they
11622 * should have been) */
11623 if (RExC_parse >= RExC_end - 4) {
11624 RExC_parse = RExC_end;
11626 else if (RExC_parse != save_parse) {
11627 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11629 vFAIL("Expecting '(?flags:(?[...'");
11632 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
11633 depth+1, oregcomp_parse);
11635 /* Here, 'current' contains the embedded expression's
11636 * inversion list, and RExC_parse points to the trailing
11637 * ']'; the next character should be the ')' which will be
11638 * paired with the '(' that has been put on the stack, so
11639 * the whole embedded expression reduces to '(operand)' */
11642 RExC_flags = save_flags;
11643 goto handle_operand;
11648 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11649 vFAIL("Unexpected character");
11652 (void) regclass(pRExC_state, flagp,depth+1,
11653 TRUE, /* means parse just the next thing */
11654 FALSE, /* don't allow multi-char folds */
11655 FALSE, /* don't silence non-portable warnings.
11658 /* regclass() will return with parsing just the \ sequence,
11659 * leaving the parse pointer at the next thing to parse */
11661 goto handle_operand;
11663 case '[': /* Is a bracketed character class */
11665 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11667 if (! is_posix_class) {
11671 (void) regclass(pRExC_state, flagp,depth+1,
11672 is_posix_class, /* parse the whole char class
11673 only if not a posix class */
11674 FALSE, /* don't allow multi-char folds */
11675 FALSE, /* don't silence non-portable warnings.
11678 /* function call leaves parse pointing to the ']', except if we
11680 if (is_posix_class) {
11684 goto handle_operand;
11693 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11694 || ! IS_OPERAND(*top_ptr))
11697 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
11699 av_push(stack, newSVuv(curchar));
11703 av_push(stack, newSVuv(curchar));
11707 if (top_index >= 0) {
11708 top_ptr = av_fetch(stack, top_index, FALSE);
11710 if (IS_OPERAND(*top_ptr)) {
11712 vFAIL("Unexpected '(' with no preceding operator");
11715 av_push(stack, newSVuv(curchar));
11722 || ! (current = av_pop(stack))
11723 || ! IS_OPERAND(current)
11724 || ! (lparen = av_pop(stack))
11725 || IS_OPERAND(lparen)
11726 || SvUV(lparen) != '(')
11729 vFAIL("Unexpected ')'");
11732 SvREFCNT_dec_NN(lparen);
11739 /* Here, we have an operand to process, in 'current' */
11741 if (top_index < 0) { /* Just push if stack is empty */
11742 av_push(stack, current);
11745 SV* top = av_pop(stack);
11746 char current_operator;
11748 if (IS_OPERAND(top)) {
11749 vFAIL("Operand with no preceding operator");
11751 current_operator = (char) SvUV(top);
11752 switch (current_operator) {
11753 case '(': /* Push the '(' back on followed by the new
11755 av_push(stack, top);
11756 av_push(stack, current);
11757 SvREFCNT_inc(top); /* Counters the '_dec' done
11758 just after the 'break', so
11759 it doesn't get wrongly freed
11764 _invlist_invert(current);
11766 /* Unlike binary operators, the top of the stack,
11767 * now that this unary one has been popped off, may
11768 * legally be an operator, and we now have operand
11771 SvREFCNT_dec_NN(top);
11772 goto handle_operand;
11775 _invlist_intersection(av_pop(stack),
11778 av_push(stack, current);
11783 _invlist_union(av_pop(stack), current, ¤t);
11784 av_push(stack, current);
11788 _invlist_subtract(av_pop(stack), current, ¤t);
11789 av_push(stack, current);
11792 case '^': /* The union minus the intersection */
11798 element = av_pop(stack);
11799 _invlist_union(element, current, &u);
11800 _invlist_intersection(element, current, &i);
11801 _invlist_subtract(u, i, ¤t);
11802 av_push(stack, current);
11803 SvREFCNT_dec_NN(i);
11804 SvREFCNT_dec_NN(u);
11805 SvREFCNT_dec_NN(element);
11810 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
11812 SvREFCNT_dec_NN(top);
11816 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11819 if (av_tindex(stack) < 0 /* Was empty */
11820 || ((final = av_pop(stack)) == NULL)
11821 || ! IS_OPERAND(final)
11822 || av_tindex(stack) >= 0) /* More left on stack */
11824 vFAIL("Incomplete expression within '(?[ ])'");
11827 /* Here, 'final' is the resultant inversion list from evaluating the
11828 * expression. Return it if so requested */
11829 if (return_invlist) {
11830 *return_invlist = final;
11834 /* Otherwise generate a resultant node, based on 'final'. regclass() is
11835 * expecting a string of ranges and individual code points */
11836 invlist_iterinit(final);
11837 result_string = newSVpvs("");
11838 while (invlist_iternext(final, &start, &end)) {
11839 if (start == end) {
11840 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
11843 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
11848 save_parse = RExC_parse;
11849 RExC_parse = SvPV(result_string, len);
11850 save_end = RExC_end;
11851 RExC_end = RExC_parse + len;
11853 /* We turn off folding around the call, as the class we have constructed
11854 * already has all folding taken into consideration, and we don't want
11855 * regclass() to add to that */
11856 RExC_flags &= ~RXf_PMf_FOLD;
11857 node = regclass(pRExC_state, flagp,depth+1,
11858 FALSE, /* means parse the whole char class */
11859 FALSE, /* don't allow multi-char folds */
11860 TRUE, /* silence non-portable warnings. The above may very
11861 well have generated non-portable code points, but
11862 they're valid on this machine */
11865 RExC_flags |= RXf_PMf_FOLD;
11867 RExC_parse = save_parse + 1;
11868 RExC_end = save_end;
11869 SvREFCNT_dec_NN(final);
11870 SvREFCNT_dec_NN(result_string);
11871 SvREFCNT_dec_NN(stack);
11873 nextchar(pRExC_state);
11874 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
11879 /* The names of properties whose definitions are not known at compile time are
11880 * stored in this SV, after a constant heading. So if the length has been
11881 * changed since initialization, then there is a run-time definition. */
11882 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11885 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
11886 const bool stop_at_1, /* Just parse the next thing, don't
11887 look for a full character class */
11888 bool allow_multi_folds,
11889 const bool silence_non_portable, /* Don't output warnings
11892 SV** ret_invlist) /* Return an inversion list, not a node */
11894 /* parse a bracketed class specification. Most of these will produce an
11895 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
11896 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
11897 * under /i with multi-character folds: it will be rewritten following the
11898 * paradigm of this example, where the <multi-fold>s are characters which
11899 * fold to multiple character sequences:
11900 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11901 * gets effectively rewritten as:
11902 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11903 * reg() gets called (recursively) on the rewritten version, and this
11904 * function will return what it constructs. (Actually the <multi-fold>s
11905 * aren't physically removed from the [abcdefghi], it's just that they are
11906 * ignored in the recursion by means of a flag:
11907 * <RExC_in_multi_char_class>.)
11909 * ANYOF nodes contain a bit map for the first 256 characters, with the
11910 * corresponding bit set if that character is in the list. For characters
11911 * above 255, a range list or swash is used. There are extra bits for \w,
11912 * etc. in locale ANYOFs, as what these match is not determinable at
11916 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11918 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11921 IV namedclass = OOB_NAMEDCLASS;
11922 char *rangebegin = NULL;
11923 bool need_class = 0;
11925 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11926 than just initialized. */
11927 SV* properties = NULL; /* Code points that match \p{} \P{} */
11928 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11929 extended beyond the Latin1 range */
11930 UV element_count = 0; /* Number of distinct elements in the class.
11931 Optimizations may be possible if this is tiny */
11932 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11933 character; used under /i */
11935 char * stop_ptr = RExC_end; /* where to stop parsing */
11936 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
11938 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
11940 /* Unicode properties are stored in a swash; this holds the current one
11941 * being parsed. If this swash is the only above-latin1 component of the
11942 * character class, an optimization is to pass it directly on to the
11943 * execution engine. Otherwise, it is set to NULL to indicate that there
11944 * are other things in the class that have to be dealt with at execution
11946 SV* swash = NULL; /* Code points that match \p{} \P{} */
11948 /* Set if a component of this character class is user-defined; just passed
11949 * on to the engine */
11950 bool has_user_defined_property = FALSE;
11952 /* inversion list of code points this node matches only when the target
11953 * string is in UTF-8. (Because is under /d) */
11954 SV* depends_list = NULL;
11956 /* inversion list of code points this node matches. For much of the
11957 * function, it includes only those that match regardless of the utf8ness
11958 * of the target string */
11959 SV* cp_list = NULL;
11962 /* In a range, counts how many 0-2 of the ends of it came from literals,
11963 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11964 UV literal_endpoint = 0;
11966 bool invert = FALSE; /* Is this class to be complemented */
11968 /* Is there any thing like \W or [:^digit:] that matches above the legal
11969 * Unicode range? */
11970 bool runtime_posix_matches_above_Unicode = FALSE;
11972 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11973 case we need to change the emitted regop to an EXACT. */
11974 const char * orig_parse = RExC_parse;
11975 const I32 orig_size = RExC_size;
11976 GET_RE_DEBUG_FLAGS_DECL;
11978 PERL_ARGS_ASSERT_REGCLASS;
11980 PERL_UNUSED_ARG(depth);
11983 DEBUG_PARSE("clas");
11985 /* Assume we are going to generate an ANYOF node. */
11986 ret = reganode(pRExC_state, ANYOF, 0);
11989 RExC_size += ANYOF_SKIP;
11990 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11993 ANYOF_FLAGS(ret) = 0;
11995 RExC_emit += ANYOF_SKIP;
11997 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11999 listsv = newSVpvs("# comment\n");
12000 initial_listsv_len = SvCUR(listsv);
12004 RExC_parse = regpatws(pRExC_state, RExC_parse,
12005 FALSE /* means don't recognize comments */);
12008 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12011 allow_multi_folds = FALSE;
12014 RExC_parse = regpatws(pRExC_state, RExC_parse,
12015 FALSE /* means don't recognize comments */);
12019 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12020 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12021 const char *s = RExC_parse;
12022 const char c = *s++;
12024 while (isWORDCHAR(*s))
12026 if (*s && c == *s && s[1] == ']') {
12027 SAVEFREESV(RExC_rx_sv);
12028 SAVEFREESV(listsv);
12030 "POSIX syntax [%c %c] belongs inside character classes",
12032 (void)ReREFCNT_inc(RExC_rx_sv);
12033 SvREFCNT_inc_simple_void_NN(listsv);
12037 /* If the caller wants us to just parse a single element, accomplish this
12038 * by faking the loop ending condition */
12039 if (stop_at_1 && RExC_end > RExC_parse) {
12040 stop_ptr = RExC_parse + 1;
12043 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12044 if (UCHARAT(RExC_parse) == ']')
12045 goto charclassloop;
12049 if (RExC_parse >= stop_ptr) {
12054 RExC_parse = regpatws(pRExC_state, RExC_parse,
12055 FALSE /* means don't recognize comments */);
12058 if (UCHARAT(RExC_parse) == ']') {
12064 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12065 save_value = value;
12066 save_prevvalue = prevvalue;
12069 rangebegin = RExC_parse;
12073 value = utf8n_to_uvchr((U8*)RExC_parse,
12074 RExC_end - RExC_parse,
12075 &numlen, UTF8_ALLOW_DEFAULT);
12076 RExC_parse += numlen;
12079 value = UCHARAT(RExC_parse++);
12082 && RExC_parse < RExC_end
12083 && POSIXCC(UCHARAT(RExC_parse)))
12085 namedclass = regpposixcc(pRExC_state, value, listsv, strict);
12087 else if (value == '\\') {
12089 value = utf8n_to_uvchr((U8*)RExC_parse,
12090 RExC_end - RExC_parse,
12091 &numlen, UTF8_ALLOW_DEFAULT);
12092 RExC_parse += numlen;
12095 value = UCHARAT(RExC_parse++);
12097 /* Some compilers cannot handle switching on 64-bit integer
12098 * values, therefore value cannot be an UV. Yes, this will
12099 * be a problem later if we want switch on Unicode.
12100 * A similar issue a little bit later when switching on
12101 * namedclass. --jhi */
12103 /* If the \ is escaping white space when white space is being
12104 * skipped, it means that that white space is wanted literally, and
12105 * is already in 'value'. Otherwise, need to translate the escape
12106 * into what it signifies. */
12107 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12109 case 'w': namedclass = ANYOF_WORDCHAR; break;
12110 case 'W': namedclass = ANYOF_NWORDCHAR; break;
12111 case 's': namedclass = ANYOF_SPACE; break;
12112 case 'S': namedclass = ANYOF_NSPACE; break;
12113 case 'd': namedclass = ANYOF_DIGIT; break;
12114 case 'D': namedclass = ANYOF_NDIGIT; break;
12115 case 'v': namedclass = ANYOF_VERTWS; break;
12116 case 'V': namedclass = ANYOF_NVERTWS; break;
12117 case 'h': namedclass = ANYOF_HORIZWS; break;
12118 case 'H': namedclass = ANYOF_NHORIZWS; break;
12119 case 'N': /* Handle \N{NAME} in class */
12121 /* We only pay attention to the first char of
12122 multichar strings being returned. I kinda wonder
12123 if this makes sense as it does change the behaviour
12124 from earlier versions, OTOH that behaviour was broken
12126 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12127 TRUE, /* => charclass */
12139 /* We will handle any undefined properties ourselves */
12140 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12142 if (RExC_parse >= RExC_end)
12143 vFAIL2("Empty \\%c{}", (U8)value);
12144 if (*RExC_parse == '{') {
12145 const U8 c = (U8)value;
12146 e = strchr(RExC_parse++, '}');
12148 vFAIL2("Missing right brace on \\%c{}", c);
12149 while (isSPACE(UCHARAT(RExC_parse)))
12151 if (e == RExC_parse)
12152 vFAIL2("Empty \\%c{}", c);
12153 n = e - RExC_parse;
12154 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12165 if (UCHARAT(RExC_parse) == '^') {
12168 /* toggle. (The rhs xor gets the single bit that
12169 * differs between P and p; the other xor inverts just
12171 value ^= 'P' ^ 'p';
12173 while (isSPACE(UCHARAT(RExC_parse))) {
12178 /* Try to get the definition of the property into
12179 * <invlist>. If /i is in effect, the effective property
12180 * will have its name be <__NAME_i>. The design is
12181 * discussed in commit
12182 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12183 Newx(name, n + sizeof("_i__\n"), char);
12185 sprintf(name, "%s%.*s%s\n",
12186 (FOLD) ? "__" : "",
12192 /* Look up the property name, and get its swash and
12193 * inversion list, if the property is found */
12195 SvREFCNT_dec_NN(swash);
12197 swash = _core_swash_init("utf8", name, &PL_sv_undef,
12200 NULL, /* No inversion list */
12203 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12205 SvREFCNT_dec_NN(swash);
12209 /* Here didn't find it. It could be a user-defined
12210 * property that will be available at run-time. If we
12211 * accept only compile-time properties, is an error;
12212 * otherwise add it to the list for run-time look up */
12214 RExC_parse = e + 1;
12215 vFAIL3("Property '%.*s' is unknown", (int) n, name);
12217 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12218 (value == 'p' ? '+' : '!'),
12220 has_user_defined_property = TRUE;
12222 /* We don't know yet, so have to assume that the
12223 * property could match something in the Latin1 range,
12224 * hence something that isn't utf8. Note that this
12225 * would cause things in <depends_list> to match
12226 * inappropriately, except that any \p{}, including
12227 * this one forces Unicode semantics, which means there
12228 * is <no depends_list> */
12229 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12233 /* Here, did get the swash and its inversion list. If
12234 * the swash is from a user-defined property, then this
12235 * whole character class should be regarded as such */
12236 has_user_defined_property =
12238 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12240 /* Invert if asking for the complement */
12241 if (value == 'P') {
12242 _invlist_union_complement_2nd(properties,
12246 /* The swash can't be used as-is, because we've
12247 * inverted things; delay removing it to here after
12248 * have copied its invlist above */
12249 SvREFCNT_dec_NN(swash);
12253 _invlist_union(properties, invlist, &properties);
12258 RExC_parse = e + 1;
12259 namedclass = ANYOF_UNIPROP; /* no official name, but it's
12262 /* \p means they want Unicode semantics */
12263 RExC_uni_semantics = 1;
12266 case 'n': value = '\n'; break;
12267 case 'r': value = '\r'; break;
12268 case 't': value = '\t'; break;
12269 case 'f': value = '\f'; break;
12270 case 'b': value = '\b'; break;
12271 case 'e': value = ASCII_TO_NATIVE('\033');break;
12272 case 'a': value = ASCII_TO_NATIVE('\007');break;
12274 RExC_parse--; /* function expects to be pointed at the 'o' */
12276 const char* error_msg;
12277 bool valid = grok_bslash_o(&RExC_parse,
12280 SIZE_ONLY, /* warnings in pass
12283 silence_non_portable,
12289 if (PL_encoding && value < 0x100) {
12290 goto recode_encoding;
12294 RExC_parse--; /* function expects to be pointed at the 'x' */
12296 const char* error_msg;
12297 bool valid = grok_bslash_x(&RExC_parse,
12300 TRUE, /* Output warnings */
12302 silence_non_portable,
12308 if (PL_encoding && value < 0x100)
12309 goto recode_encoding;
12312 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12314 case '0': case '1': case '2': case '3': case '4':
12315 case '5': case '6': case '7':
12317 /* Take 1-3 octal digits */
12318 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12319 numlen = (strict) ? 4 : 3;
12320 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12321 RExC_parse += numlen;
12323 SAVEFREESV(listsv); /* In case warnings are fatalized */
12325 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12326 vFAIL("Need exactly 3 octal digits");
12328 else if (! SIZE_ONLY /* like \08, \178 */
12330 && RExC_parse < RExC_end
12331 && isDIGIT(*RExC_parse)
12332 && ckWARN(WARN_REGEXP))
12334 SAVEFREESV(RExC_rx_sv);
12335 reg_warn_non_literal_string(
12337 form_short_octal_warning(RExC_parse, numlen));
12338 (void)ReREFCNT_inc(RExC_rx_sv);
12340 SvREFCNT_inc_simple_void_NN(listsv);
12342 if (PL_encoding && value < 0x100)
12343 goto recode_encoding;
12347 if (! RExC_override_recoding) {
12348 SV* enc = PL_encoding;
12349 value = reg_recode((const char)(U8)value, &enc);
12352 vFAIL("Invalid escape in the specified encoding");
12354 else if (SIZE_ONLY) {
12355 ckWARNreg(RExC_parse,
12356 "Invalid escape in the specified encoding");
12362 /* Allow \_ to not give an error */
12363 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12364 SAVEFREESV(listsv);
12366 vFAIL2("Unrecognized escape \\%c in character class",
12370 SAVEFREESV(RExC_rx_sv);
12371 ckWARN2reg(RExC_parse,
12372 "Unrecognized escape \\%c in character class passed through",
12374 (void)ReREFCNT_inc(RExC_rx_sv);
12376 SvREFCNT_inc_simple_void_NN(listsv);
12379 } /* End of switch on char following backslash */
12380 } /* end of handling backslash escape sequences */
12383 literal_endpoint++;
12386 /* Here, we have the current token in 'value' */
12388 /* What matches in a locale is not known until runtime. This includes
12389 * what the Posix classes (like \w, [:space:]) match. Room must be
12390 * reserved (one time per class) to store such classes, either if Perl
12391 * is compiled so that locale nodes always should have this space, or
12392 * if there is such class info to be stored. The space will contain a
12393 * bit for each named class that is to be matched against. This isn't
12394 * needed for \p{} and pseudo-classes, as they are not affected by
12395 * locale, and hence are dealt with separately */
12398 && (ANYOF_LOCALE == ANYOF_CLASS
12399 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12403 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12406 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12407 ANYOF_CLASS_ZERO(ret);
12409 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12412 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12414 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
12415 * literal, as is the character that began the false range, i.e.
12416 * the 'a' in the examples */
12419 const int w = (RExC_parse >= rangebegin)
12420 ? RExC_parse - rangebegin
12422 SAVEFREESV(listsv); /* in case of fatal warnings */
12424 vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12427 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12428 ckWARN4reg(RExC_parse,
12429 "False [] range \"%*.*s\"",
12431 (void)ReREFCNT_inc(RExC_rx_sv);
12432 cp_list = add_cp_to_invlist(cp_list, '-');
12433 cp_list = add_cp_to_invlist(cp_list, prevvalue);
12435 SvREFCNT_inc_simple_void_NN(listsv);
12438 range = 0; /* this was not a true range */
12439 element_count += 2; /* So counts for three values */
12443 U8 classnum = namedclass_to_classnum(namedclass);
12444 if (namedclass >= ANYOF_MAX) { /* If a special class */
12445 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12447 /* Here, should be \h, \H, \v, or \V. Neither /d nor
12448 * /l make a difference in what these match. There
12449 * would be problems if these characters had folds
12450 * other than themselves, as cp_list is subject to
12452 if (classnum != _CC_VERTSPACE) {
12453 assert( namedclass == ANYOF_HORIZWS
12454 || namedclass == ANYOF_NHORIZWS);
12456 /* It turns out that \h is just a synonym for
12458 classnum = _CC_BLANK;
12461 _invlist_union_maybe_complement_2nd(
12463 PL_XPosix_ptrs[classnum],
12464 cBOOL(namedclass % 2), /* Complement if odd
12465 (NHORIZWS, NVERTWS)
12470 else if (classnum == _CC_ASCII) {
12473 ANYOF_CLASS_SET(ret, namedclass);
12476 #endif /* Not isascii(); just use the hard-coded definition for it */
12477 _invlist_union_maybe_complement_2nd(
12480 cBOOL(namedclass % 2), /* Complement if odd
12484 else { /* Garden variety class */
12486 /* The ascii range inversion list */
12487 SV* ascii_source = PL_Posix_ptrs[classnum];
12489 /* The full Latin1 range inversion list */
12490 SV* l1_source = PL_L1Posix_ptrs[classnum];
12492 /* This code is structured into two major clauses. The
12493 * first is for classes whose complete definitions may not
12494 * already be known. It not, the Latin1 definition
12495 * (guaranteed to already known) is used plus code is
12496 * generated to load the rest at run-time (only if needed).
12497 * If the complete definition is known, it drops down to
12498 * the second clause, where the complete definition is
12501 if (classnum < _FIRST_NON_SWASH_CC) {
12503 /* Here, the class has a swash, which may or not
12504 * already be loaded */
12506 /* The name of the property to use to match the full
12507 * eXtended Unicode range swash for this character
12509 const char *Xname = swash_property_names[classnum];
12511 /* If returning the inversion list, we can't defer
12512 * getting this until runtime */
12513 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
12514 PL_utf8_swash_ptrs[classnum] =
12515 _core_swash_init("utf8", Xname, &PL_sv_undef,
12518 NULL, /* No inversion list */
12519 NULL /* No flags */
12521 assert(PL_utf8_swash_ptrs[classnum]);
12523 if ( ! PL_utf8_swash_ptrs[classnum]) {
12524 if (namedclass % 2 == 0) { /* A non-complemented
12526 /* If not /a matching, there are code points we
12527 * don't know at compile time. Arrange for the
12528 * unknown matches to be loaded at run-time, if
12530 if (! AT_LEAST_ASCII_RESTRICTED) {
12531 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12534 if (LOC) { /* Under locale, set run-time
12536 ANYOF_CLASS_SET(ret, namedclass);
12539 /* Add the current class's code points to
12540 * the running total */
12541 _invlist_union(posixes,
12542 (AT_LEAST_ASCII_RESTRICTED)
12548 else { /* A complemented class */
12549 if (AT_LEAST_ASCII_RESTRICTED) {
12550 /* Under /a should match everything above
12551 * ASCII, plus the complement of the set's
12553 _invlist_union_complement_2nd(posixes,
12558 /* Arrange for the unknown matches to be
12559 * loaded at run-time, if needed */
12560 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12562 runtime_posix_matches_above_Unicode = TRUE;
12564 ANYOF_CLASS_SET(ret, namedclass);
12568 /* We want to match everything in
12569 * Latin1, except those things that
12570 * l1_source matches */
12571 SV* scratch_list = NULL;
12572 _invlist_subtract(PL_Latin1, l1_source,
12575 /* Add the list from this class to the
12578 posixes = scratch_list;
12581 _invlist_union(posixes,
12584 SvREFCNT_dec_NN(scratch_list);
12586 if (DEPENDS_SEMANTICS) {
12588 |= ANYOF_NON_UTF8_LATIN1_ALL;
12593 goto namedclass_done;
12596 /* Here, there is a swash loaded for the class. If no
12597 * inversion list for it yet, get it */
12598 if (! PL_XPosix_ptrs[classnum]) {
12599 PL_XPosix_ptrs[classnum]
12600 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12604 /* Here there is an inversion list already loaded for the
12607 if (namedclass % 2 == 0) { /* A non-complemented class,
12608 like ANYOF_PUNCT */
12610 /* For non-locale, just add it to any existing list
12612 _invlist_union(posixes,
12613 (AT_LEAST_ASCII_RESTRICTED)
12615 : PL_XPosix_ptrs[classnum],
12618 else { /* Locale */
12619 SV* scratch_list = NULL;
12621 /* For above Latin1 code points, we use the full
12623 _invlist_intersection(PL_AboveLatin1,
12624 PL_XPosix_ptrs[classnum],
12626 /* And set the output to it, adding instead if
12627 * there already is an output. Checking if
12628 * 'posixes' is NULL first saves an extra clone.
12629 * Its reference count will be decremented at the
12630 * next union, etc, or if this is the only
12631 * instance, at the end of the routine */
12633 posixes = scratch_list;
12636 _invlist_union(posixes, scratch_list, &posixes);
12637 SvREFCNT_dec_NN(scratch_list);
12640 #ifndef HAS_ISBLANK
12641 if (namedclass != ANYOF_BLANK) {
12643 /* Set this class in the node for runtime
12645 ANYOF_CLASS_SET(ret, namedclass);
12646 #ifndef HAS_ISBLANK
12649 /* No isblank(), use the hard-coded ASCII-range
12650 * blanks, adding them to the running total. */
12652 _invlist_union(posixes, ascii_source, &posixes);
12657 else { /* A complemented class, like ANYOF_NPUNCT */
12659 _invlist_union_complement_2nd(
12661 (AT_LEAST_ASCII_RESTRICTED)
12663 : PL_XPosix_ptrs[classnum],
12665 /* Under /d, everything in the upper half of the
12666 * Latin1 range matches this complement */
12667 if (DEPENDS_SEMANTICS) {
12668 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12671 else { /* Locale */
12672 SV* scratch_list = NULL;
12673 _invlist_subtract(PL_AboveLatin1,
12674 PL_XPosix_ptrs[classnum],
12677 posixes = scratch_list;
12680 _invlist_union(posixes, scratch_list, &posixes);
12681 SvREFCNT_dec_NN(scratch_list);
12683 #ifndef HAS_ISBLANK
12684 if (namedclass != ANYOF_NBLANK) {
12686 ANYOF_CLASS_SET(ret, namedclass);
12687 #ifndef HAS_ISBLANK
12690 /* Get the list of all code points in Latin1
12691 * that are not ASCII blanks, and add them to
12692 * the running total */
12693 _invlist_subtract(PL_Latin1, ascii_source,
12695 _invlist_union(posixes, scratch_list, &posixes);
12696 SvREFCNT_dec_NN(scratch_list);
12703 continue; /* Go get next character */
12705 } /* end of namedclass \blah */
12707 /* Here, we have a single value. If 'range' is set, it is the ending
12708 * of a range--check its validity. Later, we will handle each
12709 * individual code point in the range. If 'range' isn't set, this
12710 * could be the beginning of a range, so check for that by looking
12711 * ahead to see if the next real character to be processed is the range
12712 * indicator--the minus sign */
12715 RExC_parse = regpatws(pRExC_state, RExC_parse,
12716 FALSE /* means don't recognize comments */);
12720 if (prevvalue > value) /* b-a */ {
12721 const int w = RExC_parse - rangebegin;
12722 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12723 range = 0; /* not a valid range */
12727 prevvalue = value; /* save the beginning of the potential range */
12728 if (! stop_at_1 /* Can't be a range if parsing just one thing */
12729 && *RExC_parse == '-')
12731 char* next_char_ptr = RExC_parse + 1;
12732 if (skip_white) { /* Get the next real char after the '-' */
12733 next_char_ptr = regpatws(pRExC_state,
12735 FALSE); /* means don't recognize
12739 /* If the '-' is at the end of the class (just before the ']',
12740 * it is a literal minus; otherwise it is a range */
12741 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
12742 RExC_parse = next_char_ptr;
12744 /* a bad range like \w-, [:word:]- ? */
12745 if (namedclass > OOB_NAMEDCLASS) {
12746 if (strict || ckWARN(WARN_REGEXP)) {
12748 RExC_parse >= rangebegin ?
12749 RExC_parse - rangebegin : 0;
12751 vFAIL4("False [] range \"%*.*s\"",
12756 "False [] range \"%*.*s\"",
12761 cp_list = add_cp_to_invlist(cp_list, '-');
12765 range = 1; /* yeah, it's a range! */
12766 continue; /* but do it the next time */
12771 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12774 /* non-Latin1 code point implies unicode semantics. Must be set in
12775 * pass1 so is there for the whole of pass 2 */
12777 RExC_uni_semantics = 1;
12780 /* Ready to process either the single value, or the completed range.
12781 * For single-valued non-inverted ranges, we consider the possibility
12782 * of multi-char folds. (We made a conscious decision to not do this
12783 * for the other cases because it can often lead to non-intuitive
12784 * results. For example, you have the peculiar case that:
12785 * "s s" =~ /^[^\xDF]+$/i => Y
12786 * "ss" =~ /^[^\xDF]+$/i => N
12788 * See [perl #89750] */
12789 if (FOLD && allow_multi_folds && value == prevvalue) {
12790 if (value == LATIN_SMALL_LETTER_SHARP_S
12791 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12794 /* Here <value> is indeed a multi-char fold. Get what it is */
12796 U8 foldbuf[UTF8_MAXBYTES_CASE];
12799 UV folded = _to_uni_fold_flags(
12804 | ((LOC) ? FOLD_FLAGS_LOCALE
12805 : (ASCII_FOLD_RESTRICTED)
12806 ? FOLD_FLAGS_NOMIX_ASCII
12810 /* Here, <folded> should be the first character of the
12811 * multi-char fold of <value>, with <foldbuf> containing the
12812 * whole thing. But, if this fold is not allowed (because of
12813 * the flags), <fold> will be the same as <value>, and should
12814 * be processed like any other character, so skip the special
12816 if (folded != value) {
12818 /* Skip if we are recursed, currently parsing the class
12819 * again. Otherwise add this character to the list of
12820 * multi-char folds. */
12821 if (! RExC_in_multi_char_class) {
12822 AV** this_array_ptr;
12824 STRLEN cp_count = utf8_length(foldbuf,
12825 foldbuf + foldlen);
12826 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12828 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12831 if (! multi_char_matches) {
12832 multi_char_matches = newAV();
12835 /* <multi_char_matches> is actually an array of arrays.
12836 * There will be one or two top-level elements: [2],
12837 * and/or [3]. The [2] element is an array, each
12838 * element thereof is a character which folds to two
12839 * characters; likewise for [3]. (Unicode guarantees a
12840 * maximum of 3 characters in any fold.) When we
12841 * rewrite the character class below, we will do so
12842 * such that the longest folds are written first, so
12843 * that it prefers the longest matching strings first.
12844 * This is done even if it turns out that any
12845 * quantifier is non-greedy, out of programmer
12846 * laziness. Tom Christiansen has agreed that this is
12847 * ok. This makes the test for the ligature 'ffi' come
12848 * before the test for 'ff' */
12849 if (av_exists(multi_char_matches, cp_count)) {
12850 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12852 this_array = *this_array_ptr;
12855 this_array = newAV();
12856 av_store(multi_char_matches, cp_count,
12859 av_push(this_array, multi_fold);
12862 /* This element should not be processed further in this
12865 value = save_value;
12866 prevvalue = save_prevvalue;
12872 /* Deal with this element of the class */
12875 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12877 SV* this_range = _new_invlist(1);
12878 _append_range_to_invlist(this_range, prevvalue, value);
12880 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12881 * If this range was specified using something like 'i-j', we want
12882 * to include only the 'i' and the 'j', and not anything in
12883 * between, so exclude non-ASCII, non-alphabetics from it.
12884 * However, if the range was specified with something like
12885 * [\x89-\x91] or [\x89-j], all code points within it should be
12886 * included. literal_endpoint==2 means both ends of the range used
12887 * a literal character, not \x{foo} */
12888 if (literal_endpoint == 2
12889 && (prevvalue >= 'a' && value <= 'z')
12890 || (prevvalue >= 'A' && value <= 'Z'))
12892 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
12895 _invlist_union(cp_list, this_range, &cp_list);
12896 literal_endpoint = 0;
12900 range = 0; /* this range (if it was one) is done now */
12901 } /* End of loop through all the text within the brackets */
12903 /* If anything in the class expands to more than one character, we have to
12904 * deal with them by building up a substitute parse string, and recursively
12905 * calling reg() on it, instead of proceeding */
12906 if (multi_char_matches) {
12907 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12910 char *save_end = RExC_end;
12911 char *save_parse = RExC_parse;
12912 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12917 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12918 because too confusing */
12920 sv_catpv(substitute_parse, "(?:");
12924 /* Look at the longest folds first */
12925 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12927 if (av_exists(multi_char_matches, cp_count)) {
12928 AV** this_array_ptr;
12931 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12933 while ((this_sequence = av_pop(*this_array_ptr)) !=
12936 if (! first_time) {
12937 sv_catpv(substitute_parse, "|");
12939 first_time = FALSE;
12941 sv_catpv(substitute_parse, SvPVX(this_sequence));
12946 /* If the character class contains anything else besides these
12947 * multi-character folds, have to include it in recursive parsing */
12948 if (element_count) {
12949 sv_catpv(substitute_parse, "|[");
12950 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12951 sv_catpv(substitute_parse, "]");
12954 sv_catpv(substitute_parse, ")");
12957 /* This is a way to get the parse to skip forward a whole named
12958 * sequence instead of matching the 2nd character when it fails the
12960 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12964 RExC_parse = SvPV(substitute_parse, len);
12965 RExC_end = RExC_parse + len;
12966 RExC_in_multi_char_class = 1;
12967 RExC_emit = (regnode *)orig_emit;
12969 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12971 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12973 RExC_parse = save_parse;
12974 RExC_end = save_end;
12975 RExC_in_multi_char_class = 0;
12976 SvREFCNT_dec_NN(multi_char_matches);
12977 SvREFCNT_dec_NN(listsv);
12981 /* If the character class contains only a single element, it may be
12982 * optimizable into another node type which is smaller and runs faster.
12983 * Check if this is the case for this class */
12984 if (element_count == 1 && ! ret_invlist) {
12988 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12989 [:digit:] or \p{foo} */
12991 /* All named classes are mapped into POSIXish nodes, with its FLAG
12992 * argument giving which class it is */
12993 switch ((I32)namedclass) {
12994 case ANYOF_UNIPROP:
12997 /* These don't depend on the charset modifiers. They always
12998 * match under /u rules */
12999 case ANYOF_NHORIZWS:
13000 case ANYOF_HORIZWS:
13001 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13004 case ANYOF_NVERTWS:
13009 /* The actual POSIXish node for all the rest depends on the
13010 * charset modifier. The ones in the first set depend only on
13011 * ASCII or, if available on this platform, locale */
13015 op = (LOC) ? POSIXL : POSIXA;
13026 /* under /a could be alpha */
13028 if (ASCII_RESTRICTED) {
13029 namedclass = ANYOF_ALPHA + (namedclass % 2);
13037 /* The rest have more possibilities depending on the charset.
13038 * We take advantage of the enum ordering of the charset
13039 * modifiers to get the exact node type, */
13041 op = POSIXD + get_regex_charset(RExC_flags);
13042 if (op > POSIXA) { /* /aa is same as /a */
13045 #ifndef HAS_ISBLANK
13047 && (namedclass == ANYOF_BLANK
13048 || namedclass == ANYOF_NBLANK))
13055 /* The odd numbered ones are the complements of the
13056 * next-lower even number one */
13057 if (namedclass % 2 == 1) {
13061 arg = namedclass_to_classnum(namedclass);
13065 else if (value == prevvalue) {
13067 /* Here, the class consists of just a single code point */
13070 if (! LOC && value == '\n') {
13071 op = REG_ANY; /* Optimize [^\n] */
13072 *flagp |= HASWIDTH|SIMPLE;
13076 else if (value < 256 || UTF) {
13078 /* Optimize a single value into an EXACTish node, but not if it
13079 * would require converting the pattern to UTF-8. */
13080 op = compute_EXACTish(pRExC_state);
13082 } /* Otherwise is a range */
13083 else if (! LOC) { /* locale could vary these */
13084 if (prevvalue == '0') {
13085 if (value == '9') {
13092 /* Here, we have changed <op> away from its initial value iff we found
13093 * an optimization */
13096 /* Throw away this ANYOF regnode, and emit the calculated one,
13097 * which should correspond to the beginning, not current, state of
13099 const char * cur_parse = RExC_parse;
13100 RExC_parse = (char *)orig_parse;
13104 /* To get locale nodes to not use the full ANYOF size would
13105 * require moving the code above that writes the portions
13106 * of it that aren't in other nodes to after this point.
13107 * e.g. ANYOF_CLASS_SET */
13108 RExC_size = orig_size;
13112 RExC_emit = (regnode *)orig_emit;
13113 if (PL_regkind[op] == POSIXD) {
13115 op += NPOSIXD - POSIXD;
13120 ret = reg_node(pRExC_state, op);
13122 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13126 *flagp |= HASWIDTH|SIMPLE;
13128 else if (PL_regkind[op] == EXACT) {
13129 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13132 RExC_parse = (char *) cur_parse;
13134 SvREFCNT_dec(posixes);
13135 SvREFCNT_dec_NN(listsv);
13136 SvREFCNT_dec(cp_list);
13143 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13145 /* If folding, we calculate all characters that could fold to or from the
13146 * ones already on the list */
13147 if (FOLD && cp_list) {
13148 UV start, end; /* End points of code point ranges */
13150 SV* fold_intersection = NULL;
13152 /* If the highest code point is within Latin1, we can use the
13153 * compiled-in Alphas list, and not have to go out to disk. This
13154 * yields two false positives, the masculine and feminine ordinal
13155 * indicators, which are weeded out below using the
13156 * IS_IN_SOME_FOLD_L1() macro */
13157 if (invlist_highest(cp_list) < 256) {
13158 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13159 &fold_intersection);
13163 /* Here, there are non-Latin1 code points, so we will have to go
13164 * fetch the list of all the characters that participate in folds
13166 if (! PL_utf8_foldable) {
13167 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13168 &PL_sv_undef, 1, 0);
13169 PL_utf8_foldable = _get_swash_invlist(swash);
13170 SvREFCNT_dec_NN(swash);
13173 /* This is a hash that for a particular fold gives all characters
13174 * that are involved in it */
13175 if (! PL_utf8_foldclosures) {
13177 /* If we were unable to find any folds, then we likely won't be
13178 * able to find the closures. So just create an empty list.
13179 * Folding will effectively be restricted to the non-Unicode
13180 * rules hard-coded into Perl. (This case happens legitimately
13181 * during compilation of Perl itself before the Unicode tables
13182 * are generated) */
13183 if (_invlist_len(PL_utf8_foldable) == 0) {
13184 PL_utf8_foldclosures = newHV();
13187 /* If the folds haven't been read in, call a fold function
13189 if (! PL_utf8_tofold) {
13190 U8 dummy[UTF8_MAXBYTES+1];
13192 /* This string is just a short named one above \xff */
13193 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13194 assert(PL_utf8_tofold); /* Verify that worked */
13196 PL_utf8_foldclosures =
13197 _swash_inversion_hash(PL_utf8_tofold);
13201 /* Only the characters in this class that participate in folds need
13202 * be checked. Get the intersection of this class and all the
13203 * possible characters that are foldable. This can quickly narrow
13204 * down a large class */
13205 _invlist_intersection(PL_utf8_foldable, cp_list,
13206 &fold_intersection);
13209 /* Now look at the foldable characters in this class individually */
13210 invlist_iterinit(fold_intersection);
13211 while (invlist_iternext(fold_intersection, &start, &end)) {
13214 /* Locale folding for Latin1 characters is deferred until runtime */
13215 if (LOC && start < 256) {
13219 /* Look at every character in the range */
13220 for (j = start; j <= end; j++) {
13222 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13228 /* We have the latin1 folding rules hard-coded here so that
13229 * an innocent-looking character class, like /[ks]/i won't
13230 * have to go out to disk to find the possible matches.
13231 * XXX It would be better to generate these via regen, in
13232 * case a new version of the Unicode standard adds new
13233 * mappings, though that is not really likely, and may be
13234 * caught by the default: case of the switch below. */
13236 if (IS_IN_SOME_FOLD_L1(j)) {
13238 /* ASCII is always matched; non-ASCII is matched only
13239 * under Unicode rules */
13240 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13242 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13246 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13250 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13251 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13253 /* Certain Latin1 characters have matches outside
13254 * Latin1. To get here, <j> is one of those
13255 * characters. None of these matches is valid for
13256 * ASCII characters under /aa, which is why the 'if'
13257 * just above excludes those. These matches only
13258 * happen when the target string is utf8. The code
13259 * below adds the single fold closures for <j> to the
13260 * inversion list. */
13265 add_cp_to_invlist(cp_list, KELVIN_SIGN);
13269 cp_list = add_cp_to_invlist(cp_list,
13270 LATIN_SMALL_LETTER_LONG_S);
13273 cp_list = add_cp_to_invlist(cp_list,
13274 GREEK_CAPITAL_LETTER_MU);
13275 cp_list = add_cp_to_invlist(cp_list,
13276 GREEK_SMALL_LETTER_MU);
13278 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13279 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13281 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13283 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13284 cp_list = add_cp_to_invlist(cp_list,
13285 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13287 case LATIN_SMALL_LETTER_SHARP_S:
13288 cp_list = add_cp_to_invlist(cp_list,
13289 LATIN_CAPITAL_LETTER_SHARP_S);
13291 case 'F': case 'f':
13292 case 'I': case 'i':
13293 case 'L': case 'l':
13294 case 'T': case 't':
13295 case 'A': case 'a':
13296 case 'H': case 'h':
13297 case 'J': case 'j':
13298 case 'N': case 'n':
13299 case 'W': case 'w':
13300 case 'Y': case 'y':
13301 /* These all are targets of multi-character
13302 * folds from code points that require UTF8 to
13303 * express, so they can't match unless the
13304 * target string is in UTF-8, so no action here
13305 * is necessary, as regexec.c properly handles
13306 * the general case for UTF-8 matching and
13307 * multi-char folds */
13310 /* Use deprecated warning to increase the
13311 * chances of this being output */
13312 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13319 /* Here is an above Latin1 character. We don't have the rules
13320 * hard-coded for it. First, get its fold. This is the simple
13321 * fold, as the multi-character folds have been handled earlier
13322 * and separated out */
13323 _to_uni_fold_flags(j, foldbuf, &foldlen,
13325 ? FOLD_FLAGS_LOCALE
13326 : (ASCII_FOLD_RESTRICTED)
13327 ? FOLD_FLAGS_NOMIX_ASCII
13330 /* Single character fold of above Latin1. Add everything in
13331 * its fold closure to the list that this node should match.
13332 * The fold closures data structure is a hash with the keys
13333 * being the UTF-8 of every character that is folded to, like
13334 * 'k', and the values each an array of all code points that
13335 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
13336 * Multi-character folds are not included */
13337 if ((listp = hv_fetch(PL_utf8_foldclosures,
13338 (char *) foldbuf, foldlen, FALSE)))
13340 AV* list = (AV*) *listp;
13342 for (k = 0; k <= av_len(list); k++) {
13343 SV** c_p = av_fetch(list, k, FALSE);
13346 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13350 /* /aa doesn't allow folds between ASCII and non-; /l
13351 * doesn't allow them between above and below 256 */
13352 if ((ASCII_FOLD_RESTRICTED
13353 && (isASCII(c) != isASCII(j)))
13354 || (LOC && ((c < 256) != (j < 256))))
13359 /* Folds involving non-ascii Latin1 characters
13360 * under /d are added to a separate list */
13361 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13363 cp_list = add_cp_to_invlist(cp_list, c);
13366 depends_list = add_cp_to_invlist(depends_list, c);
13372 SvREFCNT_dec_NN(fold_intersection);
13375 /* And combine the result (if any) with any inversion list from posix
13376 * classes. The lists are kept separate up to now because we don't want to
13377 * fold the classes (folding of those is automatically handled by the swash
13378 * fetching code) */
13380 if (! DEPENDS_SEMANTICS) {
13382 _invlist_union(cp_list, posixes, &cp_list);
13383 SvREFCNT_dec_NN(posixes);
13390 /* Under /d, we put into a separate list the Latin1 things that
13391 * match only when the target string is utf8 */
13392 SV* nonascii_but_latin1_properties = NULL;
13393 _invlist_intersection(posixes, PL_Latin1,
13394 &nonascii_but_latin1_properties);
13395 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13396 &nonascii_but_latin1_properties);
13397 _invlist_subtract(posixes, nonascii_but_latin1_properties,
13400 _invlist_union(cp_list, posixes, &cp_list);
13401 SvREFCNT_dec_NN(posixes);
13407 if (depends_list) {
13408 _invlist_union(depends_list, nonascii_but_latin1_properties,
13410 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13413 depends_list = nonascii_but_latin1_properties;
13418 /* And combine the result (if any) with any inversion list from properties.
13419 * The lists are kept separate up to now so that we can distinguish the two
13420 * in regards to matching above-Unicode. A run-time warning is generated
13421 * if a Unicode property is matched against a non-Unicode code point. But,
13422 * we allow user-defined properties to match anything, without any warning,
13423 * and we also suppress the warning if there is a portion of the character
13424 * class that isn't a Unicode property, and which matches above Unicode, \W
13425 * or [\x{110000}] for example.
13426 * (Note that in this case, unlike the Posix one above, there is no
13427 * <depends_list>, because having a Unicode property forces Unicode
13430 bool warn_super = ! has_user_defined_property;
13433 /* If it matters to the final outcome, see if a non-property
13434 * component of the class matches above Unicode. If so, the
13435 * warning gets suppressed. This is true even if just a single
13436 * such code point is specified, as though not strictly correct if
13437 * another such code point is matched against, the fact that they
13438 * are using above-Unicode code points indicates they should know
13439 * the issues involved */
13441 bool non_prop_matches_above_Unicode =
13442 runtime_posix_matches_above_Unicode
13443 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13445 non_prop_matches_above_Unicode =
13446 ! non_prop_matches_above_Unicode;
13448 warn_super = ! non_prop_matches_above_Unicode;
13451 _invlist_union(properties, cp_list, &cp_list);
13452 SvREFCNT_dec_NN(properties);
13455 cp_list = properties;
13459 OP(ret) = ANYOF_WARN_SUPER;
13463 /* Here, we have calculated what code points should be in the character
13466 * Now we can see about various optimizations. Fold calculation (which we
13467 * did above) needs to take place before inversion. Otherwise /[^k]/i
13468 * would invert to include K, which under /i would match k, which it
13469 * shouldn't. Therefore we can't invert folded locale now, as it won't be
13470 * folded until runtime */
13472 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13473 * at compile time. Besides not inverting folded locale now, we can't
13474 * invert if there are things such as \w, which aren't known until runtime
13477 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13479 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13481 _invlist_invert(cp_list);
13483 /* Any swash can't be used as-is, because we've inverted things */
13485 SvREFCNT_dec_NN(swash);
13489 /* Clear the invert flag since have just done it here */
13494 *ret_invlist = cp_list;
13496 /* Discard the generated node */
13498 RExC_size = orig_size;
13501 RExC_emit = orig_emit;
13506 /* If we didn't do folding, it's because some information isn't available
13507 * until runtime; set the run-time fold flag for these. (We don't have to
13508 * worry about properties folding, as that is taken care of by the swash
13512 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13515 /* Some character classes are equivalent to other nodes. Such nodes take
13516 * up less room and generally fewer operations to execute than ANYOF nodes.
13517 * Above, we checked for and optimized into some such equivalents for
13518 * certain common classes that are easy to test. Getting to this point in
13519 * the code means that the class didn't get optimized there. Since this
13520 * code is only executed in Pass 2, it is too late to save space--it has
13521 * been allocated in Pass 1, and currently isn't given back. But turning
13522 * things into an EXACTish node can allow the optimizer to join it to any
13523 * adjacent such nodes. And if the class is equivalent to things like /./,
13524 * expensive run-time swashes can be avoided. Now that we have more
13525 * complete information, we can find things necessarily missed by the
13526 * earlier code. I (khw) am not sure how much to look for here. It would
13527 * be easy, but perhaps too slow, to check any candidates against all the
13528 * node types they could possibly match using _invlistEQ(). */
13533 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13534 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13537 U8 op = END; /* The optimzation node-type */
13538 const char * cur_parse= RExC_parse;
13540 invlist_iterinit(cp_list);
13541 if (! invlist_iternext(cp_list, &start, &end)) {
13543 /* Here, the list is empty. This happens, for example, when a
13544 * Unicode property is the only thing in the character class, and
13545 * it doesn't match anything. (perluniprops.pod notes such
13548 *flagp |= HASWIDTH|SIMPLE;
13550 else if (start == end) { /* The range is a single code point */
13551 if (! invlist_iternext(cp_list, &start, &end)
13553 /* Don't do this optimization if it would require changing
13554 * the pattern to UTF-8 */
13555 && (start < 256 || UTF))
13557 /* Here, the list contains a single code point. Can optimize
13558 * into an EXACT node */
13567 /* A locale node under folding with one code point can be
13568 * an EXACTFL, as its fold won't be calculated until
13574 /* Here, we are generally folding, but there is only one
13575 * code point to match. If we have to, we use an EXACT
13576 * node, but it would be better for joining with adjacent
13577 * nodes in the optimization pass if we used the same
13578 * EXACTFish node that any such are likely to be. We can
13579 * do this iff the code point doesn't participate in any
13580 * folds. For example, an EXACTF of a colon is the same as
13581 * an EXACT one, since nothing folds to or from a colon. */
13583 if (IS_IN_SOME_FOLD_L1(value)) {
13588 if (! PL_utf8_foldable) {
13589 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13590 &PL_sv_undef, 1, 0);
13591 PL_utf8_foldable = _get_swash_invlist(swash);
13592 SvREFCNT_dec_NN(swash);
13594 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13599 /* If we haven't found the node type, above, it means we
13600 * can use the prevailing one */
13602 op = compute_EXACTish(pRExC_state);
13607 else if (start == 0) {
13608 if (end == UV_MAX) {
13610 *flagp |= HASWIDTH|SIMPLE;
13613 else if (end == '\n' - 1
13614 && invlist_iternext(cp_list, &start, &end)
13615 && start == '\n' + 1 && end == UV_MAX)
13618 *flagp |= HASWIDTH|SIMPLE;
13622 invlist_iterfinish(cp_list);
13625 RExC_parse = (char *)orig_parse;
13626 RExC_emit = (regnode *)orig_emit;
13628 ret = reg_node(pRExC_state, op);
13630 RExC_parse = (char *)cur_parse;
13632 if (PL_regkind[op] == EXACT) {
13633 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13636 SvREFCNT_dec_NN(cp_list);
13637 SvREFCNT_dec_NN(listsv);
13642 /* Here, <cp_list> contains all the code points we can determine at
13643 * compile time that match under all conditions. Go through it, and
13644 * for things that belong in the bitmap, put them there, and delete from
13645 * <cp_list>. While we are at it, see if everything above 255 is in the
13646 * list, and if so, set a flag to speed up execution */
13647 ANYOF_BITMAP_ZERO(ret);
13650 /* This gets set if we actually need to modify things */
13651 bool change_invlist = FALSE;
13655 /* Start looking through <cp_list> */
13656 invlist_iterinit(cp_list);
13657 while (invlist_iternext(cp_list, &start, &end)) {
13661 if (end == UV_MAX && start <= 256) {
13662 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13665 /* Quit if are above what we should change */
13670 change_invlist = TRUE;
13672 /* Set all the bits in the range, up to the max that we are doing */
13673 high = (end < 255) ? end : 255;
13674 for (i = start; i <= (int) high; i++) {
13675 if (! ANYOF_BITMAP_TEST(ret, i)) {
13676 ANYOF_BITMAP_SET(ret, i);
13682 invlist_iterfinish(cp_list);
13684 /* Done with loop; remove any code points that are in the bitmap from
13686 if (change_invlist) {
13687 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13690 /* If have completely emptied it, remove it completely */
13691 if (_invlist_len(cp_list) == 0) {
13692 SvREFCNT_dec_NN(cp_list);
13698 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13701 /* Here, the bitmap has been populated with all the Latin1 code points that
13702 * always match. Can now add to the overall list those that match only
13703 * when the target string is UTF-8 (<depends_list>). */
13704 if (depends_list) {
13706 _invlist_union(cp_list, depends_list, &cp_list);
13707 SvREFCNT_dec_NN(depends_list);
13710 cp_list = depends_list;
13714 /* If there is a swash and more than one element, we can't use the swash in
13715 * the optimization below. */
13716 if (swash && element_count > 1) {
13717 SvREFCNT_dec_NN(swash);
13722 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13724 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13725 SvREFCNT_dec_NN(listsv);
13728 /* av[0] stores the character class description in its textual form:
13729 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13730 * appropriate swash, and is also useful for dumping the regnode.
13731 * av[1] if NULL, is a placeholder to later contain the swash computed
13732 * from av[0]. But if no further computation need be done, the
13733 * swash is stored there now.
13734 * av[2] stores the cp_list inversion list for use in addition or
13735 * instead of av[0]; used only if av[1] is NULL
13736 * av[3] is set if any component of the class is from a user-defined
13737 * property; used only if av[1] is NULL */
13738 AV * const av = newAV();
13741 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13743 : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
13745 av_store(av, 1, swash);
13746 SvREFCNT_dec_NN(cp_list);
13749 av_store(av, 1, NULL);
13751 av_store(av, 2, cp_list);
13752 av_store(av, 3, newSVuv(has_user_defined_property));
13756 rv = newRV_noinc(MUTABLE_SV(av));
13757 n = add_data(pRExC_state, 1, "s");
13758 RExC_rxi->data->data[n] = (void*)rv;
13762 *flagp |= HASWIDTH|SIMPLE;
13765 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13768 /* reg_skipcomment()
13770 Absorbs an /x style # comments from the input stream.
13771 Returns true if there is more text remaining in the stream.
13772 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13773 terminates the pattern without including a newline.
13775 Note its the callers responsibility to ensure that we are
13776 actually in /x mode
13781 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13785 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13787 while (RExC_parse < RExC_end)
13788 if (*RExC_parse++ == '\n') {
13793 /* we ran off the end of the pattern without ending
13794 the comment, so we have to add an \n when wrapping */
13795 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13803 Advances the parse position, and optionally absorbs
13804 "whitespace" from the inputstream.
13806 Without /x "whitespace" means (?#...) style comments only,
13807 with /x this means (?#...) and # comments and whitespace proper.
13809 Returns the RExC_parse point from BEFORE the scan occurs.
13811 This is the /x friendly way of saying RExC_parse++.
13815 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13817 char* const retval = RExC_parse++;
13819 PERL_ARGS_ASSERT_NEXTCHAR;
13822 if (RExC_end - RExC_parse >= 3
13823 && *RExC_parse == '('
13824 && RExC_parse[1] == '?'
13825 && RExC_parse[2] == '#')
13827 while (*RExC_parse != ')') {
13828 if (RExC_parse == RExC_end)
13829 FAIL("Sequence (?#... not terminated");
13835 if (RExC_flags & RXf_PMf_EXTENDED) {
13836 if (isSPACE(*RExC_parse)) {
13840 else if (*RExC_parse == '#') {
13841 if ( reg_skipcomment( pRExC_state ) )
13850 - reg_node - emit a node
13852 STATIC regnode * /* Location. */
13853 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13857 regnode * const ret = RExC_emit;
13858 GET_RE_DEBUG_FLAGS_DECL;
13860 PERL_ARGS_ASSERT_REG_NODE;
13863 SIZE_ALIGN(RExC_size);
13867 if (RExC_emit >= RExC_emit_bound)
13868 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13869 op, RExC_emit, RExC_emit_bound);
13871 NODE_ALIGN_FILL(ret);
13873 FILL_ADVANCE_NODE(ptr, op);
13874 #ifdef RE_TRACK_PATTERN_OFFSETS
13875 if (RExC_offsets) { /* MJD */
13876 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13877 "reg_node", __LINE__,
13879 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13880 ? "Overwriting end of array!\n" : "OK",
13881 (UV)(RExC_emit - RExC_emit_start),
13882 (UV)(RExC_parse - RExC_start),
13883 (UV)RExC_offsets[0]));
13884 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13892 - reganode - emit a node with an argument
13894 STATIC regnode * /* Location. */
13895 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13899 regnode * const ret = RExC_emit;
13900 GET_RE_DEBUG_FLAGS_DECL;
13902 PERL_ARGS_ASSERT_REGANODE;
13905 SIZE_ALIGN(RExC_size);
13910 assert(2==regarglen[op]+1);
13912 Anything larger than this has to allocate the extra amount.
13913 If we changed this to be:
13915 RExC_size += (1 + regarglen[op]);
13917 then it wouldn't matter. Its not clear what side effect
13918 might come from that so its not done so far.
13923 if (RExC_emit >= RExC_emit_bound)
13924 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13925 op, RExC_emit, RExC_emit_bound);
13927 NODE_ALIGN_FILL(ret);
13929 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13930 #ifdef RE_TRACK_PATTERN_OFFSETS
13931 if (RExC_offsets) { /* MJD */
13932 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13936 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13937 "Overwriting end of array!\n" : "OK",
13938 (UV)(RExC_emit - RExC_emit_start),
13939 (UV)(RExC_parse - RExC_start),
13940 (UV)RExC_offsets[0]));
13941 Set_Cur_Node_Offset;
13949 - reguni - emit (if appropriate) a Unicode character
13952 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13956 PERL_ARGS_ASSERT_REGUNI;
13958 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13962 - reginsert - insert an operator in front of already-emitted operand
13964 * Means relocating the operand.
13967 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13973 const int offset = regarglen[(U8)op];
13974 const int size = NODE_STEP_REGNODE + offset;
13975 GET_RE_DEBUG_FLAGS_DECL;
13977 PERL_ARGS_ASSERT_REGINSERT;
13978 PERL_UNUSED_ARG(depth);
13979 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13980 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13989 if (RExC_open_parens) {
13991 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13992 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13993 if ( RExC_open_parens[paren] >= opnd ) {
13994 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13995 RExC_open_parens[paren] += size;
13997 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13999 if ( RExC_close_parens[paren] >= opnd ) {
14000 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14001 RExC_close_parens[paren] += size;
14003 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14008 while (src > opnd) {
14009 StructCopy(--src, --dst, regnode);
14010 #ifdef RE_TRACK_PATTERN_OFFSETS
14011 if (RExC_offsets) { /* MJD 20010112 */
14012 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14016 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14017 ? "Overwriting end of array!\n" : "OK",
14018 (UV)(src - RExC_emit_start),
14019 (UV)(dst - RExC_emit_start),
14020 (UV)RExC_offsets[0]));
14021 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14022 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14028 place = opnd; /* Op node, where operand used to be. */
14029 #ifdef RE_TRACK_PATTERN_OFFSETS
14030 if (RExC_offsets) { /* MJD */
14031 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14035 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14036 ? "Overwriting end of array!\n" : "OK",
14037 (UV)(place - RExC_emit_start),
14038 (UV)(RExC_parse - RExC_start),
14039 (UV)RExC_offsets[0]));
14040 Set_Node_Offset(place, RExC_parse);
14041 Set_Node_Length(place, 1);
14044 src = NEXTOPER(place);
14045 FILL_ADVANCE_NODE(place, op);
14046 Zero(src, offset, regnode);
14050 - regtail - set the next-pointer at the end of a node chain of p to val.
14051 - SEE ALSO: regtail_study
14053 /* TODO: All three parms should be const */
14055 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14059 GET_RE_DEBUG_FLAGS_DECL;
14061 PERL_ARGS_ASSERT_REGTAIL;
14063 PERL_UNUSED_ARG(depth);
14069 /* Find last node. */
14072 regnode * const temp = regnext(scan);
14074 SV * const mysv=sv_newmortal();
14075 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14076 regprop(RExC_rx, mysv, scan);
14077 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14078 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14079 (temp == NULL ? "->" : ""),
14080 (temp == NULL ? PL_reg_name[OP(val)] : "")
14088 if (reg_off_by_arg[OP(scan)]) {
14089 ARG_SET(scan, val - scan);
14092 NEXT_OFF(scan) = val - scan;
14098 - regtail_study - set the next-pointer at the end of a node chain of p to val.
14099 - Look for optimizable sequences at the same time.
14100 - currently only looks for EXACT chains.
14102 This is experimental code. The idea is to use this routine to perform
14103 in place optimizations on branches and groups as they are constructed,
14104 with the long term intention of removing optimization from study_chunk so
14105 that it is purely analytical.
14107 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14108 to control which is which.
14111 /* TODO: All four parms should be const */
14114 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14119 #ifdef EXPERIMENTAL_INPLACESCAN
14122 GET_RE_DEBUG_FLAGS_DECL;
14124 PERL_ARGS_ASSERT_REGTAIL_STUDY;
14130 /* Find last node. */
14134 regnode * const temp = regnext(scan);
14135 #ifdef EXPERIMENTAL_INPLACESCAN
14136 if (PL_regkind[OP(scan)] == EXACT) {
14137 bool has_exactf_sharp_s; /* Unexamined in this routine */
14138 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14143 switch (OP(scan)) {
14149 case EXACTFU_TRICKYFOLD:
14151 if( exact == PSEUDO )
14153 else if ( exact != OP(scan) )
14162 SV * const mysv=sv_newmortal();
14163 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14164 regprop(RExC_rx, mysv, scan);
14165 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14166 SvPV_nolen_const(mysv),
14167 REG_NODE_NUM(scan),
14168 PL_reg_name[exact]);
14175 SV * const mysv_val=sv_newmortal();
14176 DEBUG_PARSE_MSG("");
14177 regprop(RExC_rx, mysv_val, val);
14178 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14179 SvPV_nolen_const(mysv_val),
14180 (IV)REG_NODE_NUM(val),
14184 if (reg_off_by_arg[OP(scan)]) {
14185 ARG_SET(scan, val - scan);
14188 NEXT_OFF(scan) = val - scan;
14196 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14200 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14206 for (bit=0; bit<32; bit++) {
14207 if (flags & (1<<bit)) {
14208 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14211 if (!set++ && lead)
14212 PerlIO_printf(Perl_debug_log, "%s",lead);
14213 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14216 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14217 if (!set++ && lead) {
14218 PerlIO_printf(Perl_debug_log, "%s",lead);
14221 case REGEX_UNICODE_CHARSET:
14222 PerlIO_printf(Perl_debug_log, "UNICODE");
14224 case REGEX_LOCALE_CHARSET:
14225 PerlIO_printf(Perl_debug_log, "LOCALE");
14227 case REGEX_ASCII_RESTRICTED_CHARSET:
14228 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14230 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14231 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14234 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14240 PerlIO_printf(Perl_debug_log, "\n");
14242 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14248 Perl_regdump(pTHX_ const regexp *r)
14252 SV * const sv = sv_newmortal();
14253 SV *dsv= sv_newmortal();
14254 RXi_GET_DECL(r,ri);
14255 GET_RE_DEBUG_FLAGS_DECL;
14257 PERL_ARGS_ASSERT_REGDUMP;
14259 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14261 /* Header fields of interest. */
14262 if (r->anchored_substr) {
14263 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14264 RE_SV_DUMPLEN(r->anchored_substr), 30);
14265 PerlIO_printf(Perl_debug_log,
14266 "anchored %s%s at %"IVdf" ",
14267 s, RE_SV_TAIL(r->anchored_substr),
14268 (IV)r->anchored_offset);
14269 } else if (r->anchored_utf8) {
14270 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14271 RE_SV_DUMPLEN(r->anchored_utf8), 30);
14272 PerlIO_printf(Perl_debug_log,
14273 "anchored utf8 %s%s at %"IVdf" ",
14274 s, RE_SV_TAIL(r->anchored_utf8),
14275 (IV)r->anchored_offset);
14277 if (r->float_substr) {
14278 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14279 RE_SV_DUMPLEN(r->float_substr), 30);
14280 PerlIO_printf(Perl_debug_log,
14281 "floating %s%s at %"IVdf"..%"UVuf" ",
14282 s, RE_SV_TAIL(r->float_substr),
14283 (IV)r->float_min_offset, (UV)r->float_max_offset);
14284 } else if (r->float_utf8) {
14285 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14286 RE_SV_DUMPLEN(r->float_utf8), 30);
14287 PerlIO_printf(Perl_debug_log,
14288 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14289 s, RE_SV_TAIL(r->float_utf8),
14290 (IV)r->float_min_offset, (UV)r->float_max_offset);
14292 if (r->check_substr || r->check_utf8)
14293 PerlIO_printf(Perl_debug_log,
14295 (r->check_substr == r->float_substr
14296 && r->check_utf8 == r->float_utf8
14297 ? "(checking floating" : "(checking anchored"));
14298 if (r->extflags & RXf_NOSCAN)
14299 PerlIO_printf(Perl_debug_log, " noscan");
14300 if (r->extflags & RXf_CHECK_ALL)
14301 PerlIO_printf(Perl_debug_log, " isall");
14302 if (r->check_substr || r->check_utf8)
14303 PerlIO_printf(Perl_debug_log, ") ");
14305 if (ri->regstclass) {
14306 regprop(r, sv, ri->regstclass);
14307 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14309 if (r->extflags & RXf_ANCH) {
14310 PerlIO_printf(Perl_debug_log, "anchored");
14311 if (r->extflags & RXf_ANCH_BOL)
14312 PerlIO_printf(Perl_debug_log, "(BOL)");
14313 if (r->extflags & RXf_ANCH_MBOL)
14314 PerlIO_printf(Perl_debug_log, "(MBOL)");
14315 if (r->extflags & RXf_ANCH_SBOL)
14316 PerlIO_printf(Perl_debug_log, "(SBOL)");
14317 if (r->extflags & RXf_ANCH_GPOS)
14318 PerlIO_printf(Perl_debug_log, "(GPOS)");
14319 PerlIO_putc(Perl_debug_log, ' ');
14321 if (r->extflags & RXf_GPOS_SEEN)
14322 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14323 if (r->intflags & PREGf_SKIP)
14324 PerlIO_printf(Perl_debug_log, "plus ");
14325 if (r->intflags & PREGf_IMPLICIT)
14326 PerlIO_printf(Perl_debug_log, "implicit ");
14327 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14328 if (r->extflags & RXf_EVAL_SEEN)
14329 PerlIO_printf(Perl_debug_log, "with eval ");
14330 PerlIO_printf(Perl_debug_log, "\n");
14331 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
14333 PERL_ARGS_ASSERT_REGDUMP;
14334 PERL_UNUSED_CONTEXT;
14335 PERL_UNUSED_ARG(r);
14336 #endif /* DEBUGGING */
14340 - regprop - printable representation of opcode
14342 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14345 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14346 if (flags & ANYOF_INVERT) \
14347 /*make sure the invert info is in each */ \
14348 sv_catpvs(sv, "^"); \
14354 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14360 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14361 static const char * const anyofs[] = {
14362 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14363 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
14364 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
14365 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
14366 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
14367 || _CC_VERTSPACE != 16
14368 #error Need to adjust order of anyofs[]
14405 RXi_GET_DECL(prog,progi);
14406 GET_RE_DEBUG_FLAGS_DECL;
14408 PERL_ARGS_ASSERT_REGPROP;
14412 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
14413 /* It would be nice to FAIL() here, but this may be called from
14414 regexec.c, and it would be hard to supply pRExC_state. */
14415 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14416 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14418 k = PL_regkind[OP(o)];
14421 sv_catpvs(sv, " ");
14422 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14423 * is a crude hack but it may be the best for now since
14424 * we have no flag "this EXACTish node was UTF-8"
14426 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14427 PERL_PV_ESCAPE_UNI_DETECT |
14428 PERL_PV_ESCAPE_NONASCII |
14429 PERL_PV_PRETTY_ELLIPSES |
14430 PERL_PV_PRETTY_LTGT |
14431 PERL_PV_PRETTY_NOCLEAR
14433 } else if (k == TRIE) {
14434 /* print the details of the trie in dumpuntil instead, as
14435 * progi->data isn't available here */
14436 const char op = OP(o);
14437 const U32 n = ARG(o);
14438 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14439 (reg_ac_data *)progi->data->data[n] :
14441 const reg_trie_data * const trie
14442 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14444 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14445 DEBUG_TRIE_COMPILE_r(
14446 Perl_sv_catpvf(aTHX_ sv,
14447 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14448 (UV)trie->startstate,
14449 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14450 (UV)trie->wordcount,
14453 (UV)TRIE_CHARCOUNT(trie),
14454 (UV)trie->uniquecharcount
14457 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14459 int rangestart = -1;
14460 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
14461 sv_catpvs(sv, "[");
14462 for (i = 0; i <= 256; i++) {
14463 if (i < 256 && BITMAP_TEST(bitmap,i)) {
14464 if (rangestart == -1)
14466 } else if (rangestart != -1) {
14467 if (i <= rangestart + 3)
14468 for (; rangestart < i; rangestart++)
14469 put_byte(sv, rangestart);
14471 put_byte(sv, rangestart);
14472 sv_catpvs(sv, "-");
14473 put_byte(sv, i - 1);
14478 sv_catpvs(sv, "]");
14481 } else if (k == CURLY) {
14482 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14483 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14484 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14486 else if (k == WHILEM && o->flags) /* Ordinal/of */
14487 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14488 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14489 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14490 if ( RXp_PAREN_NAMES(prog) ) {
14491 if ( k != REF || (OP(o) < NREF)) {
14492 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14493 SV **name= av_fetch(list, ARG(o), 0 );
14495 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14498 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14499 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14500 I32 *nums=(I32*)SvPVX(sv_dat);
14501 SV **name= av_fetch(list, nums[0], 0 );
14504 for ( n=0; n<SvIVX(sv_dat); n++ ) {
14505 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14506 (n ? "," : ""), (IV)nums[n]);
14508 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14512 } else if (k == GOSUB)
14513 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14514 else if (k == VERB) {
14516 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14517 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14518 } else if (k == LOGICAL)
14519 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14520 else if (k == ANYOF) {
14521 int i, rangestart = -1;
14522 const U8 flags = ANYOF_FLAGS(o);
14526 if (flags & ANYOF_LOCALE)
14527 sv_catpvs(sv, "{loc}");
14528 if (flags & ANYOF_LOC_FOLD)
14529 sv_catpvs(sv, "{i}");
14530 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14531 if (flags & ANYOF_INVERT)
14532 sv_catpvs(sv, "^");
14534 /* output what the standard cp 0-255 bitmap matches */
14535 for (i = 0; i <= 256; i++) {
14536 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
14537 if (rangestart == -1)
14539 } else if (rangestart != -1) {
14540 if (i <= rangestart + 3)
14541 for (; rangestart < i; rangestart++)
14542 put_byte(sv, rangestart);
14544 put_byte(sv, rangestart);
14545 sv_catpvs(sv, "-");
14546 put_byte(sv, i - 1);
14553 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14554 /* output any special charclass tests (used entirely under use locale) */
14555 if (ANYOF_CLASS_TEST_ANY_SET(o))
14556 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14557 if (ANYOF_CLASS_TEST(o,i)) {
14558 sv_catpv(sv, anyofs[i]);
14562 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14564 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14565 sv_catpvs(sv, "{non-utf8-latin1-all}");
14568 /* output information about the unicode matching */
14569 if (flags & ANYOF_UNICODE_ALL)
14570 sv_catpvs(sv, "{unicode_all}");
14571 else if (ANYOF_NONBITMAP(o))
14572 sv_catpvs(sv, "{unicode}");
14573 if (flags & ANYOF_NONBITMAP_NON_UTF8)
14574 sv_catpvs(sv, "{outside bitmap}");
14576 if (ANYOF_NONBITMAP(o)) {
14577 SV *lv; /* Set if there is something outside the bit map */
14578 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14579 bool byte_output = FALSE; /* If something in the bitmap has been
14582 if (lv && lv != &PL_sv_undef) {
14584 U8 s[UTF8_MAXBYTES_CASE+1];
14586 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14587 uvchr_to_utf8(s, i);
14590 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14594 && swash_fetch(sw, s, TRUE))
14596 if (rangestart == -1)
14598 } else if (rangestart != -1) {
14599 byte_output = TRUE;
14600 if (i <= rangestart + 3)
14601 for (; rangestart < i; rangestart++) {
14602 put_byte(sv, rangestart);
14605 put_byte(sv, rangestart);
14606 sv_catpvs(sv, "-");
14615 char *s = savesvpv(lv);
14616 char * const origs = s;
14618 while (*s && *s != '\n')
14622 const char * const t = ++s;
14625 sv_catpvs(sv, " ");
14631 /* Truncate very long output */
14632 if (s - origs > 256) {
14633 Perl_sv_catpvf(aTHX_ sv,
14635 (int) (s - origs - 1),
14641 else if (*s == '\t') {
14656 SvREFCNT_dec_NN(lv);
14660 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14662 else if (k == POSIXD || k == NPOSIXD) {
14663 U8 index = FLAGS(o) * 2;
14664 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14665 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14668 sv_catpv(sv, anyofs[index]);
14671 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14672 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14674 PERL_UNUSED_CONTEXT;
14675 PERL_UNUSED_ARG(sv);
14676 PERL_UNUSED_ARG(o);
14677 PERL_UNUSED_ARG(prog);
14678 #endif /* DEBUGGING */
14682 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14683 { /* Assume that RE_INTUIT is set */
14685 struct regexp *const prog = ReANY(r);
14686 GET_RE_DEBUG_FLAGS_DECL;
14688 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14689 PERL_UNUSED_CONTEXT;
14693 const char * const s = SvPV_nolen_const(prog->check_substr
14694 ? prog->check_substr : prog->check_utf8);
14696 if (!PL_colorset) reginitcolors();
14697 PerlIO_printf(Perl_debug_log,
14698 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14700 prog->check_substr ? "" : "utf8 ",
14701 PL_colors[5],PL_colors[0],
14704 (strlen(s) > 60 ? "..." : ""));
14707 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14713 handles refcounting and freeing the perl core regexp structure. When
14714 it is necessary to actually free the structure the first thing it
14715 does is call the 'free' method of the regexp_engine associated to
14716 the regexp, allowing the handling of the void *pprivate; member
14717 first. (This routine is not overridable by extensions, which is why
14718 the extensions free is called first.)
14720 See regdupe and regdupe_internal if you change anything here.
14722 #ifndef PERL_IN_XSUB_RE
14724 Perl_pregfree(pTHX_ REGEXP *r)
14730 Perl_pregfree2(pTHX_ REGEXP *rx)
14733 struct regexp *const r = ReANY(rx);
14734 GET_RE_DEBUG_FLAGS_DECL;
14736 PERL_ARGS_ASSERT_PREGFREE2;
14738 if (r->mother_re) {
14739 ReREFCNT_dec(r->mother_re);
14741 CALLREGFREE_PVT(rx); /* free the private data */
14742 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14743 Safefree(r->xpv_len_u.xpvlenu_pv);
14746 SvREFCNT_dec(r->anchored_substr);
14747 SvREFCNT_dec(r->anchored_utf8);
14748 SvREFCNT_dec(r->float_substr);
14749 SvREFCNT_dec(r->float_utf8);
14750 Safefree(r->substrs);
14752 RX_MATCH_COPY_FREE(rx);
14753 #ifdef PERL_ANY_COW
14754 SvREFCNT_dec(r->saved_copy);
14757 SvREFCNT_dec(r->qr_anoncv);
14758 rx->sv_u.svu_rx = 0;
14763 This is a hacky workaround to the structural issue of match results
14764 being stored in the regexp structure which is in turn stored in
14765 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14766 could be PL_curpm in multiple contexts, and could require multiple
14767 result sets being associated with the pattern simultaneously, such
14768 as when doing a recursive match with (??{$qr})
14770 The solution is to make a lightweight copy of the regexp structure
14771 when a qr// is returned from the code executed by (??{$qr}) this
14772 lightweight copy doesn't actually own any of its data except for
14773 the starp/end and the actual regexp structure itself.
14779 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14781 struct regexp *ret;
14782 struct regexp *const r = ReANY(rx);
14783 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14785 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14788 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14790 SvOK_off((SV *)ret_x);
14792 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14793 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14794 made both spots point to the same regexp body.) */
14795 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14796 assert(!SvPVX(ret_x));
14797 ret_x->sv_u.svu_rx = temp->sv_any;
14798 temp->sv_any = NULL;
14799 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14800 SvREFCNT_dec_NN(temp);
14801 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14802 ing below will not set it. */
14803 SvCUR_set(ret_x, SvCUR(rx));
14806 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14807 sv_force_normal(sv) is called. */
14809 ret = ReANY(ret_x);
14811 SvFLAGS(ret_x) |= SvUTF8(rx);
14812 /* We share the same string buffer as the original regexp, on which we
14813 hold a reference count, incremented when mother_re is set below.
14814 The string pointer is copied here, being part of the regexp struct.
14816 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14817 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14819 const I32 npar = r->nparens+1;
14820 Newx(ret->offs, npar, regexp_paren_pair);
14821 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14824 Newx(ret->substrs, 1, struct reg_substr_data);
14825 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14827 SvREFCNT_inc_void(ret->anchored_substr);
14828 SvREFCNT_inc_void(ret->anchored_utf8);
14829 SvREFCNT_inc_void(ret->float_substr);
14830 SvREFCNT_inc_void(ret->float_utf8);
14832 /* check_substr and check_utf8, if non-NULL, point to either their
14833 anchored or float namesakes, and don't hold a second reference. */
14835 RX_MATCH_COPIED_off(ret_x);
14836 #ifdef PERL_ANY_COW
14837 ret->saved_copy = NULL;
14839 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14840 SvREFCNT_inc_void(ret->qr_anoncv);
14846 /* regfree_internal()
14848 Free the private data in a regexp. This is overloadable by
14849 extensions. Perl takes care of the regexp structure in pregfree(),
14850 this covers the *pprivate pointer which technically perl doesn't
14851 know about, however of course we have to handle the
14852 regexp_internal structure when no extension is in use.
14854 Note this is called before freeing anything in the regexp
14859 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14862 struct regexp *const r = ReANY(rx);
14863 RXi_GET_DECL(r,ri);
14864 GET_RE_DEBUG_FLAGS_DECL;
14866 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14872 SV *dsv= sv_newmortal();
14873 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14874 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14875 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14876 PL_colors[4],PL_colors[5],s);
14879 #ifdef RE_TRACK_PATTERN_OFFSETS
14881 Safefree(ri->u.offsets); /* 20010421 MJD */
14883 if (ri->code_blocks) {
14885 for (n = 0; n < ri->num_code_blocks; n++)
14886 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14887 Safefree(ri->code_blocks);
14891 int n = ri->data->count;
14894 /* If you add a ->what type here, update the comment in regcomp.h */
14895 switch (ri->data->what[n]) {
14901 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14904 Safefree(ri->data->data[n]);
14910 { /* Aho Corasick add-on structure for a trie node.
14911 Used in stclass optimization only */
14913 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14915 refcount = --aho->refcount;
14918 PerlMemShared_free(aho->states);
14919 PerlMemShared_free(aho->fail);
14920 /* do this last!!!! */
14921 PerlMemShared_free(ri->data->data[n]);
14922 PerlMemShared_free(ri->regstclass);
14928 /* trie structure. */
14930 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14932 refcount = --trie->refcount;
14935 PerlMemShared_free(trie->charmap);
14936 PerlMemShared_free(trie->states);
14937 PerlMemShared_free(trie->trans);
14939 PerlMemShared_free(trie->bitmap);
14941 PerlMemShared_free(trie->jump);
14942 PerlMemShared_free(trie->wordinfo);
14943 /* do this last!!!! */
14944 PerlMemShared_free(ri->data->data[n]);
14949 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14952 Safefree(ri->data->what);
14953 Safefree(ri->data);
14959 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14960 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14961 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14964 re_dup - duplicate a regexp.
14966 This routine is expected to clone a given regexp structure. It is only
14967 compiled under USE_ITHREADS.
14969 After all of the core data stored in struct regexp is duplicated
14970 the regexp_engine.dupe method is used to copy any private data
14971 stored in the *pprivate pointer. This allows extensions to handle
14972 any duplication it needs to do.
14974 See pregfree() and regfree_internal() if you change anything here.
14976 #if defined(USE_ITHREADS)
14977 #ifndef PERL_IN_XSUB_RE
14979 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14983 const struct regexp *r = ReANY(sstr);
14984 struct regexp *ret = ReANY(dstr);
14986 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14988 npar = r->nparens+1;
14989 Newx(ret->offs, npar, regexp_paren_pair);
14990 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14992 if (ret->substrs) {
14993 /* Do it this way to avoid reading from *r after the StructCopy().
14994 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14995 cache, it doesn't matter. */
14996 const bool anchored = r->check_substr
14997 ? r->check_substr == r->anchored_substr
14998 : r->check_utf8 == r->anchored_utf8;
14999 Newx(ret->substrs, 1, struct reg_substr_data);
15000 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15002 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15003 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15004 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15005 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15007 /* check_substr and check_utf8, if non-NULL, point to either their
15008 anchored or float namesakes, and don't hold a second reference. */
15010 if (ret->check_substr) {
15012 assert(r->check_utf8 == r->anchored_utf8);
15013 ret->check_substr = ret->anchored_substr;
15014 ret->check_utf8 = ret->anchored_utf8;
15016 assert(r->check_substr == r->float_substr);
15017 assert(r->check_utf8 == r->float_utf8);
15018 ret->check_substr = ret->float_substr;
15019 ret->check_utf8 = ret->float_utf8;
15021 } else if (ret->check_utf8) {
15023 ret->check_utf8 = ret->anchored_utf8;
15025 ret->check_utf8 = ret->float_utf8;
15030 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15031 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15034 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15036 if (RX_MATCH_COPIED(dstr))
15037 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15039 ret->subbeg = NULL;
15040 #ifdef PERL_ANY_COW
15041 ret->saved_copy = NULL;
15044 /* Whether mother_re be set or no, we need to copy the string. We
15045 cannot refrain from copying it when the storage points directly to
15046 our mother regexp, because that's
15047 1: a buffer in a different thread
15048 2: something we no longer hold a reference on
15049 so we need to copy it locally. */
15050 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15051 ret->mother_re = NULL;
15054 #endif /* PERL_IN_XSUB_RE */
15059 This is the internal complement to regdupe() which is used to copy
15060 the structure pointed to by the *pprivate pointer in the regexp.
15061 This is the core version of the extension overridable cloning hook.
15062 The regexp structure being duplicated will be copied by perl prior
15063 to this and will be provided as the regexp *r argument, however
15064 with the /old/ structures pprivate pointer value. Thus this routine
15065 may override any copying normally done by perl.
15067 It returns a pointer to the new regexp_internal structure.
15071 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15074 struct regexp *const r = ReANY(rx);
15075 regexp_internal *reti;
15077 RXi_GET_DECL(r,ri);
15079 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15083 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15084 Copy(ri->program, reti->program, len+1, regnode);
15086 reti->num_code_blocks = ri->num_code_blocks;
15087 if (ri->code_blocks) {
15089 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15090 struct reg_code_block);
15091 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15092 struct reg_code_block);
15093 for (n = 0; n < ri->num_code_blocks; n++)
15094 reti->code_blocks[n].src_regex = (REGEXP*)
15095 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15098 reti->code_blocks = NULL;
15100 reti->regstclass = NULL;
15103 struct reg_data *d;
15104 const int count = ri->data->count;
15107 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15108 char, struct reg_data);
15109 Newx(d->what, count, U8);
15112 for (i = 0; i < count; i++) {
15113 d->what[i] = ri->data->what[i];
15114 switch (d->what[i]) {
15115 /* see also regcomp.h and regfree_internal() */
15116 case 'a': /* actually an AV, but the dup function is identical. */
15120 case 'u': /* actually an HV, but the dup function is identical. */
15121 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15124 /* This is cheating. */
15125 Newx(d->data[i], 1, struct regnode_charclass_class);
15126 StructCopy(ri->data->data[i], d->data[i],
15127 struct regnode_charclass_class);
15128 reti->regstclass = (regnode*)d->data[i];
15131 /* Trie stclasses are readonly and can thus be shared
15132 * without duplication. We free the stclass in pregfree
15133 * when the corresponding reg_ac_data struct is freed.
15135 reti->regstclass= ri->regstclass;
15139 ((reg_trie_data*)ri->data->data[i])->refcount++;
15144 d->data[i] = ri->data->data[i];
15147 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15156 reti->name_list_idx = ri->name_list_idx;
15158 #ifdef RE_TRACK_PATTERN_OFFSETS
15159 if (ri->u.offsets) {
15160 Newx(reti->u.offsets, 2*len+1, U32);
15161 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15164 SetProgLen(reti,len);
15167 return (void*)reti;
15170 #endif /* USE_ITHREADS */
15172 #ifndef PERL_IN_XSUB_RE
15175 - regnext - dig the "next" pointer out of a node
15178 Perl_regnext(pTHX_ regnode *p)
15186 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15187 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15190 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15199 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15202 STRLEN l1 = strlen(pat1);
15203 STRLEN l2 = strlen(pat2);
15206 const char *message;
15208 PERL_ARGS_ASSERT_RE_CROAK2;
15214 Copy(pat1, buf, l1 , char);
15215 Copy(pat2, buf + l1, l2 , char);
15216 buf[l1 + l2] = '\n';
15217 buf[l1 + l2 + 1] = '\0';
15219 /* ANSI variant takes additional second argument */
15220 va_start(args, pat2);
15224 msv = vmess(buf, &args);
15226 message = SvPV_const(msv,l1);
15229 Copy(message, buf, l1 , char);
15230 buf[l1-1] = '\0'; /* Overwrite \n */
15231 Perl_croak(aTHX_ "%s", buf);
15234 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15236 #ifndef PERL_IN_XSUB_RE
15238 Perl_save_re_context(pTHX)
15242 struct re_save_state *state;
15244 SAVEVPTR(PL_curcop);
15245 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
15247 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
15248 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
15249 SSPUSHUV(SAVEt_RE_STATE);
15251 Copy(&PL_reg_state, state, 1, struct re_save_state);
15253 PL_reg_oldsaved = NULL;
15254 PL_reg_oldsavedlen = 0;
15255 PL_reg_oldsavedoffset = 0;
15256 PL_reg_oldsavedcoffset = 0;
15257 PL_reg_maxiter = 0;
15258 PL_reg_leftiter = 0;
15259 PL_reg_poscache = NULL;
15260 PL_reg_poscache_size = 0;
15261 #ifdef PERL_ANY_COW
15265 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15267 const REGEXP * const rx = PM_GETRE(PL_curpm);
15270 for (i = 1; i <= RX_NPARENS(rx); i++) {
15271 char digits[TYPE_CHARS(long)];
15272 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15273 GV *const *const gvp
15274 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15277 GV * const gv = *gvp;
15278 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15290 S_put_byte(pTHX_ SV *sv, int c)
15292 PERL_ARGS_ASSERT_PUT_BYTE;
15294 /* Our definition of isPRINT() ignores locales, so only bytes that are
15295 not part of UTF-8 are considered printable. I assume that the same
15296 holds for UTF-EBCDIC.
15297 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15298 which Wikipedia says:
15300 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15301 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15302 identical, to the ASCII delete (DEL) or rubout control character. ...
15303 it is typically mapped to hexadecimal code 9F, in order to provide a
15304 unique character mapping in both directions)
15306 So the old condition can be simplified to !isPRINT(c) */
15309 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
15312 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15316 const char string = c;
15317 if (c == '-' || c == ']' || c == '\\' || c == '^')
15318 sv_catpvs(sv, "\\");
15319 sv_catpvn(sv, &string, 1);
15324 #define CLEAR_OPTSTART \
15325 if (optstart) STMT_START { \
15326 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15330 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15332 STATIC const regnode *
15333 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15334 const regnode *last, const regnode *plast,
15335 SV* sv, I32 indent, U32 depth)
15338 U8 op = PSEUDO; /* Arbitrary non-END op. */
15339 const regnode *next;
15340 const regnode *optstart= NULL;
15342 RXi_GET_DECL(r,ri);
15343 GET_RE_DEBUG_FLAGS_DECL;
15345 PERL_ARGS_ASSERT_DUMPUNTIL;
15347 #ifdef DEBUG_DUMPUNTIL
15348 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15349 last ? last-start : 0,plast ? plast-start : 0);
15352 if (plast && plast < last)
15355 while (PL_regkind[op] != END && (!last || node < last)) {
15356 /* While that wasn't END last time... */
15359 if (op == CLOSE || op == WHILEM)
15361 next = regnext((regnode *)node);
15364 if (OP(node) == OPTIMIZED) {
15365 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15372 regprop(r, sv, node);
15373 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15374 (int)(2*indent + 1), "", SvPVX_const(sv));
15376 if (OP(node) != OPTIMIZED) {
15377 if (next == NULL) /* Next ptr. */
15378 PerlIO_printf(Perl_debug_log, " (0)");
15379 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15380 PerlIO_printf(Perl_debug_log, " (FAIL)");
15382 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15383 (void)PerlIO_putc(Perl_debug_log, '\n');
15387 if (PL_regkind[(U8)op] == BRANCHJ) {
15390 const regnode *nnode = (OP(next) == LONGJMP
15391 ? regnext((regnode *)next)
15393 if (last && nnode > last)
15395 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15398 else if (PL_regkind[(U8)op] == BRANCH) {
15400 DUMPUNTIL(NEXTOPER(node), next);
15402 else if ( PL_regkind[(U8)op] == TRIE ) {
15403 const regnode *this_trie = node;
15404 const char op = OP(node);
15405 const U32 n = ARG(node);
15406 const reg_ac_data * const ac = op>=AHOCORASICK ?
15407 (reg_ac_data *)ri->data->data[n] :
15409 const reg_trie_data * const trie =
15410 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15412 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15414 const regnode *nextbranch= NULL;
15417 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15418 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15420 PerlIO_printf(Perl_debug_log, "%*s%s ",
15421 (int)(2*(indent+3)), "",
15422 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15423 PL_colors[0], PL_colors[1],
15424 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15425 PERL_PV_PRETTY_ELLIPSES |
15426 PERL_PV_PRETTY_LTGT
15431 U16 dist= trie->jump[word_idx+1];
15432 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15433 (UV)((dist ? this_trie + dist : next) - start));
15436 nextbranch= this_trie + trie->jump[0];
15437 DUMPUNTIL(this_trie + dist, nextbranch);
15439 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15440 nextbranch= regnext((regnode *)nextbranch);
15442 PerlIO_printf(Perl_debug_log, "\n");
15445 if (last && next > last)
15450 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
15451 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15452 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15454 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15456 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15458 else if ( op == PLUS || op == STAR) {
15459 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15461 else if (PL_regkind[(U8)op] == ANYOF) {
15462 /* arglen 1 + class block */
15463 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15464 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15465 node = NEXTOPER(node);
15467 else if (PL_regkind[(U8)op] == EXACT) {
15468 /* Literal string, where present. */
15469 node += NODE_SZ_STR(node) - 1;
15470 node = NEXTOPER(node);
15473 node = NEXTOPER(node);
15474 node += regarglen[(U8)op];
15476 if (op == CURLYX || op == OPEN)
15480 #ifdef DEBUG_DUMPUNTIL
15481 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15486 #endif /* DEBUGGING */
15490 * c-indentation-style: bsd
15491 * c-basic-offset: 4
15492 * indent-tabs-mode: nil
15495 * ex: set ts=8 sts=4 sw=4 et: