5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
103 # if defined(BUGGY_MSC6)
104 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105 # pragma optimize("a",off)
106 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107 # pragma optimize("w",on )
108 # endif /* BUGGY_MSC6 */
112 #define STATIC static
116 typedef struct RExC_state_t {
117 U32 flags; /* RXf_* are we folding, multilining? */
118 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119 char *precomp; /* uncompiled string. */
120 REGEXP *rx_sv; /* The SV that is the regexp. */
121 regexp *rx; /* perl core regexp structure */
122 regexp_internal *rxi; /* internal data for regexp object pprivate field */
123 char *start; /* Start of input for compile */
124 char *end; /* End of input for compile */
125 char *parse; /* Input-scan pointer. */
126 I32 whilem_seen; /* number of WHILEM in this expr */
127 regnode *emit_start; /* Start of emitted-code area */
128 regnode *emit_bound; /* First regnode outside of the allocated space */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* Capture buffer count, (OPEN). */
135 I32 cpar; /* Capture buffer count, (CLOSE). */
136 I32 nestroot; /* root parens we are in - used by accept */
139 regnode **open_parens; /* pointers to open parens */
140 regnode **close_parens; /* pointers to close parens */
141 regnode *opend; /* END node in program */
142 I32 utf8; /* whether the pattern is utf8 or not */
143 I32 orig_utf8; /* whether the pattern was originally in utf8 */
144 /* XXX use this for future optimisation of case
145 * where pattern must be upgraded to utf8. */
146 I32 uni_semantics; /* If a d charset modifier should use unicode
147 rules, even if the pattern is not in
149 HV *paren_names; /* Paren names */
151 regnode **recurse; /* Recurse regops */
152 I32 recurse_count; /* Number of recurse regops */
155 I32 override_recoding;
156 I32 in_multi_char_class;
157 struct reg_code_block *code_blocks; /* positions of literal (?{})
159 int num_code_blocks; /* size of code_blocks[] */
160 int code_index; /* next code_blocks[] slot */
162 char *starttry; /* -Dr: where regtry was called. */
163 #define RExC_starttry (pRExC_state->starttry)
165 SV *runtime_code_qr; /* qr with the runtime code blocks */
167 const char *lastparse;
169 AV *paren_name_list; /* idx -> name */
170 #define RExC_lastparse (pRExC_state->lastparse)
171 #define RExC_lastnum (pRExC_state->lastnum)
172 #define RExC_paren_name_list (pRExC_state->paren_name_list)
176 #define RExC_flags (pRExC_state->flags)
177 #define RExC_pm_flags (pRExC_state->pm_flags)
178 #define RExC_precomp (pRExC_state->precomp)
179 #define RExC_rx_sv (pRExC_state->rx_sv)
180 #define RExC_rx (pRExC_state->rx)
181 #define RExC_rxi (pRExC_state->rxi)
182 #define RExC_start (pRExC_state->start)
183 #define RExC_end (pRExC_state->end)
184 #define RExC_parse (pRExC_state->parse)
185 #define RExC_whilem_seen (pRExC_state->whilem_seen)
186 #ifdef RE_TRACK_PATTERN_OFFSETS
187 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
189 #define RExC_emit (pRExC_state->emit)
190 #define RExC_emit_start (pRExC_state->emit_start)
191 #define RExC_emit_bound (pRExC_state->emit_bound)
192 #define RExC_naughty (pRExC_state->naughty)
193 #define RExC_sawback (pRExC_state->sawback)
194 #define RExC_seen (pRExC_state->seen)
195 #define RExC_size (pRExC_state->size)
196 #define RExC_npar (pRExC_state->npar)
197 #define RExC_nestroot (pRExC_state->nestroot)
198 #define RExC_extralen (pRExC_state->extralen)
199 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
200 #define RExC_utf8 (pRExC_state->utf8)
201 #define RExC_uni_semantics (pRExC_state->uni_semantics)
202 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
203 #define RExC_open_parens (pRExC_state->open_parens)
204 #define RExC_close_parens (pRExC_state->close_parens)
205 #define RExC_opend (pRExC_state->opend)
206 #define RExC_paren_names (pRExC_state->paren_names)
207 #define RExC_recurse (pRExC_state->recurse)
208 #define RExC_recurse_count (pRExC_state->recurse_count)
209 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
210 #define RExC_contains_locale (pRExC_state->contains_locale)
211 #define RExC_override_recoding (pRExC_state->override_recoding)
212 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
215 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
216 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
217 ((*s) == '{' && regcurly(s)))
220 #undef SPSTART /* dratted cpp namespace... */
223 * Flags to be passed up and down.
225 #define WORST 0 /* Worst case. */
226 #define HASWIDTH 0x01 /* Known to match non-null strings. */
228 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
229 * character. (There needs to be a case: in the switch statement in regexec.c
230 * for any node marked SIMPLE.) Note that this is not the same thing as
233 #define SPSTART 0x04 /* Starts with * or + */
234 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
235 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
237 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
239 /* whether trie related optimizations are enabled */
240 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
241 #define TRIE_STUDY_OPT
242 #define FULL_TRIE_STUDY
248 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
249 #define PBITVAL(paren) (1 << ((paren) & 7))
250 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
251 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
252 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
254 /* If not already in utf8, do a longjmp back to the beginning */
255 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
256 #define REQUIRE_UTF8 STMT_START { \
257 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
260 /* This converts the named class defined in regcomp.h to its equivalent class
261 * number defined in handy.h. */
262 #define namedclass_to_classnum(class) ((int) ((class) / 2))
263 #define classnum_to_namedclass(classnum) ((classnum) * 2)
265 /* About scan_data_t.
267 During optimisation we recurse through the regexp program performing
268 various inplace (keyhole style) optimisations. In addition study_chunk
269 and scan_commit populate this data structure with information about
270 what strings MUST appear in the pattern. We look for the longest
271 string that must appear at a fixed location, and we look for the
272 longest string that may appear at a floating location. So for instance
277 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
278 strings (because they follow a .* construct). study_chunk will identify
279 both FOO and BAR as being the longest fixed and floating strings respectively.
281 The strings can be composites, for instance
285 will result in a composite fixed substring 'foo'.
287 For each string some basic information is maintained:
289 - offset or min_offset
290 This is the position the string must appear at, or not before.
291 It also implicitly (when combined with minlenp) tells us how many
292 characters must match before the string we are searching for.
293 Likewise when combined with minlenp and the length of the string it
294 tells us how many characters must appear after the string we have
298 Only used for floating strings. This is the rightmost point that
299 the string can appear at. If set to I32 max it indicates that the
300 string can occur infinitely far to the right.
303 A pointer to the minimum number of characters of the pattern that the
304 string was found inside. This is important as in the case of positive
305 lookahead or positive lookbehind we can have multiple patterns
310 The minimum length of the pattern overall is 3, the minimum length
311 of the lookahead part is 3, but the minimum length of the part that
312 will actually match is 1. So 'FOO's minimum length is 3, but the
313 minimum length for the F is 1. This is important as the minimum length
314 is used to determine offsets in front of and behind the string being
315 looked for. Since strings can be composites this is the length of the
316 pattern at the time it was committed with a scan_commit. Note that
317 the length is calculated by study_chunk, so that the minimum lengths
318 are not known until the full pattern has been compiled, thus the
319 pointer to the value.
323 In the case of lookbehind the string being searched for can be
324 offset past the start point of the final matching string.
325 If this value was just blithely removed from the min_offset it would
326 invalidate some of the calculations for how many chars must match
327 before or after (as they are derived from min_offset and minlen and
328 the length of the string being searched for).
329 When the final pattern is compiled and the data is moved from the
330 scan_data_t structure into the regexp structure the information
331 about lookbehind is factored in, with the information that would
332 have been lost precalculated in the end_shift field for the
335 The fields pos_min and pos_delta are used to store the minimum offset
336 and the delta to the maximum offset at the current point in the pattern.
340 typedef struct scan_data_t {
341 /*I32 len_min; unused */
342 /*I32 len_delta; unused */
346 I32 last_end; /* min value, <0 unless valid. */
349 SV **longest; /* Either &l_fixed, or &l_float. */
350 SV *longest_fixed; /* longest fixed string found in pattern */
351 I32 offset_fixed; /* offset where it starts */
352 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
353 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
354 SV *longest_float; /* longest floating string found in pattern */
355 I32 offset_float_min; /* earliest point in string it can appear */
356 I32 offset_float_max; /* latest point in string it can appear */
357 I32 *minlen_float; /* pointer to the minlen relevant to the string */
358 I32 lookbehind_float; /* is the position of the string modified by LB */
362 struct regnode_charclass_class *start_class;
366 * Forward declarations for pregcomp()'s friends.
369 static const scan_data_t zero_scan_data =
370 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
372 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
373 #define SF_BEFORE_SEOL 0x0001
374 #define SF_BEFORE_MEOL 0x0002
375 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
376 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
379 # define SF_FIX_SHIFT_EOL (0+2)
380 # define SF_FL_SHIFT_EOL (0+4)
382 # define SF_FIX_SHIFT_EOL (+2)
383 # define SF_FL_SHIFT_EOL (+4)
386 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
387 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
389 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
390 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
391 #define SF_IS_INF 0x0040
392 #define SF_HAS_PAR 0x0080
393 #define SF_IN_PAR 0x0100
394 #define SF_HAS_EVAL 0x0200
395 #define SCF_DO_SUBSTR 0x0400
396 #define SCF_DO_STCLASS_AND 0x0800
397 #define SCF_DO_STCLASS_OR 0x1000
398 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
399 #define SCF_WHILEM_VISITED_POS 0x2000
401 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
402 #define SCF_SEEN_ACCEPT 0x8000
404 #define UTF cBOOL(RExC_utf8)
406 /* The enums for all these are ordered so things work out correctly */
407 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
408 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
409 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
410 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
411 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
412 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
413 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
415 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
417 #define OOB_NAMEDCLASS -1
419 /* There is no code point that is out-of-bounds, so this is problematic. But
420 * its only current use is to initialize a variable that is always set before
422 #define OOB_UNICODE 0xDEADBEEF
424 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
425 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
428 /* length of regex to show in messages that don't mark a position within */
429 #define RegexLengthToShowInErrorMessages 127
432 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
433 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
434 * op/pragma/warn/regcomp.
436 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
437 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
439 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
442 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
443 * arg. Show regex, up to a maximum length. If it's too long, chop and add
446 #define _FAIL(code) STMT_START { \
447 const char *ellipses = ""; \
448 IV len = RExC_end - RExC_precomp; \
451 SAVEFREESV(RExC_rx_sv); \
452 if (len > RegexLengthToShowInErrorMessages) { \
453 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
454 len = RegexLengthToShowInErrorMessages - 10; \
460 #define FAIL(msg) _FAIL( \
461 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
462 msg, (int)len, RExC_precomp, ellipses))
464 #define FAIL2(msg,arg) _FAIL( \
465 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
466 arg, (int)len, RExC_precomp, ellipses))
469 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
471 #define Simple_vFAIL(m) STMT_START { \
472 const IV offset = RExC_parse - RExC_precomp; \
473 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
474 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
478 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
480 #define vFAIL(m) STMT_START { \
482 SAVEFREESV(RExC_rx_sv); \
487 * Like Simple_vFAIL(), but accepts two arguments.
489 #define Simple_vFAIL2(m,a1) STMT_START { \
490 const IV offset = RExC_parse - RExC_precomp; \
491 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
492 (int)offset, RExC_precomp, RExC_precomp + offset); \
496 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
498 #define vFAIL2(m,a1) STMT_START { \
500 SAVEFREESV(RExC_rx_sv); \
501 Simple_vFAIL2(m, a1); \
506 * Like Simple_vFAIL(), but accepts three arguments.
508 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
509 const IV offset = RExC_parse - RExC_precomp; \
510 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
515 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
517 #define vFAIL3(m,a1,a2) STMT_START { \
519 SAVEFREESV(RExC_rx_sv); \
520 Simple_vFAIL3(m, a1, a2); \
524 * Like Simple_vFAIL(), but accepts four arguments.
526 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
527 const IV offset = RExC_parse - RExC_precomp; \
528 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
529 (int)offset, RExC_precomp, RExC_precomp + offset); \
532 #define ckWARNreg(loc,m) STMT_START { \
533 const IV offset = loc - RExC_precomp; \
534 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
535 (int)offset, RExC_precomp, RExC_precomp + offset); \
538 #define ckWARNregdep(loc,m) STMT_START { \
539 const IV offset = loc - RExC_precomp; \
540 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
542 (int)offset, RExC_precomp, RExC_precomp + offset); \
545 #define ckWARN2regdep(loc,m, a1) STMT_START { \
546 const IV offset = loc - RExC_precomp; \
547 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
549 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
552 #define ckWARN2reg(loc, m, a1) STMT_START { \
553 const IV offset = loc - RExC_precomp; \
554 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
555 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
558 #define vWARN3(loc, m, a1, a2) STMT_START { \
559 const IV offset = loc - RExC_precomp; \
560 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
561 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
564 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
565 const IV offset = loc - RExC_precomp; \
566 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
567 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
570 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
571 const IV offset = loc - RExC_precomp; \
572 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
573 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
576 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
577 const IV offset = loc - RExC_precomp; \
578 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
579 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
582 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
583 const IV offset = loc - RExC_precomp; \
584 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
585 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
589 /* Allow for side effects in s */
590 #define REGC(c,s) STMT_START { \
591 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
594 /* Macros for recording node offsets. 20001227 mjd@plover.com
595 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
596 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
597 * Element 0 holds the number n.
598 * Position is 1 indexed.
600 #ifndef RE_TRACK_PATTERN_OFFSETS
601 #define Set_Node_Offset_To_R(node,byte)
602 #define Set_Node_Offset(node,byte)
603 #define Set_Cur_Node_Offset
604 #define Set_Node_Length_To_R(node,len)
605 #define Set_Node_Length(node,len)
606 #define Set_Node_Cur_Length(node)
607 #define Node_Offset(n)
608 #define Node_Length(n)
609 #define Set_Node_Offset_Length(node,offset,len)
610 #define ProgLen(ri) ri->u.proglen
611 #define SetProgLen(ri,x) ri->u.proglen = x
613 #define ProgLen(ri) ri->u.offsets[0]
614 #define SetProgLen(ri,x) ri->u.offsets[0] = x
615 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
617 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
618 __LINE__, (int)(node), (int)(byte))); \
620 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
622 RExC_offsets[2*(node)-1] = (byte); \
627 #define Set_Node_Offset(node,byte) \
628 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
629 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
631 #define Set_Node_Length_To_R(node,len) STMT_START { \
633 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
634 __LINE__, (int)(node), (int)(len))); \
636 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
638 RExC_offsets[2*(node)] = (len); \
643 #define Set_Node_Length(node,len) \
644 Set_Node_Length_To_R((node)-RExC_emit_start, len)
645 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
646 #define Set_Node_Cur_Length(node) \
647 Set_Node_Length(node, RExC_parse - parse_start)
649 /* Get offsets and lengths */
650 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
651 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
653 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
654 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
655 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
659 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
660 #define EXPERIMENTAL_INPLACESCAN
661 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
663 #define DEBUG_STUDYDATA(str,data,depth) \
664 DEBUG_OPTIMISE_MORE_r(if(data){ \
665 PerlIO_printf(Perl_debug_log, \
666 "%*s" str "Pos:%"IVdf"/%"IVdf \
667 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
668 (int)(depth)*2, "", \
669 (IV)((data)->pos_min), \
670 (IV)((data)->pos_delta), \
671 (UV)((data)->flags), \
672 (IV)((data)->whilem_c), \
673 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
674 is_inf ? "INF " : "" \
676 if ((data)->last_found) \
677 PerlIO_printf(Perl_debug_log, \
678 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
679 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
680 SvPVX_const((data)->last_found), \
681 (IV)((data)->last_end), \
682 (IV)((data)->last_start_min), \
683 (IV)((data)->last_start_max), \
684 ((data)->longest && \
685 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
686 SvPVX_const((data)->longest_fixed), \
687 (IV)((data)->offset_fixed), \
688 ((data)->longest && \
689 (data)->longest==&((data)->longest_float)) ? "*" : "", \
690 SvPVX_const((data)->longest_float), \
691 (IV)((data)->offset_float_min), \
692 (IV)((data)->offset_float_max) \
694 PerlIO_printf(Perl_debug_log,"\n"); \
697 /* Mark that we cannot extend a found fixed substring at this point.
698 Update the longest found anchored substring and the longest found
699 floating substrings if needed. */
702 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
704 const STRLEN l = CHR_SVLEN(data->last_found);
705 const STRLEN old_l = CHR_SVLEN(*data->longest);
706 GET_RE_DEBUG_FLAGS_DECL;
708 PERL_ARGS_ASSERT_SCAN_COMMIT;
710 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
711 SvSetMagicSV(*data->longest, data->last_found);
712 if (*data->longest == data->longest_fixed) {
713 data->offset_fixed = l ? data->last_start_min : data->pos_min;
714 if (data->flags & SF_BEFORE_EOL)
716 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
718 data->flags &= ~SF_FIX_BEFORE_EOL;
719 data->minlen_fixed=minlenp;
720 data->lookbehind_fixed=0;
722 else { /* *data->longest == data->longest_float */
723 data->offset_float_min = l ? data->last_start_min : data->pos_min;
724 data->offset_float_max = (l
725 ? data->last_start_max
726 : data->pos_min + data->pos_delta);
727 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
728 data->offset_float_max = I32_MAX;
729 if (data->flags & SF_BEFORE_EOL)
731 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
733 data->flags &= ~SF_FL_BEFORE_EOL;
734 data->minlen_float=minlenp;
735 data->lookbehind_float=0;
738 SvCUR_set(data->last_found, 0);
740 SV * const sv = data->last_found;
741 if (SvUTF8(sv) && SvMAGICAL(sv)) {
742 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
748 data->flags &= ~SF_BEFORE_EOL;
749 DEBUG_STUDYDATA("commit: ",data,0);
752 /* These macros set, clear and test whether the synthetic start class ('ssc',
753 * given by the parameter) matches an empty string (EOS). This uses the
754 * 'next_off' field in the node, to save a bit in the flags field. The ssc
755 * stands alone, so there is never a next_off, so this field is otherwise
756 * unused. The EOS information is used only for compilation, but theoretically
757 * it could be passed on to the execution code. This could be used to store
758 * more than one bit of information, but only this one is currently used. */
759 #define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
760 #define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
761 #define TEST_SSC_EOS(node) cBOOL((node)->next_off)
763 /* Can match anything (initialization) */
765 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
767 PERL_ARGS_ASSERT_CL_ANYTHING;
769 ANYOF_BITMAP_SETALL(cl);
770 cl->flags = ANYOF_UNICODE_ALL;
773 /* If any portion of the regex is to operate under locale rules,
774 * initialization includes it. The reason this isn't done for all regexes
775 * is that the optimizer was written under the assumption that locale was
776 * all-or-nothing. Given the complexity and lack of documentation in the
777 * optimizer, and that there are inadequate test cases for locale, so many
778 * parts of it may not work properly, it is safest to avoid locale unless
780 if (RExC_contains_locale) {
781 ANYOF_CLASS_SETALL(cl); /* /l uses class */
782 cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
785 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
789 /* Can match anything (initialization) */
791 S_cl_is_anything(const struct regnode_charclass_class *cl)
795 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
797 for (value = 0; value <= ANYOF_MAX; value += 2)
798 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
800 if (!(cl->flags & ANYOF_UNICODE_ALL))
802 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
807 /* Can match anything (initialization) */
809 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
811 PERL_ARGS_ASSERT_CL_INIT;
813 Zero(cl, 1, struct regnode_charclass_class);
815 cl_anything(pRExC_state, cl);
816 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
819 /* These two functions currently do the exact same thing */
820 #define cl_init_zero S_cl_init
822 /* 'AND' a given class with another one. Can create false positives. 'cl'
823 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
824 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
826 S_cl_and(struct regnode_charclass_class *cl,
827 const struct regnode_charclass_class *and_with)
829 PERL_ARGS_ASSERT_CL_AND;
831 assert(PL_regkind[and_with->type] == ANYOF);
833 /* I (khw) am not sure all these restrictions are necessary XXX */
834 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
835 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
836 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
837 && !(and_with->flags & ANYOF_LOC_FOLD)
838 && !(cl->flags & ANYOF_LOC_FOLD)) {
841 if (and_with->flags & ANYOF_INVERT)
842 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
843 cl->bitmap[i] &= ~and_with->bitmap[i];
845 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
846 cl->bitmap[i] &= and_with->bitmap[i];
847 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
849 if (and_with->flags & ANYOF_INVERT) {
851 /* Here, the and'ed node is inverted. Get the AND of the flags that
852 * aren't affected by the inversion. Those that are affected are
853 * handled individually below */
854 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
855 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
856 cl->flags |= affected_flags;
858 /* We currently don't know how to deal with things that aren't in the
859 * bitmap, but we know that the intersection is no greater than what
860 * is already in cl, so let there be false positives that get sorted
861 * out after the synthetic start class succeeds, and the node is
862 * matched for real. */
864 /* The inversion of these two flags indicate that the resulting
865 * intersection doesn't have them */
866 if (and_with->flags & ANYOF_UNICODE_ALL) {
867 cl->flags &= ~ANYOF_UNICODE_ALL;
869 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
870 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
873 else { /* and'd node is not inverted */
874 U8 outside_bitmap_but_not_utf8; /* Temp variable */
876 if (! ANYOF_NONBITMAP(and_with)) {
878 /* Here 'and_with' doesn't match anything outside the bitmap
879 * (except possibly ANYOF_UNICODE_ALL), which means the
880 * intersection can't either, except for ANYOF_UNICODE_ALL, in
881 * which case we don't know what the intersection is, but it's no
882 * greater than what cl already has, so can just leave it alone,
883 * with possible false positives */
884 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
885 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
886 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
889 else if (! ANYOF_NONBITMAP(cl)) {
891 /* Here, 'and_with' does match something outside the bitmap, and cl
892 * doesn't have a list of things to match outside the bitmap. If
893 * cl can match all code points above 255, the intersection will
894 * be those above-255 code points that 'and_with' matches. If cl
895 * can't match all Unicode code points, it means that it can't
896 * match anything outside the bitmap (since the 'if' that got us
897 * into this block tested for that), so we leave the bitmap empty.
899 if (cl->flags & ANYOF_UNICODE_ALL) {
900 ARG_SET(cl, ARG(and_with));
902 /* and_with's ARG may match things that don't require UTF8.
903 * And now cl's will too, in spite of this being an 'and'. See
904 * the comments below about the kludge */
905 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
909 /* Here, both 'and_with' and cl match something outside the
910 * bitmap. Currently we do not do the intersection, so just match
911 * whatever cl had at the beginning. */
915 /* Take the intersection of the two sets of flags. However, the
916 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
917 * kludge around the fact that this flag is not treated like the others
918 * which are initialized in cl_anything(). The way the optimizer works
919 * is that the synthetic start class (SSC) is initialized to match
920 * anything, and then the first time a real node is encountered, its
921 * values are AND'd with the SSC's with the result being the values of
922 * the real node. However, there are paths through the optimizer where
923 * the AND never gets called, so those initialized bits are set
924 * inappropriately, which is not usually a big deal, as they just cause
925 * false positives in the SSC, which will just mean a probably
926 * imperceptible slow down in execution. However this bit has a
927 * higher false positive consequence in that it can cause utf8.pm,
928 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
929 * bigger slowdown and also causes significant extra memory to be used.
930 * In order to prevent this, the code now takes a different tack. The
931 * bit isn't set unless some part of the regular expression needs it,
932 * but once set it won't get cleared. This means that these extra
933 * modules won't get loaded unless there was some path through the
934 * pattern that would have required them anyway, and so any false
935 * positives that occur by not ANDing them out when they could be
936 * aren't as severe as they would be if we treated this bit like all
938 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
939 & ANYOF_NONBITMAP_NON_UTF8;
940 cl->flags &= and_with->flags;
941 cl->flags |= outside_bitmap_but_not_utf8;
945 /* 'OR' a given class with another one. Can create false positives. 'cl'
946 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
947 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
949 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
951 PERL_ARGS_ASSERT_CL_OR;
953 if (or_with->flags & ANYOF_INVERT) {
955 /* Here, the or'd node is to be inverted. This means we take the
956 * complement of everything not in the bitmap, but currently we don't
957 * know what that is, so give up and match anything */
958 if (ANYOF_NONBITMAP(or_with)) {
959 cl_anything(pRExC_state, cl);
962 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
963 * <= (B1 | !B2) | (CL1 | !CL2)
964 * which is wasteful if CL2 is small, but we ignore CL2:
965 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
966 * XXXX Can we handle case-fold? Unclear:
967 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
968 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
970 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
971 && !(or_with->flags & ANYOF_LOC_FOLD)
972 && !(cl->flags & ANYOF_LOC_FOLD) ) {
975 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
976 cl->bitmap[i] |= ~or_with->bitmap[i];
977 } /* XXXX: logic is complicated otherwise */
979 cl_anything(pRExC_state, cl);
982 /* And, we can just take the union of the flags that aren't affected
983 * by the inversion */
984 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
986 /* For the remaining flags:
987 ANYOF_UNICODE_ALL and inverted means to not match anything above
988 255, which means that the union with cl should just be
989 what cl has in it, so can ignore this flag
990 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
991 is 127-255 to match them, but then invert that, so the
992 union with cl should just be what cl has in it, so can
995 } else { /* 'or_with' is not inverted */
996 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
997 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
998 && (!(or_with->flags & ANYOF_LOC_FOLD)
999 || (cl->flags & ANYOF_LOC_FOLD)) ) {
1002 /* OR char bitmap and class bitmap separately */
1003 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1004 cl->bitmap[i] |= or_with->bitmap[i];
1005 ANYOF_CLASS_OR(or_with, cl);
1007 else { /* XXXX: logic is complicated, leave it along for a moment. */
1008 cl_anything(pRExC_state, cl);
1011 if (ANYOF_NONBITMAP(or_with)) {
1013 /* Use the added node's outside-the-bit-map match if there isn't a
1014 * conflict. If there is a conflict (both nodes match something
1015 * outside the bitmap, but what they match outside is not the same
1016 * pointer, and hence not easily compared until XXX we extend
1017 * inversion lists this far), give up and allow the start class to
1018 * match everything outside the bitmap. If that stuff is all above
1019 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1020 if (! ANYOF_NONBITMAP(cl)) {
1021 ARG_SET(cl, ARG(or_with));
1023 else if (ARG(cl) != ARG(or_with)) {
1025 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1026 cl_anything(pRExC_state, cl);
1029 cl->flags |= ANYOF_UNICODE_ALL;
1034 /* Take the union */
1035 cl->flags |= or_with->flags;
1039 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1040 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1041 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1042 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1047 dump_trie(trie,widecharmap,revcharmap)
1048 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1049 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1051 These routines dump out a trie in a somewhat readable format.
1052 The _interim_ variants are used for debugging the interim
1053 tables that are used to generate the final compressed
1054 representation which is what dump_trie expects.
1056 Part of the reason for their existence is to provide a form
1057 of documentation as to how the different representations function.
1062 Dumps the final compressed table form of the trie to Perl_debug_log.
1063 Used for debugging make_trie().
1067 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1068 AV *revcharmap, U32 depth)
1071 SV *sv=sv_newmortal();
1072 int colwidth= widecharmap ? 6 : 4;
1074 GET_RE_DEBUG_FLAGS_DECL;
1076 PERL_ARGS_ASSERT_DUMP_TRIE;
1078 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1079 (int)depth * 2 + 2,"",
1080 "Match","Base","Ofs" );
1082 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1083 SV ** const tmp = av_fetch( revcharmap, state, 0);
1085 PerlIO_printf( Perl_debug_log, "%*s",
1087 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1088 PL_colors[0], PL_colors[1],
1089 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1090 PERL_PV_ESCAPE_FIRSTCHAR
1095 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1096 (int)depth * 2 + 2,"");
1098 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1099 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1100 PerlIO_printf( Perl_debug_log, "\n");
1102 for( state = 1 ; state < trie->statecount ; state++ ) {
1103 const U32 base = trie->states[ state ].trans.base;
1105 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1107 if ( trie->states[ state ].wordnum ) {
1108 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1110 PerlIO_printf( Perl_debug_log, "%6s", "" );
1113 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1118 while( ( base + ofs < trie->uniquecharcount ) ||
1119 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1120 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1123 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1125 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1126 if ( ( base + ofs >= trie->uniquecharcount ) &&
1127 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1128 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1130 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1132 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1134 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1138 PerlIO_printf( Perl_debug_log, "]");
1141 PerlIO_printf( Perl_debug_log, "\n" );
1143 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1144 for (word=1; word <= trie->wordcount; word++) {
1145 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1146 (int)word, (int)(trie->wordinfo[word].prev),
1147 (int)(trie->wordinfo[word].len));
1149 PerlIO_printf(Perl_debug_log, "\n" );
1152 Dumps a fully constructed but uncompressed trie in list form.
1153 List tries normally only are used for construction when the number of
1154 possible chars (trie->uniquecharcount) is very high.
1155 Used for debugging make_trie().
1158 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1159 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1163 SV *sv=sv_newmortal();
1164 int colwidth= widecharmap ? 6 : 4;
1165 GET_RE_DEBUG_FLAGS_DECL;
1167 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1169 /* print out the table precompression. */
1170 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1171 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1172 "------:-----+-----------------\n" );
1174 for( state=1 ; state < next_alloc ; state ++ ) {
1177 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1178 (int)depth * 2 + 2,"", (UV)state );
1179 if ( ! trie->states[ state ].wordnum ) {
1180 PerlIO_printf( Perl_debug_log, "%5s| ","");
1182 PerlIO_printf( Perl_debug_log, "W%4x| ",
1183 trie->states[ state ].wordnum
1186 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1187 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1189 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1191 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1192 PL_colors[0], PL_colors[1],
1193 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1194 PERL_PV_ESCAPE_FIRSTCHAR
1196 TRIE_LIST_ITEM(state,charid).forid,
1197 (UV)TRIE_LIST_ITEM(state,charid).newstate
1200 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1201 (int)((depth * 2) + 14), "");
1204 PerlIO_printf( Perl_debug_log, "\n");
1209 Dumps a fully constructed but uncompressed trie in table form.
1210 This is the normal DFA style state transition table, with a few
1211 twists to facilitate compression later.
1212 Used for debugging make_trie().
1215 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1216 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1221 SV *sv=sv_newmortal();
1222 int colwidth= widecharmap ? 6 : 4;
1223 GET_RE_DEBUG_FLAGS_DECL;
1225 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1228 print out the table precompression so that we can do a visual check
1229 that they are identical.
1232 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1234 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1235 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1237 PerlIO_printf( Perl_debug_log, "%*s",
1239 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1240 PL_colors[0], PL_colors[1],
1241 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1242 PERL_PV_ESCAPE_FIRSTCHAR
1248 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1250 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1251 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1254 PerlIO_printf( Perl_debug_log, "\n" );
1256 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1258 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1259 (int)depth * 2 + 2,"",
1260 (UV)TRIE_NODENUM( state ) );
1262 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1263 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1265 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1267 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1269 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1270 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1272 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1273 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1281 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1282 startbranch: the first branch in the whole branch sequence
1283 first : start branch of sequence of branch-exact nodes.
1284 May be the same as startbranch
1285 last : Thing following the last branch.
1286 May be the same as tail.
1287 tail : item following the branch sequence
1288 count : words in the sequence
1289 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1290 depth : indent depth
1292 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1294 A trie is an N'ary tree where the branches are determined by digital
1295 decomposition of the key. IE, at the root node you look up the 1st character and
1296 follow that branch repeat until you find the end of the branches. Nodes can be
1297 marked as "accepting" meaning they represent a complete word. Eg:
1301 would convert into the following structure. Numbers represent states, letters
1302 following numbers represent valid transitions on the letter from that state, if
1303 the number is in square brackets it represents an accepting state, otherwise it
1304 will be in parenthesis.
1306 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1310 (1) +-i->(6)-+-s->[7]
1312 +-s->(3)-+-h->(4)-+-e->[5]
1314 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1316 This shows that when matching against the string 'hers' we will begin at state 1
1317 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1318 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1319 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1320 single traverse. We store a mapping from accepting to state to which word was
1321 matched, and then when we have multiple possibilities we try to complete the
1322 rest of the regex in the order in which they occured in the alternation.
1324 The only prior NFA like behaviour that would be changed by the TRIE support is
1325 the silent ignoring of duplicate alternations which are of the form:
1327 / (DUPE|DUPE) X? (?{ ... }) Y /x
1329 Thus EVAL blocks following a trie may be called a different number of times with
1330 and without the optimisation. With the optimisations dupes will be silently
1331 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1332 the following demonstrates:
1334 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1336 which prints out 'word' three times, but
1338 'words'=~/(word|word|word)(?{ print $1 })S/
1340 which doesnt print it out at all. This is due to other optimisations kicking in.
1342 Example of what happens on a structural level:
1344 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1346 1: CURLYM[1] {1,32767}(18)
1357 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1358 and should turn into:
1360 1: CURLYM[1] {1,32767}(18)
1362 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1370 Cases where tail != last would be like /(?foo|bar)baz/:
1380 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1381 and would end up looking like:
1384 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1391 d = uvuni_to_utf8_flags(d, uv, 0);
1393 is the recommended Unicode-aware way of saying
1398 #define TRIE_STORE_REVCHAR(val) \
1401 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1402 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1403 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1404 SvCUR_set(zlopp, kapow - flrbbbbb); \
1407 av_push(revcharmap, zlopp); \
1409 char ooooff = (char)val; \
1410 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1414 #define TRIE_READ_CHAR STMT_START { \
1417 /* if it is UTF then it is either already folded, or does not need folding */ \
1418 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1420 else if (folder == PL_fold_latin1) { \
1421 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1422 if ( foldlen > 0 ) { \
1423 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1429 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1430 skiplen = UNISKIP(uvc); \
1431 foldlen -= skiplen; \
1432 scan = foldbuf + skiplen; \
1435 /* raw data, will be folded later if needed */ \
1443 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1444 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1445 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1446 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1448 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1449 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1450 TRIE_LIST_CUR( state )++; \
1453 #define TRIE_LIST_NEW(state) STMT_START { \
1454 Newxz( trie->states[ state ].trans.list, \
1455 4, reg_trie_trans_le ); \
1456 TRIE_LIST_CUR( state ) = 1; \
1457 TRIE_LIST_LEN( state ) = 4; \
1460 #define TRIE_HANDLE_WORD(state) STMT_START { \
1461 U16 dupe= trie->states[ state ].wordnum; \
1462 regnode * const noper_next = regnext( noper ); \
1465 /* store the word for dumping */ \
1467 if (OP(noper) != NOTHING) \
1468 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1470 tmp = newSVpvn_utf8( "", 0, UTF ); \
1471 av_push( trie_words, tmp ); \
1475 trie->wordinfo[curword].prev = 0; \
1476 trie->wordinfo[curword].len = wordlen; \
1477 trie->wordinfo[curword].accept = state; \
1479 if ( noper_next < tail ) { \
1481 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1482 trie->jump[curword] = (U16)(noper_next - convert); \
1484 jumper = noper_next; \
1486 nextbranch= regnext(cur); \
1490 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1491 /* chain, so that when the bits of chain are later */\
1492 /* linked together, the dups appear in the chain */\
1493 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1494 trie->wordinfo[dupe].prev = curword; \
1496 /* we haven't inserted this word yet. */ \
1497 trie->states[ state ].wordnum = curword; \
1502 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1503 ( ( base + charid >= ucharcount \
1504 && base + charid < ubound \
1505 && state == trie->trans[ base - ucharcount + charid ].check \
1506 && trie->trans[ base - ucharcount + charid ].next ) \
1507 ? trie->trans[ base - ucharcount + charid ].next \
1508 : ( state==1 ? special : 0 ) \
1512 #define MADE_JUMP_TRIE 2
1513 #define MADE_EXACT_TRIE 4
1516 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1519 /* first pass, loop through and scan words */
1520 reg_trie_data *trie;
1521 HV *widecharmap = NULL;
1522 AV *revcharmap = newAV();
1524 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1529 regnode *jumper = NULL;
1530 regnode *nextbranch = NULL;
1531 regnode *convert = NULL;
1532 U32 *prev_states; /* temp array mapping each state to previous one */
1533 /* we just use folder as a flag in utf8 */
1534 const U8 * folder = NULL;
1537 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1538 AV *trie_words = NULL;
1539 /* along with revcharmap, this only used during construction but both are
1540 * useful during debugging so we store them in the struct when debugging.
1543 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1544 STRLEN trie_charcount=0;
1546 SV *re_trie_maxbuff;
1547 GET_RE_DEBUG_FLAGS_DECL;
1549 PERL_ARGS_ASSERT_MAKE_TRIE;
1551 PERL_UNUSED_ARG(depth);
1558 case EXACTFU_TRICKYFOLD:
1559 case EXACTFU: folder = PL_fold_latin1; break;
1560 case EXACTF: folder = PL_fold; break;
1561 case EXACTFL: folder = PL_fold_locale; break;
1562 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1565 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1567 trie->startstate = 1;
1568 trie->wordcount = word_count;
1569 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1570 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1572 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1573 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1574 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1577 trie_words = newAV();
1580 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1581 if (!SvIOK(re_trie_maxbuff)) {
1582 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1584 DEBUG_TRIE_COMPILE_r({
1585 PerlIO_printf( Perl_debug_log,
1586 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1587 (int)depth * 2 + 2, "",
1588 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1589 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1593 /* Find the node we are going to overwrite */
1594 if ( first == startbranch && OP( last ) != BRANCH ) {
1595 /* whole branch chain */
1598 /* branch sub-chain */
1599 convert = NEXTOPER( first );
1602 /* -- First loop and Setup --
1604 We first traverse the branches and scan each word to determine if it
1605 contains widechars, and how many unique chars there are, this is
1606 important as we have to build a table with at least as many columns as we
1609 We use an array of integers to represent the character codes 0..255
1610 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1611 native representation of the character value as the key and IV's for the
1614 *TODO* If we keep track of how many times each character is used we can
1615 remap the columns so that the table compression later on is more
1616 efficient in terms of memory by ensuring the most common value is in the
1617 middle and the least common are on the outside. IMO this would be better
1618 than a most to least common mapping as theres a decent chance the most
1619 common letter will share a node with the least common, meaning the node
1620 will not be compressible. With a middle is most common approach the worst
1621 case is when we have the least common nodes twice.
1625 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1626 regnode *noper = NEXTOPER( cur );
1627 const U8 *uc = (U8*)STRING( noper );
1628 const U8 *e = uc + STR_LEN( noper );
1630 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1632 const U8 *scan = (U8*)NULL;
1633 U32 wordlen = 0; /* required init */
1635 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1637 if (OP(noper) == NOTHING) {
1638 regnode *noper_next= regnext(noper);
1639 if (noper_next != tail && OP(noper_next) == flags) {
1641 uc= (U8*)STRING(noper);
1642 e= uc + STR_LEN(noper);
1643 trie->minlen= STR_LEN(noper);
1650 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1651 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1652 regardless of encoding */
1653 if (OP( noper ) == EXACTFU_SS) {
1654 /* false positives are ok, so just set this */
1655 TRIE_BITMAP_SET(trie,0xDF);
1658 for ( ; uc < e ; uc += len ) {
1659 TRIE_CHARCOUNT(trie)++;
1664 U8 folded= folder[ (U8) uvc ];
1665 if ( !trie->charmap[ folded ] ) {
1666 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1667 TRIE_STORE_REVCHAR( folded );
1670 if ( !trie->charmap[ uvc ] ) {
1671 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1672 TRIE_STORE_REVCHAR( uvc );
1675 /* store the codepoint in the bitmap, and its folded
1677 TRIE_BITMAP_SET(trie, uvc);
1679 /* store the folded codepoint */
1680 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1683 /* store first byte of utf8 representation of
1684 variant codepoints */
1685 if (! UNI_IS_INVARIANT(uvc)) {
1686 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1689 set_bit = 0; /* We've done our bit :-) */
1694 widecharmap = newHV();
1696 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1699 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1701 if ( !SvTRUE( *svpp ) ) {
1702 sv_setiv( *svpp, ++trie->uniquecharcount );
1703 TRIE_STORE_REVCHAR(uvc);
1707 if( cur == first ) {
1708 trie->minlen = chars;
1709 trie->maxlen = chars;
1710 } else if (chars < trie->minlen) {
1711 trie->minlen = chars;
1712 } else if (chars > trie->maxlen) {
1713 trie->maxlen = chars;
1715 if (OP( noper ) == EXACTFU_SS) {
1716 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1717 if (trie->minlen > 1)
1720 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1721 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1722 * - We assume that any such sequence might match a 2 byte string */
1723 if (trie->minlen > 2 )
1727 } /* end first pass */
1728 DEBUG_TRIE_COMPILE_r(
1729 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1730 (int)depth * 2 + 2,"",
1731 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1732 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1733 (int)trie->minlen, (int)trie->maxlen )
1737 We now know what we are dealing with in terms of unique chars and
1738 string sizes so we can calculate how much memory a naive
1739 representation using a flat table will take. If it's over a reasonable
1740 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1741 conservative but potentially much slower representation using an array
1744 At the end we convert both representations into the same compressed
1745 form that will be used in regexec.c for matching with. The latter
1746 is a form that cannot be used to construct with but has memory
1747 properties similar to the list form and access properties similar
1748 to the table form making it both suitable for fast searches and
1749 small enough that its feasable to store for the duration of a program.
1751 See the comment in the code where the compressed table is produced
1752 inplace from the flat tabe representation for an explanation of how
1753 the compression works.
1758 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1761 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1763 Second Pass -- Array Of Lists Representation
1765 Each state will be represented by a list of charid:state records
1766 (reg_trie_trans_le) the first such element holds the CUR and LEN
1767 points of the allocated array. (See defines above).
1769 We build the initial structure using the lists, and then convert
1770 it into the compressed table form which allows faster lookups
1771 (but cant be modified once converted).
1774 STRLEN transcount = 1;
1776 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1777 "%*sCompiling trie using list compiler\n",
1778 (int)depth * 2 + 2, ""));
1780 trie->states = (reg_trie_state *)
1781 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1782 sizeof(reg_trie_state) );
1786 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1788 regnode *noper = NEXTOPER( cur );
1789 U8 *uc = (U8*)STRING( noper );
1790 const U8 *e = uc + STR_LEN( noper );
1791 U32 state = 1; /* required init */
1792 U16 charid = 0; /* sanity init */
1793 U8 *scan = (U8*)NULL; /* sanity init */
1794 STRLEN foldlen = 0; /* required init */
1795 U32 wordlen = 0; /* required init */
1796 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1799 if (OP(noper) == NOTHING) {
1800 regnode *noper_next= regnext(noper);
1801 if (noper_next != tail && OP(noper_next) == flags) {
1803 uc= (U8*)STRING(noper);
1804 e= uc + STR_LEN(noper);
1808 if (OP(noper) != NOTHING) {
1809 for ( ; uc < e ; uc += len ) {
1814 charid = trie->charmap[ uvc ];
1816 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1820 charid=(U16)SvIV( *svpp );
1823 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1830 if ( !trie->states[ state ].trans.list ) {
1831 TRIE_LIST_NEW( state );
1833 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1834 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1835 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1840 newstate = next_alloc++;
1841 prev_states[newstate] = state;
1842 TRIE_LIST_PUSH( state, charid, newstate );
1847 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1851 TRIE_HANDLE_WORD(state);
1853 } /* end second pass */
1855 /* next alloc is the NEXT state to be allocated */
1856 trie->statecount = next_alloc;
1857 trie->states = (reg_trie_state *)
1858 PerlMemShared_realloc( trie->states,
1860 * sizeof(reg_trie_state) );
1862 /* and now dump it out before we compress it */
1863 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1864 revcharmap, next_alloc,
1868 trie->trans = (reg_trie_trans *)
1869 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1876 for( state=1 ; state < next_alloc ; state ++ ) {
1880 DEBUG_TRIE_COMPILE_MORE_r(
1881 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1885 if (trie->states[state].trans.list) {
1886 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1890 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1891 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1892 if ( forid < minid ) {
1894 } else if ( forid > maxid ) {
1898 if ( transcount < tp + maxid - minid + 1) {
1900 trie->trans = (reg_trie_trans *)
1901 PerlMemShared_realloc( trie->trans,
1903 * sizeof(reg_trie_trans) );
1904 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1906 base = trie->uniquecharcount + tp - minid;
1907 if ( maxid == minid ) {
1909 for ( ; zp < tp ; zp++ ) {
1910 if ( ! trie->trans[ zp ].next ) {
1911 base = trie->uniquecharcount + zp - minid;
1912 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1913 trie->trans[ zp ].check = state;
1919 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1920 trie->trans[ tp ].check = state;
1925 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1926 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1927 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1928 trie->trans[ tid ].check = state;
1930 tp += ( maxid - minid + 1 );
1932 Safefree(trie->states[ state ].trans.list);
1935 DEBUG_TRIE_COMPILE_MORE_r(
1936 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1939 trie->states[ state ].trans.base=base;
1941 trie->lasttrans = tp + 1;
1945 Second Pass -- Flat Table Representation.
1947 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1948 We know that we will need Charcount+1 trans at most to store the data
1949 (one row per char at worst case) So we preallocate both structures
1950 assuming worst case.
1952 We then construct the trie using only the .next slots of the entry
1955 We use the .check field of the first entry of the node temporarily to
1956 make compression both faster and easier by keeping track of how many non
1957 zero fields are in the node.
1959 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1962 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1963 number representing the first entry of the node, and state as a
1964 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1965 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1966 are 2 entrys per node. eg:
1974 The table is internally in the right hand, idx form. However as we also
1975 have to deal with the states array which is indexed by nodenum we have to
1976 use TRIE_NODENUM() to convert.
1979 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1980 "%*sCompiling trie using table compiler\n",
1981 (int)depth * 2 + 2, ""));
1983 trie->trans = (reg_trie_trans *)
1984 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1985 * trie->uniquecharcount + 1,
1986 sizeof(reg_trie_trans) );
1987 trie->states = (reg_trie_state *)
1988 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1989 sizeof(reg_trie_state) );
1990 next_alloc = trie->uniquecharcount + 1;
1993 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1995 regnode *noper = NEXTOPER( cur );
1996 const U8 *uc = (U8*)STRING( noper );
1997 const U8 *e = uc + STR_LEN( noper );
1999 U32 state = 1; /* required init */
2001 U16 charid = 0; /* sanity init */
2002 U32 accept_state = 0; /* sanity init */
2003 U8 *scan = (U8*)NULL; /* sanity init */
2005 STRLEN foldlen = 0; /* required init */
2006 U32 wordlen = 0; /* required init */
2008 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2010 if (OP(noper) == NOTHING) {
2011 regnode *noper_next= regnext(noper);
2012 if (noper_next != tail && OP(noper_next) == flags) {
2014 uc= (U8*)STRING(noper);
2015 e= uc + STR_LEN(noper);
2019 if ( OP(noper) != NOTHING ) {
2020 for ( ; uc < e ; uc += len ) {
2025 charid = trie->charmap[ uvc ];
2027 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2028 charid = svpp ? (U16)SvIV(*svpp) : 0;
2032 if ( !trie->trans[ state + charid ].next ) {
2033 trie->trans[ state + charid ].next = next_alloc;
2034 trie->trans[ state ].check++;
2035 prev_states[TRIE_NODENUM(next_alloc)]
2036 = TRIE_NODENUM(state);
2037 next_alloc += trie->uniquecharcount;
2039 state = trie->trans[ state + charid ].next;
2041 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2043 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2046 accept_state = TRIE_NODENUM( state );
2047 TRIE_HANDLE_WORD(accept_state);
2049 } /* end second pass */
2051 /* and now dump it out before we compress it */
2052 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2054 next_alloc, depth+1));
2058 * Inplace compress the table.*
2060 For sparse data sets the table constructed by the trie algorithm will
2061 be mostly 0/FAIL transitions or to put it another way mostly empty.
2062 (Note that leaf nodes will not contain any transitions.)
2064 This algorithm compresses the tables by eliminating most such
2065 transitions, at the cost of a modest bit of extra work during lookup:
2067 - Each states[] entry contains a .base field which indicates the
2068 index in the state[] array wheres its transition data is stored.
2070 - If .base is 0 there are no valid transitions from that node.
2072 - If .base is nonzero then charid is added to it to find an entry in
2075 -If trans[states[state].base+charid].check!=state then the
2076 transition is taken to be a 0/Fail transition. Thus if there are fail
2077 transitions at the front of the node then the .base offset will point
2078 somewhere inside the previous nodes data (or maybe even into a node
2079 even earlier), but the .check field determines if the transition is
2083 The following process inplace converts the table to the compressed
2084 table: We first do not compress the root node 1,and mark all its
2085 .check pointers as 1 and set its .base pointer as 1 as well. This
2086 allows us to do a DFA construction from the compressed table later,
2087 and ensures that any .base pointers we calculate later are greater
2090 - We set 'pos' to indicate the first entry of the second node.
2092 - We then iterate over the columns of the node, finding the first and
2093 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2094 and set the .check pointers accordingly, and advance pos
2095 appropriately and repreat for the next node. Note that when we copy
2096 the next pointers we have to convert them from the original
2097 NODEIDX form to NODENUM form as the former is not valid post
2100 - If a node has no transitions used we mark its base as 0 and do not
2101 advance the pos pointer.
2103 - If a node only has one transition we use a second pointer into the
2104 structure to fill in allocated fail transitions from other states.
2105 This pointer is independent of the main pointer and scans forward
2106 looking for null transitions that are allocated to a state. When it
2107 finds one it writes the single transition into the "hole". If the
2108 pointer doesnt find one the single transition is appended as normal.
2110 - Once compressed we can Renew/realloc the structures to release the
2113 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2114 specifically Fig 3.47 and the associated pseudocode.
2118 const U32 laststate = TRIE_NODENUM( next_alloc );
2121 trie->statecount = laststate;
2123 for ( state = 1 ; state < laststate ; state++ ) {
2125 const U32 stateidx = TRIE_NODEIDX( state );
2126 const U32 o_used = trie->trans[ stateidx ].check;
2127 U32 used = trie->trans[ stateidx ].check;
2128 trie->trans[ stateidx ].check = 0;
2130 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2131 if ( flag || trie->trans[ stateidx + charid ].next ) {
2132 if ( trie->trans[ stateidx + charid ].next ) {
2134 for ( ; zp < pos ; zp++ ) {
2135 if ( ! trie->trans[ zp ].next ) {
2139 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2140 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2141 trie->trans[ zp ].check = state;
2142 if ( ++zp > pos ) pos = zp;
2149 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2151 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2152 trie->trans[ pos ].check = state;
2157 trie->lasttrans = pos + 1;
2158 trie->states = (reg_trie_state *)
2159 PerlMemShared_realloc( trie->states, laststate
2160 * sizeof(reg_trie_state) );
2161 DEBUG_TRIE_COMPILE_MORE_r(
2162 PerlIO_printf( Perl_debug_log,
2163 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2164 (int)depth * 2 + 2,"",
2165 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2168 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2171 } /* end table compress */
2173 DEBUG_TRIE_COMPILE_MORE_r(
2174 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2175 (int)depth * 2 + 2, "",
2176 (UV)trie->statecount,
2177 (UV)trie->lasttrans)
2179 /* resize the trans array to remove unused space */
2180 trie->trans = (reg_trie_trans *)
2181 PerlMemShared_realloc( trie->trans, trie->lasttrans
2182 * sizeof(reg_trie_trans) );
2184 { /* Modify the program and insert the new TRIE node */
2185 U8 nodetype =(U8)(flags & 0xFF);
2189 regnode *optimize = NULL;
2190 #ifdef RE_TRACK_PATTERN_OFFSETS
2193 U32 mjd_nodelen = 0;
2194 #endif /* RE_TRACK_PATTERN_OFFSETS */
2195 #endif /* DEBUGGING */
2197 This means we convert either the first branch or the first Exact,
2198 depending on whether the thing following (in 'last') is a branch
2199 or not and whther first is the startbranch (ie is it a sub part of
2200 the alternation or is it the whole thing.)
2201 Assuming its a sub part we convert the EXACT otherwise we convert
2202 the whole branch sequence, including the first.
2204 /* Find the node we are going to overwrite */
2205 if ( first != startbranch || OP( last ) == BRANCH ) {
2206 /* branch sub-chain */
2207 NEXT_OFF( first ) = (U16)(last - first);
2208 #ifdef RE_TRACK_PATTERN_OFFSETS
2210 mjd_offset= Node_Offset((convert));
2211 mjd_nodelen= Node_Length((convert));
2214 /* whole branch chain */
2216 #ifdef RE_TRACK_PATTERN_OFFSETS
2219 const regnode *nop = NEXTOPER( convert );
2220 mjd_offset= Node_Offset((nop));
2221 mjd_nodelen= Node_Length((nop));
2225 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2226 (int)depth * 2 + 2, "",
2227 (UV)mjd_offset, (UV)mjd_nodelen)
2230 /* But first we check to see if there is a common prefix we can
2231 split out as an EXACT and put in front of the TRIE node. */
2232 trie->startstate= 1;
2233 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2235 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2239 const U32 base = trie->states[ state ].trans.base;
2241 if ( trie->states[state].wordnum )
2244 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2245 if ( ( base + ofs >= trie->uniquecharcount ) &&
2246 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2247 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2249 if ( ++count > 1 ) {
2250 SV **tmp = av_fetch( revcharmap, ofs, 0);
2251 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2252 if ( state == 1 ) break;
2254 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2256 PerlIO_printf(Perl_debug_log,
2257 "%*sNew Start State=%"UVuf" Class: [",
2258 (int)depth * 2 + 2, "",
2261 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2262 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2264 TRIE_BITMAP_SET(trie,*ch);
2266 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2268 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2272 TRIE_BITMAP_SET(trie,*ch);
2274 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2275 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2281 SV **tmp = av_fetch( revcharmap, idx, 0);
2283 char *ch = SvPV( *tmp, len );
2285 SV *sv=sv_newmortal();
2286 PerlIO_printf( Perl_debug_log,
2287 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2288 (int)depth * 2 + 2, "",
2290 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2291 PL_colors[0], PL_colors[1],
2292 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2293 PERL_PV_ESCAPE_FIRSTCHAR
2298 OP( convert ) = nodetype;
2299 str=STRING(convert);
2302 STR_LEN(convert) += len;
2308 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2313 trie->prefixlen = (state-1);
2315 regnode *n = convert+NODE_SZ_STR(convert);
2316 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2317 trie->startstate = state;
2318 trie->minlen -= (state - 1);
2319 trie->maxlen -= (state - 1);
2321 /* At least the UNICOS C compiler choked on this
2322 * being argument to DEBUG_r(), so let's just have
2325 #ifdef PERL_EXT_RE_BUILD
2331 regnode *fix = convert;
2332 U32 word = trie->wordcount;
2334 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2335 while( ++fix < n ) {
2336 Set_Node_Offset_Length(fix, 0, 0);
2339 SV ** const tmp = av_fetch( trie_words, word, 0 );
2341 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2342 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2344 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2352 NEXT_OFF(convert) = (U16)(tail - convert);
2353 DEBUG_r(optimize= n);
2359 if ( trie->maxlen ) {
2360 NEXT_OFF( convert ) = (U16)(tail - convert);
2361 ARG_SET( convert, data_slot );
2362 /* Store the offset to the first unabsorbed branch in
2363 jump[0], which is otherwise unused by the jump logic.
2364 We use this when dumping a trie and during optimisation. */
2366 trie->jump[0] = (U16)(nextbranch - convert);
2368 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2369 * and there is a bitmap
2370 * and the first "jump target" node we found leaves enough room
2371 * then convert the TRIE node into a TRIEC node, with the bitmap
2372 * embedded inline in the opcode - this is hypothetically faster.
2374 if ( !trie->states[trie->startstate].wordnum
2376 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2378 OP( convert ) = TRIEC;
2379 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2380 PerlMemShared_free(trie->bitmap);
2383 OP( convert ) = TRIE;
2385 /* store the type in the flags */
2386 convert->flags = nodetype;
2390 + regarglen[ OP( convert ) ];
2392 /* XXX We really should free up the resource in trie now,
2393 as we won't use them - (which resources?) dmq */
2395 /* needed for dumping*/
2396 DEBUG_r(if (optimize) {
2397 regnode *opt = convert;
2399 while ( ++opt < optimize) {
2400 Set_Node_Offset_Length(opt,0,0);
2403 Try to clean up some of the debris left after the
2406 while( optimize < jumper ) {
2407 mjd_nodelen += Node_Length((optimize));
2408 OP( optimize ) = OPTIMIZED;
2409 Set_Node_Offset_Length(optimize,0,0);
2412 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2414 } /* end node insert */
2416 /* Finish populating the prev field of the wordinfo array. Walk back
2417 * from each accept state until we find another accept state, and if
2418 * so, point the first word's .prev field at the second word. If the
2419 * second already has a .prev field set, stop now. This will be the
2420 * case either if we've already processed that word's accept state,
2421 * or that state had multiple words, and the overspill words were
2422 * already linked up earlier.
2429 for (word=1; word <= trie->wordcount; word++) {
2431 if (trie->wordinfo[word].prev)
2433 state = trie->wordinfo[word].accept;
2435 state = prev_states[state];
2438 prev = trie->states[state].wordnum;
2442 trie->wordinfo[word].prev = prev;
2444 Safefree(prev_states);
2448 /* and now dump out the compressed format */
2449 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2451 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2453 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2454 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2456 SvREFCNT_dec_NN(revcharmap);
2460 : trie->startstate>1
2466 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2468 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2470 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2471 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2474 We find the fail state for each state in the trie, this state is the longest proper
2475 suffix of the current state's 'word' that is also a proper prefix of another word in our
2476 trie. State 1 represents the word '' and is thus the default fail state. This allows
2477 the DFA not to have to restart after its tried and failed a word at a given point, it
2478 simply continues as though it had been matching the other word in the first place.
2480 'abcdgu'=~/abcdefg|cdgu/
2481 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2482 fail, which would bring us to the state representing 'd' in the second word where we would
2483 try 'g' and succeed, proceeding to match 'cdgu'.
2485 /* add a fail transition */
2486 const U32 trie_offset = ARG(source);
2487 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2489 const U32 ucharcount = trie->uniquecharcount;
2490 const U32 numstates = trie->statecount;
2491 const U32 ubound = trie->lasttrans + ucharcount;
2495 U32 base = trie->states[ 1 ].trans.base;
2498 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2499 GET_RE_DEBUG_FLAGS_DECL;
2501 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2503 PERL_UNUSED_ARG(depth);
2507 ARG_SET( stclass, data_slot );
2508 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2509 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2510 aho->trie=trie_offset;
2511 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2512 Copy( trie->states, aho->states, numstates, reg_trie_state );
2513 Newxz( q, numstates, U32);
2514 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2517 /* initialize fail[0..1] to be 1 so that we always have
2518 a valid final fail state */
2519 fail[ 0 ] = fail[ 1 ] = 1;
2521 for ( charid = 0; charid < ucharcount ; charid++ ) {
2522 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2524 q[ q_write ] = newstate;
2525 /* set to point at the root */
2526 fail[ q[ q_write++ ] ]=1;
2529 while ( q_read < q_write) {
2530 const U32 cur = q[ q_read++ % numstates ];
2531 base = trie->states[ cur ].trans.base;
2533 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2534 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2536 U32 fail_state = cur;
2539 fail_state = fail[ fail_state ];
2540 fail_base = aho->states[ fail_state ].trans.base;
2541 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2543 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2544 fail[ ch_state ] = fail_state;
2545 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2547 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2549 q[ q_write++ % numstates] = ch_state;
2553 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2554 when we fail in state 1, this allows us to use the
2555 charclass scan to find a valid start char. This is based on the principle
2556 that theres a good chance the string being searched contains lots of stuff
2557 that cant be a start char.
2559 fail[ 0 ] = fail[ 1 ] = 0;
2560 DEBUG_TRIE_COMPILE_r({
2561 PerlIO_printf(Perl_debug_log,
2562 "%*sStclass Failtable (%"UVuf" states): 0",
2563 (int)(depth * 2), "", (UV)numstates
2565 for( q_read=1; q_read<numstates; q_read++ ) {
2566 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2568 PerlIO_printf(Perl_debug_log, "\n");
2571 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2576 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2577 * These need to be revisited when a newer toolchain becomes available.
2579 #if defined(__sparc64__) && defined(__GNUC__)
2580 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2581 # undef SPARC64_GCC_WORKAROUND
2582 # define SPARC64_GCC_WORKAROUND 1
2586 #define DEBUG_PEEP(str,scan,depth) \
2587 DEBUG_OPTIMISE_r({if (scan){ \
2588 SV * const mysv=sv_newmortal(); \
2589 regnode *Next = regnext(scan); \
2590 regprop(RExC_rx, mysv, scan); \
2591 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2592 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2593 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2597 /* The below joins as many adjacent EXACTish nodes as possible into a single
2598 * one. The regop may be changed if the node(s) contain certain sequences that
2599 * require special handling. The joining is only done if:
2600 * 1) there is room in the current conglomerated node to entirely contain the
2602 * 2) they are the exact same node type
2604 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2605 * these get optimized out
2607 * If a node is to match under /i (folded), the number of characters it matches
2608 * can be different than its character length if it contains a multi-character
2609 * fold. *min_subtract is set to the total delta of the input nodes.
2611 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2612 * and contains LATIN SMALL LETTER SHARP S
2614 * This is as good a place as any to discuss the design of handling these
2615 * multi-character fold sequences. It's been wrong in Perl for a very long
2616 * time. There are three code points in Unicode whose multi-character folds
2617 * were long ago discovered to mess things up. The previous designs for
2618 * dealing with these involved assigning a special node for them. This
2619 * approach doesn't work, as evidenced by this example:
2620 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2621 * Both these fold to "sss", but if the pattern is parsed to create a node that
2622 * would match just the \xDF, it won't be able to handle the case where a
2623 * successful match would have to cross the node's boundary. The new approach
2624 * that hopefully generally solves the problem generates an EXACTFU_SS node
2627 * It turns out that there are problems with all multi-character folds, and not
2628 * just these three. Now the code is general, for all such cases, but the
2629 * three still have some special handling. The approach taken is:
2630 * 1) This routine examines each EXACTFish node that could contain multi-
2631 * character fold sequences. It returns in *min_subtract how much to
2632 * subtract from the the actual length of the string to get a real minimum
2633 * match length; it is 0 if there are no multi-char folds. This delta is
2634 * used by the caller to adjust the min length of the match, and the delta
2635 * between min and max, so that the optimizer doesn't reject these
2636 * possibilities based on size constraints.
2637 * 2) Certain of these sequences require special handling by the trie code,
2638 * so, if found, this code changes the joined node type to special ops:
2639 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2640 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2641 * is used for an EXACTFU node that contains at least one "ss" sequence in
2642 * it. For non-UTF-8 patterns and strings, this is the only case where
2643 * there is a possible fold length change. That means that a regular
2644 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2645 * with length changes, and so can be processed faster. regexec.c takes
2646 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2647 * pre-folded by regcomp.c. This saves effort in regex matching.
2648 * However, the pre-folding isn't done for non-UTF8 patterns because the
2649 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2650 * down by forcing the pattern into UTF8 unless necessary. Also what
2651 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2652 * possibilities for the non-UTF8 patterns are quite simple, except for
2653 * the sharp s. All the ones that don't involve a UTF-8 target string are
2654 * members of a fold-pair, and arrays are set up for all of them so that
2655 * the other member of the pair can be found quickly. Code elsewhere in
2656 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2657 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2658 * described in the next item.
2659 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2660 * 'ss' or not is not knowable at compile time. It will match iff the
2661 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2662 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2663 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2664 * described in item 3). An assumption that the optimizer part of
2665 * regexec.c (probably unwittingly) makes is that a character in the
2666 * pattern corresponds to at most a single character in the target string.
2667 * (And I do mean character, and not byte here, unlike other parts of the
2668 * documentation that have never been updated to account for multibyte
2669 * Unicode.) This assumption is wrong only in this case, as all other
2670 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2671 * virtue of having this file pre-fold UTF-8 patterns. I'm
2672 * reluctant to try to change this assumption, so instead the code punts.
2673 * This routine examines EXACTF nodes for the sharp s, and returns a
2674 * boolean indicating whether or not the node is an EXACTF node that
2675 * contains a sharp s. When it is true, the caller sets a flag that later
2676 * causes the optimizer in this file to not set values for the floating
2677 * and fixed string lengths, and thus avoids the optimizer code in
2678 * regexec.c that makes the invalid assumption. Thus, there is no
2679 * optimization based on string lengths for EXACTF nodes that contain the
2680 * sharp s. This only happens for /id rules (which means the pattern
2684 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2685 if (PL_regkind[OP(scan)] == EXACT) \
2686 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2689 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2690 /* Merge several consecutive EXACTish nodes into one. */
2691 regnode *n = regnext(scan);
2693 regnode *next = scan + NODE_SZ_STR(scan);
2697 regnode *stop = scan;
2698 GET_RE_DEBUG_FLAGS_DECL;
2700 PERL_UNUSED_ARG(depth);
2703 PERL_ARGS_ASSERT_JOIN_EXACT;
2704 #ifndef EXPERIMENTAL_INPLACESCAN
2705 PERL_UNUSED_ARG(flags);
2706 PERL_UNUSED_ARG(val);
2708 DEBUG_PEEP("join",scan,depth);
2710 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2711 * EXACT ones that are mergeable to the current one. */
2713 && (PL_regkind[OP(n)] == NOTHING
2714 || (stringok && OP(n) == OP(scan)))
2716 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2719 if (OP(n) == TAIL || n > next)
2721 if (PL_regkind[OP(n)] == NOTHING) {
2722 DEBUG_PEEP("skip:",n,depth);
2723 NEXT_OFF(scan) += NEXT_OFF(n);
2724 next = n + NODE_STEP_REGNODE;
2731 else if (stringok) {
2732 const unsigned int oldl = STR_LEN(scan);
2733 regnode * const nnext = regnext(n);
2735 /* XXX I (khw) kind of doubt that this works on platforms where
2736 * U8_MAX is above 255 because of lots of other assumptions */
2737 /* Don't join if the sum can't fit into a single node */
2738 if (oldl + STR_LEN(n) > U8_MAX)
2741 DEBUG_PEEP("merg",n,depth);
2744 NEXT_OFF(scan) += NEXT_OFF(n);
2745 STR_LEN(scan) += STR_LEN(n);
2746 next = n + NODE_SZ_STR(n);
2747 /* Now we can overwrite *n : */
2748 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2756 #ifdef EXPERIMENTAL_INPLACESCAN
2757 if (flags && !NEXT_OFF(n)) {
2758 DEBUG_PEEP("atch", val, depth);
2759 if (reg_off_by_arg[OP(n)]) {
2760 ARG_SET(n, val - n);
2763 NEXT_OFF(n) = val - n;
2771 *has_exactf_sharp_s = FALSE;
2773 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2774 * can now analyze for sequences of problematic code points. (Prior to
2775 * this final joining, sequences could have been split over boundaries, and
2776 * hence missed). The sequences only happen in folding, hence for any
2777 * non-EXACT EXACTish node */
2778 if (OP(scan) != EXACT) {
2779 const U8 * const s0 = (U8*) STRING(scan);
2781 const U8 * const s_end = s0 + STR_LEN(scan);
2783 /* One pass is made over the node's string looking for all the
2784 * possibilities. to avoid some tests in the loop, there are two main
2785 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2789 /* Examine the string for a multi-character fold sequence. UTF-8
2790 * patterns have all characters pre-folded by the time this code is
2792 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2793 length sequence we are looking for is 2 */
2796 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2797 if (! len) { /* Not a multi-char fold: get next char */
2802 /* Nodes with 'ss' require special handling, except for EXACTFL
2803 * and EXACTFA for which there is no multi-char fold to this */
2804 if (len == 2 && *s == 's' && *(s+1) == 's'
2805 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2808 OP(scan) = EXACTFU_SS;
2811 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2812 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2813 COMBINING_DIAERESIS_UTF8
2814 COMBINING_ACUTE_ACCENT_UTF8,
2816 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2817 COMBINING_DIAERESIS_UTF8
2818 COMBINING_ACUTE_ACCENT_UTF8,
2823 /* These two folds require special handling by trie's, so
2824 * change the node type to indicate this. If EXACTFA and
2825 * EXACTFL were ever to be handled by trie's, this would
2826 * have to be changed. If this node has already been
2827 * changed to EXACTFU_SS in this loop, leave it as is. (I
2828 * (khw) think it doesn't matter in regexec.c for UTF
2829 * patterns, but no need to change it */
2830 if (OP(scan) == EXACTFU) {
2831 OP(scan) = EXACTFU_TRICKYFOLD;
2835 else { /* Here is a generic multi-char fold. */
2836 const U8* multi_end = s + len;
2838 /* Count how many characters in it. In the case of /l and
2839 * /aa, no folds which contain ASCII code points are
2840 * allowed, so check for those, and skip if found. (In
2841 * EXACTFL, no folds are allowed to any Latin1 code point,
2842 * not just ASCII. But there aren't any of these
2843 * currently, nor ever likely, so don't take the time to
2844 * test for them. The code that generates the
2845 * is_MULTI_foo() macros croaks should one actually get put
2846 * into Unicode .) */
2847 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2848 count = utf8_length(s, multi_end);
2852 while (s < multi_end) {
2855 goto next_iteration;
2865 /* The delta is how long the sequence is minus 1 (1 is how long
2866 * the character that folds to the sequence is) */
2867 *min_subtract += count - 1;
2871 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2873 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2874 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2875 * nodes can't have multi-char folds to this range (and there are
2876 * no existing ones in the upper latin1 range). In the EXACTF
2877 * case we look also for the sharp s, which can be in the final
2878 * position. Otherwise we can stop looking 1 byte earlier because
2879 * have to find at least two characters for a multi-fold */
2880 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2882 /* The below is perhaps overboard, but this allows us to save a
2883 * test each time through the loop at the expense of a mask. This
2884 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2885 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2886 * are 64. This uses an exclusive 'or' to find that bit and then
2887 * inverts it to form a mask, with just a single 0, in the bit
2888 * position where 'S' and 's' differ. */
2889 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2890 const U8 s_masked = 's' & S_or_s_mask;
2893 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2894 if (! len) { /* Not a multi-char fold. */
2895 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2897 *has_exactf_sharp_s = TRUE;
2904 && ((*s & S_or_s_mask) == s_masked)
2905 && ((*(s+1) & S_or_s_mask) == s_masked))
2908 /* EXACTF nodes need to know that the minimum length
2909 * changed so that a sharp s in the string can match this
2910 * ss in the pattern, but they remain EXACTF nodes, as they
2911 * won't match this unless the target string is is UTF-8,
2912 * which we don't know until runtime */
2913 if (OP(scan) != EXACTF) {
2914 OP(scan) = EXACTFU_SS;
2918 *min_subtract += len - 1;
2925 /* Allow dumping but overwriting the collection of skipped
2926 * ops and/or strings with fake optimized ops */
2927 n = scan + NODE_SZ_STR(scan);
2935 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2939 /* REx optimizer. Converts nodes into quicker variants "in place".
2940 Finds fixed substrings. */
2942 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2943 to the position after last scanned or to NULL. */
2945 #define INIT_AND_WITHP \
2946 assert(!and_withp); \
2947 Newx(and_withp,1,struct regnode_charclass_class); \
2948 SAVEFREEPV(and_withp)
2950 /* this is a chain of data about sub patterns we are processing that
2951 need to be handled separately/specially in study_chunk. Its so
2952 we can simulate recursion without losing state. */
2954 typedef struct scan_frame {
2955 regnode *last; /* last node to process in this frame */
2956 regnode *next; /* next node to process when last is reached */
2957 struct scan_frame *prev; /*previous frame*/
2958 I32 stop; /* what stopparen do we use */
2962 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2965 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2966 I32 *minlenp, I32 *deltap,
2971 struct regnode_charclass_class *and_withp,
2972 U32 flags, U32 depth)
2973 /* scanp: Start here (read-write). */
2974 /* deltap: Write maxlen-minlen here. */
2975 /* last: Stop before this one. */
2976 /* data: string data about the pattern */
2977 /* stopparen: treat close N as END */
2978 /* recursed: which subroutines have we recursed into */
2979 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2982 I32 min = 0; /* There must be at least this number of characters to match */
2984 regnode *scan = *scanp, *next;
2986 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2987 int is_inf_internal = 0; /* The studied chunk is infinite */
2988 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2989 scan_data_t data_fake;
2990 SV *re_trie_maxbuff = NULL;
2991 regnode *first_non_open = scan;
2992 I32 stopmin = I32_MAX;
2993 scan_frame *frame = NULL;
2994 GET_RE_DEBUG_FLAGS_DECL;
2996 PERL_ARGS_ASSERT_STUDY_CHUNK;
2999 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3003 while (first_non_open && OP(first_non_open) == OPEN)
3004 first_non_open=regnext(first_non_open);
3009 while ( scan && OP(scan) != END && scan < last ){
3010 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3011 node length to get a real minimum (because
3012 the folded version may be shorter) */
3013 bool has_exactf_sharp_s = FALSE;
3014 /* Peephole optimizer: */
3015 DEBUG_STUDYDATA("Peep:", data,depth);
3016 DEBUG_PEEP("Peep",scan,depth);
3018 /* Its not clear to khw or hv why this is done here, and not in the
3019 * clauses that deal with EXACT nodes. khw's guess is that it's
3020 * because of a previous design */
3021 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3023 /* Follow the next-chain of the current node and optimize
3024 away all the NOTHINGs from it. */
3025 if (OP(scan) != CURLYX) {
3026 const int max = (reg_off_by_arg[OP(scan)]
3028 /* I32 may be smaller than U16 on CRAYs! */
3029 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3030 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3034 /* Skip NOTHING and LONGJMP. */
3035 while ((n = regnext(n))
3036 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3037 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3038 && off + noff < max)
3040 if (reg_off_by_arg[OP(scan)])
3043 NEXT_OFF(scan) = off;
3048 /* The principal pseudo-switch. Cannot be a switch, since we
3049 look into several different things. */
3050 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3051 || OP(scan) == IFTHEN) {
3052 next = regnext(scan);
3054 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3056 if (OP(next) == code || code == IFTHEN) {
3057 /* NOTE - There is similar code to this block below for handling
3058 TRIE nodes on a re-study. If you change stuff here check there
3060 I32 max1 = 0, min1 = I32_MAX, num = 0;
3061 struct regnode_charclass_class accum;
3062 regnode * const startbranch=scan;
3064 if (flags & SCF_DO_SUBSTR)
3065 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3066 if (flags & SCF_DO_STCLASS)
3067 cl_init_zero(pRExC_state, &accum);
3069 while (OP(scan) == code) {
3070 I32 deltanext, minnext, f = 0, fake;
3071 struct regnode_charclass_class this_class;
3074 data_fake.flags = 0;
3076 data_fake.whilem_c = data->whilem_c;
3077 data_fake.last_closep = data->last_closep;
3080 data_fake.last_closep = &fake;
3082 data_fake.pos_delta = delta;
3083 next = regnext(scan);
3084 scan = NEXTOPER(scan);
3086 scan = NEXTOPER(scan);
3087 if (flags & SCF_DO_STCLASS) {
3088 cl_init(pRExC_state, &this_class);
3089 data_fake.start_class = &this_class;
3090 f = SCF_DO_STCLASS_AND;
3092 if (flags & SCF_WHILEM_VISITED_POS)
3093 f |= SCF_WHILEM_VISITED_POS;
3095 /* we suppose the run is continuous, last=next...*/
3096 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3098 stopparen, recursed, NULL, f,depth+1);
3101 if (max1 < minnext + deltanext)
3102 max1 = minnext + deltanext;
3103 if (deltanext == I32_MAX)
3104 is_inf = is_inf_internal = 1;
3106 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3108 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3109 if ( stopmin > minnext)
3110 stopmin = min + min1;
3111 flags &= ~SCF_DO_SUBSTR;
3113 data->flags |= SCF_SEEN_ACCEPT;
3116 if (data_fake.flags & SF_HAS_EVAL)
3117 data->flags |= SF_HAS_EVAL;
3118 data->whilem_c = data_fake.whilem_c;
3120 if (flags & SCF_DO_STCLASS)
3121 cl_or(pRExC_state, &accum, &this_class);
3123 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3125 if (flags & SCF_DO_SUBSTR) {
3126 data->pos_min += min1;
3127 data->pos_delta += max1 - min1;
3128 if (max1 != min1 || is_inf)
3129 data->longest = &(data->longest_float);
3132 delta += max1 - min1;
3133 if (flags & SCF_DO_STCLASS_OR) {
3134 cl_or(pRExC_state, data->start_class, &accum);
3136 cl_and(data->start_class, and_withp);
3137 flags &= ~SCF_DO_STCLASS;
3140 else if (flags & SCF_DO_STCLASS_AND) {
3142 cl_and(data->start_class, &accum);
3143 flags &= ~SCF_DO_STCLASS;
3146 /* Switch to OR mode: cache the old value of
3147 * data->start_class */
3149 StructCopy(data->start_class, and_withp,
3150 struct regnode_charclass_class);
3151 flags &= ~SCF_DO_STCLASS_AND;
3152 StructCopy(&accum, data->start_class,
3153 struct regnode_charclass_class);
3154 flags |= SCF_DO_STCLASS_OR;
3155 SET_SSC_EOS(data->start_class);
3159 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3162 Assuming this was/is a branch we are dealing with: 'scan' now
3163 points at the item that follows the branch sequence, whatever
3164 it is. We now start at the beginning of the sequence and look
3171 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3173 If we can find such a subsequence we need to turn the first
3174 element into a trie and then add the subsequent branch exact
3175 strings to the trie.
3179 1. patterns where the whole set of branches can be converted.
3181 2. patterns where only a subset can be converted.
3183 In case 1 we can replace the whole set with a single regop
3184 for the trie. In case 2 we need to keep the start and end
3187 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3188 becomes BRANCH TRIE; BRANCH X;
3190 There is an additional case, that being where there is a
3191 common prefix, which gets split out into an EXACT like node
3192 preceding the TRIE node.
3194 If x(1..n)==tail then we can do a simple trie, if not we make
3195 a "jump" trie, such that when we match the appropriate word
3196 we "jump" to the appropriate tail node. Essentially we turn
3197 a nested if into a case structure of sorts.
3202 if (!re_trie_maxbuff) {
3203 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3204 if (!SvIOK(re_trie_maxbuff))
3205 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3207 if ( SvIV(re_trie_maxbuff)>=0 ) {
3209 regnode *first = (regnode *)NULL;
3210 regnode *last = (regnode *)NULL;
3211 regnode *tail = scan;
3216 SV * const mysv = sv_newmortal(); /* for dumping */
3218 /* var tail is used because there may be a TAIL
3219 regop in the way. Ie, the exacts will point to the
3220 thing following the TAIL, but the last branch will
3221 point at the TAIL. So we advance tail. If we
3222 have nested (?:) we may have to move through several
3226 while ( OP( tail ) == TAIL ) {
3227 /* this is the TAIL generated by (?:) */
3228 tail = regnext( tail );
3232 DEBUG_TRIE_COMPILE_r({
3233 regprop(RExC_rx, mysv, tail );
3234 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3235 (int)depth * 2 + 2, "",
3236 "Looking for TRIE'able sequences. Tail node is: ",
3237 SvPV_nolen_const( mysv )
3243 Step through the branches
3244 cur represents each branch,
3245 noper is the first thing to be matched as part of that branch
3246 noper_next is the regnext() of that node.
3248 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3249 via a "jump trie" but we also support building with NOJUMPTRIE,
3250 which restricts the trie logic to structures like /FOO|BAR/.
3252 If noper is a trieable nodetype then the branch is a possible optimization
3253 target. If we are building under NOJUMPTRIE then we require that noper_next
3254 is the same as scan (our current position in the regex program).
3256 Once we have two or more consecutive such branches we can create a
3257 trie of the EXACT's contents and stitch it in place into the program.
3259 If the sequence represents all of the branches in the alternation we
3260 replace the entire thing with a single TRIE node.
3262 Otherwise when it is a subsequence we need to stitch it in place and
3263 replace only the relevant branches. This means the first branch has
3264 to remain as it is used by the alternation logic, and its next pointer,
3265 and needs to be repointed at the item on the branch chain following
3266 the last branch we have optimized away.
3268 This could be either a BRANCH, in which case the subsequence is internal,
3269 or it could be the item following the branch sequence in which case the
3270 subsequence is at the end (which does not necessarily mean the first node
3271 is the start of the alternation).
3273 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3276 ----------------+-----------
3280 EXACTFU_SS | EXACTFU
3281 EXACTFU_TRICKYFOLD | EXACTFU
3286 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3287 ( EXACT == (X) ) ? EXACT : \
3288 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3291 /* dont use tail as the end marker for this traverse */
3292 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3293 regnode * const noper = NEXTOPER( cur );
3294 U8 noper_type = OP( noper );
3295 U8 noper_trietype = TRIE_TYPE( noper_type );
3296 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3297 regnode * const noper_next = regnext( noper );
3298 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3299 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3302 DEBUG_TRIE_COMPILE_r({
3303 regprop(RExC_rx, mysv, cur);
3304 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3305 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3307 regprop(RExC_rx, mysv, noper);
3308 PerlIO_printf( Perl_debug_log, " -> %s",
3309 SvPV_nolen_const(mysv));
3312 regprop(RExC_rx, mysv, noper_next );
3313 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3314 SvPV_nolen_const(mysv));
3316 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3317 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3318 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3322 /* Is noper a trieable nodetype that can be merged with the
3323 * current trie (if there is one)? */
3327 ( noper_trietype == NOTHING)
3328 || ( trietype == NOTHING )
3329 || ( trietype == noper_trietype )
3332 && noper_next == tail
3336 /* Handle mergable triable node
3337 * Either we are the first node in a new trieable sequence,
3338 * in which case we do some bookkeeping, otherwise we update
3339 * the end pointer. */
3342 if ( noper_trietype == NOTHING ) {
3343 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3344 regnode * const noper_next = regnext( noper );
3345 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3346 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3349 if ( noper_next_trietype ) {
3350 trietype = noper_next_trietype;
3351 } else if (noper_next_type) {
3352 /* a NOTHING regop is 1 regop wide. We need at least two
3353 * for a trie so we can't merge this in */
3357 trietype = noper_trietype;
3360 if ( trietype == NOTHING )
3361 trietype = noper_trietype;
3366 } /* end handle mergable triable node */
3368 /* handle unmergable node -
3369 * noper may either be a triable node which can not be tried
3370 * together with the current trie, or a non triable node */
3372 /* If last is set and trietype is not NOTHING then we have found
3373 * at least two triable branch sequences in a row of a similar
3374 * trietype so we can turn them into a trie. If/when we
3375 * allow NOTHING to start a trie sequence this condition will be
3376 * required, and it isn't expensive so we leave it in for now. */
3377 if ( trietype && trietype != NOTHING )
3378 make_trie( pRExC_state,
3379 startbranch, first, cur, tail, count,
3380 trietype, depth+1 );
3381 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3385 && noper_next == tail
3388 /* noper is triable, so we can start a new trie sequence */
3391 trietype = noper_trietype;
3393 /* if we already saw a first but the current node is not triable then we have
3394 * to reset the first information. */
3399 } /* end handle unmergable node */
3400 } /* loop over branches */
3401 DEBUG_TRIE_COMPILE_r({
3402 regprop(RExC_rx, mysv, cur);
3403 PerlIO_printf( Perl_debug_log,
3404 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3405 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3408 if ( last && trietype ) {
3409 if ( trietype != NOTHING ) {
3410 /* the last branch of the sequence was part of a trie,
3411 * so we have to construct it here outside of the loop
3413 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3414 #ifdef TRIE_STUDY_OPT
3415 if ( ((made == MADE_EXACT_TRIE &&
3416 startbranch == first)
3417 || ( first_non_open == first )) &&
3419 flags |= SCF_TRIE_RESTUDY;
3420 if ( startbranch == first
3423 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3428 /* at this point we know whatever we have is a NOTHING sequence/branch
3429 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3431 if ( startbranch == first ) {
3433 /* the entire thing is a NOTHING sequence, something like this:
3434 * (?:|) So we can turn it into a plain NOTHING op. */
3435 DEBUG_TRIE_COMPILE_r({
3436 regprop(RExC_rx, mysv, cur);
3437 PerlIO_printf( Perl_debug_log,
3438 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3439 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3442 OP(startbranch)= NOTHING;
3443 NEXT_OFF(startbranch)= tail - startbranch;
3444 for ( opt= startbranch + 1; opt < tail ; opt++ )
3448 } /* end if ( last) */
3449 } /* TRIE_MAXBUF is non zero */
3454 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3455 scan = NEXTOPER(NEXTOPER(scan));
3456 } else /* single branch is optimized. */
3457 scan = NEXTOPER(scan);
3459 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3460 scan_frame *newframe = NULL;
3465 if (OP(scan) != SUSPEND) {
3466 /* set the pointer */
3467 if (OP(scan) == GOSUB) {
3469 RExC_recurse[ARG2L(scan)] = scan;
3470 start = RExC_open_parens[paren-1];
3471 end = RExC_close_parens[paren-1];
3474 start = RExC_rxi->program + 1;
3478 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3479 SAVEFREEPV(recursed);
3481 if (!PAREN_TEST(recursed,paren+1)) {
3482 PAREN_SET(recursed,paren+1);
3483 Newx(newframe,1,scan_frame);
3485 if (flags & SCF_DO_SUBSTR) {
3486 SCAN_COMMIT(pRExC_state,data,minlenp);
3487 data->longest = &(data->longest_float);
3489 is_inf = is_inf_internal = 1;
3490 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3491 cl_anything(pRExC_state, data->start_class);
3492 flags &= ~SCF_DO_STCLASS;
3495 Newx(newframe,1,scan_frame);
3498 end = regnext(scan);
3503 SAVEFREEPV(newframe);
3504 newframe->next = regnext(scan);
3505 newframe->last = last;
3506 newframe->stop = stopparen;
3507 newframe->prev = frame;
3517 else if (OP(scan) == EXACT) {
3518 I32 l = STR_LEN(scan);
3521 const U8 * const s = (U8*)STRING(scan);
3522 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3523 l = utf8_length(s, s + l);
3525 uc = *((U8*)STRING(scan));
3528 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3529 /* The code below prefers earlier match for fixed
3530 offset, later match for variable offset. */
3531 if (data->last_end == -1) { /* Update the start info. */
3532 data->last_start_min = data->pos_min;
3533 data->last_start_max = is_inf
3534 ? I32_MAX : data->pos_min + data->pos_delta;
3536 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3538 SvUTF8_on(data->last_found);
3540 SV * const sv = data->last_found;
3541 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3542 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3543 if (mg && mg->mg_len >= 0)
3544 mg->mg_len += utf8_length((U8*)STRING(scan),
3545 (U8*)STRING(scan)+STR_LEN(scan));
3547 data->last_end = data->pos_min + l;
3548 data->pos_min += l; /* As in the first entry. */
3549 data->flags &= ~SF_BEFORE_EOL;
3551 if (flags & SCF_DO_STCLASS_AND) {
3552 /* Check whether it is compatible with what we know already! */
3556 /* If compatible, we or it in below. It is compatible if is
3557 * in the bitmp and either 1) its bit or its fold is set, or 2)
3558 * it's for a locale. Even if there isn't unicode semantics
3559 * here, at runtime there may be because of matching against a
3560 * utf8 string, so accept a possible false positive for
3561 * latin1-range folds */
3563 (!(data->start_class->flags & ANYOF_LOCALE)
3564 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3565 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3566 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3571 ANYOF_CLASS_ZERO(data->start_class);
3572 ANYOF_BITMAP_ZERO(data->start_class);
3574 ANYOF_BITMAP_SET(data->start_class, uc);
3575 else if (uc >= 0x100) {
3578 /* Some Unicode code points fold to the Latin1 range; as
3579 * XXX temporary code, instead of figuring out if this is
3580 * one, just assume it is and set all the start class bits
3581 * that could be some such above 255 code point's fold
3582 * which will generate fals positives. As the code
3583 * elsewhere that does compute the fold settles down, it
3584 * can be extracted out and re-used here */
3585 for (i = 0; i < 256; i++){
3586 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3587 ANYOF_BITMAP_SET(data->start_class, i);
3591 CLEAR_SSC_EOS(data->start_class);
3593 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3595 else if (flags & SCF_DO_STCLASS_OR) {
3596 /* false positive possible if the class is case-folded */
3598 ANYOF_BITMAP_SET(data->start_class, uc);
3600 data->start_class->flags |= ANYOF_UNICODE_ALL;
3601 CLEAR_SSC_EOS(data->start_class);
3602 cl_and(data->start_class, and_withp);
3604 flags &= ~SCF_DO_STCLASS;
3606 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3607 I32 l = STR_LEN(scan);
3608 UV uc = *((U8*)STRING(scan));
3610 /* Search for fixed substrings supports EXACT only. */
3611 if (flags & SCF_DO_SUBSTR) {
3613 SCAN_COMMIT(pRExC_state, data, minlenp);
3616 const U8 * const s = (U8 *)STRING(scan);
3617 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3618 l = utf8_length(s, s + l);
3620 if (has_exactf_sharp_s) {
3621 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3623 min += l - min_subtract;
3625 delta += min_subtract;
3626 if (flags & SCF_DO_SUBSTR) {
3627 data->pos_min += l - min_subtract;
3628 if (data->pos_min < 0) {
3631 data->pos_delta += min_subtract;
3633 data->longest = &(data->longest_float);
3636 if (flags & SCF_DO_STCLASS_AND) {
3637 /* Check whether it is compatible with what we know already! */
3640 (!(data->start_class->flags & ANYOF_LOCALE)
3641 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3642 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3646 ANYOF_CLASS_ZERO(data->start_class);
3647 ANYOF_BITMAP_ZERO(data->start_class);
3649 ANYOF_BITMAP_SET(data->start_class, uc);
3650 CLEAR_SSC_EOS(data->start_class);
3651 if (OP(scan) == EXACTFL) {
3652 /* XXX This set is probably no longer necessary, and
3653 * probably wrong as LOCALE now is on in the initial
3655 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3659 /* Also set the other member of the fold pair. In case
3660 * that unicode semantics is called for at runtime, use
3661 * the full latin1 fold. (Can't do this for locale,
3662 * because not known until runtime) */
3663 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3665 /* All other (EXACTFL handled above) folds except under
3666 * /iaa that include s, S, and sharp_s also may include
3668 if (OP(scan) != EXACTFA) {
3669 if (uc == 's' || uc == 'S') {
3670 ANYOF_BITMAP_SET(data->start_class,
3671 LATIN_SMALL_LETTER_SHARP_S);
3673 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3674 ANYOF_BITMAP_SET(data->start_class, 's');
3675 ANYOF_BITMAP_SET(data->start_class, 'S');
3680 else if (uc >= 0x100) {
3682 for (i = 0; i < 256; i++){
3683 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3684 ANYOF_BITMAP_SET(data->start_class, i);
3689 else if (flags & SCF_DO_STCLASS_OR) {
3690 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3691 /* false positive possible if the class is case-folded.
3692 Assume that the locale settings are the same... */
3694 ANYOF_BITMAP_SET(data->start_class, uc);
3695 if (OP(scan) != EXACTFL) {
3697 /* And set the other member of the fold pair, but
3698 * can't do that in locale because not known until
3700 ANYOF_BITMAP_SET(data->start_class,
3701 PL_fold_latin1[uc]);
3703 /* All folds except under /iaa that include s, S,
3704 * and sharp_s also may include the others */
3705 if (OP(scan) != EXACTFA) {
3706 if (uc == 's' || uc == 'S') {
3707 ANYOF_BITMAP_SET(data->start_class,
3708 LATIN_SMALL_LETTER_SHARP_S);
3710 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3711 ANYOF_BITMAP_SET(data->start_class, 's');
3712 ANYOF_BITMAP_SET(data->start_class, 'S');
3717 CLEAR_SSC_EOS(data->start_class);
3719 cl_and(data->start_class, and_withp);
3721 flags &= ~SCF_DO_STCLASS;
3723 else if (REGNODE_VARIES(OP(scan))) {
3724 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3725 I32 f = flags, pos_before = 0;
3726 regnode * const oscan = scan;
3727 struct regnode_charclass_class this_class;
3728 struct regnode_charclass_class *oclass = NULL;
3729 I32 next_is_eval = 0;
3731 switch (PL_regkind[OP(scan)]) {
3732 case WHILEM: /* End of (?:...)* . */
3733 scan = NEXTOPER(scan);
3736 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3737 next = NEXTOPER(scan);
3738 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3740 maxcount = REG_INFTY;
3741 next = regnext(scan);
3742 scan = NEXTOPER(scan);
3746 if (flags & SCF_DO_SUBSTR)
3751 if (flags & SCF_DO_STCLASS) {
3753 maxcount = REG_INFTY;
3754 next = regnext(scan);
3755 scan = NEXTOPER(scan);
3758 is_inf = is_inf_internal = 1;
3759 scan = regnext(scan);
3760 if (flags & SCF_DO_SUBSTR) {
3761 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3762 data->longest = &(data->longest_float);
3764 goto optimize_curly_tail;
3766 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3767 && (scan->flags == stopparen))
3772 mincount = ARG1(scan);
3773 maxcount = ARG2(scan);
3775 next = regnext(scan);
3776 if (OP(scan) == CURLYX) {
3777 I32 lp = (data ? *(data->last_closep) : 0);
3778 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3780 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3781 next_is_eval = (OP(scan) == EVAL);
3783 if (flags & SCF_DO_SUBSTR) {
3784 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3785 pos_before = data->pos_min;
3789 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3791 data->flags |= SF_IS_INF;
3793 if (flags & SCF_DO_STCLASS) {
3794 cl_init(pRExC_state, &this_class);
3795 oclass = data->start_class;
3796 data->start_class = &this_class;
3797 f |= SCF_DO_STCLASS_AND;
3798 f &= ~SCF_DO_STCLASS_OR;
3800 /* Exclude from super-linear cache processing any {n,m}
3801 regops for which the combination of input pos and regex
3802 pos is not enough information to determine if a match
3805 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3806 regex pos at the \s*, the prospects for a match depend not
3807 only on the input position but also on how many (bar\s*)
3808 repeats into the {4,8} we are. */
3809 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3810 f &= ~SCF_WHILEM_VISITED_POS;
3812 /* This will finish on WHILEM, setting scan, or on NULL: */
3813 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3814 last, data, stopparen, recursed, NULL,
3816 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3818 if (flags & SCF_DO_STCLASS)
3819 data->start_class = oclass;
3820 if (mincount == 0 || minnext == 0) {
3821 if (flags & SCF_DO_STCLASS_OR) {
3822 cl_or(pRExC_state, data->start_class, &this_class);
3824 else if (flags & SCF_DO_STCLASS_AND) {
3825 /* Switch to OR mode: cache the old value of
3826 * data->start_class */
3828 StructCopy(data->start_class, and_withp,
3829 struct regnode_charclass_class);
3830 flags &= ~SCF_DO_STCLASS_AND;
3831 StructCopy(&this_class, data->start_class,
3832 struct regnode_charclass_class);
3833 flags |= SCF_DO_STCLASS_OR;
3834 SET_SSC_EOS(data->start_class);
3836 } else { /* Non-zero len */
3837 if (flags & SCF_DO_STCLASS_OR) {
3838 cl_or(pRExC_state, data->start_class, &this_class);
3839 cl_and(data->start_class, and_withp);
3841 else if (flags & SCF_DO_STCLASS_AND)
3842 cl_and(data->start_class, &this_class);
3843 flags &= ~SCF_DO_STCLASS;
3845 if (!scan) /* It was not CURLYX, but CURLY. */
3847 if ( /* ? quantifier ok, except for (?{ ... }) */
3848 (next_is_eval || !(mincount == 0 && maxcount == 1))
3849 && (minnext == 0) && (deltanext == 0)
3850 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3851 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3853 /* Fatal warnings may leak the regexp without this: */
3854 SAVEFREESV(RExC_rx_sv);
3855 ckWARNreg(RExC_parse,
3856 "Quantifier unexpected on zero-length expression");
3857 (void)ReREFCNT_inc(RExC_rx_sv);
3860 min += minnext * mincount;
3861 is_inf_internal |= ((maxcount == REG_INFTY
3862 && (minnext + deltanext) > 0)
3863 || deltanext == I32_MAX);
3864 is_inf |= is_inf_internal;
3865 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3867 /* Try powerful optimization CURLYX => CURLYN. */
3868 if ( OP(oscan) == CURLYX && data
3869 && data->flags & SF_IN_PAR
3870 && !(data->flags & SF_HAS_EVAL)
3871 && !deltanext && minnext == 1 ) {
3872 /* Try to optimize to CURLYN. */
3873 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3874 regnode * const nxt1 = nxt;
3881 if (!REGNODE_SIMPLE(OP(nxt))
3882 && !(PL_regkind[OP(nxt)] == EXACT
3883 && STR_LEN(nxt) == 1))
3889 if (OP(nxt) != CLOSE)
3891 if (RExC_open_parens) {
3892 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3893 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3895 /* Now we know that nxt2 is the only contents: */
3896 oscan->flags = (U8)ARG(nxt);
3898 OP(nxt1) = NOTHING; /* was OPEN. */
3901 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3902 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3903 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3904 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3905 OP(nxt + 1) = OPTIMIZED; /* was count. */
3906 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3911 /* Try optimization CURLYX => CURLYM. */
3912 if ( OP(oscan) == CURLYX && data
3913 && !(data->flags & SF_HAS_PAR)
3914 && !(data->flags & SF_HAS_EVAL)
3915 && !deltanext /* atom is fixed width */
3916 && minnext != 0 /* CURLYM can't handle zero width */
3917 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3919 /* XXXX How to optimize if data == 0? */
3920 /* Optimize to a simpler form. */
3921 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3925 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3926 && (OP(nxt2) != WHILEM))
3928 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3929 /* Need to optimize away parenths. */
3930 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3931 /* Set the parenth number. */
3932 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3934 oscan->flags = (U8)ARG(nxt);
3935 if (RExC_open_parens) {
3936 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3937 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3939 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3940 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3943 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3944 OP(nxt + 1) = OPTIMIZED; /* was count. */
3945 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3946 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3949 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3950 regnode *nnxt = regnext(nxt1);
3952 if (reg_off_by_arg[OP(nxt1)])
3953 ARG_SET(nxt1, nxt2 - nxt1);
3954 else if (nxt2 - nxt1 < U16_MAX)
3955 NEXT_OFF(nxt1) = nxt2 - nxt1;
3957 OP(nxt) = NOTHING; /* Cannot beautify */
3962 /* Optimize again: */
3963 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3964 NULL, stopparen, recursed, NULL, 0,depth+1);
3969 else if ((OP(oscan) == CURLYX)
3970 && (flags & SCF_WHILEM_VISITED_POS)
3971 /* See the comment on a similar expression above.
3972 However, this time it's not a subexpression
3973 we care about, but the expression itself. */
3974 && (maxcount == REG_INFTY)
3975 && data && ++data->whilem_c < 16) {
3976 /* This stays as CURLYX, we can put the count/of pair. */
3977 /* Find WHILEM (as in regexec.c) */
3978 regnode *nxt = oscan + NEXT_OFF(oscan);
3980 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3982 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3983 | (RExC_whilem_seen << 4)); /* On WHILEM */
3985 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3987 if (flags & SCF_DO_SUBSTR) {
3988 SV *last_str = NULL;
3989 int counted = mincount != 0;
3991 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3992 #if defined(SPARC64_GCC_WORKAROUND)
3995 const char *s = NULL;
3998 if (pos_before >= data->last_start_min)
4001 b = data->last_start_min;
4004 s = SvPV_const(data->last_found, l);
4005 old = b - data->last_start_min;
4008 I32 b = pos_before >= data->last_start_min
4009 ? pos_before : data->last_start_min;
4011 const char * const s = SvPV_const(data->last_found, l);
4012 I32 old = b - data->last_start_min;
4016 old = utf8_hop((U8*)s, old) - (U8*)s;
4018 /* Get the added string: */
4019 last_str = newSVpvn_utf8(s + old, l, UTF);
4020 if (deltanext == 0 && pos_before == b) {
4021 /* What was added is a constant string */
4023 SvGROW(last_str, (mincount * l) + 1);
4024 repeatcpy(SvPVX(last_str) + l,
4025 SvPVX_const(last_str), l, mincount - 1);
4026 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4027 /* Add additional parts. */
4028 SvCUR_set(data->last_found,
4029 SvCUR(data->last_found) - l);
4030 sv_catsv(data->last_found, last_str);
4032 SV * sv = data->last_found;
4034 SvUTF8(sv) && SvMAGICAL(sv) ?
4035 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4036 if (mg && mg->mg_len >= 0)
4037 mg->mg_len += CHR_SVLEN(last_str) - l;
4039 data->last_end += l * (mincount - 1);
4042 /* start offset must point into the last copy */
4043 data->last_start_min += minnext * (mincount - 1);
4044 data->last_start_max += is_inf ? I32_MAX
4045 : (maxcount - 1) * (minnext + data->pos_delta);
4048 /* It is counted once already... */
4049 data->pos_min += minnext * (mincount - counted);
4050 data->pos_delta += - counted * deltanext +
4051 (minnext + deltanext) * maxcount - minnext * mincount;
4052 if (mincount != maxcount) {
4053 /* Cannot extend fixed substrings found inside
4055 SCAN_COMMIT(pRExC_state,data,minlenp);
4056 if (mincount && last_str) {
4057 SV * const sv = data->last_found;
4058 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4059 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4063 sv_setsv(sv, last_str);
4064 data->last_end = data->pos_min;
4065 data->last_start_min =
4066 data->pos_min - CHR_SVLEN(last_str);
4067 data->last_start_max = is_inf
4069 : data->pos_min + data->pos_delta
4070 - CHR_SVLEN(last_str);
4072 data->longest = &(data->longest_float);
4074 SvREFCNT_dec(last_str);
4076 if (data && (fl & SF_HAS_EVAL))
4077 data->flags |= SF_HAS_EVAL;
4078 optimize_curly_tail:
4079 if (OP(oscan) != CURLYX) {
4080 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4082 NEXT_OFF(oscan) += NEXT_OFF(next);
4085 default: /* REF, ANYOFV, and CLUMP only? */
4086 if (flags & SCF_DO_SUBSTR) {
4087 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4088 data->longest = &(data->longest_float);
4090 is_inf = is_inf_internal = 1;
4091 if (flags & SCF_DO_STCLASS_OR)
4092 cl_anything(pRExC_state, data->start_class);
4093 flags &= ~SCF_DO_STCLASS;
4097 else if (OP(scan) == LNBREAK) {
4098 if (flags & SCF_DO_STCLASS) {
4100 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4101 if (flags & SCF_DO_STCLASS_AND) {
4102 for (value = 0; value < 256; value++)
4103 if (!is_VERTWS_cp(value))
4104 ANYOF_BITMAP_CLEAR(data->start_class, value);
4107 for (value = 0; value < 256; value++)
4108 if (is_VERTWS_cp(value))
4109 ANYOF_BITMAP_SET(data->start_class, value);
4111 if (flags & SCF_DO_STCLASS_OR)
4112 cl_and(data->start_class, and_withp);
4113 flags &= ~SCF_DO_STCLASS;
4116 delta++; /* Because of the 2 char string cr-lf */
4117 if (flags & SCF_DO_SUBSTR) {
4118 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4120 data->pos_delta += 1;
4121 data->longest = &(data->longest_float);
4124 else if (REGNODE_SIMPLE(OP(scan))) {
4127 if (flags & SCF_DO_SUBSTR) {
4128 SCAN_COMMIT(pRExC_state,data,minlenp);
4132 if (flags & SCF_DO_STCLASS) {
4134 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4136 /* Some of the logic below assumes that switching
4137 locale on will only add false positives. */
4138 switch (PL_regkind[OP(scan)]) {
4144 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4147 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4148 cl_anything(pRExC_state, data->start_class);
4151 if (OP(scan) == SANY)
4153 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4154 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4155 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4156 cl_anything(pRExC_state, data->start_class);
4158 if (flags & SCF_DO_STCLASS_AND || !value)
4159 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4162 if (flags & SCF_DO_STCLASS_AND)
4163 cl_and(data->start_class,
4164 (struct regnode_charclass_class*)scan);
4166 cl_or(pRExC_state, data->start_class,
4167 (struct regnode_charclass_class*)scan);
4175 classnum = FLAGS(scan);
4176 if (flags & SCF_DO_STCLASS_AND) {
4177 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4178 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4179 for (value = 0; value < loop_max; value++) {
4180 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4181 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4187 if (data->start_class->flags & ANYOF_LOCALE) {
4188 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4192 /* Even if under locale, set the bits for non-locale
4193 * in case it isn't a true locale-node. This will
4194 * create false positives if it truly is locale */
4195 for (value = 0; value < loop_max; value++) {
4196 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4197 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4209 classnum = FLAGS(scan);
4210 if (flags & SCF_DO_STCLASS_AND) {
4211 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4212 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4213 for (value = 0; value < loop_max; value++) {
4214 if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4215 ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
4221 if (data->start_class->flags & ANYOF_LOCALE) {
4222 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4226 /* Even if under locale, set the bits for non-locale in
4227 * case it isn't a true locale-node. This will create
4228 * false positives if it truly is locale */
4229 for (value = 0; value < loop_max; value++) {
4230 if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
4231 ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
4234 if (PL_regkind[OP(scan)] == NPOSIXD) {
4235 data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4241 if (flags & SCF_DO_STCLASS_OR)
4242 cl_and(data->start_class, and_withp);
4243 flags &= ~SCF_DO_STCLASS;
4246 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4247 data->flags |= (OP(scan) == MEOL
4250 SCAN_COMMIT(pRExC_state, data, minlenp);
4253 else if ( PL_regkind[OP(scan)] == BRANCHJ
4254 /* Lookbehind, or need to calculate parens/evals/stclass: */
4255 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4256 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4257 if ( OP(scan) == UNLESSM &&
4259 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4260 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4263 regnode *upto= regnext(scan);
4265 SV * const mysv_val=sv_newmortal();
4266 DEBUG_STUDYDATA("OPFAIL",data,depth);
4268 /*DEBUG_PARSE_MSG("opfail");*/
4269 regprop(RExC_rx, mysv_val, upto);
4270 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4271 SvPV_nolen_const(mysv_val),
4272 (IV)REG_NODE_NUM(upto),
4277 NEXT_OFF(scan) = upto - scan;
4278 for (opt= scan + 1; opt < upto ; opt++)
4279 OP(opt) = OPTIMIZED;
4283 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4284 || OP(scan) == UNLESSM )
4286 /* Negative Lookahead/lookbehind
4287 In this case we can't do fixed string optimisation.
4290 I32 deltanext, minnext, fake = 0;
4292 struct regnode_charclass_class intrnl;
4295 data_fake.flags = 0;
4297 data_fake.whilem_c = data->whilem_c;
4298 data_fake.last_closep = data->last_closep;
4301 data_fake.last_closep = &fake;
4302 data_fake.pos_delta = delta;
4303 if ( flags & SCF_DO_STCLASS && !scan->flags
4304 && OP(scan) == IFMATCH ) { /* Lookahead */
4305 cl_init(pRExC_state, &intrnl);
4306 data_fake.start_class = &intrnl;
4307 f |= SCF_DO_STCLASS_AND;
4309 if (flags & SCF_WHILEM_VISITED_POS)
4310 f |= SCF_WHILEM_VISITED_POS;
4311 next = regnext(scan);
4312 nscan = NEXTOPER(NEXTOPER(scan));
4313 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4314 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4317 FAIL("Variable length lookbehind not implemented");
4319 else if (minnext > (I32)U8_MAX) {
4320 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4322 scan->flags = (U8)minnext;
4325 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4327 if (data_fake.flags & SF_HAS_EVAL)
4328 data->flags |= SF_HAS_EVAL;
4329 data->whilem_c = data_fake.whilem_c;
4331 if (f & SCF_DO_STCLASS_AND) {
4332 if (flags & SCF_DO_STCLASS_OR) {
4333 /* OR before, AND after: ideally we would recurse with
4334 * data_fake to get the AND applied by study of the
4335 * remainder of the pattern, and then derecurse;
4336 * *** HACK *** for now just treat as "no information".
4337 * See [perl #56690].
4339 cl_init(pRExC_state, data->start_class);
4341 /* AND before and after: combine and continue */
4342 const int was = TEST_SSC_EOS(data->start_class);
4344 cl_and(data->start_class, &intrnl);
4346 SET_SSC_EOS(data->start_class);
4350 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4352 /* Positive Lookahead/lookbehind
4353 In this case we can do fixed string optimisation,
4354 but we must be careful about it. Note in the case of
4355 lookbehind the positions will be offset by the minimum
4356 length of the pattern, something we won't know about
4357 until after the recurse.
4359 I32 deltanext, fake = 0;
4361 struct regnode_charclass_class intrnl;
4363 /* We use SAVEFREEPV so that when the full compile
4364 is finished perl will clean up the allocated
4365 minlens when it's all done. This way we don't
4366 have to worry about freeing them when we know
4367 they wont be used, which would be a pain.
4370 Newx( minnextp, 1, I32 );
4371 SAVEFREEPV(minnextp);
4374 StructCopy(data, &data_fake, scan_data_t);
4375 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4378 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4379 data_fake.last_found=newSVsv(data->last_found);
4383 data_fake.last_closep = &fake;
4384 data_fake.flags = 0;
4385 data_fake.pos_delta = delta;
4387 data_fake.flags |= SF_IS_INF;
4388 if ( flags & SCF_DO_STCLASS && !scan->flags
4389 && OP(scan) == IFMATCH ) { /* Lookahead */
4390 cl_init(pRExC_state, &intrnl);
4391 data_fake.start_class = &intrnl;
4392 f |= SCF_DO_STCLASS_AND;
4394 if (flags & SCF_WHILEM_VISITED_POS)
4395 f |= SCF_WHILEM_VISITED_POS;
4396 next = regnext(scan);
4397 nscan = NEXTOPER(NEXTOPER(scan));
4399 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4400 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4403 FAIL("Variable length lookbehind not implemented");
4405 else if (*minnextp > (I32)U8_MAX) {
4406 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4408 scan->flags = (U8)*minnextp;
4413 if (f & SCF_DO_STCLASS_AND) {
4414 const int was = TEST_SSC_EOS(data.start_class);
4416 cl_and(data->start_class, &intrnl);
4418 SET_SSC_EOS(data->start_class);
4421 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4423 if (data_fake.flags & SF_HAS_EVAL)
4424 data->flags |= SF_HAS_EVAL;
4425 data->whilem_c = data_fake.whilem_c;
4426 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4427 if (RExC_rx->minlen<*minnextp)
4428 RExC_rx->minlen=*minnextp;
4429 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4430 SvREFCNT_dec_NN(data_fake.last_found);
4432 if ( data_fake.minlen_fixed != minlenp )
4434 data->offset_fixed= data_fake.offset_fixed;
4435 data->minlen_fixed= data_fake.minlen_fixed;
4436 data->lookbehind_fixed+= scan->flags;
4438 if ( data_fake.minlen_float != minlenp )
4440 data->minlen_float= data_fake.minlen_float;
4441 data->offset_float_min=data_fake.offset_float_min;
4442 data->offset_float_max=data_fake.offset_float_max;
4443 data->lookbehind_float+= scan->flags;
4450 else if (OP(scan) == OPEN) {
4451 if (stopparen != (I32)ARG(scan))
4454 else if (OP(scan) == CLOSE) {
4455 if (stopparen == (I32)ARG(scan)) {
4458 if ((I32)ARG(scan) == is_par) {
4459 next = regnext(scan);
4461 if ( next && (OP(next) != WHILEM) && next < last)
4462 is_par = 0; /* Disable optimization */
4465 *(data->last_closep) = ARG(scan);
4467 else if (OP(scan) == EVAL) {
4469 data->flags |= SF_HAS_EVAL;
4471 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4472 if (flags & SCF_DO_SUBSTR) {
4473 SCAN_COMMIT(pRExC_state,data,minlenp);
4474 flags &= ~SCF_DO_SUBSTR;
4476 if (data && OP(scan)==ACCEPT) {
4477 data->flags |= SCF_SEEN_ACCEPT;
4482 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4484 if (flags & SCF_DO_SUBSTR) {
4485 SCAN_COMMIT(pRExC_state,data,minlenp);
4486 data->longest = &(data->longest_float);
4488 is_inf = is_inf_internal = 1;
4489 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4490 cl_anything(pRExC_state, data->start_class);
4491 flags &= ~SCF_DO_STCLASS;
4493 else if (OP(scan) == GPOS) {
4494 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4495 !(delta || is_inf || (data && data->pos_delta)))
4497 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4498 RExC_rx->extflags |= RXf_ANCH_GPOS;
4499 if (RExC_rx->gofs < (U32)min)
4500 RExC_rx->gofs = min;
4502 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4506 #ifdef TRIE_STUDY_OPT
4507 #ifdef FULL_TRIE_STUDY
4508 else if (PL_regkind[OP(scan)] == TRIE) {
4509 /* NOTE - There is similar code to this block above for handling
4510 BRANCH nodes on the initial study. If you change stuff here
4512 regnode *trie_node= scan;
4513 regnode *tail= regnext(scan);
4514 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4515 I32 max1 = 0, min1 = I32_MAX;
4516 struct regnode_charclass_class accum;
4518 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4519 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4520 if (flags & SCF_DO_STCLASS)
4521 cl_init_zero(pRExC_state, &accum);
4527 const regnode *nextbranch= NULL;
4530 for ( word=1 ; word <= trie->wordcount ; word++)
4532 I32 deltanext=0, minnext=0, f = 0, fake;
4533 struct regnode_charclass_class this_class;
4535 data_fake.flags = 0;
4537 data_fake.whilem_c = data->whilem_c;
4538 data_fake.last_closep = data->last_closep;
4541 data_fake.last_closep = &fake;
4542 data_fake.pos_delta = delta;
4543 if (flags & SCF_DO_STCLASS) {
4544 cl_init(pRExC_state, &this_class);
4545 data_fake.start_class = &this_class;
4546 f = SCF_DO_STCLASS_AND;
4548 if (flags & SCF_WHILEM_VISITED_POS)
4549 f |= SCF_WHILEM_VISITED_POS;
4551 if (trie->jump[word]) {
4553 nextbranch = trie_node + trie->jump[0];
4554 scan= trie_node + trie->jump[word];
4555 /* We go from the jump point to the branch that follows
4556 it. Note this means we need the vestigal unused branches
4557 even though they arent otherwise used.
4559 minnext = study_chunk(pRExC_state, &scan, minlenp,
4560 &deltanext, (regnode *)nextbranch, &data_fake,
4561 stopparen, recursed, NULL, f,depth+1);
4563 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4564 nextbranch= regnext((regnode*)nextbranch);
4566 if (min1 > (I32)(minnext + trie->minlen))
4567 min1 = minnext + trie->minlen;
4568 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4569 max1 = minnext + deltanext + trie->maxlen;
4570 if (deltanext == I32_MAX)
4571 is_inf = is_inf_internal = 1;
4573 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4575 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4576 if ( stopmin > min + min1)
4577 stopmin = min + min1;
4578 flags &= ~SCF_DO_SUBSTR;
4580 data->flags |= SCF_SEEN_ACCEPT;
4583 if (data_fake.flags & SF_HAS_EVAL)
4584 data->flags |= SF_HAS_EVAL;
4585 data->whilem_c = data_fake.whilem_c;
4587 if (flags & SCF_DO_STCLASS)
4588 cl_or(pRExC_state, &accum, &this_class);
4591 if (flags & SCF_DO_SUBSTR) {
4592 data->pos_min += min1;
4593 data->pos_delta += max1 - min1;
4594 if (max1 != min1 || is_inf)
4595 data->longest = &(data->longest_float);
4598 delta += max1 - min1;
4599 if (flags & SCF_DO_STCLASS_OR) {
4600 cl_or(pRExC_state, data->start_class, &accum);
4602 cl_and(data->start_class, and_withp);
4603 flags &= ~SCF_DO_STCLASS;
4606 else if (flags & SCF_DO_STCLASS_AND) {
4608 cl_and(data->start_class, &accum);
4609 flags &= ~SCF_DO_STCLASS;
4612 /* Switch to OR mode: cache the old value of
4613 * data->start_class */
4615 StructCopy(data->start_class, and_withp,
4616 struct regnode_charclass_class);
4617 flags &= ~SCF_DO_STCLASS_AND;
4618 StructCopy(&accum, data->start_class,
4619 struct regnode_charclass_class);
4620 flags |= SCF_DO_STCLASS_OR;
4621 SET_SSC_EOS(data->start_class);
4628 else if (PL_regkind[OP(scan)] == TRIE) {
4629 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4632 min += trie->minlen;
4633 delta += (trie->maxlen - trie->minlen);
4634 flags &= ~SCF_DO_STCLASS; /* xxx */
4635 if (flags & SCF_DO_SUBSTR) {
4636 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4637 data->pos_min += trie->minlen;
4638 data->pos_delta += (trie->maxlen - trie->minlen);
4639 if (trie->maxlen != trie->minlen)
4640 data->longest = &(data->longest_float);
4642 if (trie->jump) /* no more substrings -- for now /grr*/
4643 flags &= ~SCF_DO_SUBSTR;
4645 #endif /* old or new */
4646 #endif /* TRIE_STUDY_OPT */
4648 /* Else: zero-length, ignore. */
4649 scan = regnext(scan);
4654 stopparen = frame->stop;
4655 frame = frame->prev;
4656 goto fake_study_recurse;
4661 DEBUG_STUDYDATA("pre-fin:",data,depth);
4664 *deltap = is_inf_internal ? I32_MAX : delta;
4665 if (flags & SCF_DO_SUBSTR && is_inf)
4666 data->pos_delta = I32_MAX - data->pos_min;
4667 if (is_par > (I32)U8_MAX)
4669 if (is_par && pars==1 && data) {
4670 data->flags |= SF_IN_PAR;
4671 data->flags &= ~SF_HAS_PAR;
4673 else if (pars && data) {
4674 data->flags |= SF_HAS_PAR;
4675 data->flags &= ~SF_IN_PAR;
4677 if (flags & SCF_DO_STCLASS_OR)
4678 cl_and(data->start_class, and_withp);
4679 if (flags & SCF_TRIE_RESTUDY)
4680 data->flags |= SCF_TRIE_RESTUDY;
4682 DEBUG_STUDYDATA("post-fin:",data,depth);
4684 return min < stopmin ? min : stopmin;
4688 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4690 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4692 PERL_ARGS_ASSERT_ADD_DATA;
4694 Renewc(RExC_rxi->data,
4695 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4696 char, struct reg_data);
4698 Renew(RExC_rxi->data->what, count + n, U8);
4700 Newx(RExC_rxi->data->what, n, U8);
4701 RExC_rxi->data->count = count + n;
4702 Copy(s, RExC_rxi->data->what + count, n, U8);
4706 /*XXX: todo make this not included in a non debugging perl */
4707 #ifndef PERL_IN_XSUB_RE
4709 Perl_reginitcolors(pTHX)
4712 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4714 char *t = savepv(s);
4718 t = strchr(t, '\t');
4724 PL_colors[i] = t = (char *)"";
4729 PL_colors[i++] = (char *)"";
4736 #ifdef TRIE_STUDY_OPT
4737 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4740 (data.flags & SCF_TRIE_RESTUDY) \
4748 #define CHECK_RESTUDY_GOTO_butfirst
4752 * pregcomp - compile a regular expression into internal code
4754 * Decides which engine's compiler to call based on the hint currently in
4758 #ifndef PERL_IN_XSUB_RE
4760 /* return the currently in-scope regex engine (or the default if none) */
4762 regexp_engine const *
4763 Perl_current_re_engine(pTHX)
4767 if (IN_PERL_COMPILETIME) {
4768 HV * const table = GvHV(PL_hintgv);
4772 return &PL_core_reg_engine;
4773 ptr = hv_fetchs(table, "regcomp", FALSE);
4774 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4775 return &PL_core_reg_engine;
4776 return INT2PTR(regexp_engine*,SvIV(*ptr));
4780 if (!PL_curcop->cop_hints_hash)
4781 return &PL_core_reg_engine;
4782 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4783 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4784 return &PL_core_reg_engine;
4785 return INT2PTR(regexp_engine*,SvIV(ptr));
4791 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4794 regexp_engine const *eng = current_re_engine();
4795 GET_RE_DEBUG_FLAGS_DECL;
4797 PERL_ARGS_ASSERT_PREGCOMP;
4799 /* Dispatch a request to compile a regexp to correct regexp engine. */
4801 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4804 return CALLREGCOMP_ENG(eng, pattern, flags);
4808 /* public(ish) entry point for the perl core's own regex compiling code.
4809 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4810 * pattern rather than a list of OPs, and uses the internal engine rather
4811 * than the current one */
4814 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4816 SV *pat = pattern; /* defeat constness! */
4817 PERL_ARGS_ASSERT_RE_COMPILE;
4818 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4819 #ifdef PERL_IN_XSUB_RE
4822 &PL_core_reg_engine,
4824 NULL, NULL, rx_flags, 0);
4827 /* see if there are any run-time code blocks in the pattern.
4828 * False positives are allowed */
4831 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4832 U32 pm_flags, char *pat, STRLEN plen)
4837 /* avoid infinitely recursing when we recompile the pattern parcelled up
4838 * as qr'...'. A single constant qr// string can't have have any
4839 * run-time component in it, and thus, no runtime code. (A non-qr
4840 * string, however, can, e.g. $x =~ '(?{})') */
4841 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4844 for (s = 0; s < plen; s++) {
4845 if (n < pRExC_state->num_code_blocks
4846 && s == pRExC_state->code_blocks[n].start)
4848 s = pRExC_state->code_blocks[n].end;
4852 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4854 if (pat[s] == '(' && pat[s+1] == '?' &&
4855 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4862 /* Handle run-time code blocks. We will already have compiled any direct
4863 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4864 * copy of it, but with any literal code blocks blanked out and
4865 * appropriate chars escaped; then feed it into
4867 * eval "qr'modified_pattern'"
4871 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
4875 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
4877 * After eval_sv()-ing that, grab any new code blocks from the returned qr
4878 * and merge them with any code blocks of the original regexp.
4880 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
4881 * instead, just save the qr and return FALSE; this tells our caller that
4882 * the original pattern needs upgrading to utf8.
4886 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
4887 char *pat, STRLEN plen)
4891 GET_RE_DEBUG_FLAGS_DECL;
4893 if (pRExC_state->runtime_code_qr) {
4894 /* this is the second time we've been called; this should
4895 * only happen if the main pattern got upgraded to utf8
4896 * during compilation; re-use the qr we compiled first time
4897 * round (which should be utf8 too)
4899 qr = pRExC_state->runtime_code_qr;
4900 pRExC_state->runtime_code_qr = NULL;
4901 assert(RExC_utf8 && SvUTF8(qr));
4907 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
4911 /* determine how many extra chars we need for ' and \ escaping */
4912 for (s = 0; s < plen; s++) {
4913 if (pat[s] == '\'' || pat[s] == '\\')
4917 Newx(newpat, newlen, char);
4919 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
4921 for (s = 0; s < plen; s++) {
4922 if (n < pRExC_state->num_code_blocks
4923 && s == pRExC_state->code_blocks[n].start)
4925 /* blank out literal code block */
4926 assert(pat[s] == '(');
4927 while (s <= pRExC_state->code_blocks[n].end) {
4935 if (pat[s] == '\'' || pat[s] == '\\')
4940 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
4944 PerlIO_printf(Perl_debug_log,
4945 "%sre-parsing pattern for runtime code:%s %s\n",
4946 PL_colors[4],PL_colors[5],newpat);
4949 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
4955 PUSHSTACKi(PERLSI_REQUIRE);
4956 /* this causes the toker to collapse \\ into \ when parsing
4957 * qr''; normally only q'' does this. It also alters hints
4959 PL_reg_state.re_reparsing = TRUE;
4960 eval_sv(sv, G_SCALAR);
4961 SvREFCNT_dec_NN(sv);
4966 SV * const errsv = ERRSV;
4967 if (SvTRUE_NN(errsv))
4969 Safefree(pRExC_state->code_blocks);
4970 /* use croak_sv ? */
4971 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
4974 assert(SvROK(qr_ref));
4976 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
4977 /* the leaving below frees the tmp qr_ref.
4978 * Give qr a life of its own */
4986 if (!RExC_utf8 && SvUTF8(qr)) {
4987 /* first time through; the pattern got upgraded; save the
4988 * qr for the next time through */
4989 assert(!pRExC_state->runtime_code_qr);
4990 pRExC_state->runtime_code_qr = qr;
4995 /* extract any code blocks within the returned qr// */
4998 /* merge the main (r1) and run-time (r2) code blocks into one */
5000 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5001 struct reg_code_block *new_block, *dst;
5002 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5005 if (!r2->num_code_blocks) /* we guessed wrong */
5007 SvREFCNT_dec_NN(qr);
5012 r1->num_code_blocks + r2->num_code_blocks,
5013 struct reg_code_block);
5016 while ( i1 < r1->num_code_blocks
5017 || i2 < r2->num_code_blocks)
5019 struct reg_code_block *src;
5022 if (i1 == r1->num_code_blocks) {
5023 src = &r2->code_blocks[i2++];
5026 else if (i2 == r2->num_code_blocks)
5027 src = &r1->code_blocks[i1++];
5028 else if ( r1->code_blocks[i1].start
5029 < r2->code_blocks[i2].start)
5031 src = &r1->code_blocks[i1++];
5032 assert(src->end < r2->code_blocks[i2].start);
5035 assert( r1->code_blocks[i1].start
5036 > r2->code_blocks[i2].start);
5037 src = &r2->code_blocks[i2++];
5039 assert(src->end < r1->code_blocks[i1].start);
5042 assert(pat[src->start] == '(');
5043 assert(pat[src->end] == ')');
5044 dst->start = src->start;
5045 dst->end = src->end;
5046 dst->block = src->block;
5047 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5051 r1->num_code_blocks += r2->num_code_blocks;
5052 Safefree(r1->code_blocks);
5053 r1->code_blocks = new_block;
5056 SvREFCNT_dec_NN(qr);
5062 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5064 /* This is the common code for setting up the floating and fixed length
5065 * string data extracted from Perlre_op_compile() below. Returns a boolean
5066 * as to whether succeeded or not */
5070 if (! (longest_length
5071 || (eol /* Can't have SEOL and MULTI */
5072 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5074 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5075 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5080 /* copy the information about the longest from the reg_scan_data
5081 over to the program. */
5082 if (SvUTF8(sv_longest)) {
5083 *rx_utf8 = sv_longest;
5086 *rx_substr = sv_longest;
5089 /* end_shift is how many chars that must be matched that
5090 follow this item. We calculate it ahead of time as once the
5091 lookbehind offset is added in we lose the ability to correctly
5093 ml = minlen ? *(minlen) : (I32)longest_length;
5094 *rx_end_shift = ml - offset
5095 - longest_length + (SvTAIL(sv_longest) != 0)
5098 t = (eol/* Can't have SEOL and MULTI */
5099 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5100 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5106 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5107 * regular expression into internal code.
5108 * The pattern may be passed either as:
5109 * a list of SVs (patternp plus pat_count)
5110 * a list of OPs (expr)
5111 * If both are passed, the SV list is used, but the OP list indicates
5112 * which SVs are actually pre-compiled code blocks
5114 * The SVs in the list have magic and qr overloading applied to them (and
5115 * the list may be modified in-place with replacement SVs in the latter
5118 * If the pattern hasn't changed from old_re, then old_re will be
5121 * eng is the current engine. If that engine has an op_comp method, then
5122 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5123 * do the initial concatenation of arguments and pass on to the external
5126 * If is_bare_re is not null, set it to a boolean indicating whether the
5127 * arg list reduced (after overloading) to a single bare regex which has
5128 * been returned (i.e. /$qr/).
5130 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5132 * pm_flags contains the PMf_* flags, typically based on those from the
5133 * pm_flags field of the related PMOP. Currently we're only interested in
5134 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5136 * We can't allocate space until we know how big the compiled form will be,
5137 * but we can't compile it (and thus know how big it is) until we've got a
5138 * place to put the code. So we cheat: we compile it twice, once with code
5139 * generation turned off and size counting turned on, and once "for real".
5140 * This also means that we don't allocate space until we are sure that the
5141 * thing really will compile successfully, and we never have to move the
5142 * code and thus invalidate pointers into it. (Note that it has to be in
5143 * one piece because free() must be able to free it all.) [NB: not true in perl]
5145 * Beware that the optimization-preparation code in here knows about some
5146 * of the structure of the compiled regexp. [I'll say.]
5150 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5151 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5152 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5157 regexp_internal *ri;
5166 SV * VOL code_blocksv = NULL;
5168 /* these are all flags - maybe they should be turned
5169 * into a single int with different bit masks */
5170 I32 sawlookahead = 0;
5173 bool used_setjump = FALSE;
5174 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5175 bool code_is_utf8 = 0;
5176 bool VOL recompile = 0;
5177 bool runtime_code = 0;
5181 RExC_state_t RExC_state;
5182 RExC_state_t * const pRExC_state = &RExC_state;
5183 #ifdef TRIE_STUDY_OPT
5185 RExC_state_t copyRExC_state;
5187 GET_RE_DEBUG_FLAGS_DECL;
5189 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5191 DEBUG_r(if (!PL_colorset) reginitcolors());
5193 #ifndef PERL_IN_XSUB_RE
5194 /* Initialize these here instead of as-needed, as is quick and avoids
5195 * having to test them each time otherwise */
5196 if (! PL_AboveLatin1) {
5197 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5198 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5199 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5201 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5202 = _new_invlist_C_array(L1PosixAlnum_invlist);
5203 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5204 = _new_invlist_C_array(PosixAlnum_invlist);
5206 PL_L1Posix_ptrs[_CC_ALPHA]
5207 = _new_invlist_C_array(L1PosixAlpha_invlist);
5208 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5210 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5211 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5213 /* Cased is the same as Alpha in the ASCII range */
5214 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5215 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5217 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5218 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5220 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5221 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5223 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5224 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5226 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5227 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5229 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5230 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5232 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5233 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5235 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5236 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5237 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5238 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5240 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5241 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5243 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5245 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5246 PL_L1Posix_ptrs[_CC_WORDCHAR]
5247 = _new_invlist_C_array(L1PosixWord_invlist);
5249 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5250 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5252 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5256 pRExC_state->code_blocks = NULL;
5257 pRExC_state->num_code_blocks = 0;
5260 *is_bare_re = FALSE;
5262 if (expr && (expr->op_type == OP_LIST ||
5263 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5265 /* is the source UTF8, and how many code blocks are there? */
5269 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5270 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5272 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5273 /* count of DO blocks */
5277 pRExC_state->num_code_blocks = ncode;
5278 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5283 /* handle a list of SVs */
5287 /* apply magic and RE overloading to each arg */
5288 for (svp = patternp; svp < patternp + pat_count; svp++) {
5291 if (SvROK(rx) && SvAMAGIC(rx)) {
5292 SV *sv = AMG_CALLunary(rx, regexp_amg);
5296 if (SvTYPE(sv) != SVt_REGEXP)
5297 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5303 if (pat_count > 1) {
5304 /* concat multiple args and find any code block indexes */
5309 STRLEN orig_patlen = 0;
5311 if (pRExC_state->num_code_blocks) {
5312 o = cLISTOPx(expr)->op_first;
5313 assert( o->op_type == OP_PUSHMARK
5314 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5315 || o->op_type == OP_PADRANGE);
5319 pat = newSVpvn("", 0);
5322 /* determine if the pattern is going to be utf8 (needed
5323 * in advance to align code block indices correctly).
5324 * XXX This could fail to be detected for an arg with
5325 * overloading but not concat overloading; but the main effect
5326 * in this obscure case is to need a 'use re eval' for a
5327 * literal code block */
5328 for (svp = patternp; svp < patternp + pat_count; svp++) {
5335 for (svp = patternp; svp < patternp + pat_count; svp++) {
5336 SV *sv, *msv = *svp;
5339 /* we make the assumption here that each op in the list of
5340 * op_siblings maps to one SV pushed onto the stack,
5341 * except for code blocks, with have both an OP_NULL and
5343 * This allows us to match up the list of SVs against the
5344 * list of OPs to find the next code block.
5346 * Note that PUSHMARK PADSV PADSV ..
5348 * PADRANGE NULL NULL ..
5349 * so the alignment still works. */
5351 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5352 assert(n < pRExC_state->num_code_blocks);
5353 pRExC_state->code_blocks[n].start = SvCUR(pat);
5354 pRExC_state->code_blocks[n].block = o;
5355 pRExC_state->code_blocks[n].src_regex = NULL;
5358 o = o->op_sibling; /* skip CONST */
5364 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5365 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5368 /* overloading involved: all bets are off over literal
5369 * code. Pretend we haven't seen it */
5370 pRExC_state->num_code_blocks -= n;
5376 while (SvAMAGIC(msv)
5377 && (sv = AMG_CALLunary(msv, string_amg))
5381 && SvRV(msv) == SvRV(sv))
5386 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5388 orig_patlen = SvCUR(pat);
5389 sv_catsv_nomg(pat, msv);
5392 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5395 /* extract any code blocks within any embedded qr//'s */
5396 if (rx && SvTYPE(rx) == SVt_REGEXP
5397 && RX_ENGINE((REGEXP*)rx)->op_comp)
5400 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5401 if (ri->num_code_blocks) {
5403 /* the presence of an embedded qr// with code means
5404 * we should always recompile: the text of the
5405 * qr// may not have changed, but it may be a
5406 * different closure than last time */
5408 Renew(pRExC_state->code_blocks,
5409 pRExC_state->num_code_blocks + ri->num_code_blocks,
5410 struct reg_code_block);
5411 pRExC_state->num_code_blocks += ri->num_code_blocks;
5412 for (i=0; i < ri->num_code_blocks; i++) {
5413 struct reg_code_block *src, *dst;
5414 STRLEN offset = orig_patlen
5415 + ReANY((REGEXP *)rx)->pre_prefix;
5416 assert(n < pRExC_state->num_code_blocks);
5417 src = &ri->code_blocks[i];
5418 dst = &pRExC_state->code_blocks[n];
5419 dst->start = src->start + offset;
5420 dst->end = src->end + offset;
5421 dst->block = src->block;
5422 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5436 while (SvAMAGIC(pat)
5437 && (sv = AMG_CALLunary(pat, string_amg))
5445 /* handle bare regex: foo =~ $re */
5450 if (SvTYPE(re) == SVt_REGEXP) {
5454 Safefree(pRExC_state->code_blocks);
5460 /* not a list of SVs, so must be a list of OPs */
5462 if (expr->op_type == OP_LIST) {
5467 pat = newSVpvn("", 0);
5472 /* given a list of CONSTs and DO blocks in expr, append all
5473 * the CONSTs to pat, and record the start and end of each
5474 * code block in code_blocks[] (each DO{} op is followed by an
5475 * OP_CONST containing the corresponding literal '(?{...})
5478 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5479 if (o->op_type == OP_CONST) {
5480 sv_catsv(pat, cSVOPo_sv);
5482 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5486 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5487 assert(i+1 < pRExC_state->num_code_blocks);
5488 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5489 pRExC_state->code_blocks[i].block = o;
5490 pRExC_state->code_blocks[i].src_regex = NULL;
5496 assert(expr->op_type == OP_CONST);
5497 pat = cSVOPx_sv(expr);
5501 exp = SvPV_nomg(pat, plen);
5503 if (!eng->op_comp) {
5504 if ((SvUTF8(pat) && IN_BYTES)
5505 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5507 /* make a temporary copy; either to convert to bytes,
5508 * or to avoid repeating get-magic / overloaded stringify */
5509 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5510 (IN_BYTES ? 0 : SvUTF8(pat)));
5512 Safefree(pRExC_state->code_blocks);
5513 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5516 /* ignore the utf8ness if the pattern is 0 length */
5517 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5518 RExC_uni_semantics = 0;
5519 RExC_contains_locale = 0;
5520 pRExC_state->runtime_code_qr = NULL;
5522 /****************** LONG JUMP TARGET HERE***********************/
5523 /* Longjmp back to here if have to switch in midstream to utf8 */
5524 if (! RExC_orig_utf8) {
5525 JMPENV_PUSH(jump_ret);
5526 used_setjump = TRUE;
5529 if (jump_ret == 0) { /* First time through */
5533 SV *dsv= sv_newmortal();
5534 RE_PV_QUOTED_DECL(s, RExC_utf8,
5535 dsv, exp, plen, 60);
5536 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5537 PL_colors[4],PL_colors[5],s);
5540 else { /* longjumped back */
5543 STRLEN s = 0, d = 0;
5546 /* If the cause for the longjmp was other than changing to utf8, pop
5547 * our own setjmp, and longjmp to the correct handler */
5548 if (jump_ret != UTF8_LONGJMP) {
5550 JMPENV_JUMP(jump_ret);
5555 /* It's possible to write a regexp in ascii that represents Unicode
5556 codepoints outside of the byte range, such as via \x{100}. If we
5557 detect such a sequence we have to convert the entire pattern to utf8
5558 and then recompile, as our sizing calculation will have been based
5559 on 1 byte == 1 character, but we will need to use utf8 to encode
5560 at least some part of the pattern, and therefore must convert the whole
5563 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5564 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5566 /* upgrade pattern to UTF8, and if there are code blocks,
5567 * recalculate the indices.
5568 * This is essentially an unrolled Perl_bytes_to_utf8() */
5570 src = (U8*)SvPV_nomg(pat, plen);
5571 Newx(dst, plen * 2 + 1, U8);
5574 const UV uv = NATIVE_TO_ASCII(src[s]);
5575 if (UNI_IS_INVARIANT(uv))
5576 dst[d] = (U8)UTF_TO_NATIVE(uv);
5578 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5579 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5581 if (n < pRExC_state->num_code_blocks) {
5582 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5583 pRExC_state->code_blocks[n].start = d;
5584 assert(dst[d] == '(');
5587 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5588 pRExC_state->code_blocks[n].end = d;
5589 assert(dst[d] == ')');
5602 RExC_orig_utf8 = RExC_utf8 = 1;
5605 /* return old regex if pattern hasn't changed */
5609 && !!RX_UTF8(old_re) == !!RExC_utf8
5610 && RX_PRECOMP(old_re)
5611 && RX_PRELEN(old_re) == plen
5612 && memEQ(RX_PRECOMP(old_re), exp, plen))
5614 /* with runtime code, always recompile */
5615 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5617 if (!runtime_code) {
5621 Safefree(pRExC_state->code_blocks);
5625 else if ((pm_flags & PMf_USE_RE_EVAL)
5626 /* this second condition covers the non-regex literal case,
5627 * i.e. $foo =~ '(?{})'. */
5628 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5629 && (PL_hints & HINT_RE_EVAL))
5631 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5634 #ifdef TRIE_STUDY_OPT
5638 rx_flags = orig_rx_flags;
5640 if (initial_charset == REGEX_LOCALE_CHARSET) {
5641 RExC_contains_locale = 1;
5643 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5645 /* Set to use unicode semantics if the pattern is in utf8 and has the
5646 * 'depends' charset specified, as it means unicode when utf8 */
5647 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5651 RExC_flags = rx_flags;
5652 RExC_pm_flags = pm_flags;
5655 if (TAINTING_get && TAINT_get)
5656 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5658 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5659 /* whoops, we have a non-utf8 pattern, whilst run-time code
5660 * got compiled as utf8. Try again with a utf8 pattern */
5661 JMPENV_JUMP(UTF8_LONGJMP);
5664 assert(!pRExC_state->runtime_code_qr);
5669 RExC_in_lookbehind = 0;
5670 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5672 RExC_override_recoding = 0;
5673 RExC_in_multi_char_class = 0;
5675 /* First pass: determine size, legality. */
5683 RExC_emit = &PL_regdummy;
5684 RExC_whilem_seen = 0;
5685 RExC_open_parens = NULL;
5686 RExC_close_parens = NULL;
5688 RExC_paren_names = NULL;
5690 RExC_paren_name_list = NULL;
5692 RExC_recurse = NULL;
5693 RExC_recurse_count = 0;
5694 pRExC_state->code_index = 0;
5696 #if 0 /* REGC() is (currently) a NOP at the first pass.
5697 * Clever compilers notice this and complain. --jhi */
5698 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5701 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5703 RExC_lastparse=NULL;
5705 /* reg may croak on us, not giving us a chance to free
5706 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5707 need it to survive as long as the regexp (qr/(?{})/).
5708 We must check that code_blocksv is not already set, because we may
5709 have longjmped back. */
5710 if (pRExC_state->code_blocks && !code_blocksv) {
5711 code_blocksv = newSV_type(SVt_PV);
5712 SAVEFREESV(code_blocksv);
5713 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5714 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5716 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5717 RExC_precomp = NULL;
5721 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5723 /* Here, finished first pass. Get rid of any added setjmp */
5729 PerlIO_printf(Perl_debug_log,
5730 "Required size %"IVdf" nodes\n"
5731 "Starting second pass (creation)\n",
5734 RExC_lastparse=NULL;
5737 /* The first pass could have found things that force Unicode semantics */
5738 if ((RExC_utf8 || RExC_uni_semantics)
5739 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5741 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5744 /* Small enough for pointer-storage convention?
5745 If extralen==0, this means that we will not need long jumps. */
5746 if (RExC_size >= 0x10000L && RExC_extralen)
5747 RExC_size += RExC_extralen;
5750 if (RExC_whilem_seen > 15)
5751 RExC_whilem_seen = 15;
5753 /* Allocate space and zero-initialize. Note, the two step process
5754 of zeroing when in debug mode, thus anything assigned has to
5755 happen after that */
5756 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5758 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5759 char, regexp_internal);
5760 if ( r == NULL || ri == NULL )
5761 FAIL("Regexp out of space");
5763 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5764 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5766 /* bulk initialize base fields with 0. */
5767 Zero(ri, sizeof(regexp_internal), char);
5770 /* non-zero initialization begins here */
5773 r->extflags = rx_flags;
5774 if (pm_flags & PMf_IS_QR) {
5775 ri->code_blocks = pRExC_state->code_blocks;
5776 ri->num_code_blocks = pRExC_state->num_code_blocks;
5781 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5782 if (pRExC_state->code_blocks[n].src_regex)
5783 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5784 SAVEFREEPV(pRExC_state->code_blocks);
5788 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5789 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5791 /* The caret is output if there are any defaults: if not all the STD
5792 * flags are set, or if no character set specifier is needed */
5794 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5796 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5797 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5798 >> RXf_PMf_STD_PMMOD_SHIFT);
5799 const char *fptr = STD_PAT_MODS; /*"msix"*/
5801 /* Allocate for the worst case, which is all the std flags are turned
5802 * on. If more precision is desired, we could do a population count of
5803 * the flags set. This could be done with a small lookup table, or by
5804 * shifting, masking and adding, or even, when available, assembly
5805 * language for a machine-language population count.
5806 * We never output a minus, as all those are defaults, so are
5807 * covered by the caret */
5808 const STRLEN wraplen = plen + has_p + has_runon
5809 + has_default /* If needs a caret */
5811 /* If needs a character set specifier */
5812 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5813 + (sizeof(STD_PAT_MODS) - 1)
5814 + (sizeof("(?:)") - 1);
5816 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5817 r->xpv_len_u.xpvlenu_pv = p;
5819 SvFLAGS(rx) |= SVf_UTF8;
5822 /* If a default, cover it using the caret */
5824 *p++= DEFAULT_PAT_MOD;
5828 const char* const name = get_regex_charset_name(r->extflags, &len);
5829 Copy(name, p, len, char);
5833 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5836 while((ch = *fptr++)) {
5844 Copy(RExC_precomp, p, plen, char);
5845 assert ((RX_WRAPPED(rx) - p) < 16);
5846 r->pre_prefix = p - RX_WRAPPED(rx);
5852 SvCUR_set(rx, p - RX_WRAPPED(rx));
5856 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5858 if (RExC_seen & REG_SEEN_RECURSE) {
5859 Newxz(RExC_open_parens, RExC_npar,regnode *);
5860 SAVEFREEPV(RExC_open_parens);
5861 Newxz(RExC_close_parens,RExC_npar,regnode *);
5862 SAVEFREEPV(RExC_close_parens);
5865 /* Useful during FAIL. */
5866 #ifdef RE_TRACK_PATTERN_OFFSETS
5867 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5868 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5869 "%s %"UVuf" bytes for offset annotations.\n",
5870 ri->u.offsets ? "Got" : "Couldn't get",
5871 (UV)((2*RExC_size+1) * sizeof(U32))));
5873 SetProgLen(ri,RExC_size);
5878 /* Second pass: emit code. */
5879 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5880 RExC_pm_flags = pm_flags;
5885 RExC_emit_start = ri->program;
5886 RExC_emit = ri->program;
5887 RExC_emit_bound = ri->program + RExC_size + 1;
5888 pRExC_state->code_index = 0;
5890 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5891 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5895 /* XXXX To minimize changes to RE engine we always allocate
5896 3-units-long substrs field. */
5897 Newx(r->substrs, 1, struct reg_substr_data);
5898 if (RExC_recurse_count) {
5899 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5900 SAVEFREEPV(RExC_recurse);
5904 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5905 Zero(r->substrs, 1, struct reg_substr_data);
5907 #ifdef TRIE_STUDY_OPT
5909 StructCopy(&zero_scan_data, &data, scan_data_t);
5910 copyRExC_state = RExC_state;
5913 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5915 RExC_state = copyRExC_state;
5916 if (seen & REG_TOP_LEVEL_BRANCHES)
5917 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5919 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5920 StructCopy(&zero_scan_data, &data, scan_data_t);
5923 StructCopy(&zero_scan_data, &data, scan_data_t);
5926 /* Dig out information for optimizations. */
5927 r->extflags = RExC_flags; /* was pm_op */
5928 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5931 SvUTF8_on(rx); /* Unicode in it? */
5932 ri->regstclass = NULL;
5933 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5934 r->intflags |= PREGf_NAUGHTY;
5935 scan = ri->program + 1; /* First BRANCH. */
5937 /* testing for BRANCH here tells us whether there is "must appear"
5938 data in the pattern. If there is then we can use it for optimisations */
5939 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5941 STRLEN longest_float_length, longest_fixed_length;
5942 struct regnode_charclass_class ch_class; /* pointed to by data */
5944 I32 last_close = 0; /* pointed to by data */
5945 regnode *first= scan;
5946 regnode *first_next= regnext(first);
5948 * Skip introductions and multiplicators >= 1
5949 * so that we can extract the 'meat' of the pattern that must
5950 * match in the large if() sequence following.
5951 * NOTE that EXACT is NOT covered here, as it is normally
5952 * picked up by the optimiser separately.
5954 * This is unfortunate as the optimiser isnt handling lookahead
5955 * properly currently.
5958 while ((OP(first) == OPEN && (sawopen = 1)) ||
5959 /* An OR of *one* alternative - should not happen now. */
5960 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5961 /* for now we can't handle lookbehind IFMATCH*/
5962 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5963 (OP(first) == PLUS) ||
5964 (OP(first) == MINMOD) ||
5965 /* An {n,m} with n>0 */
5966 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5967 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5970 * the only op that could be a regnode is PLUS, all the rest
5971 * will be regnode_1 or regnode_2.
5974 if (OP(first) == PLUS)
5977 first += regarglen[OP(first)];
5979 first = NEXTOPER(first);
5980 first_next= regnext(first);
5983 /* Starting-point info. */
5985 DEBUG_PEEP("first:",first,0);
5986 /* Ignore EXACT as we deal with it later. */
5987 if (PL_regkind[OP(first)] == EXACT) {
5988 if (OP(first) == EXACT)
5989 NOOP; /* Empty, get anchored substr later. */
5991 ri->regstclass = first;
5994 else if (PL_regkind[OP(first)] == TRIE &&
5995 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5998 /* this can happen only on restudy */
5999 if ( OP(first) == TRIE ) {
6000 struct regnode_1 *trieop = (struct regnode_1 *)
6001 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6002 StructCopy(first,trieop,struct regnode_1);
6003 trie_op=(regnode *)trieop;
6005 struct regnode_charclass *trieop = (struct regnode_charclass *)
6006 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6007 StructCopy(first,trieop,struct regnode_charclass);
6008 trie_op=(regnode *)trieop;
6011 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6012 ri->regstclass = trie_op;
6015 else if (REGNODE_SIMPLE(OP(first)))
6016 ri->regstclass = first;
6017 else if (PL_regkind[OP(first)] == BOUND ||
6018 PL_regkind[OP(first)] == NBOUND)
6019 ri->regstclass = first;
6020 else if (PL_regkind[OP(first)] == BOL) {
6021 r->extflags |= (OP(first) == MBOL
6023 : (OP(first) == SBOL
6026 first = NEXTOPER(first);
6029 else if (OP(first) == GPOS) {
6030 r->extflags |= RXf_ANCH_GPOS;
6031 first = NEXTOPER(first);
6034 else if ((!sawopen || !RExC_sawback) &&
6035 (OP(first) == STAR &&
6036 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6037 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6039 /* turn .* into ^.* with an implied $*=1 */
6041 (OP(NEXTOPER(first)) == REG_ANY)
6044 r->extflags |= type;
6045 r->intflags |= PREGf_IMPLICIT;
6046 first = NEXTOPER(first);
6049 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6050 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6051 /* x+ must match at the 1st pos of run of x's */
6052 r->intflags |= PREGf_SKIP;
6054 /* Scan is after the zeroth branch, first is atomic matcher. */
6055 #ifdef TRIE_STUDY_OPT
6058 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6059 (IV)(first - scan + 1))
6063 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6064 (IV)(first - scan + 1))
6070 * If there's something expensive in the r.e., find the
6071 * longest literal string that must appear and make it the
6072 * regmust. Resolve ties in favor of later strings, since
6073 * the regstart check works with the beginning of the r.e.
6074 * and avoiding duplication strengthens checking. Not a
6075 * strong reason, but sufficient in the absence of others.
6076 * [Now we resolve ties in favor of the earlier string if
6077 * it happens that c_offset_min has been invalidated, since the
6078 * earlier string may buy us something the later one won't.]
6081 data.longest_fixed = newSVpvs("");
6082 data.longest_float = newSVpvs("");
6083 data.last_found = newSVpvs("");
6084 data.longest = &(data.longest_fixed);
6085 ENTER_with_name("study_chunk");
6086 SAVEFREESV(data.longest_fixed);
6087 SAVEFREESV(data.longest_float);
6088 SAVEFREESV(data.last_found);
6090 if (!ri->regstclass) {
6091 cl_init(pRExC_state, &ch_class);
6092 data.start_class = &ch_class;
6093 stclass_flag = SCF_DO_STCLASS_AND;
6094 } else /* XXXX Check for BOUND? */
6096 data.last_closep = &last_close;
6098 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6099 &data, -1, NULL, NULL,
6100 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6103 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6106 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6107 && data.last_start_min == 0 && data.last_end > 0
6108 && !RExC_seen_zerolen
6109 && !(RExC_seen & REG_SEEN_VERBARG)
6110 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6111 r->extflags |= RXf_CHECK_ALL;
6112 scan_commit(pRExC_state, &data,&minlen,0);
6114 longest_float_length = CHR_SVLEN(data.longest_float);
6116 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6117 && data.offset_fixed == data.offset_float_min
6118 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6119 && S_setup_longest (aTHX_ pRExC_state,
6123 &(r->float_end_shift),
6124 data.lookbehind_float,
6125 data.offset_float_min,
6127 longest_float_length,
6128 data.flags & SF_FL_BEFORE_EOL,
6129 data.flags & SF_FL_BEFORE_MEOL))
6131 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6132 r->float_max_offset = data.offset_float_max;
6133 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6134 r->float_max_offset -= data.lookbehind_float;
6135 SvREFCNT_inc_simple_void_NN(data.longest_float);
6138 r->float_substr = r->float_utf8 = NULL;
6139 longest_float_length = 0;
6142 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6144 if (S_setup_longest (aTHX_ pRExC_state,
6146 &(r->anchored_utf8),
6147 &(r->anchored_substr),
6148 &(r->anchored_end_shift),
6149 data.lookbehind_fixed,
6152 longest_fixed_length,
6153 data.flags & SF_FIX_BEFORE_EOL,
6154 data.flags & SF_FIX_BEFORE_MEOL))
6156 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6157 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6160 r->anchored_substr = r->anchored_utf8 = NULL;
6161 longest_fixed_length = 0;
6163 LEAVE_with_name("study_chunk");
6166 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6167 ri->regstclass = NULL;
6169 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6171 && ! TEST_SSC_EOS(data.start_class)
6172 && !cl_is_anything(data.start_class))
6174 const U32 n = add_data(pRExC_state, 1, "f");
6175 OP(data.start_class) = ANYOF_SYNTHETIC;
6177 Newx(RExC_rxi->data->data[n], 1,
6178 struct regnode_charclass_class);
6179 StructCopy(data.start_class,
6180 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6181 struct regnode_charclass_class);
6182 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6183 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6184 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6185 regprop(r, sv, (regnode*)data.start_class);
6186 PerlIO_printf(Perl_debug_log,
6187 "synthetic stclass \"%s\".\n",
6188 SvPVX_const(sv));});
6191 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6192 if (longest_fixed_length > longest_float_length) {
6193 r->check_end_shift = r->anchored_end_shift;
6194 r->check_substr = r->anchored_substr;
6195 r->check_utf8 = r->anchored_utf8;
6196 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6197 if (r->extflags & RXf_ANCH_SINGLE)
6198 r->extflags |= RXf_NOSCAN;
6201 r->check_end_shift = r->float_end_shift;
6202 r->check_substr = r->float_substr;
6203 r->check_utf8 = r->float_utf8;
6204 r->check_offset_min = r->float_min_offset;
6205 r->check_offset_max = r->float_max_offset;
6207 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6208 This should be changed ASAP! */
6209 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6210 r->extflags |= RXf_USE_INTUIT;
6211 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6212 r->extflags |= RXf_INTUIT_TAIL;
6214 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6215 if ( (STRLEN)minlen < longest_float_length )
6216 minlen= longest_float_length;
6217 if ( (STRLEN)minlen < longest_fixed_length )
6218 minlen= longest_fixed_length;
6222 /* Several toplevels. Best we can is to set minlen. */
6224 struct regnode_charclass_class ch_class;
6227 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6229 scan = ri->program + 1;
6230 cl_init(pRExC_state, &ch_class);
6231 data.start_class = &ch_class;
6232 data.last_closep = &last_close;
6235 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6236 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6238 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6240 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6241 = r->float_substr = r->float_utf8 = NULL;
6243 if (! TEST_SSC_EOS(data.start_class)
6244 && !cl_is_anything(data.start_class))
6246 const U32 n = add_data(pRExC_state, 1, "f");
6247 OP(data.start_class) = ANYOF_SYNTHETIC;
6249 Newx(RExC_rxi->data->data[n], 1,
6250 struct regnode_charclass_class);
6251 StructCopy(data.start_class,
6252 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6253 struct regnode_charclass_class);
6254 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6255 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6256 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6257 regprop(r, sv, (regnode*)data.start_class);
6258 PerlIO_printf(Perl_debug_log,
6259 "synthetic stclass \"%s\".\n",
6260 SvPVX_const(sv));});
6264 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6265 the "real" pattern. */
6267 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6268 (IV)minlen, (IV)r->minlen);
6270 r->minlenret = minlen;
6271 if (r->minlen < minlen)
6274 if (RExC_seen & REG_SEEN_GPOS)
6275 r->extflags |= RXf_GPOS_SEEN;
6276 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6277 r->extflags |= RXf_LOOKBEHIND_SEEN;
6278 if (pRExC_state->num_code_blocks)
6279 r->extflags |= RXf_EVAL_SEEN;
6280 if (RExC_seen & REG_SEEN_CANY)
6281 r->extflags |= RXf_CANY_SEEN;
6282 if (RExC_seen & REG_SEEN_VERBARG)
6284 r->intflags |= PREGf_VERBARG_SEEN;
6285 r->extflags |= RXf_MODIFIES_VARS;
6287 if (RExC_seen & REG_SEEN_CUTGROUP)
6288 r->intflags |= PREGf_CUTGROUP_SEEN;
6289 if (pm_flags & PMf_USE_RE_EVAL)
6290 r->intflags |= PREGf_USE_RE_EVAL;
6291 if (RExC_paren_names)
6292 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6294 RXp_PAREN_NAMES(r) = NULL;
6296 #ifdef STUPID_PATTERN_CHECKS
6297 if (RX_PRELEN(rx) == 0)
6298 r->extflags |= RXf_NULL;
6299 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6300 r->extflags |= RXf_WHITE;
6301 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6302 r->extflags |= RXf_START_ONLY;
6305 regnode *first = ri->program + 1;
6308 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6309 r->extflags |= RXf_NULL;
6310 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6311 r->extflags |= RXf_START_ONLY;
6312 else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
6313 && OP(regnext(first)) == END)
6314 r->extflags |= RXf_WHITE;
6318 if (RExC_paren_names) {
6319 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6320 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6323 ri->name_list_idx = 0;
6325 if (RExC_recurse_count) {
6326 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6327 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6328 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6331 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6332 /* assume we don't need to swap parens around before we match */
6335 PerlIO_printf(Perl_debug_log,"Final program:\n");
6338 #ifdef RE_TRACK_PATTERN_OFFSETS
6339 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6340 const U32 len = ri->u.offsets[0];
6342 GET_RE_DEBUG_FLAGS_DECL;
6343 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6344 for (i = 1; i <= len; i++) {
6345 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6346 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6347 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6349 PerlIO_printf(Perl_debug_log, "\n");
6357 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6360 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6362 PERL_UNUSED_ARG(value);
6364 if (flags & RXapif_FETCH) {
6365 return reg_named_buff_fetch(rx, key, flags);
6366 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6367 Perl_croak_no_modify();
6369 } else if (flags & RXapif_EXISTS) {
6370 return reg_named_buff_exists(rx, key, flags)
6373 } else if (flags & RXapif_REGNAMES) {
6374 return reg_named_buff_all(rx, flags);
6375 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6376 return reg_named_buff_scalar(rx, flags);
6378 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6384 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6387 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6388 PERL_UNUSED_ARG(lastkey);
6390 if (flags & RXapif_FIRSTKEY)
6391 return reg_named_buff_firstkey(rx, flags);
6392 else if (flags & RXapif_NEXTKEY)
6393 return reg_named_buff_nextkey(rx, flags);
6395 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6401 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6404 AV *retarray = NULL;
6406 struct regexp *const rx = ReANY(r);
6408 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6410 if (flags & RXapif_ALL)
6413 if (rx && RXp_PAREN_NAMES(rx)) {
6414 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6417 SV* sv_dat=HeVAL(he_str);
6418 I32 *nums=(I32*)SvPVX(sv_dat);
6419 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6420 if ((I32)(rx->nparens) >= nums[i]
6421 && rx->offs[nums[i]].start != -1
6422 && rx->offs[nums[i]].end != -1)
6425 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6430 ret = newSVsv(&PL_sv_undef);
6433 av_push(retarray, ret);
6436 return newRV_noinc(MUTABLE_SV(retarray));
6443 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6446 struct regexp *const rx = ReANY(r);
6448 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6450 if (rx && RXp_PAREN_NAMES(rx)) {
6451 if (flags & RXapif_ALL) {
6452 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6454 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6456 SvREFCNT_dec_NN(sv);
6468 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6470 struct regexp *const rx = ReANY(r);
6472 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6474 if ( rx && RXp_PAREN_NAMES(rx) ) {
6475 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6477 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6484 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6486 struct regexp *const rx = ReANY(r);
6487 GET_RE_DEBUG_FLAGS_DECL;
6489 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6491 if (rx && RXp_PAREN_NAMES(rx)) {
6492 HV *hv = RXp_PAREN_NAMES(rx);
6494 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6497 SV* sv_dat = HeVAL(temphe);
6498 I32 *nums = (I32*)SvPVX(sv_dat);
6499 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6500 if ((I32)(rx->lastparen) >= nums[i] &&
6501 rx->offs[nums[i]].start != -1 &&
6502 rx->offs[nums[i]].end != -1)
6508 if (parno || flags & RXapif_ALL) {
6509 return newSVhek(HeKEY_hek(temphe));
6517 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6522 struct regexp *const rx = ReANY(r);
6524 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6526 if (rx && RXp_PAREN_NAMES(rx)) {
6527 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6528 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6529 } else if (flags & RXapif_ONE) {
6530 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6531 av = MUTABLE_AV(SvRV(ret));
6532 length = av_len(av);
6533 SvREFCNT_dec_NN(ret);
6534 return newSViv(length + 1);
6536 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6540 return &PL_sv_undef;
6544 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6546 struct regexp *const rx = ReANY(r);
6549 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6551 if (rx && RXp_PAREN_NAMES(rx)) {
6552 HV *hv= RXp_PAREN_NAMES(rx);
6554 (void)hv_iterinit(hv);
6555 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6558 SV* sv_dat = HeVAL(temphe);
6559 I32 *nums = (I32*)SvPVX(sv_dat);
6560 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6561 if ((I32)(rx->lastparen) >= nums[i] &&
6562 rx->offs[nums[i]].start != -1 &&
6563 rx->offs[nums[i]].end != -1)
6569 if (parno || flags & RXapif_ALL) {
6570 av_push(av, newSVhek(HeKEY_hek(temphe)));
6575 return newRV_noinc(MUTABLE_SV(av));
6579 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6582 struct regexp *const rx = ReANY(r);
6588 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6590 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6591 || n == RX_BUFF_IDX_CARET_FULLMATCH
6592 || n == RX_BUFF_IDX_CARET_POSTMATCH
6594 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6601 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6602 /* no need to distinguish between them any more */
6603 n = RX_BUFF_IDX_FULLMATCH;
6605 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6606 && rx->offs[0].start != -1)
6608 /* $`, ${^PREMATCH} */
6609 i = rx->offs[0].start;
6613 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6614 && rx->offs[0].end != -1)
6616 /* $', ${^POSTMATCH} */
6617 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6618 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6621 if ( 0 <= n && n <= (I32)rx->nparens &&
6622 (s1 = rx->offs[n].start) != -1 &&
6623 (t1 = rx->offs[n].end) != -1)
6625 /* $&, ${^MATCH}, $1 ... */
6627 s = rx->subbeg + s1 - rx->suboffset;
6632 assert(s >= rx->subbeg);
6633 assert(rx->sublen >= (s - rx->subbeg) + i );
6635 #if NO_TAINT_SUPPORT
6636 sv_setpvn(sv, s, i);
6638 const int oldtainted = TAINT_get;
6640 sv_setpvn(sv, s, i);
6641 TAINT_set(oldtainted);
6643 if ( (rx->extflags & RXf_CANY_SEEN)
6644 ? (RXp_MATCH_UTF8(rx)
6645 && (!i || is_utf8_string((U8*)s, i)))
6646 : (RXp_MATCH_UTF8(rx)) )
6653 if (RXp_MATCH_TAINTED(rx)) {
6654 if (SvTYPE(sv) >= SVt_PVMG) {
6655 MAGIC* const mg = SvMAGIC(sv);
6658 SvMAGIC_set(sv, mg->mg_moremagic);
6660 if ((mgt = SvMAGIC(sv))) {
6661 mg->mg_moremagic = mgt;
6662 SvMAGIC_set(sv, mg);
6673 sv_setsv(sv,&PL_sv_undef);
6679 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6680 SV const * const value)
6682 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6684 PERL_UNUSED_ARG(rx);
6685 PERL_UNUSED_ARG(paren);
6686 PERL_UNUSED_ARG(value);
6689 Perl_croak_no_modify();
6693 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6696 struct regexp *const rx = ReANY(r);
6700 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6702 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6704 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6705 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6709 case RX_BUFF_IDX_PREMATCH: /* $` */
6710 if (rx->offs[0].start != -1) {
6711 i = rx->offs[0].start;
6720 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6721 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6723 case RX_BUFF_IDX_POSTMATCH: /* $' */
6724 if (rx->offs[0].end != -1) {
6725 i = rx->sublen - rx->offs[0].end;
6727 s1 = rx->offs[0].end;
6734 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6735 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6739 /* $& / ${^MATCH}, $1, $2, ... */
6741 if (paren <= (I32)rx->nparens &&
6742 (s1 = rx->offs[paren].start) != -1 &&
6743 (t1 = rx->offs[paren].end) != -1)
6749 if (ckWARN(WARN_UNINITIALIZED))
6750 report_uninit((const SV *)sv);
6755 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6756 const char * const s = rx->subbeg - rx->suboffset + s1;
6761 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6768 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6770 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6771 PERL_UNUSED_ARG(rx);
6775 return newSVpvs("Regexp");
6778 /* Scans the name of a named buffer from the pattern.
6779 * If flags is REG_RSN_RETURN_NULL returns null.
6780 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6781 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6782 * to the parsed name as looked up in the RExC_paren_names hash.
6783 * If there is an error throws a vFAIL().. type exception.
6786 #define REG_RSN_RETURN_NULL 0
6787 #define REG_RSN_RETURN_NAME 1
6788 #define REG_RSN_RETURN_DATA 2
6791 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6793 char *name_start = RExC_parse;
6795 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6797 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6798 /* skip IDFIRST by using do...while */
6801 RExC_parse += UTF8SKIP(RExC_parse);
6802 } while (isALNUM_utf8((U8*)RExC_parse));
6806 } while (isALNUM(*RExC_parse));
6808 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6809 vFAIL("Group name must start with a non-digit word character");
6813 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6814 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6815 if ( flags == REG_RSN_RETURN_NAME)
6817 else if (flags==REG_RSN_RETURN_DATA) {
6820 if ( ! sv_name ) /* should not happen*/
6821 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6822 if (RExC_paren_names)
6823 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6825 sv_dat = HeVAL(he_str);
6827 vFAIL("Reference to nonexistent named group");
6831 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6832 (unsigned long) flags);
6834 assert(0); /* NOT REACHED */
6839 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6840 int rem=(int)(RExC_end - RExC_parse); \
6849 if (RExC_lastparse!=RExC_parse) \
6850 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6853 iscut ? "..." : "<" \
6856 PerlIO_printf(Perl_debug_log,"%16s",""); \
6859 num = RExC_size + 1; \
6861 num=REG_NODE_NUM(RExC_emit); \
6862 if (RExC_lastnum!=num) \
6863 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6865 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6866 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6867 (int)((depth*2)), "", \
6871 RExC_lastparse=RExC_parse; \
6876 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6877 DEBUG_PARSE_MSG((funcname)); \
6878 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6880 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6881 DEBUG_PARSE_MSG((funcname)); \
6882 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6885 /* This section of code defines the inversion list object and its methods. The
6886 * interfaces are highly subject to change, so as much as possible is static to
6887 * this file. An inversion list is here implemented as a malloc'd C UV array
6888 * with some added info that is placed as UVs at the beginning in a header
6889 * portion. An inversion list for Unicode is an array of code points, sorted
6890 * by ordinal number. The zeroth element is the first code point in the list.
6891 * The 1th element is the first element beyond that not in the list. In other
6892 * words, the first range is
6893 * invlist[0]..(invlist[1]-1)
6894 * The other ranges follow. Thus every element whose index is divisible by two
6895 * marks the beginning of a range that is in the list, and every element not
6896 * divisible by two marks the beginning of a range not in the list. A single
6897 * element inversion list that contains the single code point N generally
6898 * consists of two elements
6901 * (The exception is when N is the highest representable value on the
6902 * machine, in which case the list containing just it would be a single
6903 * element, itself. By extension, if the last range in the list extends to
6904 * infinity, then the first element of that range will be in the inversion list
6905 * at a position that is divisible by two, and is the final element in the
6907 * Taking the complement (inverting) an inversion list is quite simple, if the
6908 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6909 * This implementation reserves an element at the beginning of each inversion
6910 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
6911 * actual beginning of the list is either that element if 0, or the next one if
6914 * More about inversion lists can be found in "Unicode Demystified"
6915 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6916 * More will be coming when functionality is added later.
6918 * The inversion list data structure is currently implemented as an SV pointing
6919 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6920 * array of UV whose memory management is automatically handled by the existing
6921 * facilities for SV's.
6923 * Some of the methods should always be private to the implementation, and some
6924 * should eventually be made public */
6926 /* The header definitions are in F<inline_invlist.c> */
6927 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6928 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6930 #define INVLIST_INITIAL_LEN 10
6932 PERL_STATIC_INLINE UV*
6933 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6935 /* Returns a pointer to the first element in the inversion list's array.
6936 * This is called upon initialization of an inversion list. Where the
6937 * array begins depends on whether the list has the code point U+0000
6938 * in it or not. The other parameter tells it whether the code that
6939 * follows this call is about to put a 0 in the inversion list or not.
6940 * The first element is either the element with 0, if 0, or the next one,
6943 UV* zero = get_invlist_zero_addr(invlist);
6945 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6948 assert(! *_get_invlist_len_addr(invlist));
6950 /* 1^1 = 0; 1^0 = 1 */
6951 *zero = 1 ^ will_have_0;
6952 return zero + *zero;
6955 PERL_STATIC_INLINE UV*
6956 S_invlist_array(pTHX_ SV* const invlist)
6958 /* Returns the pointer to the inversion list's array. Every time the
6959 * length changes, this needs to be called in case malloc or realloc moved
6962 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6964 /* Must not be empty. If these fail, you probably didn't check for <len>
6965 * being non-zero before trying to get the array */
6966 assert(*_get_invlist_len_addr(invlist));
6967 assert(*get_invlist_zero_addr(invlist) == 0
6968 || *get_invlist_zero_addr(invlist) == 1);
6970 /* The array begins either at the element reserved for zero if the
6971 * list contains 0 (that element will be set to 0), or otherwise the next
6972 * element (in which case the reserved element will be set to 1). */
6973 return (UV *) (get_invlist_zero_addr(invlist)
6974 + *get_invlist_zero_addr(invlist));
6977 PERL_STATIC_INLINE void
6978 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6980 /* Sets the current number of elements stored in the inversion list */
6982 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6984 *_get_invlist_len_addr(invlist) = len;
6986 assert(len <= SvLEN(invlist));
6988 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6989 /* If the list contains U+0000, that element is part of the header,
6990 * and should not be counted as part of the array. It will contain
6991 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6993 * SvCUR_set(invlist,
6994 * TO_INTERNAL_SIZE(len
6995 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6996 * But, this is only valid if len is not 0. The consequences of not doing
6997 * this is that the memory allocation code may think that 1 more UV is
6998 * being used than actually is, and so might do an unnecessary grow. That
6999 * seems worth not bothering to make this the precise amount.
7001 * Note that when inverting, SvCUR shouldn't change */
7004 PERL_STATIC_INLINE IV*
7005 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7007 /* Return the address of the UV that is reserved to hold the cached index
7010 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7012 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7015 PERL_STATIC_INLINE IV
7016 S_invlist_previous_index(pTHX_ SV* const invlist)
7018 /* Returns cached index of previous search */
7020 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7022 return *get_invlist_previous_index_addr(invlist);
7025 PERL_STATIC_INLINE void
7026 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7028 /* Caches <index> for later retrieval */
7030 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7032 assert(index == 0 || index < (int) _invlist_len(invlist));
7034 *get_invlist_previous_index_addr(invlist) = index;
7037 PERL_STATIC_INLINE UV
7038 S_invlist_max(pTHX_ SV* const invlist)
7040 /* Returns the maximum number of elements storable in the inversion list's
7041 * array, without having to realloc() */
7043 PERL_ARGS_ASSERT_INVLIST_MAX;
7045 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7046 ? _invlist_len(invlist)
7047 : FROM_INTERNAL_SIZE(SvLEN(invlist));
7050 PERL_STATIC_INLINE UV*
7051 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7053 /* Return the address of the UV that is reserved to hold 0 if the inversion
7054 * list contains 0. This has to be the last element of the heading, as the
7055 * list proper starts with either it if 0, or the next element if not.
7056 * (But we force it to contain either 0 or 1) */
7058 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7060 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7063 #ifndef PERL_IN_XSUB_RE
7065 Perl__new_invlist(pTHX_ IV initial_size)
7068 /* Return a pointer to a newly constructed inversion list, with enough
7069 * space to store 'initial_size' elements. If that number is negative, a
7070 * system default is used instead */
7074 if (initial_size < 0) {
7075 initial_size = INVLIST_INITIAL_LEN;
7078 /* Allocate the initial space */
7079 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7080 invlist_set_len(new_list, 0);
7082 /* Force iterinit() to be used to get iteration to work */
7083 *get_invlist_iter_addr(new_list) = UV_MAX;
7085 /* This should force a segfault if a method doesn't initialize this
7087 *get_invlist_zero_addr(new_list) = UV_MAX;
7089 *get_invlist_previous_index_addr(new_list) = 0;
7090 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7091 #if HEADER_LENGTH != 5
7092 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7100 S__new_invlist_C_array(pTHX_ UV* list)
7102 /* Return a pointer to a newly constructed inversion list, initialized to
7103 * point to <list>, which has to be in the exact correct inversion list
7104 * form, including internal fields. Thus this is a dangerous routine that
7105 * should not be used in the wrong hands */
7107 SV* invlist = newSV_type(SVt_PV);
7109 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7111 SvPV_set(invlist, (char *) list);
7112 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7113 shouldn't touch it */
7114 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7116 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7117 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7120 /* Initialize the iteration pointer.
7121 * XXX This could be done at compile time in charclass_invlists.h, but I
7122 * (khw) am not confident that the suffixes for specifying the C constant
7123 * UV_MAX are portable, e.g. 'ull' on a 32 bit machine that is configured
7124 * to use 64 bits; might need a Configure probe */
7125 invlist_iterfinish(invlist);
7131 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7133 /* Grow the maximum size of an inversion list */
7135 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7137 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7140 PERL_STATIC_INLINE void
7141 S_invlist_trim(pTHX_ SV* const invlist)
7143 PERL_ARGS_ASSERT_INVLIST_TRIM;
7145 /* Change the length of the inversion list to how many entries it currently
7148 SvPV_shrink_to_cur((SV *) invlist);
7151 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7154 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7156 /* Subject to change or removal. Append the range from 'start' to 'end' at
7157 * the end of the inversion list. The range must be above any existing
7161 UV max = invlist_max(invlist);
7162 UV len = _invlist_len(invlist);
7164 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7166 if (len == 0) { /* Empty lists must be initialized */
7167 array = _invlist_array_init(invlist, start == 0);
7170 /* Here, the existing list is non-empty. The current max entry in the
7171 * list is generally the first value not in the set, except when the
7172 * set extends to the end of permissible values, in which case it is
7173 * the first entry in that final set, and so this call is an attempt to
7174 * append out-of-order */
7176 UV final_element = len - 1;
7177 array = invlist_array(invlist);
7178 if (array[final_element] > start
7179 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7181 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7182 array[final_element], start,
7183 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7186 /* Here, it is a legal append. If the new range begins with the first
7187 * value not in the set, it is extending the set, so the new first
7188 * value not in the set is one greater than the newly extended range.
7190 if (array[final_element] == start) {
7191 if (end != UV_MAX) {
7192 array[final_element] = end + 1;
7195 /* But if the end is the maximum representable on the machine,
7196 * just let the range that this would extend to have no end */
7197 invlist_set_len(invlist, len - 1);
7203 /* Here the new range doesn't extend any existing set. Add it */
7205 len += 2; /* Includes an element each for the start and end of range */
7207 /* If overflows the existing space, extend, which may cause the array to be
7210 invlist_extend(invlist, len);
7211 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7212 failure in invlist_array() */
7213 array = invlist_array(invlist);
7216 invlist_set_len(invlist, len);
7219 /* The next item on the list starts the range, the one after that is
7220 * one past the new range. */
7221 array[len - 2] = start;
7222 if (end != UV_MAX) {
7223 array[len - 1] = end + 1;
7226 /* But if the end is the maximum representable on the machine, just let
7227 * the range have no end */
7228 invlist_set_len(invlist, len - 1);
7232 #ifndef PERL_IN_XSUB_RE
7235 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7237 /* Searches the inversion list for the entry that contains the input code
7238 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7239 * return value is the index into the list's array of the range that
7244 IV high = _invlist_len(invlist);
7245 const IV highest_element = high - 1;
7248 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7250 /* If list is empty, return failure. */
7255 /* (We can't get the array unless we know the list is non-empty) */
7256 array = invlist_array(invlist);
7258 mid = invlist_previous_index(invlist);
7259 assert(mid >=0 && mid <= highest_element);
7261 /* <mid> contains the cache of the result of the previous call to this
7262 * function (0 the first time). See if this call is for the same result,
7263 * or if it is for mid-1. This is under the theory that calls to this
7264 * function will often be for related code points that are near each other.
7265 * And benchmarks show that caching gives better results. We also test
7266 * here if the code point is within the bounds of the list. These tests
7267 * replace others that would have had to be made anyway to make sure that
7268 * the array bounds were not exceeded, and these give us extra information
7269 * at the same time */
7270 if (cp >= array[mid]) {
7271 if (cp >= array[highest_element]) {
7272 return highest_element;
7275 /* Here, array[mid] <= cp < array[highest_element]. This means that
7276 * the final element is not the answer, so can exclude it; it also
7277 * means that <mid> is not the final element, so can refer to 'mid + 1'
7279 if (cp < array[mid + 1]) {
7285 else { /* cp < aray[mid] */
7286 if (cp < array[0]) { /* Fail if outside the array */
7290 if (cp >= array[mid - 1]) {
7295 /* Binary search. What we are looking for is <i> such that
7296 * array[i] <= cp < array[i+1]
7297 * The loop below converges on the i+1. Note that there may not be an
7298 * (i+1)th element in the array, and things work nonetheless */
7299 while (low < high) {
7300 mid = (low + high) / 2;
7301 assert(mid <= highest_element);
7302 if (array[mid] <= cp) { /* cp >= array[mid] */
7305 /* We could do this extra test to exit the loop early.
7306 if (cp < array[low]) {
7311 else { /* cp < array[mid] */
7318 invlist_set_previous_index(invlist, high);
7323 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7325 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7326 * but is used when the swash has an inversion list. This makes this much
7327 * faster, as it uses a binary search instead of a linear one. This is
7328 * intimately tied to that function, and perhaps should be in utf8.c,
7329 * except it is intimately tied to inversion lists as well. It assumes
7330 * that <swatch> is all 0's on input */
7333 const IV len = _invlist_len(invlist);
7337 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7339 if (len == 0) { /* Empty inversion list */
7343 array = invlist_array(invlist);
7345 /* Find which element it is */
7346 i = _invlist_search(invlist, start);
7348 /* We populate from <start> to <end> */
7349 while (current < end) {
7352 /* The inversion list gives the results for every possible code point
7353 * after the first one in the list. Only those ranges whose index is
7354 * even are ones that the inversion list matches. For the odd ones,
7355 * and if the initial code point is not in the list, we have to skip
7356 * forward to the next element */
7357 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7359 if (i >= len) { /* Finished if beyond the end of the array */
7363 if (current >= end) { /* Finished if beyond the end of what we
7365 if (LIKELY(end < UV_MAX)) {
7369 /* We get here when the upper bound is the maximum
7370 * representable on the machine, and we are looking for just
7371 * that code point. Have to special case it */
7373 goto join_end_of_list;
7376 assert(current >= start);
7378 /* The current range ends one below the next one, except don't go past
7381 upper = (i < len && array[i] < end) ? array[i] : end;
7383 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7384 * for each code point in it */
7385 for (; current < upper; current++) {
7386 const STRLEN offset = (STRLEN)(current - start);
7387 swatch[offset >> 3] |= 1 << (offset & 7);
7392 /* Quit if at the end of the list */
7395 /* But first, have to deal with the highest possible code point on
7396 * the platform. The previous code assumes that <end> is one
7397 * beyond where we want to populate, but that is impossible at the
7398 * platform's infinity, so have to handle it specially */
7399 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7401 const STRLEN offset = (STRLEN)(end - start);
7402 swatch[offset >> 3] |= 1 << (offset & 7);
7407 /* Advance to the next range, which will be for code points not in the
7416 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7418 /* Take the union of two inversion lists and point <output> to it. *output
7419 * should be defined upon input, and if it points to one of the two lists,
7420 * the reference count to that list will be decremented. The first list,
7421 * <a>, may be NULL, in which case a copy of the second list is returned.
7422 * If <complement_b> is TRUE, the union is taken of the complement
7423 * (inversion) of <b> instead of b itself.
7425 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7426 * Richard Gillam, published by Addison-Wesley, and explained at some
7427 * length there. The preface says to incorporate its examples into your
7428 * code at your own risk.
7430 * The algorithm is like a merge sort.
7432 * XXX A potential performance improvement is to keep track as we go along
7433 * if only one of the inputs contributes to the result, meaning the other
7434 * is a subset of that one. In that case, we can skip the final copy and
7435 * return the larger of the input lists, but then outside code might need
7436 * to keep track of whether to free the input list or not */
7438 UV* array_a; /* a's array */
7440 UV len_a; /* length of a's array */
7443 SV* u; /* the resulting union */
7447 UV i_a = 0; /* current index into a's array */
7451 /* running count, as explained in the algorithm source book; items are
7452 * stopped accumulating and are output when the count changes to/from 0.
7453 * The count is incremented when we start a range that's in the set, and
7454 * decremented when we start a range that's not in the set. So its range
7455 * is 0 to 2. Only when the count is zero is something not in the set.
7459 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7462 /* If either one is empty, the union is the other one */
7463 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7470 *output = invlist_clone(b);
7472 _invlist_invert(*output);
7474 } /* else *output already = b; */
7477 else if ((len_b = _invlist_len(b)) == 0) {
7482 /* The complement of an empty list is a list that has everything in it,
7483 * so the union with <a> includes everything too */
7488 *output = _new_invlist(1);
7489 _append_range_to_invlist(*output, 0, UV_MAX);
7491 else if (*output != a) {
7492 *output = invlist_clone(a);
7494 /* else *output already = a; */
7498 /* Here both lists exist and are non-empty */
7499 array_a = invlist_array(a);
7500 array_b = invlist_array(b);
7502 /* If are to take the union of 'a' with the complement of b, set it
7503 * up so are looking at b's complement. */
7506 /* To complement, we invert: if the first element is 0, remove it. To
7507 * do this, we just pretend the array starts one later, and clear the
7508 * flag as we don't have to do anything else later */
7509 if (array_b[0] == 0) {
7512 complement_b = FALSE;
7516 /* But if the first element is not zero, we unshift a 0 before the
7517 * array. The data structure reserves a space for that 0 (which
7518 * should be a '1' right now), so physical shifting is unneeded,
7519 * but temporarily change that element to 0. Before exiting the
7520 * routine, we must restore the element to '1' */
7527 /* Size the union for the worst case: that the sets are completely
7529 u = _new_invlist(len_a + len_b);
7531 /* Will contain U+0000 if either component does */
7532 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7533 || (len_b > 0 && array_b[0] == 0));
7535 /* Go through each list item by item, stopping when exhausted one of
7537 while (i_a < len_a && i_b < len_b) {
7538 UV cp; /* The element to potentially add to the union's array */
7539 bool cp_in_set; /* is it in the the input list's set or not */
7541 /* We need to take one or the other of the two inputs for the union.
7542 * Since we are merging two sorted lists, we take the smaller of the
7543 * next items. In case of a tie, we take the one that is in its set
7544 * first. If we took one not in the set first, it would decrement the
7545 * count, possibly to 0 which would cause it to be output as ending the
7546 * range, and the next time through we would take the same number, and
7547 * output it again as beginning the next range. By doing it the
7548 * opposite way, there is no possibility that the count will be
7549 * momentarily decremented to 0, and thus the two adjoining ranges will
7550 * be seamlessly merged. (In a tie and both are in the set or both not
7551 * in the set, it doesn't matter which we take first.) */
7552 if (array_a[i_a] < array_b[i_b]
7553 || (array_a[i_a] == array_b[i_b]
7554 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7556 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7560 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7564 /* Here, have chosen which of the two inputs to look at. Only output
7565 * if the running count changes to/from 0, which marks the
7566 * beginning/end of a range in that's in the set */
7569 array_u[i_u++] = cp;
7576 array_u[i_u++] = cp;
7581 /* Here, we are finished going through at least one of the lists, which
7582 * means there is something remaining in at most one. We check if the list
7583 * that hasn't been exhausted is positioned such that we are in the middle
7584 * of a range in its set or not. (i_a and i_b point to the element beyond
7585 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7586 * is potentially more to output.
7587 * There are four cases:
7588 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7589 * in the union is entirely from the non-exhausted set.
7590 * 2) Both were in their sets, count is 2. Nothing further should
7591 * be output, as everything that remains will be in the exhausted
7592 * list's set, hence in the union; decrementing to 1 but not 0 insures
7594 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7595 * Nothing further should be output because the union includes
7596 * everything from the exhausted set. Not decrementing ensures that.
7597 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7598 * decrementing to 0 insures that we look at the remainder of the
7599 * non-exhausted set */
7600 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7601 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7606 /* The final length is what we've output so far, plus what else is about to
7607 * be output. (If 'count' is non-zero, then the input list we exhausted
7608 * has everything remaining up to the machine's limit in its set, and hence
7609 * in the union, so there will be no further output. */
7612 /* At most one of the subexpressions will be non-zero */
7613 len_u += (len_a - i_a) + (len_b - i_b);
7616 /* Set result to final length, which can change the pointer to array_u, so
7618 if (len_u != _invlist_len(u)) {
7619 invlist_set_len(u, len_u);
7621 array_u = invlist_array(u);
7624 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7625 * the other) ended with everything above it not in its set. That means
7626 * that the remaining part of the union is precisely the same as the
7627 * non-exhausted list, so can just copy it unchanged. (If both list were
7628 * exhausted at the same time, then the operations below will be both 0.)
7631 IV copy_count; /* At most one will have a non-zero copy count */
7632 if ((copy_count = len_a - i_a) > 0) {
7633 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7635 else if ((copy_count = len_b - i_b) > 0) {
7636 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7640 /* We may be removing a reference to one of the inputs */
7641 if (a == *output || b == *output) {
7642 assert(! invlist_is_iterating(*output));
7643 SvREFCNT_dec_NN(*output);
7646 /* If we've changed b, restore it */
7656 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7658 /* Take the intersection of two inversion lists and point <i> to it. *i
7659 * should be defined upon input, and if it points to one of the two lists,
7660 * the reference count to that list will be decremented.
7661 * If <complement_b> is TRUE, the result will be the intersection of <a>
7662 * and the complement (or inversion) of <b> instead of <b> directly.
7664 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7665 * Richard Gillam, published by Addison-Wesley, and explained at some
7666 * length there. The preface says to incorporate its examples into your
7667 * code at your own risk. In fact, it had bugs
7669 * The algorithm is like a merge sort, and is essentially the same as the
7673 UV* array_a; /* a's array */
7675 UV len_a; /* length of a's array */
7678 SV* r; /* the resulting intersection */
7682 UV i_a = 0; /* current index into a's array */
7686 /* running count, as explained in the algorithm source book; items are
7687 * stopped accumulating and are output when the count changes to/from 2.
7688 * The count is incremented when we start a range that's in the set, and
7689 * decremented when we start a range that's not in the set. So its range
7690 * is 0 to 2. Only when the count is 2 is something in the intersection.
7694 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7697 /* Special case if either one is empty */
7698 len_a = _invlist_len(a);
7699 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7701 if (len_a != 0 && complement_b) {
7703 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7704 * be empty. Here, also we are using 'b's complement, which hence
7705 * must be every possible code point. Thus the intersection is
7708 *i = invlist_clone(a);
7714 /* else *i is already 'a' */
7718 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7719 * intersection must be empty */
7726 *i = _new_invlist(0);
7730 /* Here both lists exist and are non-empty */
7731 array_a = invlist_array(a);
7732 array_b = invlist_array(b);
7734 /* If are to take the intersection of 'a' with the complement of b, set it
7735 * up so are looking at b's complement. */
7738 /* To complement, we invert: if the first element is 0, remove it. To
7739 * do this, we just pretend the array starts one later, and clear the
7740 * flag as we don't have to do anything else later */
7741 if (array_b[0] == 0) {
7744 complement_b = FALSE;
7748 /* But if the first element is not zero, we unshift a 0 before the
7749 * array. The data structure reserves a space for that 0 (which
7750 * should be a '1' right now), so physical shifting is unneeded,
7751 * but temporarily change that element to 0. Before exiting the
7752 * routine, we must restore the element to '1' */
7759 /* Size the intersection for the worst case: that the intersection ends up
7760 * fragmenting everything to be completely disjoint */
7761 r= _new_invlist(len_a + len_b);
7763 /* Will contain U+0000 iff both components do */
7764 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7765 && len_b > 0 && array_b[0] == 0);
7767 /* Go through each list item by item, stopping when exhausted one of
7769 while (i_a < len_a && i_b < len_b) {
7770 UV cp; /* The element to potentially add to the intersection's
7772 bool cp_in_set; /* Is it in the input list's set or not */
7774 /* We need to take one or the other of the two inputs for the
7775 * intersection. Since we are merging two sorted lists, we take the
7776 * smaller of the next items. In case of a tie, we take the one that
7777 * is not in its set first (a difference from the union algorithm). If
7778 * we took one in the set first, it would increment the count, possibly
7779 * to 2 which would cause it to be output as starting a range in the
7780 * intersection, and the next time through we would take that same
7781 * number, and output it again as ending the set. By doing it the
7782 * opposite of this, there is no possibility that the count will be
7783 * momentarily incremented to 2. (In a tie and both are in the set or
7784 * both not in the set, it doesn't matter which we take first.) */
7785 if (array_a[i_a] < array_b[i_b]
7786 || (array_a[i_a] == array_b[i_b]
7787 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7789 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7793 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7797 /* Here, have chosen which of the two inputs to look at. Only output
7798 * if the running count changes to/from 2, which marks the
7799 * beginning/end of a range that's in the intersection */
7803 array_r[i_r++] = cp;
7808 array_r[i_r++] = cp;
7814 /* Here, we are finished going through at least one of the lists, which
7815 * means there is something remaining in at most one. We check if the list
7816 * that has been exhausted is positioned such that we are in the middle
7817 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7818 * the ones we care about.) There are four cases:
7819 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7820 * nothing left in the intersection.
7821 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7822 * above 2. What should be output is exactly that which is in the
7823 * non-exhausted set, as everything it has is also in the intersection
7824 * set, and everything it doesn't have can't be in the intersection
7825 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7826 * gets incremented to 2. Like the previous case, the intersection is
7827 * everything that remains in the non-exhausted set.
7828 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7829 * remains 1. And the intersection has nothing more. */
7830 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7831 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7836 /* The final length is what we've output so far plus what else is in the
7837 * intersection. At most one of the subexpressions below will be non-zero */
7840 len_r += (len_a - i_a) + (len_b - i_b);
7843 /* Set result to final length, which can change the pointer to array_r, so
7845 if (len_r != _invlist_len(r)) {
7846 invlist_set_len(r, len_r);
7848 array_r = invlist_array(r);
7851 /* Finish outputting any remaining */
7852 if (count >= 2) { /* At most one will have a non-zero copy count */
7854 if ((copy_count = len_a - i_a) > 0) {
7855 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7857 else if ((copy_count = len_b - i_b) > 0) {
7858 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7862 /* We may be removing a reference to one of the inputs */
7863 if (a == *i || b == *i) {
7864 assert(! invlist_is_iterating(*i));
7865 SvREFCNT_dec_NN(*i);
7868 /* If we've changed b, restore it */
7878 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7880 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7881 * set. A pointer to the inversion list is returned. This may actually be
7882 * a new list, in which case the passed in one has been destroyed. The
7883 * passed in inversion list can be NULL, in which case a new one is created
7884 * with just the one range in it */
7889 if (invlist == NULL) {
7890 invlist = _new_invlist(2);
7894 len = _invlist_len(invlist);
7897 /* If comes after the final entry, can just append it to the end */
7899 || start >= invlist_array(invlist)
7900 [_invlist_len(invlist) - 1])
7902 _append_range_to_invlist(invlist, start, end);
7906 /* Here, can't just append things, create and return a new inversion list
7907 * which is the union of this range and the existing inversion list */
7908 range_invlist = _new_invlist(2);
7909 _append_range_to_invlist(range_invlist, start, end);
7911 _invlist_union(invlist, range_invlist, &invlist);
7913 /* The temporary can be freed */
7914 SvREFCNT_dec_NN(range_invlist);
7921 PERL_STATIC_INLINE SV*
7922 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7923 return _add_range_to_invlist(invlist, cp, cp);
7926 #ifndef PERL_IN_XSUB_RE
7928 Perl__invlist_invert(pTHX_ SV* const invlist)
7930 /* Complement the input inversion list. This adds a 0 if the list didn't
7931 * have a zero; removes it otherwise. As described above, the data
7932 * structure is set up so that this is very efficient */
7934 UV* len_pos = _get_invlist_len_addr(invlist);
7936 PERL_ARGS_ASSERT__INVLIST_INVERT;
7938 assert(! invlist_is_iterating(invlist));
7940 /* The inverse of matching nothing is matching everything */
7941 if (*len_pos == 0) {
7942 _append_range_to_invlist(invlist, 0, UV_MAX);
7946 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7947 * zero element was a 0, so it is being removed, so the length decrements
7948 * by 1; and vice-versa. SvCUR is unaffected */
7949 if (*get_invlist_zero_addr(invlist) ^= 1) {
7958 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7960 /* Complement the input inversion list (which must be a Unicode property,
7961 * all of which don't match above the Unicode maximum code point.) And
7962 * Perl has chosen to not have the inversion match above that either. This
7963 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7969 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7971 _invlist_invert(invlist);
7973 len = _invlist_len(invlist);
7975 if (len != 0) { /* If empty do nothing */
7976 array = invlist_array(invlist);
7977 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7978 /* Add 0x110000. First, grow if necessary */
7980 if (invlist_max(invlist) < len) {
7981 invlist_extend(invlist, len);
7982 array = invlist_array(invlist);
7984 invlist_set_len(invlist, len);
7985 array[len - 1] = PERL_UNICODE_MAX + 1;
7987 else { /* Remove the 0x110000 */
7988 invlist_set_len(invlist, len - 1);
7996 PERL_STATIC_INLINE SV*
7997 S_invlist_clone(pTHX_ SV* const invlist)
8000 /* Return a new inversion list that is a copy of the input one, which is
8003 /* Need to allocate extra space to accommodate Perl's addition of a
8004 * trailing NUL to SvPV's, since it thinks they are always strings */
8005 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8006 STRLEN length = SvCUR(invlist);
8008 PERL_ARGS_ASSERT_INVLIST_CLONE;
8010 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8011 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8016 PERL_STATIC_INLINE UV*
8017 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8019 /* Return the address of the UV that contains the current iteration
8022 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8024 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8027 PERL_STATIC_INLINE UV*
8028 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8030 /* Return the address of the UV that contains the version id. */
8032 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8034 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8037 PERL_STATIC_INLINE void
8038 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8040 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8042 *get_invlist_iter_addr(invlist) = 0;
8045 PERL_STATIC_INLINE void
8046 S_invlist_iterfinish(pTHX_ SV* invlist)
8048 /* Terminate iterator for invlist. This is to catch development errors.
8049 * Any iteration that is interrupted before completed should call this
8050 * function. Functions that add code points anywhere else but to the end
8051 * of an inversion list assert that they are not in the middle of an
8052 * iteration. If they were, the addition would make the iteration
8053 * problematical: if the iteration hadn't reached the place where things
8054 * were being added, it would be ok */
8056 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8058 *get_invlist_iter_addr(invlist) = UV_MAX;
8062 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8064 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8065 * This call sets in <*start> and <*end>, the next range in <invlist>.
8066 * Returns <TRUE> if successful and the next call will return the next
8067 * range; <FALSE> if was already at the end of the list. If the latter,
8068 * <*start> and <*end> are unchanged, and the next call to this function
8069 * will start over at the beginning of the list */
8071 UV* pos = get_invlist_iter_addr(invlist);
8072 UV len = _invlist_len(invlist);
8075 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8078 *pos = UV_MAX; /* Force iterinit() to be required next time */
8082 array = invlist_array(invlist);
8084 *start = array[(*pos)++];
8090 *end = array[(*pos)++] - 1;
8096 PERL_STATIC_INLINE bool
8097 S_invlist_is_iterating(pTHX_ SV* const invlist)
8099 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8101 return *(get_invlist_iter_addr(invlist)) < UV_MAX;
8104 PERL_STATIC_INLINE UV
8105 S_invlist_highest(pTHX_ SV* const invlist)
8107 /* Returns the highest code point that matches an inversion list. This API
8108 * has an ambiguity, as it returns 0 under either the highest is actually
8109 * 0, or if the list is empty. If this distinction matters to you, check
8110 * for emptiness before calling this function */
8112 UV len = _invlist_len(invlist);
8115 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8121 array = invlist_array(invlist);
8123 /* The last element in the array in the inversion list always starts a
8124 * range that goes to infinity. That range may be for code points that are
8125 * matched in the inversion list, or it may be for ones that aren't
8126 * matched. In the latter case, the highest code point in the set is one
8127 * less than the beginning of this range; otherwise it is the final element
8128 * of this range: infinity */
8129 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8131 : array[len - 1] - 1;
8134 #ifndef PERL_IN_XSUB_RE
8136 Perl__invlist_contents(pTHX_ SV* const invlist)
8138 /* Get the contents of an inversion list into a string SV so that they can
8139 * be printed out. It uses the format traditionally done for debug tracing
8143 SV* output = newSVpvs("\n");
8145 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8147 assert(! invlist_is_iterating(invlist));
8149 invlist_iterinit(invlist);
8150 while (invlist_iternext(invlist, &start, &end)) {
8151 if (end == UV_MAX) {
8152 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8154 else if (end != start) {
8155 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8159 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8167 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8169 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8171 /* Dumps out the ranges in an inversion list. The string 'header'
8172 * if present is output on a line before the first range */
8176 PERL_ARGS_ASSERT__INVLIST_DUMP;
8178 if (header && strlen(header)) {
8179 PerlIO_printf(Perl_debug_log, "%s\n", header);
8181 if (invlist_is_iterating(invlist)) {
8182 PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
8186 invlist_iterinit(invlist);
8187 while (invlist_iternext(invlist, &start, &end)) {
8188 if (end == UV_MAX) {
8189 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8191 else if (end != start) {
8192 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8196 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8204 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8206 /* Return a boolean as to if the two passed in inversion lists are
8207 * identical. The final argument, if TRUE, says to take the complement of
8208 * the second inversion list before doing the comparison */
8210 UV* array_a = invlist_array(a);
8211 UV* array_b = invlist_array(b);
8212 UV len_a = _invlist_len(a);
8213 UV len_b = _invlist_len(b);
8215 UV i = 0; /* current index into the arrays */
8216 bool retval = TRUE; /* Assume are identical until proven otherwise */
8218 PERL_ARGS_ASSERT__INVLISTEQ;
8220 /* If are to compare 'a' with the complement of b, set it
8221 * up so are looking at b's complement. */
8224 /* The complement of nothing is everything, so <a> would have to have
8225 * just one element, starting at zero (ending at infinity) */
8227 return (len_a == 1 && array_a[0] == 0);
8229 else if (array_b[0] == 0) {
8231 /* Otherwise, to complement, we invert. Here, the first element is
8232 * 0, just remove it. To do this, we just pretend the array starts
8233 * one later, and clear the flag as we don't have to do anything
8238 complement_b = FALSE;
8242 /* But if the first element is not zero, we unshift a 0 before the
8243 * array. The data structure reserves a space for that 0 (which
8244 * should be a '1' right now), so physical shifting is unneeded,
8245 * but temporarily change that element to 0. Before exiting the
8246 * routine, we must restore the element to '1' */
8253 /* Make sure that the lengths are the same, as well as the final element
8254 * before looping through the remainder. (Thus we test the length, final,
8255 * and first elements right off the bat) */
8256 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8259 else for (i = 0; i < len_a - 1; i++) {
8260 if (array_a[i] != array_b[i]) {
8273 #undef HEADER_LENGTH
8274 #undef INVLIST_INITIAL_LENGTH
8275 #undef TO_INTERNAL_SIZE
8276 #undef FROM_INTERNAL_SIZE
8277 #undef INVLIST_LEN_OFFSET
8278 #undef INVLIST_ZERO_OFFSET
8279 #undef INVLIST_ITER_OFFSET
8280 #undef INVLIST_VERSION_ID
8281 #undef INVLIST_PREVIOUS_INDEX_OFFSET
8283 /* End of inversion list object */
8286 - reg - regular expression, i.e. main body or parenthesized thing
8288 * Caller must absorb opening parenthesis.
8290 * Combining parenthesis handling with the base level of regular expression
8291 * is a trifle forced, but the need to tie the tails of the branches to what
8292 * follows makes it hard to avoid.
8294 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8296 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8298 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8302 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8303 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8306 regnode *ret; /* Will be the head of the group. */
8309 regnode *ender = NULL;
8312 U32 oregflags = RExC_flags;
8313 bool have_branch = 0;
8315 I32 freeze_paren = 0;
8316 I32 after_freeze = 0;
8318 /* for (?g), (?gc), and (?o) warnings; warning
8319 about (?c) will warn about (?g) -- japhy */
8321 #define WASTED_O 0x01
8322 #define WASTED_G 0x02
8323 #define WASTED_C 0x04
8324 #define WASTED_GC (0x02|0x04)
8325 I32 wastedflags = 0x00;
8327 char * parse_start = RExC_parse; /* MJD */
8328 char * const oregcomp_parse = RExC_parse;
8330 GET_RE_DEBUG_FLAGS_DECL;
8332 PERL_ARGS_ASSERT_REG;
8333 DEBUG_PARSE("reg ");
8335 *flagp = 0; /* Tentatively. */
8338 /* Make an OPEN node, if parenthesized. */
8340 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8341 char *start_verb = RExC_parse;
8342 STRLEN verb_len = 0;
8343 char *start_arg = NULL;
8344 unsigned char op = 0;
8346 int internal_argval = 0; /* internal_argval is only useful if !argok */
8347 while ( *RExC_parse && *RExC_parse != ')' ) {
8348 if ( *RExC_parse == ':' ) {
8349 start_arg = RExC_parse + 1;
8355 verb_len = RExC_parse - start_verb;
8358 while ( *RExC_parse && *RExC_parse != ')' )
8360 if ( *RExC_parse != ')' )
8361 vFAIL("Unterminated verb pattern argument");
8362 if ( RExC_parse == start_arg )
8365 if ( *RExC_parse != ')' )
8366 vFAIL("Unterminated verb pattern");
8369 switch ( *start_verb ) {
8370 case 'A': /* (*ACCEPT) */
8371 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8373 internal_argval = RExC_nestroot;
8376 case 'C': /* (*COMMIT) */
8377 if ( memEQs(start_verb,verb_len,"COMMIT") )
8380 case 'F': /* (*FAIL) */
8381 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8386 case ':': /* (*:NAME) */
8387 case 'M': /* (*MARK:NAME) */
8388 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8393 case 'P': /* (*PRUNE) */
8394 if ( memEQs(start_verb,verb_len,"PRUNE") )
8397 case 'S': /* (*SKIP) */
8398 if ( memEQs(start_verb,verb_len,"SKIP") )
8401 case 'T': /* (*THEN) */
8402 /* [19:06] <TimToady> :: is then */
8403 if ( memEQs(start_verb,verb_len,"THEN") ) {
8405 RExC_seen |= REG_SEEN_CUTGROUP;
8411 vFAIL3("Unknown verb pattern '%.*s'",
8412 verb_len, start_verb);
8415 if ( start_arg && internal_argval ) {
8416 vFAIL3("Verb pattern '%.*s' may not have an argument",
8417 verb_len, start_verb);
8418 } else if ( argok < 0 && !start_arg ) {
8419 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8420 verb_len, start_verb);
8422 ret = reganode(pRExC_state, op, internal_argval);
8423 if ( ! internal_argval && ! SIZE_ONLY ) {
8425 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8426 ARG(ret) = add_data( pRExC_state, 1, "S" );
8427 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8434 if (!internal_argval)
8435 RExC_seen |= REG_SEEN_VERBARG;
8436 } else if ( start_arg ) {
8437 vFAIL3("Verb pattern '%.*s' may not have an argument",
8438 verb_len, start_verb);
8440 ret = reg_node(pRExC_state, op);
8442 nextchar(pRExC_state);
8445 if (*RExC_parse == '?') { /* (?...) */
8446 bool is_logical = 0;
8447 const char * const seqstart = RExC_parse;
8448 bool has_use_defaults = FALSE;
8451 paren = *RExC_parse++;
8452 ret = NULL; /* For look-ahead/behind. */
8455 case 'P': /* (?P...) variants for those used to PCRE/Python */
8456 paren = *RExC_parse++;
8457 if ( paren == '<') /* (?P<...>) named capture */
8459 else if (paren == '>') { /* (?P>name) named recursion */
8460 goto named_recursion;
8462 else if (paren == '=') { /* (?P=...) named backref */
8463 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8464 you change this make sure you change that */
8465 char* name_start = RExC_parse;
8467 SV *sv_dat = reg_scan_name(pRExC_state,
8468 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8469 if (RExC_parse == name_start || *RExC_parse != ')')
8470 vFAIL2("Sequence %.3s... not terminated",parse_start);
8473 num = add_data( pRExC_state, 1, "S" );
8474 RExC_rxi->data->data[num]=(void*)sv_dat;
8475 SvREFCNT_inc_simple_void(sv_dat);
8478 ret = reganode(pRExC_state,
8481 : (ASCII_FOLD_RESTRICTED)
8483 : (AT_LEAST_UNI_SEMANTICS)
8491 Set_Node_Offset(ret, parse_start+1);
8492 Set_Node_Cur_Length(ret); /* MJD */
8494 nextchar(pRExC_state);
8498 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8500 case '<': /* (?<...) */
8501 if (*RExC_parse == '!')
8503 else if (*RExC_parse != '=')
8509 case '\'': /* (?'...') */
8510 name_start= RExC_parse;
8511 svname = reg_scan_name(pRExC_state,
8512 SIZE_ONLY ? /* reverse test from the others */
8513 REG_RSN_RETURN_NAME :
8514 REG_RSN_RETURN_NULL);
8515 if (RExC_parse == name_start) {
8517 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8520 if (*RExC_parse != paren)
8521 vFAIL2("Sequence (?%c... not terminated",
8522 paren=='>' ? '<' : paren);
8526 if (!svname) /* shouldn't happen */
8528 "panic: reg_scan_name returned NULL");
8529 if (!RExC_paren_names) {
8530 RExC_paren_names= newHV();
8531 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8533 RExC_paren_name_list= newAV();
8534 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8537 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8539 sv_dat = HeVAL(he_str);
8541 /* croak baby croak */
8543 "panic: paren_name hash element allocation failed");
8544 } else if ( SvPOK(sv_dat) ) {
8545 /* (?|...) can mean we have dupes so scan to check
8546 its already been stored. Maybe a flag indicating
8547 we are inside such a construct would be useful,
8548 but the arrays are likely to be quite small, so
8549 for now we punt -- dmq */
8550 IV count = SvIV(sv_dat);
8551 I32 *pv = (I32*)SvPVX(sv_dat);
8553 for ( i = 0 ; i < count ; i++ ) {
8554 if ( pv[i] == RExC_npar ) {
8560 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8561 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8562 pv[count] = RExC_npar;
8563 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8566 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8567 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8569 SvIV_set(sv_dat, 1);
8572 /* Yes this does cause a memory leak in debugging Perls */
8573 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8574 SvREFCNT_dec_NN(svname);
8577 /*sv_dump(sv_dat);*/
8579 nextchar(pRExC_state);
8581 goto capturing_parens;
8583 RExC_seen |= REG_SEEN_LOOKBEHIND;
8584 RExC_in_lookbehind++;
8586 case '=': /* (?=...) */
8587 RExC_seen_zerolen++;
8589 case '!': /* (?!...) */
8590 RExC_seen_zerolen++;
8591 if (*RExC_parse == ')') {
8592 ret=reg_node(pRExC_state, OPFAIL);
8593 nextchar(pRExC_state);
8597 case '|': /* (?|...) */
8598 /* branch reset, behave like a (?:...) except that
8599 buffers in alternations share the same numbers */
8601 after_freeze = freeze_paren = RExC_npar;
8603 case ':': /* (?:...) */
8604 case '>': /* (?>...) */
8606 case '$': /* (?$...) */
8607 case '@': /* (?@...) */
8608 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8610 case '#': /* (?#...) */
8611 while (*RExC_parse && *RExC_parse != ')')
8613 if (*RExC_parse != ')')
8614 FAIL("Sequence (?#... not terminated");
8615 nextchar(pRExC_state);
8618 case '0' : /* (?0) */
8619 case 'R' : /* (?R) */
8620 if (*RExC_parse != ')')
8621 FAIL("Sequence (?R) not terminated");
8622 ret = reg_node(pRExC_state, GOSTART);
8623 *flagp |= POSTPONED;
8624 nextchar(pRExC_state);
8627 { /* named and numeric backreferences */
8629 case '&': /* (?&NAME) */
8630 parse_start = RExC_parse - 1;
8633 SV *sv_dat = reg_scan_name(pRExC_state,
8634 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8635 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8637 goto gen_recurse_regop;
8638 assert(0); /* NOT REACHED */
8640 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8642 vFAIL("Illegal pattern");
8644 goto parse_recursion;
8646 case '-': /* (?-1) */
8647 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8648 RExC_parse--; /* rewind to let it be handled later */
8652 case '1': case '2': case '3': case '4': /* (?1) */
8653 case '5': case '6': case '7': case '8': case '9':
8656 num = atoi(RExC_parse);
8657 parse_start = RExC_parse - 1; /* MJD */
8658 if (*RExC_parse == '-')
8660 while (isDIGIT(*RExC_parse))
8662 if (*RExC_parse!=')')
8663 vFAIL("Expecting close bracket");
8666 if ( paren == '-' ) {
8668 Diagram of capture buffer numbering.
8669 Top line is the normal capture buffer numbers
8670 Bottom line is the negative indexing as from
8674 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8678 num = RExC_npar + num;
8681 vFAIL("Reference to nonexistent group");
8683 } else if ( paren == '+' ) {
8684 num = RExC_npar + num - 1;
8687 ret = reganode(pRExC_state, GOSUB, num);
8689 if (num > (I32)RExC_rx->nparens) {
8691 vFAIL("Reference to nonexistent group");
8693 ARG2L_SET( ret, RExC_recurse_count++);
8695 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8696 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8700 RExC_seen |= REG_SEEN_RECURSE;
8701 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8702 Set_Node_Offset(ret, parse_start); /* MJD */
8704 *flagp |= POSTPONED;
8705 nextchar(pRExC_state);
8707 } /* named and numeric backreferences */
8708 assert(0); /* NOT REACHED */
8710 case '?': /* (??...) */
8712 if (*RExC_parse != '{') {
8714 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8717 *flagp |= POSTPONED;
8718 paren = *RExC_parse++;
8720 case '{': /* (?{...}) */
8723 struct reg_code_block *cb;
8725 RExC_seen_zerolen++;
8727 if ( !pRExC_state->num_code_blocks
8728 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8729 || pRExC_state->code_blocks[pRExC_state->code_index].start
8730 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8733 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8734 FAIL("panic: Sequence (?{...}): no code block found\n");
8735 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8737 /* this is a pre-compiled code block (?{...}) */
8738 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8739 RExC_parse = RExC_start + cb->end;
8742 if (cb->src_regex) {
8743 n = add_data(pRExC_state, 2, "rl");
8744 RExC_rxi->data->data[n] =
8745 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8746 RExC_rxi->data->data[n+1] = (void*)o;
8749 n = add_data(pRExC_state, 1,
8750 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8751 RExC_rxi->data->data[n] = (void*)o;
8754 pRExC_state->code_index++;
8755 nextchar(pRExC_state);
8759 ret = reg_node(pRExC_state, LOGICAL);
8760 eval = reganode(pRExC_state, EVAL, n);
8763 /* for later propagation into (??{}) return value */
8764 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8766 REGTAIL(pRExC_state, ret, eval);
8767 /* deal with the length of this later - MJD */
8770 ret = reganode(pRExC_state, EVAL, n);
8771 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8772 Set_Node_Offset(ret, parse_start);
8775 case '(': /* (?(?{...})...) and (?(?=...)...) */
8778 if (RExC_parse[0] == '?') { /* (?(?...)) */
8779 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8780 || RExC_parse[1] == '<'
8781 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8784 ret = reg_node(pRExC_state, LOGICAL);
8787 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8791 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8792 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8794 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8795 char *name_start= RExC_parse++;
8797 SV *sv_dat=reg_scan_name(pRExC_state,
8798 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8799 if (RExC_parse == name_start || *RExC_parse != ch)
8800 vFAIL2("Sequence (?(%c... not terminated",
8801 (ch == '>' ? '<' : ch));
8804 num = add_data( pRExC_state, 1, "S" );
8805 RExC_rxi->data->data[num]=(void*)sv_dat;
8806 SvREFCNT_inc_simple_void(sv_dat);
8808 ret = reganode(pRExC_state,NGROUPP,num);
8809 goto insert_if_check_paren;
8811 else if (RExC_parse[0] == 'D' &&
8812 RExC_parse[1] == 'E' &&
8813 RExC_parse[2] == 'F' &&
8814 RExC_parse[3] == 'I' &&
8815 RExC_parse[4] == 'N' &&
8816 RExC_parse[5] == 'E')
8818 ret = reganode(pRExC_state,DEFINEP,0);
8821 goto insert_if_check_paren;
8823 else if (RExC_parse[0] == 'R') {
8826 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8827 parno = atoi(RExC_parse++);
8828 while (isDIGIT(*RExC_parse))
8830 } else if (RExC_parse[0] == '&') {
8833 sv_dat = reg_scan_name(pRExC_state,
8834 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8835 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8837 ret = reganode(pRExC_state,INSUBP,parno);
8838 goto insert_if_check_paren;
8840 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8843 parno = atoi(RExC_parse++);
8845 while (isDIGIT(*RExC_parse))
8847 ret = reganode(pRExC_state, GROUPP, parno);
8849 insert_if_check_paren:
8850 if ((c = *nextchar(pRExC_state)) != ')')
8851 vFAIL("Switch condition not recognized");
8853 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8854 br = regbranch(pRExC_state, &flags, 1,depth+1);
8856 br = reganode(pRExC_state, LONGJMP, 0);
8858 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8859 c = *nextchar(pRExC_state);
8864 vFAIL("(?(DEFINE)....) does not allow branches");
8865 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8866 regbranch(pRExC_state, &flags, 1,depth+1);
8867 REGTAIL(pRExC_state, ret, lastbr);
8870 c = *nextchar(pRExC_state);
8875 vFAIL("Switch (?(condition)... contains too many branches");
8876 ender = reg_node(pRExC_state, TAIL);
8877 REGTAIL(pRExC_state, br, ender);
8879 REGTAIL(pRExC_state, lastbr, ender);
8880 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8883 REGTAIL(pRExC_state, ret, ender);
8884 RExC_size++; /* XXX WHY do we need this?!!
8885 For large programs it seems to be required
8886 but I can't figure out why. -- dmq*/
8890 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8894 RExC_parse--; /* for vFAIL to print correctly */
8895 vFAIL("Sequence (? incomplete");
8897 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8899 has_use_defaults = TRUE;
8900 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8901 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8902 ? REGEX_UNICODE_CHARSET
8903 : REGEX_DEPENDS_CHARSET);
8907 parse_flags: /* (?i) */
8909 U32 posflags = 0, negflags = 0;
8910 U32 *flagsp = &posflags;
8911 char has_charset_modifier = '\0';
8912 regex_charset cs = get_regex_charset(RExC_flags);
8913 if (cs == REGEX_DEPENDS_CHARSET
8914 && (RExC_utf8 || RExC_uni_semantics))
8916 cs = REGEX_UNICODE_CHARSET;
8919 while (*RExC_parse) {
8920 /* && strchr("iogcmsx", *RExC_parse) */
8921 /* (?g), (?gc) and (?o) are useless here
8922 and must be globally applied -- japhy */
8923 switch (*RExC_parse) {
8924 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8925 case LOCALE_PAT_MOD:
8926 if (has_charset_modifier) {
8927 goto excess_modifier;
8929 else if (flagsp == &negflags) {
8932 cs = REGEX_LOCALE_CHARSET;
8933 has_charset_modifier = LOCALE_PAT_MOD;
8934 RExC_contains_locale = 1;
8936 case UNICODE_PAT_MOD:
8937 if (has_charset_modifier) {
8938 goto excess_modifier;
8940 else if (flagsp == &negflags) {
8943 cs = REGEX_UNICODE_CHARSET;
8944 has_charset_modifier = UNICODE_PAT_MOD;
8946 case ASCII_RESTRICT_PAT_MOD:
8947 if (flagsp == &negflags) {
8950 if (has_charset_modifier) {
8951 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8952 goto excess_modifier;
8954 /* Doubled modifier implies more restricted */
8955 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8958 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8960 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8962 case DEPENDS_PAT_MOD:
8963 if (has_use_defaults) {
8964 goto fail_modifiers;
8966 else if (flagsp == &negflags) {
8969 else if (has_charset_modifier) {
8970 goto excess_modifier;
8973 /* The dual charset means unicode semantics if the
8974 * pattern (or target, not known until runtime) are
8975 * utf8, or something in the pattern indicates unicode
8977 cs = (RExC_utf8 || RExC_uni_semantics)
8978 ? REGEX_UNICODE_CHARSET
8979 : REGEX_DEPENDS_CHARSET;
8980 has_charset_modifier = DEPENDS_PAT_MOD;
8984 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8985 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8987 else if (has_charset_modifier == *(RExC_parse - 1)) {
8988 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8991 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8996 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8998 case ONCE_PAT_MOD: /* 'o' */
8999 case GLOBAL_PAT_MOD: /* 'g' */
9000 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9001 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9002 if (! (wastedflags & wflagbit) ) {
9003 wastedflags |= wflagbit;
9006 "Useless (%s%c) - %suse /%c modifier",
9007 flagsp == &negflags ? "?-" : "?",
9009 flagsp == &negflags ? "don't " : "",
9016 case CONTINUE_PAT_MOD: /* 'c' */
9017 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9018 if (! (wastedflags & WASTED_C) ) {
9019 wastedflags |= WASTED_GC;
9022 "Useless (%sc) - %suse /gc modifier",
9023 flagsp == &negflags ? "?-" : "?",
9024 flagsp == &negflags ? "don't " : ""
9029 case KEEPCOPY_PAT_MOD: /* 'p' */
9030 if (flagsp == &negflags) {
9032 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9034 *flagsp |= RXf_PMf_KEEPCOPY;
9038 /* A flag is a default iff it is following a minus, so
9039 * if there is a minus, it means will be trying to
9040 * re-specify a default which is an error */
9041 if (has_use_defaults || flagsp == &negflags) {
9044 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9048 wastedflags = 0; /* reset so (?g-c) warns twice */
9054 RExC_flags |= posflags;
9055 RExC_flags &= ~negflags;
9056 set_regex_charset(&RExC_flags, cs);
9058 oregflags |= posflags;
9059 oregflags &= ~negflags;
9060 set_regex_charset(&oregflags, cs);
9062 nextchar(pRExC_state);
9073 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9078 }} /* one for the default block, one for the switch */
9085 ret = reganode(pRExC_state, OPEN, parno);
9088 RExC_nestroot = parno;
9089 if (RExC_seen & REG_SEEN_RECURSE
9090 && !RExC_open_parens[parno-1])
9092 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9093 "Setting open paren #%"IVdf" to %d\n",
9094 (IV)parno, REG_NODE_NUM(ret)));
9095 RExC_open_parens[parno-1]= ret;
9098 Set_Node_Length(ret, 1); /* MJD */
9099 Set_Node_Offset(ret, RExC_parse); /* MJD */
9107 /* Pick up the branches, linking them together. */
9108 parse_start = RExC_parse; /* MJD */
9109 br = regbranch(pRExC_state, &flags, 1,depth+1);
9111 /* branch_len = (paren != 0); */
9115 if (*RExC_parse == '|') {
9116 if (!SIZE_ONLY && RExC_extralen) {
9117 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9120 reginsert(pRExC_state, BRANCH, br, depth+1);
9121 Set_Node_Length(br, paren != 0);
9122 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9126 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9128 else if (paren == ':') {
9129 *flagp |= flags&SIMPLE;
9131 if (is_open) { /* Starts with OPEN. */
9132 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9134 else if (paren != '?') /* Not Conditional */
9136 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9138 while (*RExC_parse == '|') {
9139 if (!SIZE_ONLY && RExC_extralen) {
9140 ender = reganode(pRExC_state, LONGJMP,0);
9141 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9144 RExC_extralen += 2; /* Account for LONGJMP. */
9145 nextchar(pRExC_state);
9147 if (RExC_npar > after_freeze)
9148 after_freeze = RExC_npar;
9149 RExC_npar = freeze_paren;
9151 br = regbranch(pRExC_state, &flags, 0, depth+1);
9155 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9157 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9160 if (have_branch || paren != ':') {
9161 /* Make a closing node, and hook it on the end. */
9164 ender = reg_node(pRExC_state, TAIL);
9167 ender = reganode(pRExC_state, CLOSE, parno);
9168 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9169 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9170 "Setting close paren #%"IVdf" to %d\n",
9171 (IV)parno, REG_NODE_NUM(ender)));
9172 RExC_close_parens[parno-1]= ender;
9173 if (RExC_nestroot == parno)
9176 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9177 Set_Node_Length(ender,1); /* MJD */
9183 *flagp &= ~HASWIDTH;
9186 ender = reg_node(pRExC_state, SUCCEED);
9189 ender = reg_node(pRExC_state, END);
9191 assert(!RExC_opend); /* there can only be one! */
9196 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9197 SV * const mysv_val1=sv_newmortal();
9198 SV * const mysv_val2=sv_newmortal();
9199 DEBUG_PARSE_MSG("lsbr");
9200 regprop(RExC_rx, mysv_val1, lastbr);
9201 regprop(RExC_rx, mysv_val2, ender);
9202 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9203 SvPV_nolen_const(mysv_val1),
9204 (IV)REG_NODE_NUM(lastbr),
9205 SvPV_nolen_const(mysv_val2),
9206 (IV)REG_NODE_NUM(ender),
9207 (IV)(ender - lastbr)
9210 REGTAIL(pRExC_state, lastbr, ender);
9212 if (have_branch && !SIZE_ONLY) {
9215 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9217 /* Hook the tails of the branches to the closing node. */
9218 for (br = ret; br; br = regnext(br)) {
9219 const U8 op = PL_regkind[OP(br)];
9221 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9222 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9225 else if (op == BRANCHJ) {
9226 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9227 /* for now we always disable this optimisation * /
9228 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9234 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9235 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9236 SV * const mysv_val1=sv_newmortal();
9237 SV * const mysv_val2=sv_newmortal();
9238 DEBUG_PARSE_MSG("NADA");
9239 regprop(RExC_rx, mysv_val1, ret);
9240 regprop(RExC_rx, mysv_val2, ender);
9241 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9242 SvPV_nolen_const(mysv_val1),
9243 (IV)REG_NODE_NUM(ret),
9244 SvPV_nolen_const(mysv_val2),
9245 (IV)REG_NODE_NUM(ender),
9250 if (OP(ender) == TAIL) {
9255 for ( opt= br + 1; opt < ender ; opt++ )
9257 NEXT_OFF(br)= ender - br;
9265 static const char parens[] = "=!<,>";
9267 if (paren && (p = strchr(parens, paren))) {
9268 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9269 int flag = (p - parens) > 1;
9272 node = SUSPEND, flag = 0;
9273 reginsert(pRExC_state, node,ret, depth+1);
9274 Set_Node_Cur_Length(ret);
9275 Set_Node_Offset(ret, parse_start + 1);
9277 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9281 /* Check for proper termination. */
9283 RExC_flags = oregflags;
9284 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9285 RExC_parse = oregcomp_parse;
9286 vFAIL("Unmatched (");
9289 else if (!paren && RExC_parse < RExC_end) {
9290 if (*RExC_parse == ')') {
9292 vFAIL("Unmatched )");
9295 FAIL("Junk on end of regexp"); /* "Can't happen". */
9296 assert(0); /* NOTREACHED */
9299 if (RExC_in_lookbehind) {
9300 RExC_in_lookbehind--;
9302 if (after_freeze > RExC_npar)
9303 RExC_npar = after_freeze;
9308 - regbranch - one alternative of an | operator
9310 * Implements the concatenation operator.
9313 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9317 regnode *chain = NULL;
9319 I32 flags = 0, c = 0;
9320 GET_RE_DEBUG_FLAGS_DECL;
9322 PERL_ARGS_ASSERT_REGBRANCH;
9324 DEBUG_PARSE("brnc");
9329 if (!SIZE_ONLY && RExC_extralen)
9330 ret = reganode(pRExC_state, BRANCHJ,0);
9332 ret = reg_node(pRExC_state, BRANCH);
9333 Set_Node_Length(ret, 1);
9337 if (!first && SIZE_ONLY)
9338 RExC_extralen += 1; /* BRANCHJ */
9340 *flagp = WORST; /* Tentatively. */
9343 nextchar(pRExC_state);
9344 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9346 latest = regpiece(pRExC_state, &flags,depth+1);
9347 if (latest == NULL) {
9348 if (flags & TRYAGAIN)
9352 else if (ret == NULL)
9354 *flagp |= flags&(HASWIDTH|POSTPONED);
9355 if (chain == NULL) /* First piece. */
9356 *flagp |= flags&SPSTART;
9359 REGTAIL(pRExC_state, chain, latest);
9364 if (chain == NULL) { /* Loop ran zero times. */
9365 chain = reg_node(pRExC_state, NOTHING);
9370 *flagp |= flags&SIMPLE;
9377 - regpiece - something followed by possible [*+?]
9379 * Note that the branching code sequences used for ? and the general cases
9380 * of * and + are somewhat optimized: they use the same NOTHING node as
9381 * both the endmarker for their branch list and the body of the last branch.
9382 * It might seem that this node could be dispensed with entirely, but the
9383 * endmarker role is not redundant.
9386 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9393 const char * const origparse = RExC_parse;
9395 I32 max = REG_INFTY;
9396 #ifdef RE_TRACK_PATTERN_OFFSETS
9399 const char *maxpos = NULL;
9401 /* Save the original in case we change the emitted regop to a FAIL. */
9402 regnode * const orig_emit = RExC_emit;
9404 GET_RE_DEBUG_FLAGS_DECL;
9406 PERL_ARGS_ASSERT_REGPIECE;
9408 DEBUG_PARSE("piec");
9410 ret = regatom(pRExC_state, &flags,depth+1);
9412 if (flags & TRYAGAIN)
9419 if (op == '{' && regcurly(RExC_parse)) {
9421 #ifdef RE_TRACK_PATTERN_OFFSETS
9422 parse_start = RExC_parse; /* MJD */
9424 next = RExC_parse + 1;
9425 while (isDIGIT(*next) || *next == ',') {
9434 if (*next == '}') { /* got one */
9438 min = atoi(RExC_parse);
9442 maxpos = RExC_parse;
9444 if (!max && *maxpos != '0')
9445 max = REG_INFTY; /* meaning "infinity" */
9446 else if (max >= REG_INFTY)
9447 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9449 nextchar(pRExC_state);
9450 if (max < min) { /* If can't match, warn and optimize to fail
9453 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9455 /* We can't back off the size because we have to reserve
9456 * enough space for all the things we are about to throw
9457 * away, but we can shrink it by the ammount we are about
9459 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9462 RExC_emit = orig_emit;
9464 ret = reg_node(pRExC_state, OPFAIL);
9467 else if (max == 0) { /* replace {0} with a nothing node */
9469 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)NOTHING];
9472 RExC_emit = orig_emit;
9474 ret = reg_node(pRExC_state, NOTHING);
9479 if ((flags&SIMPLE)) {
9480 RExC_naughty += 2 + RExC_naughty / 2;
9481 reginsert(pRExC_state, CURLY, ret, depth+1);
9482 Set_Node_Offset(ret, parse_start+1); /* MJD */
9483 Set_Node_Cur_Length(ret);
9486 regnode * const w = reg_node(pRExC_state, WHILEM);
9489 REGTAIL(pRExC_state, ret, w);
9490 if (!SIZE_ONLY && RExC_extralen) {
9491 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9492 reginsert(pRExC_state, NOTHING,ret, depth+1);
9493 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9495 reginsert(pRExC_state, CURLYX,ret, depth+1);
9497 Set_Node_Offset(ret, parse_start+1);
9498 Set_Node_Length(ret,
9499 op == '{' ? (RExC_parse - parse_start) : 1);
9501 if (!SIZE_ONLY && RExC_extralen)
9502 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9503 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9505 RExC_whilem_seen++, RExC_extralen += 3;
9506 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9515 ARG1_SET(ret, (U16)min);
9516 ARG2_SET(ret, (U16)max);
9528 #if 0 /* Now runtime fix should be reliable. */
9530 /* if this is reinstated, don't forget to put this back into perldiag:
9532 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9534 (F) The part of the regexp subject to either the * or + quantifier
9535 could match an empty string. The {#} shows in the regular
9536 expression about where the problem was discovered.
9540 if (!(flags&HASWIDTH) && op != '?')
9541 vFAIL("Regexp *+ operand could be empty");
9544 #ifdef RE_TRACK_PATTERN_OFFSETS
9545 parse_start = RExC_parse;
9547 nextchar(pRExC_state);
9549 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9551 if (op == '*' && (flags&SIMPLE)) {
9552 reginsert(pRExC_state, STAR, ret, depth+1);
9556 else if (op == '*') {
9560 else if (op == '+' && (flags&SIMPLE)) {
9561 reginsert(pRExC_state, PLUS, ret, depth+1);
9565 else if (op == '+') {
9569 else if (op == '?') {
9574 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9575 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9576 ckWARN3reg(RExC_parse,
9577 "%.*s matches null string many times",
9578 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9580 (void)ReREFCNT_inc(RExC_rx_sv);
9583 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9584 nextchar(pRExC_state);
9585 reginsert(pRExC_state, MINMOD, ret, depth+1);
9586 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9588 #ifndef REG_ALLOW_MINMOD_SUSPEND
9591 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9593 nextchar(pRExC_state);
9594 ender = reg_node(pRExC_state, SUCCEED);
9595 REGTAIL(pRExC_state, ret, ender);
9596 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9598 ender = reg_node(pRExC_state, TAIL);
9599 REGTAIL(pRExC_state, ret, ender);
9603 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9605 vFAIL("Nested quantifiers");
9612 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9615 /* This is expected to be called by a parser routine that has recognized '\N'
9616 and needs to handle the rest. RExC_parse is expected to point at the first
9617 char following the N at the time of the call. On successful return,
9618 RExC_parse has been updated to point to just after the sequence identified
9619 by this routine, and <*flagp> has been updated.
9621 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9624 \N may begin either a named sequence, or if outside a character class, mean
9625 to match a non-newline. For non single-quoted regexes, the tokenizer has
9626 attempted to decide which, and in the case of a named sequence, converted it
9627 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9628 where c1... are the characters in the sequence. For single-quoted regexes,
9629 the tokenizer passes the \N sequence through unchanged; this code will not
9630 attempt to determine this nor expand those, instead raising a syntax error.
9631 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9632 or there is no '}', it signals that this \N occurrence means to match a
9635 Only the \N{U+...} form should occur in a character class, for the same
9636 reason that '.' inside a character class means to just match a period: it
9637 just doesn't make sense.
9639 The function raises an error (via vFAIL), and doesn't return for various
9640 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9641 success; it returns FALSE otherwise.
9643 If <valuep> is non-null, it means the caller can accept an input sequence
9644 consisting of a just a single code point; <*valuep> is set to that value
9645 if the input is such.
9647 If <node_p> is non-null it signifies that the caller can accept any other
9648 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9650 1) \N means not-a-NL: points to a newly created REG_ANY node;
9651 2) \N{}: points to a new NOTHING node;
9652 3) otherwise: points to a new EXACT node containing the resolved
9654 Note that FALSE is returned for single code point sequences if <valuep> is
9658 char * endbrace; /* '}' following the name */
9660 char *endchar; /* Points to '.' or '}' ending cur char in the input
9662 bool has_multiple_chars; /* true if the input stream contains a sequence of
9663 more than one character */
9665 GET_RE_DEBUG_FLAGS_DECL;
9667 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9671 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9673 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9674 * modifier. The other meaning does not */
9675 p = (RExC_flags & RXf_PMf_EXTENDED)
9676 ? regwhite( pRExC_state, RExC_parse )
9679 /* Disambiguate between \N meaning a named character versus \N meaning
9680 * [^\n]. The former is assumed when it can't be the latter. */
9681 if (*p != '{' || regcurly(p)) {
9684 /* no bare \N in a charclass */
9685 if (in_char_class) {
9686 vFAIL("\\N in a character class must be a named character: \\N{...}");
9690 nextchar(pRExC_state);
9691 *node_p = reg_node(pRExC_state, REG_ANY);
9692 *flagp |= HASWIDTH|SIMPLE;
9695 Set_Node_Length(*node_p, 1); /* MJD */
9699 /* Here, we have decided it should be a named character or sequence */
9701 /* The test above made sure that the next real character is a '{', but
9702 * under the /x modifier, it could be separated by space (or a comment and
9703 * \n) and this is not allowed (for consistency with \x{...} and the
9704 * tokenizer handling of \N{NAME}). */
9705 if (*RExC_parse != '{') {
9706 vFAIL("Missing braces on \\N{}");
9709 RExC_parse++; /* Skip past the '{' */
9711 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9712 || ! (endbrace == RExC_parse /* nothing between the {} */
9713 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9714 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9716 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9717 vFAIL("\\N{NAME} must be resolved by the lexer");
9720 if (endbrace == RExC_parse) { /* empty: \N{} */
9723 *node_p = reg_node(pRExC_state,NOTHING);
9725 else if (in_char_class) {
9726 if (SIZE_ONLY && in_char_class) {
9727 ckWARNreg(RExC_parse,
9728 "Ignoring zero length \\N{} in character class"
9736 nextchar(pRExC_state);
9740 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9741 RExC_parse += 2; /* Skip past the 'U+' */
9743 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9745 /* Code points are separated by dots. If none, there is only one code
9746 * point, and is terminated by the brace */
9747 has_multiple_chars = (endchar < endbrace);
9749 if (valuep && (! has_multiple_chars || in_char_class)) {
9750 /* We only pay attention to the first char of
9751 multichar strings being returned in char classes. I kinda wonder
9752 if this makes sense as it does change the behaviour
9753 from earlier versions, OTOH that behaviour was broken
9754 as well. XXX Solution is to recharacterize as
9755 [rest-of-class]|multi1|multi2... */
9757 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9758 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9759 | PERL_SCAN_DISALLOW_PREFIX
9760 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9762 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9764 /* The tokenizer should have guaranteed validity, but it's possible to
9765 * bypass it by using single quoting, so check */
9766 if (length_of_hex == 0
9767 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9769 RExC_parse += length_of_hex; /* Includes all the valid */
9770 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9771 ? UTF8SKIP(RExC_parse)
9773 /* Guard against malformed utf8 */
9774 if (RExC_parse >= endchar) {
9775 RExC_parse = endchar;
9777 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9780 if (in_char_class && has_multiple_chars) {
9781 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9784 RExC_parse = endbrace + 1;
9786 else if (! node_p || ! has_multiple_chars) {
9788 /* Here, the input is legal, but not according to the caller's
9789 * options. We fail without advancing the parse, so that the
9790 * caller can try again */
9796 /* What is done here is to convert this to a sub-pattern of the form
9797 * (?:\x{char1}\x{char2}...)
9798 * and then call reg recursively. That way, it retains its atomicness,
9799 * while not having to worry about special handling that some code
9800 * points may have. toke.c has converted the original Unicode values
9801 * to native, so that we can just pass on the hex values unchanged. We
9802 * do have to set a flag to keep recoding from happening in the
9805 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9807 char *orig_end = RExC_end;
9810 while (RExC_parse < endbrace) {
9812 /* Convert to notation the rest of the code understands */
9813 sv_catpv(substitute_parse, "\\x{");
9814 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9815 sv_catpv(substitute_parse, "}");
9817 /* Point to the beginning of the next character in the sequence. */
9818 RExC_parse = endchar + 1;
9819 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9821 sv_catpv(substitute_parse, ")");
9823 RExC_parse = SvPV(substitute_parse, len);
9825 /* Don't allow empty number */
9827 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9829 RExC_end = RExC_parse + len;
9831 /* The values are Unicode, and therefore not subject to recoding */
9832 RExC_override_recoding = 1;
9834 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9835 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9837 RExC_parse = endbrace;
9838 RExC_end = orig_end;
9839 RExC_override_recoding = 0;
9841 nextchar(pRExC_state);
9851 * It returns the code point in utf8 for the value in *encp.
9852 * value: a code value in the source encoding
9853 * encp: a pointer to an Encode object
9855 * If the result from Encode is not a single character,
9856 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9859 S_reg_recode(pTHX_ const char value, SV **encp)
9862 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9863 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9864 const STRLEN newlen = SvCUR(sv);
9865 UV uv = UNICODE_REPLACEMENT;
9867 PERL_ARGS_ASSERT_REG_RECODE;
9871 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9874 if (!newlen || numlen != newlen) {
9875 uv = UNICODE_REPLACEMENT;
9881 PERL_STATIC_INLINE U8
9882 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9886 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9892 op = get_regex_charset(RExC_flags);
9893 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9894 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9895 been, so there is no hole */
9901 PERL_STATIC_INLINE void
9902 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9904 /* This knows the details about sizing an EXACTish node, setting flags for
9905 * it (by setting <*flagp>, and potentially populating it with a single
9908 * If <len> (the length in bytes) is non-zero, this function assumes that
9909 * the node has already been populated, and just does the sizing. In this
9910 * case <code_point> should be the final code point that has already been
9911 * placed into the node. This value will be ignored except that under some
9912 * circumstances <*flagp> is set based on it.
9914 * If <len> is zero, the function assumes that the node is to contain only
9915 * the single character given by <code_point> and calculates what <len>
9916 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9917 * additionally will populate the node's STRING with <code_point>, if <len>
9918 * is 0. In both cases <*flagp> is appropriately set
9920 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9921 * folded (the latter only when the rules indicate it can match 'ss') */
9923 bool len_passed_in = cBOOL(len != 0);
9924 U8 character[UTF8_MAXBYTES_CASE+1];
9926 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9928 if (! len_passed_in) {
9931 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9934 uvchr_to_utf8( character, code_point);
9935 len = UTF8SKIP(character);
9939 || code_point != LATIN_SMALL_LETTER_SHARP_S
9940 || ASCII_FOLD_RESTRICTED
9941 || ! AT_LEAST_UNI_SEMANTICS)
9943 *character = (U8) code_point;
9948 *(character + 1) = 's';
9954 RExC_size += STR_SZ(len);
9957 RExC_emit += STR_SZ(len);
9958 STR_LEN(node) = len;
9959 if (! len_passed_in) {
9960 Copy((char *) character, STRING(node), len, char);
9966 /* A single character node is SIMPLE, except for the special-cased SHARP S
9968 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
9969 && (code_point != LATIN_SMALL_LETTER_SHARP_S
9970 || ! FOLD || ! DEPENDS_SEMANTICS))
9977 - regatom - the lowest level
9979 Try to identify anything special at the start of the pattern. If there
9980 is, then handle it as required. This may involve generating a single regop,
9981 such as for an assertion; or it may involve recursing, such as to
9982 handle a () structure.
9984 If the string doesn't start with something special then we gobble up
9985 as much literal text as we can.
9987 Once we have been able to handle whatever type of thing started the
9988 sequence, we return.
9990 Note: we have to be careful with escapes, as they can be both literal
9991 and special, and in the case of \10 and friends, context determines which.
9993 A summary of the code structure is:
9995 switch (first_byte) {
9996 cases for each special:
9997 handle this special;
10000 switch (2nd byte) {
10001 cases for each unambiguous special:
10002 handle this special;
10004 cases for each ambigous special/literal:
10006 if (special) handle here
10008 default: // unambiguously literal:
10011 default: // is a literal char
10014 create EXACTish node for literal;
10015 while (more input and node isn't full) {
10016 switch (input_byte) {
10017 cases for each special;
10018 make sure parse pointer is set so that the next call to
10019 regatom will see this special first
10020 goto loopdone; // EXACTish node terminated by prev. char
10022 append char to EXACTISH node;
10024 get next input byte;
10028 return the generated node;
10030 Specifically there are two separate switches for handling
10031 escape sequences, with the one for handling literal escapes requiring
10032 a dummy entry for all of the special escapes that are actually handled
10037 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10040 regnode *ret = NULL;
10042 char *parse_start = RExC_parse;
10046 GET_RE_DEBUG_FLAGS_DECL;
10048 *flagp = WORST; /* Tentatively. */
10050 DEBUG_PARSE("atom");
10052 PERL_ARGS_ASSERT_REGATOM;
10055 switch ((U8)*RExC_parse) {
10057 RExC_seen_zerolen++;
10058 nextchar(pRExC_state);
10059 if (RExC_flags & RXf_PMf_MULTILINE)
10060 ret = reg_node(pRExC_state, MBOL);
10061 else if (RExC_flags & RXf_PMf_SINGLELINE)
10062 ret = reg_node(pRExC_state, SBOL);
10064 ret = reg_node(pRExC_state, BOL);
10065 Set_Node_Length(ret, 1); /* MJD */
10068 nextchar(pRExC_state);
10070 RExC_seen_zerolen++;
10071 if (RExC_flags & RXf_PMf_MULTILINE)
10072 ret = reg_node(pRExC_state, MEOL);
10073 else if (RExC_flags & RXf_PMf_SINGLELINE)
10074 ret = reg_node(pRExC_state, SEOL);
10076 ret = reg_node(pRExC_state, EOL);
10077 Set_Node_Length(ret, 1); /* MJD */
10080 nextchar(pRExC_state);
10081 if (RExC_flags & RXf_PMf_SINGLELINE)
10082 ret = reg_node(pRExC_state, SANY);
10084 ret = reg_node(pRExC_state, REG_ANY);
10085 *flagp |= HASWIDTH|SIMPLE;
10087 Set_Node_Length(ret, 1); /* MJD */
10091 char * const oregcomp_parse = ++RExC_parse;
10092 ret = regclass(pRExC_state, flagp,depth+1);
10093 if (*RExC_parse != ']') {
10094 RExC_parse = oregcomp_parse;
10095 vFAIL("Unmatched [");
10097 nextchar(pRExC_state);
10098 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10102 nextchar(pRExC_state);
10103 ret = reg(pRExC_state, 1, &flags,depth+1);
10105 if (flags & TRYAGAIN) {
10106 if (RExC_parse == RExC_end) {
10107 /* Make parent create an empty node if needed. */
10108 *flagp |= TRYAGAIN;
10115 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10119 if (flags & TRYAGAIN) {
10120 *flagp |= TRYAGAIN;
10123 vFAIL("Internal urp");
10124 /* Supposed to be caught earlier. */
10130 vFAIL("Quantifier follows nothing");
10135 This switch handles escape sequences that resolve to some kind
10136 of special regop and not to literal text. Escape sequnces that
10137 resolve to literal text are handled below in the switch marked
10140 Every entry in this switch *must* have a corresponding entry
10141 in the literal escape switch. However, the opposite is not
10142 required, as the default for this switch is to jump to the
10143 literal text handling code.
10145 switch ((U8)*++RExC_parse) {
10147 /* Special Escapes */
10149 RExC_seen_zerolen++;
10150 ret = reg_node(pRExC_state, SBOL);
10152 goto finish_meta_pat;
10154 ret = reg_node(pRExC_state, GPOS);
10155 RExC_seen |= REG_SEEN_GPOS;
10157 goto finish_meta_pat;
10159 RExC_seen_zerolen++;
10160 ret = reg_node(pRExC_state, KEEPS);
10162 /* XXX:dmq : disabling in-place substitution seems to
10163 * be necessary here to avoid cases of memory corruption, as
10164 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10166 RExC_seen |= REG_SEEN_LOOKBEHIND;
10167 goto finish_meta_pat;
10169 ret = reg_node(pRExC_state, SEOL);
10171 RExC_seen_zerolen++; /* Do not optimize RE away */
10172 goto finish_meta_pat;
10174 ret = reg_node(pRExC_state, EOS);
10176 RExC_seen_zerolen++; /* Do not optimize RE away */
10177 goto finish_meta_pat;
10179 ret = reg_node(pRExC_state, CANY);
10180 RExC_seen |= REG_SEEN_CANY;
10181 *flagp |= HASWIDTH|SIMPLE;
10182 goto finish_meta_pat;
10184 ret = reg_node(pRExC_state, CLUMP);
10185 *flagp |= HASWIDTH;
10186 goto finish_meta_pat;
10192 arg = ANYOF_WORDCHAR;
10196 RExC_seen_zerolen++;
10197 RExC_seen |= REG_SEEN_LOOKBEHIND;
10198 op = BOUND + get_regex_charset(RExC_flags);
10199 if (op > BOUNDA) { /* /aa is same as /a */
10202 ret = reg_node(pRExC_state, op);
10203 FLAGS(ret) = get_regex_charset(RExC_flags);
10205 goto finish_meta_pat;
10207 RExC_seen_zerolen++;
10208 RExC_seen |= REG_SEEN_LOOKBEHIND;
10209 op = NBOUND + get_regex_charset(RExC_flags);
10210 if (op > NBOUNDA) { /* /aa is same as /a */
10213 ret = reg_node(pRExC_state, op);
10214 FLAGS(ret) = get_regex_charset(RExC_flags);
10216 goto finish_meta_pat;
10226 ret = reg_node(pRExC_state, LNBREAK);
10227 *flagp |= HASWIDTH|SIMPLE;
10228 goto finish_meta_pat;
10236 goto join_posix_op_known;
10242 arg = ANYOF_VERTWS;
10244 goto join_posix_op_known;
10254 op = POSIXD + get_regex_charset(RExC_flags);
10255 if (op > POSIXA) { /* /aa is same as /a */
10259 join_posix_op_known:
10262 op += NPOSIXD - POSIXD;
10265 ret = reg_node(pRExC_state, op);
10267 FLAGS(ret) = namedclass_to_classnum(arg);
10270 *flagp |= HASWIDTH|SIMPLE;
10274 nextchar(pRExC_state);
10275 Set_Node_Length(ret, 2); /* MJD */
10280 char* const oldregxend = RExC_end;
10282 char* parse_start = RExC_parse - 2;
10285 if (RExC_parse[1] == '{') {
10286 /* a lovely hack--pretend we saw [\pX] instead */
10287 RExC_end = strchr(RExC_parse, '}');
10289 const U8 c = (U8)*RExC_parse;
10291 RExC_end = oldregxend;
10292 vFAIL2("Missing right brace on \\%c{}", c);
10297 RExC_end = RExC_parse + 2;
10298 if (RExC_end > oldregxend)
10299 RExC_end = oldregxend;
10303 ret = regclass(pRExC_state, flagp,depth+1);
10305 RExC_end = oldregxend;
10308 Set_Node_Offset(ret, parse_start + 2);
10309 Set_Node_Cur_Length(ret);
10310 nextchar(pRExC_state);
10314 /* Handle \N and \N{NAME} with multiple code points here and not
10315 * below because it can be multicharacter. join_exact() will join
10316 * them up later on. Also this makes sure that things like
10317 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10318 * The options to the grok function call causes it to fail if the
10319 * sequence is just a single code point. We then go treat it as
10320 * just another character in the current EXACT node, and hence it
10321 * gets uniform treatment with all the other characters. The
10322 * special treatment for quantifiers is not needed for such single
10323 * character sequences */
10325 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10330 case 'k': /* Handle \k<NAME> and \k'NAME' */
10333 char ch= RExC_parse[1];
10334 if (ch != '<' && ch != '\'' && ch != '{') {
10336 vFAIL2("Sequence %.2s... not terminated",parse_start);
10338 /* this pretty much dupes the code for (?P=...) in reg(), if
10339 you change this make sure you change that */
10340 char* name_start = (RExC_parse += 2);
10342 SV *sv_dat = reg_scan_name(pRExC_state,
10343 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10344 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10345 if (RExC_parse == name_start || *RExC_parse != ch)
10346 vFAIL2("Sequence %.3s... not terminated",parse_start);
10349 num = add_data( pRExC_state, 1, "S" );
10350 RExC_rxi->data->data[num]=(void*)sv_dat;
10351 SvREFCNT_inc_simple_void(sv_dat);
10355 ret = reganode(pRExC_state,
10358 : (ASCII_FOLD_RESTRICTED)
10360 : (AT_LEAST_UNI_SEMANTICS)
10366 *flagp |= HASWIDTH;
10368 /* override incorrect value set in reganode MJD */
10369 Set_Node_Offset(ret, parse_start+1);
10370 Set_Node_Cur_Length(ret); /* MJD */
10371 nextchar(pRExC_state);
10377 case '1': case '2': case '3': case '4':
10378 case '5': case '6': case '7': case '8': case '9':
10381 bool isg = *RExC_parse == 'g';
10386 if (*RExC_parse == '{') {
10390 if (*RExC_parse == '-') {
10394 if (hasbrace && !isDIGIT(*RExC_parse)) {
10395 if (isrel) RExC_parse--;
10397 goto parse_named_seq;
10399 num = atoi(RExC_parse);
10400 if (isg && num == 0)
10401 vFAIL("Reference to invalid group 0");
10403 num = RExC_npar - num;
10405 vFAIL("Reference to nonexistent or unclosed group");
10407 if (!isg && num > 9 && num >= RExC_npar)
10408 /* Probably a character specified in octal, e.g. \35 */
10411 char * const parse_start = RExC_parse - 1; /* MJD */
10412 while (isDIGIT(*RExC_parse))
10414 if (parse_start == RExC_parse - 1)
10415 vFAIL("Unterminated \\g... pattern");
10417 if (*RExC_parse != '}')
10418 vFAIL("Unterminated \\g{...} pattern");
10422 if (num > (I32)RExC_rx->nparens)
10423 vFAIL("Reference to nonexistent group");
10426 ret = reganode(pRExC_state,
10429 : (ASCII_FOLD_RESTRICTED)
10431 : (AT_LEAST_UNI_SEMANTICS)
10437 *flagp |= HASWIDTH;
10439 /* override incorrect value set in reganode MJD */
10440 Set_Node_Offset(ret, parse_start+1);
10441 Set_Node_Cur_Length(ret); /* MJD */
10443 nextchar(pRExC_state);
10448 if (RExC_parse >= RExC_end)
10449 FAIL("Trailing \\");
10452 /* Do not generate "unrecognized" warnings here, we fall
10453 back into the quick-grab loop below */
10460 if (RExC_flags & RXf_PMf_EXTENDED) {
10461 if ( reg_skipcomment( pRExC_state ) )
10468 parse_start = RExC_parse - 1;
10477 #define MAX_NODE_STRING_SIZE 127
10478 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10480 U8 upper_parse = MAX_NODE_STRING_SIZE;
10483 bool next_is_quantifier;
10484 char * oldp = NULL;
10486 /* If a folding node contains only code points that don't
10487 * participate in folds, it can be changed into an EXACT node,
10488 * which allows the optimizer more things to look for */
10492 node_type = compute_EXACTish(pRExC_state);
10493 ret = reg_node(pRExC_state, node_type);
10495 /* In pass1, folded, we use a temporary buffer instead of the
10496 * actual node, as the node doesn't exist yet */
10497 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10503 /* We do the EXACTFish to EXACT node only if folding, and not if in
10504 * locale, as whether a character folds or not isn't known until
10506 maybe_exact = FOLD && ! LOC;
10508 /* XXX The node can hold up to 255 bytes, yet this only goes to
10509 * 127. I (khw) do not know why. Keeping it somewhat less than
10510 * 255 allows us to not have to worry about overflow due to
10511 * converting to utf8 and fold expansion, but that value is
10512 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10513 * split up by this limit into a single one using the real max of
10514 * 255. Even at 127, this breaks under rare circumstances. If
10515 * folding, we do not want to split a node at a character that is a
10516 * non-final in a multi-char fold, as an input string could just
10517 * happen to want to match across the node boundary. The join
10518 * would solve that problem if the join actually happens. But a
10519 * series of more than two nodes in a row each of 127 would cause
10520 * the first join to succeed to get to 254, but then there wouldn't
10521 * be room for the next one, which could at be one of those split
10522 * multi-char folds. I don't know of any fool-proof solution. One
10523 * could back off to end with only a code point that isn't such a
10524 * non-final, but it is possible for there not to be any in the
10526 for (p = RExC_parse - 1;
10527 len < upper_parse && p < RExC_end;
10532 if (RExC_flags & RXf_PMf_EXTENDED)
10533 p = regwhite( pRExC_state, p );
10544 /* Literal Escapes Switch
10546 This switch is meant to handle escape sequences that
10547 resolve to a literal character.
10549 Every escape sequence that represents something
10550 else, like an assertion or a char class, is handled
10551 in the switch marked 'Special Escapes' above in this
10552 routine, but also has an entry here as anything that
10553 isn't explicitly mentioned here will be treated as
10554 an unescaped equivalent literal.
10557 switch ((U8)*++p) {
10558 /* These are all the special escapes. */
10559 case 'A': /* Start assertion */
10560 case 'b': case 'B': /* Word-boundary assertion*/
10561 case 'C': /* Single char !DANGEROUS! */
10562 case 'd': case 'D': /* digit class */
10563 case 'g': case 'G': /* generic-backref, pos assertion */
10564 case 'h': case 'H': /* HORIZWS */
10565 case 'k': case 'K': /* named backref, keep marker */
10566 case 'p': case 'P': /* Unicode property */
10567 case 'R': /* LNBREAK */
10568 case 's': case 'S': /* space class */
10569 case 'v': case 'V': /* VERTWS */
10570 case 'w': case 'W': /* word class */
10571 case 'X': /* eXtended Unicode "combining character sequence" */
10572 case 'z': case 'Z': /* End of line/string assertion */
10576 /* Anything after here is an escape that resolves to a
10577 literal. (Except digits, which may or may not)
10583 case 'N': /* Handle a single-code point named character. */
10584 /* The options cause it to fail if a multiple code
10585 * point sequence. Handle those in the switch() above
10587 RExC_parse = p + 1;
10588 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10589 flagp, depth, FALSE))
10591 RExC_parse = p = oldp;
10595 if (ender > 0xff) {
10612 ender = ASCII_TO_NATIVE('\033');
10616 ender = ASCII_TO_NATIVE('\007');
10621 STRLEN brace_len = len;
10623 const char* error_msg;
10625 bool valid = grok_bslash_o(p,
10632 RExC_parse = p; /* going to die anyway; point
10633 to exact spot of failure */
10640 if (PL_encoding && ender < 0x100) {
10641 goto recode_encoding;
10643 if (ender > 0xff) {
10650 STRLEN brace_len = len;
10652 const char* error_msg;
10654 bool valid = grok_bslash_x(p,
10661 RExC_parse = p; /* going to die anyway; point
10662 to exact spot of failure */
10668 if (PL_encoding && ender < 0x100) {
10669 goto recode_encoding;
10671 if (ender > 0xff) {
10678 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10680 case '0': case '1': case '2': case '3':case '4':
10681 case '5': case '6': case '7':
10683 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10685 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10687 ender = grok_oct(p, &numlen, &flags, NULL);
10688 if (ender > 0xff) {
10697 if (PL_encoding && ender < 0x100)
10698 goto recode_encoding;
10701 if (! RExC_override_recoding) {
10702 SV* enc = PL_encoding;
10703 ender = reg_recode((const char)(U8)ender, &enc);
10704 if (!enc && SIZE_ONLY)
10705 ckWARNreg(p, "Invalid escape in the specified encoding");
10711 FAIL("Trailing \\");
10714 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
10715 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10717 goto normal_default;
10721 /* Currently we don't warn when the lbrace is at the start
10722 * of a construct. This catches it in the middle of a
10723 * literal string, or when its the first thing after
10724 * something like "\b" */
10726 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10728 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10733 if (UTF8_IS_START(*p) && UTF) {
10735 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10736 &numlen, UTF8_ALLOW_DEFAULT);
10742 } /* End of switch on the literal */
10744 /* Here, have looked at the literal character and <ender>
10745 * contains its ordinal, <p> points to the character after it
10748 if ( RExC_flags & RXf_PMf_EXTENDED)
10749 p = regwhite( pRExC_state, p );
10751 /* If the next thing is a quantifier, it applies to this
10752 * character only, which means that this character has to be in
10753 * its own node and can't just be appended to the string in an
10754 * existing node, so if there are already other characters in
10755 * the node, close the node with just them, and set up to do
10756 * this character again next time through, when it will be the
10757 * only thing in its new node */
10758 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10766 /* See comments for join_exact() as to why we fold
10767 * this non-UTF at compile time */
10768 || (node_type == EXACTFU
10769 && ender == LATIN_SMALL_LETTER_SHARP_S))
10773 /* Prime the casefolded buffer. Locale rules, which
10774 * apply only to code points < 256, aren't known until
10775 * execution, so for them, just output the original
10776 * character using utf8. If we start to fold non-UTF
10777 * patterns, be sure to update join_exact() */
10778 if (LOC && ender < 256) {
10779 if (UNI_IS_INVARIANT(ender)) {
10783 *s = UTF8_TWO_BYTE_HI(ender);
10784 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10789 UV folded = _to_uni_fold_flags(
10794 | ((LOC) ? FOLD_FLAGS_LOCALE
10795 : (ASCII_FOLD_RESTRICTED)
10796 ? FOLD_FLAGS_NOMIX_ASCII
10800 /* If this node only contains non-folding code
10801 * points so far, see if this new one is also
10804 if (folded != ender) {
10805 maybe_exact = FALSE;
10808 /* Here the fold is the original; we have
10809 * to check further to see if anything
10811 if (! PL_utf8_foldable) {
10812 SV* swash = swash_init("utf8",
10814 &PL_sv_undef, 1, 0);
10816 _get_swash_invlist(swash);
10817 SvREFCNT_dec_NN(swash);
10819 if (_invlist_contains_cp(PL_utf8_foldable,
10822 maybe_exact = FALSE;
10830 /* The loop increments <len> each time, as all but this
10831 * path (and the one just below for UTF) through it add
10832 * a single byte to the EXACTish node. But this one
10833 * has changed len to be the correct final value, so
10834 * subtract one to cancel out the increment that
10836 len += foldlen - 1;
10840 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10844 const STRLEN unilen = reguni(pRExC_state, ender, s);
10850 /* See comment just above for - 1 */
10854 REGC((char)ender, s++);
10857 if (next_is_quantifier) {
10859 /* Here, the next input is a quantifier, and to get here,
10860 * the current character is the only one in the node.
10861 * Also, here <len> doesn't include the final byte for this
10867 } /* End of loop through literal characters */
10869 /* Here we have either exhausted the input or ran out of room in
10870 * the node. (If we encountered a character that can't be in the
10871 * node, transfer is made directly to <loopdone>, and so we
10872 * wouldn't have fallen off the end of the loop.) In the latter
10873 * case, we artificially have to split the node into two, because
10874 * we just don't have enough space to hold everything. This
10875 * creates a problem if the final character participates in a
10876 * multi-character fold in the non-final position, as a match that
10877 * should have occurred won't, due to the way nodes are matched,
10878 * and our artificial boundary. So back off until we find a non-
10879 * problematic character -- one that isn't at the beginning or
10880 * middle of such a fold. (Either it doesn't participate in any
10881 * folds, or appears only in the final position of all the folds it
10882 * does participate in.) A better solution with far fewer false
10883 * positives, and that would fill the nodes more completely, would
10884 * be to actually have available all the multi-character folds to
10885 * test against, and to back-off only far enough to be sure that
10886 * this node isn't ending with a partial one. <upper_parse> is set
10887 * further below (if we need to reparse the node) to include just
10888 * up through that final non-problematic character that this code
10889 * identifies, so when it is set to less than the full node, we can
10890 * skip the rest of this */
10891 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10893 const STRLEN full_len = len;
10895 assert(len >= MAX_NODE_STRING_SIZE);
10897 /* Here, <s> points to the final byte of the final character.
10898 * Look backwards through the string until find a non-
10899 * problematic character */
10903 /* These two have no multi-char folds to non-UTF characters
10905 if (ASCII_FOLD_RESTRICTED || LOC) {
10909 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10913 if (! PL_NonL1NonFinalFold) {
10914 PL_NonL1NonFinalFold = _new_invlist_C_array(
10915 NonL1_Perl_Non_Final_Folds_invlist);
10918 /* Point to the first byte of the final character */
10919 s = (char *) utf8_hop((U8 *) s, -1);
10921 while (s >= s0) { /* Search backwards until find
10922 non-problematic char */
10923 if (UTF8_IS_INVARIANT(*s)) {
10925 /* There are no ascii characters that participate
10926 * in multi-char folds under /aa. In EBCDIC, the
10927 * non-ascii invariants are all control characters,
10928 * so don't ever participate in any folds. */
10929 if (ASCII_FOLD_RESTRICTED
10930 || ! IS_NON_FINAL_FOLD(*s))
10935 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10937 /* No Latin1 characters participate in multi-char
10938 * folds under /l */
10940 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10946 else if (! _invlist_contains_cp(
10947 PL_NonL1NonFinalFold,
10948 valid_utf8_to_uvchr((U8 *) s, NULL)))
10953 /* Here, the current character is problematic in that
10954 * it does occur in the non-final position of some
10955 * fold, so try the character before it, but have to
10956 * special case the very first byte in the string, so
10957 * we don't read outside the string */
10958 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10959 } /* End of loop backwards through the string */
10961 /* If there were only problematic characters in the string,
10962 * <s> will point to before s0, in which case the length
10963 * should be 0, otherwise include the length of the
10964 * non-problematic character just found */
10965 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10968 /* Here, have found the final character, if any, that is
10969 * non-problematic as far as ending the node without splitting
10970 * it across a potential multi-char fold. <len> contains the
10971 * number of bytes in the node up-to and including that
10972 * character, or is 0 if there is no such character, meaning
10973 * the whole node contains only problematic characters. In
10974 * this case, give up and just take the node as-is. We can't
10980 /* Here, the node does contain some characters that aren't
10981 * problematic. If one such is the final character in the
10982 * node, we are done */
10983 if (len == full_len) {
10986 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
10988 /* If the final character is problematic, but the
10989 * penultimate is not, back-off that last character to
10990 * later start a new node with it */
10995 /* Here, the final non-problematic character is earlier
10996 * in the input than the penultimate character. What we do
10997 * is reparse from the beginning, going up only as far as
10998 * this final ok one, thus guaranteeing that the node ends
10999 * in an acceptable character. The reason we reparse is
11000 * that we know how far in the character is, but we don't
11001 * know how to correlate its position with the input parse.
11002 * An alternate implementation would be to build that
11003 * correlation as we go along during the original parse,
11004 * but that would entail extra work for every node, whereas
11005 * this code gets executed only when the string is too
11006 * large for the node, and the final two characters are
11007 * problematic, an infrequent occurrence. Yet another
11008 * possible strategy would be to save the tail of the
11009 * string, and the next time regatom is called, initialize
11010 * with that. The problem with this is that unless you
11011 * back off one more character, you won't be guaranteed
11012 * regatom will get called again, unless regbranch,
11013 * regpiece ... are also changed. If you do back off that
11014 * extra character, so that there is input guaranteed to
11015 * force calling regatom, you can't handle the case where
11016 * just the first character in the node is acceptable. I
11017 * (khw) decided to try this method which doesn't have that
11018 * pitfall; if performance issues are found, we can do a
11019 * combination of the current approach plus that one */
11025 } /* End of verifying node ends with an appropriate char */
11027 loopdone: /* Jumped to when encounters something that shouldn't be in
11030 /* If 'maybe_exact' is still set here, means there are no
11031 * code points in the node that participate in folds */
11032 if (FOLD && maybe_exact) {
11036 /* I (khw) don't know if you can get here with zero length, but the
11037 * old code handled this situation by creating a zero-length EXACT
11038 * node. Might as well be NOTHING instead */
11043 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11046 RExC_parse = p - 1;
11047 Set_Node_Cur_Length(ret); /* MJD */
11048 nextchar(pRExC_state);
11050 /* len is STRLEN which is unsigned, need to copy to signed */
11053 vFAIL("Internal disaster");
11056 } /* End of label 'defchar:' */
11058 } /* End of giant switch on input character */
11064 S_regwhite( RExC_state_t *pRExC_state, char *p )
11066 const char *e = RExC_end;
11068 PERL_ARGS_ASSERT_REGWHITE;
11073 else if (*p == '#') {
11076 if (*p++ == '\n') {
11082 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11090 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11091 Character classes ([:foo:]) can also be negated ([:^foo:]).
11092 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11093 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11094 but trigger failures because they are currently unimplemented. */
11096 #define POSIXCC_DONE(c) ((c) == ':')
11097 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11098 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11100 PERL_STATIC_INLINE I32
11101 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
11104 I32 namedclass = OOB_NAMEDCLASS;
11106 PERL_ARGS_ASSERT_REGPPOSIXCC;
11108 if (value == '[' && RExC_parse + 1 < RExC_end &&
11109 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11110 POSIXCC(UCHARAT(RExC_parse))) {
11111 const char c = UCHARAT(RExC_parse);
11112 char* const s = RExC_parse++;
11114 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11116 if (RExC_parse == RExC_end)
11117 /* Grandfather lone [:, [=, [. */
11120 const char* const t = RExC_parse++; /* skip over the c */
11123 if (UCHARAT(RExC_parse) == ']') {
11124 const char *posixcc = s + 1;
11125 RExC_parse++; /* skip over the ending ] */
11128 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11129 const I32 skip = t - posixcc;
11131 /* Initially switch on the length of the name. */
11134 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11135 namedclass = ANYOF_WORDCHAR;
11138 /* Names all of length 5. */
11139 /* alnum alpha ascii blank cntrl digit graph lower
11140 print punct space upper */
11141 /* Offset 4 gives the best switch position. */
11142 switch (posixcc[4]) {
11144 if (memEQ(posixcc, "alph", 4)) /* alpha */
11145 namedclass = ANYOF_ALPHA;
11148 if (memEQ(posixcc, "spac", 4)) /* space */
11149 namedclass = ANYOF_PSXSPC;
11152 if (memEQ(posixcc, "grap", 4)) /* graph */
11153 namedclass = ANYOF_GRAPH;
11156 if (memEQ(posixcc, "asci", 4)) /* ascii */
11157 namedclass = ANYOF_ASCII;
11160 if (memEQ(posixcc, "blan", 4)) /* blank */
11161 namedclass = ANYOF_BLANK;
11164 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11165 namedclass = ANYOF_CNTRL;
11168 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11169 namedclass = ANYOF_ALPHANUMERIC;
11172 if (memEQ(posixcc, "lowe", 4)) /* lower */
11173 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11174 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11175 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11178 if (memEQ(posixcc, "digi", 4)) /* digit */
11179 namedclass = ANYOF_DIGIT;
11180 else if (memEQ(posixcc, "prin", 4)) /* print */
11181 namedclass = ANYOF_PRINT;
11182 else if (memEQ(posixcc, "punc", 4)) /* punct */
11183 namedclass = ANYOF_PUNCT;
11188 if (memEQ(posixcc, "xdigit", 6))
11189 namedclass = ANYOF_XDIGIT;
11193 if (namedclass == OOB_NAMEDCLASS)
11194 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11197 /* The #defines are structured so each complement is +1 to
11198 * the normal one */
11202 assert (posixcc[skip] == ':');
11203 assert (posixcc[skip+1] == ']');
11204 } else if (!SIZE_ONLY) {
11205 /* [[=foo=]] and [[.foo.]] are still future. */
11207 /* adjust RExC_parse so the warning shows after
11208 the class closes */
11209 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11211 SvREFCNT_dec(free_me);
11212 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11215 /* Maternal grandfather:
11216 * "[:" ending in ":" but not in ":]" */
11226 /* The names of properties whose definitions are not known at compile time are
11227 * stored in this SV, after a constant heading. So if the length has been
11228 * changed since initialization, then there is a run-time definition. */
11229 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11232 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11234 /* parse a bracketed class specification. Most of these will produce an ANYOF node;
11235 * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11236 * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
11237 * multi-character folds: it will be rewritten following the paradigm of
11238 * this example, where the <multi-fold>s are characters which fold to
11239 * multiple character sequences:
11240 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11241 * gets effectively rewritten as:
11242 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11243 * reg() gets called (recursively) on the rewritten version, and this
11244 * function will return what it constructs. (Actually the <multi-fold>s
11245 * aren't physically removed from the [abcdefghi], it's just that they are
11246 * ignored in the recursion by means of a flag:
11247 * <RExC_in_multi_char_class>.)
11249 * ANYOF nodes contain a bit map for the first 256 characters, with the
11250 * corresponding bit set if that character is in the list. For characters
11251 * above 255, a range list or swash is used. There are extra bits for \w,
11252 * etc. in locale ANYOFs, as what these match is not determinable at
11257 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11259 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11262 IV namedclass = OOB_NAMEDCLASS;
11263 char *rangebegin = NULL;
11264 bool need_class = 0;
11266 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11267 than just initialized. */
11268 SV* properties = NULL; /* Code points that match \p{} \P{} */
11269 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11270 extended beyond the Latin1 range */
11271 UV element_count = 0; /* Number of distinct elements in the class.
11272 Optimizations may be possible if this is tiny */
11273 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11274 character; used under /i */
11277 /* Unicode properties are stored in a swash; this holds the current one
11278 * being parsed. If this swash is the only above-latin1 component of the
11279 * character class, an optimization is to pass it directly on to the
11280 * execution engine. Otherwise, it is set to NULL to indicate that there
11281 * are other things in the class that have to be dealt with at execution
11283 SV* swash = NULL; /* Code points that match \p{} \P{} */
11285 /* Set if a component of this character class is user-defined; just passed
11286 * on to the engine */
11287 bool has_user_defined_property = FALSE;
11289 /* inversion list of code points this node matches only when the target
11290 * string is in UTF-8. (Because is under /d) */
11291 SV* depends_list = NULL;
11293 /* inversion list of code points this node matches. For much of the
11294 * function, it includes only those that match regardless of the utf8ness
11295 * of the target string */
11296 SV* cp_list = NULL;
11299 /* In a range, counts how many 0-2 of the ends of it came from literals,
11300 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11301 UV literal_endpoint = 0;
11303 bool invert = FALSE; /* Is this class to be complemented */
11305 /* Is there any thing like \W or [:^digit:] that matches above the legal
11306 * Unicode range? */
11307 bool runtime_posix_matches_above_Unicode = FALSE;
11309 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11310 case we need to change the emitted regop to an EXACT. */
11311 const char * orig_parse = RExC_parse;
11312 const I32 orig_size = RExC_size;
11313 GET_RE_DEBUG_FLAGS_DECL;
11315 PERL_ARGS_ASSERT_REGCLASS;
11317 PERL_UNUSED_ARG(depth);
11320 DEBUG_PARSE("clas");
11322 /* Assume we are going to generate an ANYOF node. */
11323 ret = reganode(pRExC_state, ANYOF, 0);
11326 ANYOF_FLAGS(ret) = 0;
11329 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11336 RExC_size += ANYOF_SKIP;
11337 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11340 RExC_emit += ANYOF_SKIP;
11342 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11344 listsv = newSVpvs("# comment\n");
11345 initial_listsv_len = SvCUR(listsv);
11348 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11350 if (!SIZE_ONLY && POSIXCC(nextvalue))
11352 const char *s = RExC_parse;
11353 const char c = *s++;
11355 while (isALNUM(*s))
11357 if (*s && c == *s && s[1] == ']') {
11358 SAVEFREESV(RExC_rx_sv);
11359 SAVEFREESV(listsv);
11361 "POSIX syntax [%c %c] belongs inside character classes",
11363 (void)ReREFCNT_inc(RExC_rx_sv);
11364 SvREFCNT_inc_simple_void_NN(listsv);
11368 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11369 if (UCHARAT(RExC_parse) == ']')
11370 goto charclassloop;
11373 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11377 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11378 save_value = value;
11379 save_prevvalue = prevvalue;
11382 rangebegin = RExC_parse;
11386 value = utf8n_to_uvchr((U8*)RExC_parse,
11387 RExC_end - RExC_parse,
11388 &numlen, UTF8_ALLOW_DEFAULT);
11389 RExC_parse += numlen;
11392 value = UCHARAT(RExC_parse++);
11394 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11395 if (value == '[' && POSIXCC(nextvalue))
11396 namedclass = regpposixcc(pRExC_state, value, listsv);
11397 else if (value == '\\') {
11399 value = utf8n_to_uvchr((U8*)RExC_parse,
11400 RExC_end - RExC_parse,
11401 &numlen, UTF8_ALLOW_DEFAULT);
11402 RExC_parse += numlen;
11405 value = UCHARAT(RExC_parse++);
11406 /* Some compilers cannot handle switching on 64-bit integer
11407 * values, therefore value cannot be an UV. Yes, this will
11408 * be a problem later if we want switch on Unicode.
11409 * A similar issue a little bit later when switching on
11410 * namedclass. --jhi */
11411 switch ((I32)value) {
11412 case 'w': namedclass = ANYOF_WORDCHAR; break;
11413 case 'W': namedclass = ANYOF_NWORDCHAR; break;
11414 case 's': namedclass = ANYOF_SPACE; break;
11415 case 'S': namedclass = ANYOF_NSPACE; break;
11416 case 'd': namedclass = ANYOF_DIGIT; break;
11417 case 'D': namedclass = ANYOF_NDIGIT; break;
11418 case 'v': namedclass = ANYOF_VERTWS; break;
11419 case 'V': namedclass = ANYOF_NVERTWS; break;
11420 case 'h': namedclass = ANYOF_HORIZWS; break;
11421 case 'H': namedclass = ANYOF_NHORIZWS; break;
11422 case 'N': /* Handle \N{NAME} in class */
11424 /* We only pay attention to the first char of
11425 multichar strings being returned. I kinda wonder
11426 if this makes sense as it does change the behaviour
11427 from earlier versions, OTOH that behaviour was broken
11429 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11430 TRUE /* => charclass */))
11441 /* This routine will handle any undefined properties */
11442 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11444 if (RExC_parse >= RExC_end)
11445 vFAIL2("Empty \\%c{}", (U8)value);
11446 if (*RExC_parse == '{') {
11447 const U8 c = (U8)value;
11448 e = strchr(RExC_parse++, '}');
11450 vFAIL2("Missing right brace on \\%c{}", c);
11451 while (isSPACE(UCHARAT(RExC_parse)))
11453 if (e == RExC_parse)
11454 vFAIL2("Empty \\%c{}", c);
11455 n = e - RExC_parse;
11456 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11467 if (UCHARAT(RExC_parse) == '^') {
11470 /* toggle. (The rhs xor gets the single bit that
11471 * differs between P and p; the other xor inverts just
11473 value ^= 'P' ^ 'p';
11475 while (isSPACE(UCHARAT(RExC_parse))) {
11480 /* Try to get the definition of the property into
11481 * <invlist>. If /i is in effect, the effective property
11482 * will have its name be <__NAME_i>. The design is
11483 * discussed in commit
11484 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11485 Newx(name, n + sizeof("_i__\n"), char);
11487 sprintf(name, "%s%.*s%s\n",
11488 (FOLD) ? "__" : "",
11494 /* Look up the property name, and get its swash and
11495 * inversion list, if the property is found */
11497 SvREFCNT_dec_NN(swash);
11499 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11502 NULL, /* No inversion list */
11505 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11507 SvREFCNT_dec_NN(swash);
11511 /* Here didn't find it. It could be a user-defined
11512 * property that will be available at run-time. Add it
11513 * to the list to look up then */
11514 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11515 (value == 'p' ? '+' : '!'),
11517 has_user_defined_property = TRUE;
11519 /* We don't know yet, so have to assume that the
11520 * property could match something in the Latin1 range,
11521 * hence something that isn't utf8. Note that this
11522 * would cause things in <depends_list> to match
11523 * inappropriately, except that any \p{}, including
11524 * this one forces Unicode semantics, which means there
11525 * is <no depends_list> */
11526 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11530 /* Here, did get the swash and its inversion list. If
11531 * the swash is from a user-defined property, then this
11532 * whole character class should be regarded as such */
11533 has_user_defined_property =
11535 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11537 /* Invert if asking for the complement */
11538 if (value == 'P') {
11539 _invlist_union_complement_2nd(properties,
11543 /* The swash can't be used as-is, because we've
11544 * inverted things; delay removing it to here after
11545 * have copied its invlist above */
11546 SvREFCNT_dec_NN(swash);
11550 _invlist_union(properties, invlist, &properties);
11555 RExC_parse = e + 1;
11556 namedclass = ANYOF_UNIPROP; /* no official name, but it's named */
11558 /* \p means they want Unicode semantics */
11559 RExC_uni_semantics = 1;
11562 case 'n': value = '\n'; break;
11563 case 'r': value = '\r'; break;
11564 case 't': value = '\t'; break;
11565 case 'f': value = '\f'; break;
11566 case 'b': value = '\b'; break;
11567 case 'e': value = ASCII_TO_NATIVE('\033');break;
11568 case 'a': value = ASCII_TO_NATIVE('\007');break;
11570 RExC_parse--; /* function expects to be pointed at the 'o' */
11572 const char* error_msg;
11573 bool valid = grok_bslash_o(RExC_parse,
11578 RExC_parse += numlen;
11583 if (PL_encoding && value < 0x100) {
11584 goto recode_encoding;
11588 RExC_parse--; /* function expects to be pointed at the 'x' */
11590 const char* error_msg;
11591 bool valid = grok_bslash_x(RExC_parse,
11596 RExC_parse += numlen;
11601 if (PL_encoding && value < 0x100)
11602 goto recode_encoding;
11605 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11607 case '0': case '1': case '2': case '3': case '4':
11608 case '5': case '6': case '7':
11610 /* Take 1-3 octal digits */
11611 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11613 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11614 RExC_parse += numlen;
11615 if (PL_encoding && value < 0x100)
11616 goto recode_encoding;
11620 if (! RExC_override_recoding) {
11621 SV* enc = PL_encoding;
11622 value = reg_recode((const char)(U8)value, &enc);
11623 if (!enc && SIZE_ONLY)
11624 ckWARNreg(RExC_parse,
11625 "Invalid escape in the specified encoding");
11629 /* Allow \_ to not give an error */
11630 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11631 SAVEFREESV(RExC_rx_sv);
11632 SAVEFREESV(listsv);
11633 ckWARN2reg(RExC_parse,
11634 "Unrecognized escape \\%c in character class passed through",
11636 (void)ReREFCNT_inc(RExC_rx_sv);
11637 SvREFCNT_inc_simple_void_NN(listsv);
11641 } /* end of \blah */
11644 literal_endpoint++;
11647 /* What matches in a locale is not known until runtime. This
11648 * includes what the Posix classes (like \w, [:space:]) match.
11649 * Room must be reserved (one time per class) to store such
11650 * classes, either if Perl is compiled so that locale nodes always
11651 * should have this space, or if there is such class info to be
11652 * stored. The space will contain a bit for each named class that
11653 * is to be matched against. This isn't needed for \p{} and
11654 * pseudo-classes, as they are not affected by locale, and hence
11655 * are dealt with separately */
11658 && (ANYOF_LOCALE == ANYOF_CLASS
11659 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11663 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11666 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11667 ANYOF_CLASS_ZERO(ret);
11669 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11672 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11674 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11675 * literal, as is the character that began the false range, i.e.
11676 * the 'a' in the examples */
11680 RExC_parse >= rangebegin ?
11681 RExC_parse - rangebegin : 0;
11682 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11683 SAVEFREESV(listsv);
11684 ckWARN4reg(RExC_parse,
11685 "False [] range \"%*.*s\"",
11687 (void)ReREFCNT_inc(RExC_rx_sv);
11688 SvREFCNT_inc_simple_void_NN(listsv);
11689 cp_list = add_cp_to_invlist(cp_list, '-');
11690 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11693 range = 0; /* this was not a true range */
11694 element_count += 2; /* So counts for three values */
11698 U8 classnum = namedclass_to_classnum(namedclass);
11700 /* The ascii range inversion list */
11701 SV* ascii_source = PL_Posix_ptrs[classnum];
11703 /* The full Latin1 range inversion list */
11704 SV* l1_source = PL_L1Posix_ptrs[classnum];
11706 /* The name of the property to use to match the full eXtended
11707 * Unicode range swash fo this character class */
11708 const char *Xname = swash_property_names[classnum];
11710 switch ((I32)namedclass) {
11712 case ANYOF_ALPHANUMERIC: /* C's alnum, in contrast to \w */
11720 case ANYOF_WORDCHAR:
11721 if ( ! PL_utf8_swash_ptrs[classnum]) {
11723 /* If not /a matching, there are code points we don't
11724 * know at compile time. Arrange for the unknown
11725 * matches to be loaded at run-time, if needed */
11726 if (! AT_LEAST_ASCII_RESTRICTED) {
11727 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n", Xname);
11729 if (LOC) { /* Under locale, set run-time lookup */
11730 ANYOF_CLASS_SET(ret, namedclass);
11733 /* Add the current class's code points to the
11735 _invlist_union(posixes,
11736 (AT_LEAST_ASCII_RESTRICTED)
11743 if (! PL_XPosix_ptrs[classnum]) {
11744 PL_XPosix_ptrs[classnum]
11745 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
11755 /* For non-locale, just add it to any existing list */
11756 _invlist_union(posixes,
11757 (AT_LEAST_ASCII_RESTRICTED)
11759 : PL_XPosix_ptrs[classnum],
11762 else { /* Locale */
11763 SV* scratch_list = NULL;
11765 /* For above Latin1 code points, we use the full
11767 _invlist_intersection(PL_AboveLatin1,
11768 PL_XPosix_ptrs[classnum],
11770 /* And set the output to it, adding instead if there
11771 * already is an output. Checking if 'posixes' is NULL
11772 * first saves an extra clone. Its reference count
11773 * will be decremented at the next union, etc, or if
11774 * this is the only instance, at the end of the routine
11777 posixes = scratch_list;
11780 _invlist_union(posixes, scratch_list, &posixes);
11781 SvREFCNT_dec_NN(scratch_list);
11784 #ifndef HAS_ISBLANK
11785 if (namedclass != ANYOF_BLANK) {
11787 /* Set this class in the node for runtime
11789 ANYOF_CLASS_SET(ret, namedclass);
11790 #ifndef HAS_ISBLANK
11793 /* No isblank(), use the hard-coded ASCII-range
11794 * blanks, adding them to the running total. */
11796 _invlist_union(posixes, ascii_source, &posixes);
11804 case ANYOF_NALPHANUMERIC:
11811 case ANYOF_NWORDCHAR:
11812 if ( ! PL_utf8_swash_ptrs[classnum]) {
11813 if (AT_LEAST_ASCII_RESTRICTED) {
11814 /* Under /a should match everything above ASCII,
11815 * and the complement of the set's ASCII matches */
11816 _invlist_union_complement_2nd(posixes, ascii_source,
11820 /* Arrange for the unknown matches to be loaded at
11821 * run-time, if needed */
11822 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n", Xname);
11823 runtime_posix_matches_above_Unicode = TRUE;
11825 ANYOF_CLASS_SET(ret, namedclass);
11829 /* We want to match everything in Latin1,
11830 * except those things that l1_source matches
11832 SV* scratch_list = NULL;
11833 _invlist_subtract(PL_Latin1, l1_source,
11836 /* Add the list from this class to the running
11839 posixes = scratch_list;
11842 _invlist_union(posixes, scratch_list,
11844 SvREFCNT_dec_NN(scratch_list);
11846 if (DEPENDS_SEMANTICS) {
11848 |= ANYOF_NON_UTF8_LATIN1_ALL;
11854 if (! PL_XPosix_ptrs[classnum]) {
11855 PL_XPosix_ptrs[classnum]
11856 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
11862 case ANYOF_NPSXSPC:
11864 case ANYOF_NXDIGIT:
11866 _invlist_union_complement_2nd(
11868 (AT_LEAST_ASCII_RESTRICTED)
11870 : PL_XPosix_ptrs[classnum],
11872 /* Under /d, everything in the upper half of the Latin1
11873 * range matches this complement */
11874 if (DEPENDS_SEMANTICS) {
11875 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11878 else { /* Locale */
11879 SV* scratch_list = NULL;
11880 _invlist_subtract(PL_AboveLatin1,
11881 PL_XPosix_ptrs[classnum],
11884 posixes = scratch_list;
11887 _invlist_union(posixes, scratch_list, &posixes);
11888 SvREFCNT_dec_NN(scratch_list);
11890 #ifndef HAS_ISBLANK
11891 if (namedclass != ANYOF_NBLANK) {
11893 ANYOF_CLASS_SET(ret, namedclass);
11894 #ifndef HAS_ISBLANK
11897 /* Get the list of all code points in Latin1 that
11898 * are not ASCII blanks, and add them to the
11900 _invlist_subtract(PL_Latin1, ascii_source,
11902 _invlist_union(posixes, scratch_list, &posixes);
11903 SvREFCNT_dec_NN(scratch_list);
11912 ANYOF_CLASS_SET(ret, namedclass);
11915 #endif /* Not isascii(); just use the hard-coded definition for it */
11916 _invlist_union(posixes, PL_ASCII, &posixes);
11921 ANYOF_CLASS_SET(ret, namedclass);
11925 _invlist_union_complement_2nd(posixes,
11926 PL_ASCII, &posixes);
11927 if (DEPENDS_SEMANTICS) {
11928 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11935 case ANYOF_HORIZWS:
11936 /* For these, we use the cp_list, as neither /d nor /l make
11937 * a difference in what these match. There would be
11938 * problems if these characters had folds other than
11939 * themselves, as cp_list is subject to folding.
11941 * It turns out that \h is just a synonym for XPosixBlank */
11942 classnum = _CC_BLANK;
11946 _invlist_union(cp_list, PL_XPosix_ptrs[classnum], &cp_list);
11949 case ANYOF_NHORIZWS:
11950 classnum = _CC_BLANK;
11953 case ANYOF_NVERTWS:
11954 _invlist_union_complement_2nd(cp_list,
11955 PL_XPosix_ptrs[classnum],
11959 case ANYOF_UNIPROP: /* this is to handle \p and \P */
11963 vFAIL("Invalid [::] class");
11967 continue; /* Go get next character */
11969 } /* end of namedclass \blah */
11972 if (prevvalue > value) /* b-a */ {
11973 const int w = RExC_parse - rangebegin;
11974 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11975 range = 0; /* not a valid range */
11979 prevvalue = value; /* save the beginning of the potential range */
11980 if (RExC_parse+1 < RExC_end
11981 && *RExC_parse == '-'
11982 && RExC_parse[1] != ']')
11986 /* a bad range like \w-, [:word:]- ? */
11987 if (namedclass > OOB_NAMEDCLASS) {
11988 if (ckWARN(WARN_REGEXP)) {
11990 RExC_parse >= rangebegin ?
11991 RExC_parse - rangebegin : 0;
11993 "False [] range \"%*.*s\"",
11997 cp_list = add_cp_to_invlist(cp_list, '-');
12001 range = 1; /* yeah, it's a range! */
12002 continue; /* but do it the next time */
12006 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12009 /* non-Latin1 code point implies unicode semantics. Must be set in
12010 * pass1 so is there for the whole of pass 2 */
12012 RExC_uni_semantics = 1;
12015 /* Ready to process either the single value, or the completed range.
12016 * For single-valued non-inverted ranges, we consider the possibility
12017 * of multi-char folds. (We made a conscious decision to not do this
12018 * for the other cases because it can often lead to non-intuitive
12019 * results. For example, you have the peculiar case that:
12020 * "s s" =~ /^[^\xDF]+$/i => Y
12021 * "ss" =~ /^[^\xDF]+$/i => N
12023 * See [perl #89750] */
12024 if (FOLD && ! invert && value == prevvalue) {
12025 if (value == LATIN_SMALL_LETTER_SHARP_S
12026 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12029 /* Here <value> is indeed a multi-char fold. Get what it is */
12031 U8 foldbuf[UTF8_MAXBYTES_CASE];
12034 UV folded = _to_uni_fold_flags(
12039 | ((LOC) ? FOLD_FLAGS_LOCALE
12040 : (ASCII_FOLD_RESTRICTED)
12041 ? FOLD_FLAGS_NOMIX_ASCII
12045 /* Here, <folded> should be the first character of the
12046 * multi-char fold of <value>, with <foldbuf> containing the
12047 * whole thing. But, if this fold is not allowed (because of
12048 * the flags), <fold> will be the same as <value>, and should
12049 * be processed like any other character, so skip the special
12051 if (folded != value) {
12053 /* Skip if we are recursed, currently parsing the class
12054 * again. Otherwise add this character to the list of
12055 * multi-char folds. */
12056 if (! RExC_in_multi_char_class) {
12057 AV** this_array_ptr;
12059 STRLEN cp_count = utf8_length(foldbuf,
12060 foldbuf + foldlen);
12061 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12063 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12066 if (! multi_char_matches) {
12067 multi_char_matches = newAV();
12070 /* <multi_char_matches> is actually an array of arrays.
12071 * There will be one or two top-level elements: [2],
12072 * and/or [3]. The [2] element is an array, each
12073 * element thereof is a character which folds to two
12074 * characters; likewise for [3]. (Unicode guarantees a
12075 * maximum of 3 characters in any fold.) When we
12076 * rewrite the character class below, we will do so
12077 * such that the longest folds are written first, so
12078 * that it prefers the longest matching strings first.
12079 * This is done even if it turns out that any
12080 * quantifier is non-greedy, out of programmer
12081 * laziness. Tom Christiansen has agreed that this is
12082 * ok. This makes the test for the ligature 'ffi' come
12083 * before the test for 'ff' */
12084 if (av_exists(multi_char_matches, cp_count)) {
12085 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12087 this_array = *this_array_ptr;
12090 this_array = newAV();
12091 av_store(multi_char_matches, cp_count,
12094 av_push(this_array, multi_fold);
12097 /* This element should not be processed further in this
12100 value = save_value;
12101 prevvalue = save_prevvalue;
12107 /* Deal with this element of the class */
12110 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12112 UV* this_range = _new_invlist(1);
12113 _append_range_to_invlist(this_range, prevvalue, value);
12115 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12116 * If this range was specified using something like 'i-j', we want
12117 * to include only the 'i' and the 'j', and not anything in
12118 * between, so exclude non-ASCII, non-alphabetics from it.
12119 * However, if the range was specified with something like
12120 * [\x89-\x91] or [\x89-j], all code points within it should be
12121 * included. literal_endpoint==2 means both ends of the range used
12122 * a literal character, not \x{foo} */
12123 if (literal_endpoint == 2
12124 && (prevvalue >= 'a' && value <= 'z')
12125 || (prevvalue >= 'A' && value <= 'Z'))
12127 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12128 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12130 _invlist_union(cp_list, this_range, &cp_list);
12131 literal_endpoint = 0;
12135 range = 0; /* this range (if it was one) is done now */
12136 } /* End of loop through all the text within the brackets */
12138 /* If anything in the class expands to more than one character, we have to
12139 * deal with them by building up a substitute parse string, and recursively
12140 * calling reg() on it, instead of proceeding */
12141 if (multi_char_matches) {
12142 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12145 char *save_end = RExC_end;
12146 char *save_parse = RExC_parse;
12147 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12152 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12153 because too confusing */
12155 sv_catpv(substitute_parse, "(?:");
12159 /* Look at the longest folds first */
12160 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12162 if (av_exists(multi_char_matches, cp_count)) {
12163 AV** this_array_ptr;
12166 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12168 while ((this_sequence = av_pop(*this_array_ptr)) !=
12171 if (! first_time) {
12172 sv_catpv(substitute_parse, "|");
12174 first_time = FALSE;
12176 sv_catpv(substitute_parse, SvPVX(this_sequence));
12181 /* If the character class contains anything else besides these
12182 * multi-character folds, have to include it in recursive parsing */
12183 if (element_count) {
12184 sv_catpv(substitute_parse, "|[");
12185 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12186 sv_catpv(substitute_parse, "]");
12189 sv_catpv(substitute_parse, ")");
12192 /* This is a way to get the parse to skip forward a whole named
12193 * sequence instead of matching the 2nd character when it fails the
12195 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12199 RExC_parse = SvPV(substitute_parse, len);
12200 RExC_end = RExC_parse + len;
12201 RExC_in_multi_char_class = 1;
12202 RExC_emit = (regnode *)orig_emit;
12204 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12206 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12208 RExC_parse = save_parse;
12209 RExC_end = save_end;
12210 RExC_in_multi_char_class = 0;
12211 SvREFCNT_dec_NN(multi_char_matches);
12212 SvREFCNT_dec_NN(listsv);
12216 /* If the character class contains only a single element, it may be
12217 * optimizable into another node type which is smaller and runs faster.
12218 * Check if this is the case for this class */
12219 if (element_count == 1) {
12223 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12224 [:digit:] or \p{foo} */
12226 /* All named classes are mapped into POSIXish nodes, with its FLAG
12227 * argument giving which class it is */
12228 switch ((I32)namedclass) {
12229 case ANYOF_UNIPROP:
12232 /* These don't depend on the charset modifiers. They always
12233 * match under /u rules */
12234 case ANYOF_NHORIZWS:
12235 case ANYOF_HORIZWS:
12236 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
12239 case ANYOF_NVERTWS:
12244 /* The actual POSIXish node for all the rest depends on the
12245 * charset modifier. The ones in the first set depend only on
12246 * ASCII or, if available on this platform, locale */
12250 op = (LOC) ? POSIXL : POSIXA;
12261 /* under /a could be alpha */
12263 if (ASCII_RESTRICTED) {
12264 namedclass = ANYOF_ALPHA + (namedclass % 2);
12272 /* The rest have more possibilities depending on the charset. We
12273 * take advantage of the enum ordering of the charset modifiers to
12274 * get the exact node type, */
12276 op = POSIXD + get_regex_charset(RExC_flags);
12277 if (op > POSIXA) { /* /aa is same as /a */
12280 #ifndef HAS_ISBLANK
12282 && (namedclass == ANYOF_BLANK
12283 || namedclass == ANYOF_NBLANK))
12290 /* The odd numbered ones are the complements of the
12291 * next-lower even number one */
12292 if (namedclass % 2 == 1) {
12296 arg = namedclass_to_classnum(namedclass);
12300 else if (value == prevvalue) {
12302 /* Here, the class consists of just a single code point */
12305 if (! LOC && value == '\n') {
12306 op = REG_ANY; /* Optimize [^\n] */
12307 *flagp |= HASWIDTH|SIMPLE;
12311 else if (value < 256 || UTF) {
12313 /* Optimize a single value into an EXACTish node, but not if it
12314 * would require converting the pattern to UTF-8. */
12315 op = compute_EXACTish(pRExC_state);
12317 } /* Otherwise is a range */
12318 else if (! LOC) { /* locale could vary these */
12319 if (prevvalue == '0') {
12320 if (value == '9') {
12327 /* Here, we have changed <op> away from its initial value iff we found
12328 * an optimization */
12331 /* Throw away this ANYOF regnode, and emit the calculated one,
12332 * which should correspond to the beginning, not current, state of
12334 const char * cur_parse = RExC_parse;
12335 RExC_parse = (char *)orig_parse;
12339 /* To get locale nodes to not use the full ANYOF size would
12340 * require moving the code above that writes the portions
12341 * of it that aren't in other nodes to after this point.
12342 * e.g. ANYOF_CLASS_SET */
12343 RExC_size = orig_size;
12347 RExC_emit = (regnode *)orig_emit;
12348 if (PL_regkind[op] == POSIXD) {
12350 op += NPOSIXD - POSIXD;
12355 ret = reg_node(pRExC_state, op);
12357 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
12361 *flagp |= HASWIDTH|SIMPLE;
12363 else if (PL_regkind[op] == EXACT) {
12364 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12367 RExC_parse = (char *) cur_parse;
12369 SvREFCNT_dec(posixes);
12370 SvREFCNT_dec_NN(listsv);
12371 SvREFCNT_dec(cp_list);
12378 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12380 /* If folding, we calculate all characters that could fold to or from the
12381 * ones already on the list */
12382 if (FOLD && cp_list) {
12383 UV start, end; /* End points of code point ranges */
12385 SV* fold_intersection = NULL;
12387 /* If the highest code point is within Latin1, we can use the
12388 * compiled-in Alphas list, and not have to go out to disk. This
12389 * yields two false positives, the masculine and feminine ordinal
12390 * indicators, which are weeded out below using the
12391 * IS_IN_SOME_FOLD_L1() macro */
12392 if (invlist_highest(cp_list) < 256) {
12393 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, &fold_intersection);
12397 /* Here, there are non-Latin1 code points, so we will have to go
12398 * fetch the list of all the characters that participate in folds
12400 if (! PL_utf8_foldable) {
12401 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12402 &PL_sv_undef, 1, 0);
12403 PL_utf8_foldable = _get_swash_invlist(swash);
12404 SvREFCNT_dec_NN(swash);
12407 /* This is a hash that for a particular fold gives all characters
12408 * that are involved in it */
12409 if (! PL_utf8_foldclosures) {
12411 /* If we were unable to find any folds, then we likely won't be
12412 * able to find the closures. So just create an empty list.
12413 * Folding will effectively be restricted to the non-Unicode
12414 * rules hard-coded into Perl. (This case happens legitimately
12415 * during compilation of Perl itself before the Unicode tables
12416 * are generated) */
12417 if (_invlist_len(PL_utf8_foldable) == 0) {
12418 PL_utf8_foldclosures = newHV();
12421 /* If the folds haven't been read in, call a fold function
12423 if (! PL_utf8_tofold) {
12424 U8 dummy[UTF8_MAXBYTES+1];
12426 /* This string is just a short named one above \xff */
12427 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12428 assert(PL_utf8_tofold); /* Verify that worked */
12430 PL_utf8_foldclosures =
12431 _swash_inversion_hash(PL_utf8_tofold);
12435 /* Only the characters in this class that participate in folds need
12436 * be checked. Get the intersection of this class and all the
12437 * possible characters that are foldable. This can quickly narrow
12438 * down a large class */
12439 _invlist_intersection(PL_utf8_foldable, cp_list,
12440 &fold_intersection);
12443 /* Now look at the foldable characters in this class individually */
12444 invlist_iterinit(fold_intersection);
12445 while (invlist_iternext(fold_intersection, &start, &end)) {
12448 /* Locale folding for Latin1 characters is deferred until runtime */
12449 if (LOC && start < 256) {
12453 /* Look at every character in the range */
12454 for (j = start; j <= end; j++) {
12456 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12462 /* We have the latin1 folding rules hard-coded here so that
12463 * an innocent-looking character class, like /[ks]/i won't
12464 * have to go out to disk to find the possible matches.
12465 * XXX It would be better to generate these via regen, in
12466 * case a new version of the Unicode standard adds new
12467 * mappings, though that is not really likely, and may be
12468 * caught by the default: case of the switch below. */
12470 if (IS_IN_SOME_FOLD_L1(j)) {
12472 /* ASCII is always matched; non-ASCII is matched only
12473 * under Unicode rules */
12474 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12476 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12480 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12484 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12485 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12487 /* Certain Latin1 characters have matches outside
12488 * Latin1. To get here, <j> is one of those
12489 * characters. None of these matches is valid for
12490 * ASCII characters under /aa, which is why the 'if'
12491 * just above excludes those. These matches only
12492 * happen when the target string is utf8. The code
12493 * below adds the single fold closures for <j> to the
12494 * inversion list. */
12499 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12503 cp_list = add_cp_to_invlist(cp_list,
12504 LATIN_SMALL_LETTER_LONG_S);
12507 cp_list = add_cp_to_invlist(cp_list,
12508 GREEK_CAPITAL_LETTER_MU);
12509 cp_list = add_cp_to_invlist(cp_list,
12510 GREEK_SMALL_LETTER_MU);
12512 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12513 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12515 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12517 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12518 cp_list = add_cp_to_invlist(cp_list,
12519 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12521 case LATIN_SMALL_LETTER_SHARP_S:
12522 cp_list = add_cp_to_invlist(cp_list,
12523 LATIN_CAPITAL_LETTER_SHARP_S);
12525 case 'F': case 'f':
12526 case 'I': case 'i':
12527 case 'L': case 'l':
12528 case 'T': case 't':
12529 case 'A': case 'a':
12530 case 'H': case 'h':
12531 case 'J': case 'j':
12532 case 'N': case 'n':
12533 case 'W': case 'w':
12534 case 'Y': case 'y':
12535 /* These all are targets of multi-character
12536 * folds from code points that require UTF8 to
12537 * express, so they can't match unless the
12538 * target string is in UTF-8, so no action here
12539 * is necessary, as regexec.c properly handles
12540 * the general case for UTF-8 matching and
12541 * multi-char folds */
12544 /* Use deprecated warning to increase the
12545 * chances of this being output */
12546 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12553 /* Here is an above Latin1 character. We don't have the rules
12554 * hard-coded for it. First, get its fold. This is the simple
12555 * fold, as the multi-character folds have been handled earlier
12556 * and separated out */
12557 _to_uni_fold_flags(j, foldbuf, &foldlen,
12559 ? FOLD_FLAGS_LOCALE
12560 : (ASCII_FOLD_RESTRICTED)
12561 ? FOLD_FLAGS_NOMIX_ASCII
12564 /* Single character fold of above Latin1. Add everything in
12565 * its fold closure to the list that this node should match.
12566 * The fold closures data structure is a hash with the keys
12567 * being the UTF-8 of every character that is folded to, like
12568 * 'k', and the values each an array of all code points that
12569 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
12570 * Multi-character folds are not included */
12571 if ((listp = hv_fetch(PL_utf8_foldclosures,
12572 (char *) foldbuf, foldlen, FALSE)))
12574 AV* list = (AV*) *listp;
12576 for (k = 0; k <= av_len(list); k++) {
12577 SV** c_p = av_fetch(list, k, FALSE);
12580 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12584 /* /aa doesn't allow folds between ASCII and non-; /l
12585 * doesn't allow them between above and below 256 */
12586 if ((ASCII_FOLD_RESTRICTED
12587 && (isASCII(c) != isASCII(j)))
12588 || (LOC && ((c < 256) != (j < 256))))
12593 /* Folds involving non-ascii Latin1 characters
12594 * under /d are added to a separate list */
12595 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12597 cp_list = add_cp_to_invlist(cp_list, c);
12600 depends_list = add_cp_to_invlist(depends_list, c);
12606 SvREFCNT_dec_NN(fold_intersection);
12609 /* And combine the result (if any) with any inversion list from posix
12610 * classes. The lists are kept separate up to now because we don't want to
12611 * fold the classes (folding of those is automatically handled by the swash
12612 * fetching code) */
12614 if (! DEPENDS_SEMANTICS) {
12616 _invlist_union(cp_list, posixes, &cp_list);
12617 SvREFCNT_dec_NN(posixes);
12624 /* Under /d, we put into a separate list the Latin1 things that
12625 * match only when the target string is utf8 */
12626 SV* nonascii_but_latin1_properties = NULL;
12627 _invlist_intersection(posixes, PL_Latin1,
12628 &nonascii_but_latin1_properties);
12629 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12630 &nonascii_but_latin1_properties);
12631 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12634 _invlist_union(cp_list, posixes, &cp_list);
12635 SvREFCNT_dec_NN(posixes);
12641 if (depends_list) {
12642 _invlist_union(depends_list, nonascii_but_latin1_properties,
12644 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
12647 depends_list = nonascii_but_latin1_properties;
12652 /* And combine the result (if any) with any inversion list from properties.
12653 * The lists are kept separate up to now so that we can distinguish the two
12654 * in regards to matching above-Unicode. A run-time warning is generated
12655 * if a Unicode property is matched against a non-Unicode code point. But,
12656 * we allow user-defined properties to match anything, without any warning,
12657 * and we also suppress the warning if there is a portion of the character
12658 * class that isn't a Unicode property, and which matches above Unicode, \W
12659 * or [\x{110000}] for example.
12660 * (Note that in this case, unlike the Posix one above, there is no
12661 * <depends_list>, because having a Unicode property forces Unicode
12664 bool warn_super = ! has_user_defined_property;
12667 /* If it matters to the final outcome, see if a non-property
12668 * component of the class matches above Unicode. If so, the
12669 * warning gets suppressed. This is true even if just a single
12670 * such code point is specified, as though not strictly correct if
12671 * another such code point is matched against, the fact that they
12672 * are using above-Unicode code points indicates they should know
12673 * the issues involved */
12675 bool non_prop_matches_above_Unicode =
12676 runtime_posix_matches_above_Unicode
12677 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12679 non_prop_matches_above_Unicode =
12680 ! non_prop_matches_above_Unicode;
12682 warn_super = ! non_prop_matches_above_Unicode;
12685 _invlist_union(properties, cp_list, &cp_list);
12686 SvREFCNT_dec_NN(properties);
12689 cp_list = properties;
12693 OP(ret) = ANYOF_WARN_SUPER;
12697 /* Here, we have calculated what code points should be in the character
12700 * Now we can see about various optimizations. Fold calculation (which we
12701 * did above) needs to take place before inversion. Otherwise /[^k]/i
12702 * would invert to include K, which under /i would match k, which it
12703 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12704 * folded until runtime */
12706 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12707 * at compile time. Besides not inverting folded locale now, we can't
12708 * invert if there are things such as \w, which aren't known until runtime
12711 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12713 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12715 _invlist_invert(cp_list);
12717 /* Any swash can't be used as-is, because we've inverted things */
12719 SvREFCNT_dec_NN(swash);
12723 /* Clear the invert flag since have just done it here */
12727 /* If we didn't do folding, it's because some information isn't available
12728 * until runtime; set the run-time fold flag for these. (We don't have to
12729 * worry about properties folding, as that is taken care of by the swash
12733 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12736 /* Some character classes are equivalent to other nodes. Such nodes take
12737 * up less room and generally fewer operations to execute than ANYOF nodes.
12738 * Above, we checked for and optimized into some such equivalents for
12739 * certain common classes that are easy to test. Getting to this point in
12740 * the code means that the class didn't get optimized there. Since this
12741 * code is only executed in Pass 2, it is too late to save space--it has
12742 * been allocated in Pass 1, and currently isn't given back. But turning
12743 * things into an EXACTish node can allow the optimizer to join it to any
12744 * adjacent such nodes. And if the class is equivalent to things like /./,
12745 * expensive run-time swashes can be avoided. Now that we have more
12746 * complete information, we can find things necessarily missed by the
12747 * earlier code. I (khw) am not sure how much to look for here. It would
12748 * be easy, but perhaps too slow, to check any candidates against all the
12749 * node types they could possibly match using _invlistEQ(). */
12754 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12755 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12758 U8 op = END; /* The optimzation node-type */
12759 const char * cur_parse= RExC_parse;
12761 invlist_iterinit(cp_list);
12762 if (! invlist_iternext(cp_list, &start, &end)) {
12764 /* Here, the list is empty. This happens, for example, when a
12765 * Unicode property is the only thing in the character class, and
12766 * it doesn't match anything. (perluniprops.pod notes such
12769 *flagp |= HASWIDTH|SIMPLE;
12771 else if (start == end) { /* The range is a single code point */
12772 if (! invlist_iternext(cp_list, &start, &end)
12774 /* Don't do this optimization if it would require changing
12775 * the pattern to UTF-8 */
12776 && (start < 256 || UTF))
12778 /* Here, the list contains a single code point. Can optimize
12779 * into an EXACT node */
12788 /* A locale node under folding with one code point can be
12789 * an EXACTFL, as its fold won't be calculated until
12795 /* Here, we are generally folding, but there is only one
12796 * code point to match. If we have to, we use an EXACT
12797 * node, but it would be better for joining with adjacent
12798 * nodes in the optimization pass if we used the same
12799 * EXACTFish node that any such are likely to be. We can
12800 * do this iff the code point doesn't participate in any
12801 * folds. For example, an EXACTF of a colon is the same as
12802 * an EXACT one, since nothing folds to or from a colon. */
12804 if (IS_IN_SOME_FOLD_L1(value)) {
12809 if (! PL_utf8_foldable) {
12810 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12811 &PL_sv_undef, 1, 0);
12812 PL_utf8_foldable = _get_swash_invlist(swash);
12813 SvREFCNT_dec_NN(swash);
12815 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
12820 /* If we haven't found the node type, above, it means we
12821 * can use the prevailing one */
12823 op = compute_EXACTish(pRExC_state);
12828 else if (start == 0) {
12829 if (end == UV_MAX) {
12831 *flagp |= HASWIDTH|SIMPLE;
12834 else if (end == '\n' - 1
12835 && invlist_iternext(cp_list, &start, &end)
12836 && start == '\n' + 1 && end == UV_MAX)
12839 *flagp |= HASWIDTH|SIMPLE;
12843 invlist_iterfinish(cp_list);
12846 RExC_parse = (char *)orig_parse;
12847 RExC_emit = (regnode *)orig_emit;
12849 ret = reg_node(pRExC_state, op);
12851 RExC_parse = (char *)cur_parse;
12853 if (PL_regkind[op] == EXACT) {
12854 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12857 SvREFCNT_dec_NN(cp_list);
12858 SvREFCNT_dec_NN(listsv);
12863 /* Here, <cp_list> contains all the code points we can determine at
12864 * compile time that match under all conditions. Go through it, and
12865 * for things that belong in the bitmap, put them there, and delete from
12866 * <cp_list>. While we are at it, see if everything above 255 is in the
12867 * list, and if so, set a flag to speed up execution */
12868 ANYOF_BITMAP_ZERO(ret);
12871 /* This gets set if we actually need to modify things */
12872 bool change_invlist = FALSE;
12876 /* Start looking through <cp_list> */
12877 invlist_iterinit(cp_list);
12878 while (invlist_iternext(cp_list, &start, &end)) {
12882 if (end == UV_MAX && start <= 256) {
12883 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12886 /* Quit if are above what we should change */
12891 change_invlist = TRUE;
12893 /* Set all the bits in the range, up to the max that we are doing */
12894 high = (end < 255) ? end : 255;
12895 for (i = start; i <= (int) high; i++) {
12896 if (! ANYOF_BITMAP_TEST(ret, i)) {
12897 ANYOF_BITMAP_SET(ret, i);
12903 invlist_iterfinish(cp_list);
12905 /* Done with loop; remove any code points that are in the bitmap from
12907 if (change_invlist) {
12908 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
12911 /* If have completely emptied it, remove it completely */
12912 if (_invlist_len(cp_list) == 0) {
12913 SvREFCNT_dec_NN(cp_list);
12919 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
12922 /* Here, the bitmap has been populated with all the Latin1 code points that
12923 * always match. Can now add to the overall list those that match only
12924 * when the target string is UTF-8 (<depends_list>). */
12925 if (depends_list) {
12927 _invlist_union(cp_list, depends_list, &cp_list);
12928 SvREFCNT_dec_NN(depends_list);
12931 cp_list = depends_list;
12935 /* If there is a swash and more than one element, we can't use the swash in
12936 * the optimization below. */
12937 if (swash && element_count > 1) {
12938 SvREFCNT_dec_NN(swash);
12943 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12945 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12946 SvREFCNT_dec_NN(listsv);
12949 /* av[0] stores the character class description in its textual form:
12950 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12951 * appropriate swash, and is also useful for dumping the regnode.
12952 * av[1] if NULL, is a placeholder to later contain the swash computed
12953 * from av[0]. But if no further computation need be done, the
12954 * swash is stored there now.
12955 * av[2] stores the cp_list inversion list for use in addition or
12956 * instead of av[0]; used only if av[1] is NULL
12957 * av[3] is set if any component of the class is from a user-defined
12958 * property; used only if av[1] is NULL */
12959 AV * const av = newAV();
12962 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12964 : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
12966 av_store(av, 1, swash);
12967 SvREFCNT_dec_NN(cp_list);
12970 av_store(av, 1, NULL);
12972 av_store(av, 2, cp_list);
12973 av_store(av, 3, newSVuv(has_user_defined_property));
12977 rv = newRV_noinc(MUTABLE_SV(av));
12978 n = add_data(pRExC_state, 1, "s");
12979 RExC_rxi->data->data[n] = (void*)rv;
12983 *flagp |= HASWIDTH|SIMPLE;
12986 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12989 /* reg_skipcomment()
12991 Absorbs an /x style # comments from the input stream.
12992 Returns true if there is more text remaining in the stream.
12993 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12994 terminates the pattern without including a newline.
12996 Note its the callers responsibility to ensure that we are
12997 actually in /x mode
13002 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13006 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13008 while (RExC_parse < RExC_end)
13009 if (*RExC_parse++ == '\n') {
13014 /* we ran off the end of the pattern without ending
13015 the comment, so we have to add an \n when wrapping */
13016 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13024 Advances the parse position, and optionally absorbs
13025 "whitespace" from the inputstream.
13027 Without /x "whitespace" means (?#...) style comments only,
13028 with /x this means (?#...) and # comments and whitespace proper.
13030 Returns the RExC_parse point from BEFORE the scan occurs.
13032 This is the /x friendly way of saying RExC_parse++.
13036 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13038 char* const retval = RExC_parse++;
13040 PERL_ARGS_ASSERT_NEXTCHAR;
13043 if (RExC_end - RExC_parse >= 3
13044 && *RExC_parse == '('
13045 && RExC_parse[1] == '?'
13046 && RExC_parse[2] == '#')
13048 while (*RExC_parse != ')') {
13049 if (RExC_parse == RExC_end)
13050 FAIL("Sequence (?#... not terminated");
13056 if (RExC_flags & RXf_PMf_EXTENDED) {
13057 if (isSPACE(*RExC_parse)) {
13061 else if (*RExC_parse == '#') {
13062 if ( reg_skipcomment( pRExC_state ) )
13071 - reg_node - emit a node
13073 STATIC regnode * /* Location. */
13074 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13078 regnode * const ret = RExC_emit;
13079 GET_RE_DEBUG_FLAGS_DECL;
13081 PERL_ARGS_ASSERT_REG_NODE;
13084 SIZE_ALIGN(RExC_size);
13088 if (RExC_emit >= RExC_emit_bound)
13089 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13090 op, RExC_emit, RExC_emit_bound);
13092 NODE_ALIGN_FILL(ret);
13094 FILL_ADVANCE_NODE(ptr, op);
13095 #ifdef RE_TRACK_PATTERN_OFFSETS
13096 if (RExC_offsets) { /* MJD */
13097 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13098 "reg_node", __LINE__,
13100 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13101 ? "Overwriting end of array!\n" : "OK",
13102 (UV)(RExC_emit - RExC_emit_start),
13103 (UV)(RExC_parse - RExC_start),
13104 (UV)RExC_offsets[0]));
13105 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13113 - reganode - emit a node with an argument
13115 STATIC regnode * /* Location. */
13116 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13120 regnode * const ret = RExC_emit;
13121 GET_RE_DEBUG_FLAGS_DECL;
13123 PERL_ARGS_ASSERT_REGANODE;
13126 SIZE_ALIGN(RExC_size);
13131 assert(2==regarglen[op]+1);
13133 Anything larger than this has to allocate the extra amount.
13134 If we changed this to be:
13136 RExC_size += (1 + regarglen[op]);
13138 then it wouldn't matter. Its not clear what side effect
13139 might come from that so its not done so far.
13144 if (RExC_emit >= RExC_emit_bound)
13145 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13146 op, RExC_emit, RExC_emit_bound);
13148 NODE_ALIGN_FILL(ret);
13150 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13151 #ifdef RE_TRACK_PATTERN_OFFSETS
13152 if (RExC_offsets) { /* MJD */
13153 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13157 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13158 "Overwriting end of array!\n" : "OK",
13159 (UV)(RExC_emit - RExC_emit_start),
13160 (UV)(RExC_parse - RExC_start),
13161 (UV)RExC_offsets[0]));
13162 Set_Cur_Node_Offset;
13170 - reguni - emit (if appropriate) a Unicode character
13173 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13177 PERL_ARGS_ASSERT_REGUNI;
13179 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13183 - reginsert - insert an operator in front of already-emitted operand
13185 * Means relocating the operand.
13188 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13194 const int offset = regarglen[(U8)op];
13195 const int size = NODE_STEP_REGNODE + offset;
13196 GET_RE_DEBUG_FLAGS_DECL;
13198 PERL_ARGS_ASSERT_REGINSERT;
13199 PERL_UNUSED_ARG(depth);
13200 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13201 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13210 if (RExC_open_parens) {
13212 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13213 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13214 if ( RExC_open_parens[paren] >= opnd ) {
13215 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13216 RExC_open_parens[paren] += size;
13218 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13220 if ( RExC_close_parens[paren] >= opnd ) {
13221 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13222 RExC_close_parens[paren] += size;
13224 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13229 while (src > opnd) {
13230 StructCopy(--src, --dst, regnode);
13231 #ifdef RE_TRACK_PATTERN_OFFSETS
13232 if (RExC_offsets) { /* MJD 20010112 */
13233 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13237 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13238 ? "Overwriting end of array!\n" : "OK",
13239 (UV)(src - RExC_emit_start),
13240 (UV)(dst - RExC_emit_start),
13241 (UV)RExC_offsets[0]));
13242 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13243 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13249 place = opnd; /* Op node, where operand used to be. */
13250 #ifdef RE_TRACK_PATTERN_OFFSETS
13251 if (RExC_offsets) { /* MJD */
13252 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13256 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13257 ? "Overwriting end of array!\n" : "OK",
13258 (UV)(place - RExC_emit_start),
13259 (UV)(RExC_parse - RExC_start),
13260 (UV)RExC_offsets[0]));
13261 Set_Node_Offset(place, RExC_parse);
13262 Set_Node_Length(place, 1);
13265 src = NEXTOPER(place);
13266 FILL_ADVANCE_NODE(place, op);
13267 Zero(src, offset, regnode);
13271 - regtail - set the next-pointer at the end of a node chain of p to val.
13272 - SEE ALSO: regtail_study
13274 /* TODO: All three parms should be const */
13276 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13280 GET_RE_DEBUG_FLAGS_DECL;
13282 PERL_ARGS_ASSERT_REGTAIL;
13284 PERL_UNUSED_ARG(depth);
13290 /* Find last node. */
13293 regnode * const temp = regnext(scan);
13295 SV * const mysv=sv_newmortal();
13296 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13297 regprop(RExC_rx, mysv, scan);
13298 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13299 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13300 (temp == NULL ? "->" : ""),
13301 (temp == NULL ? PL_reg_name[OP(val)] : "")
13309 if (reg_off_by_arg[OP(scan)]) {
13310 ARG_SET(scan, val - scan);
13313 NEXT_OFF(scan) = val - scan;
13319 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13320 - Look for optimizable sequences at the same time.
13321 - currently only looks for EXACT chains.
13323 This is experimental code. The idea is to use this routine to perform
13324 in place optimizations on branches and groups as they are constructed,
13325 with the long term intention of removing optimization from study_chunk so
13326 that it is purely analytical.
13328 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13329 to control which is which.
13332 /* TODO: All four parms should be const */
13335 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13340 #ifdef EXPERIMENTAL_INPLACESCAN
13343 GET_RE_DEBUG_FLAGS_DECL;
13345 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13351 /* Find last node. */
13355 regnode * const temp = regnext(scan);
13356 #ifdef EXPERIMENTAL_INPLACESCAN
13357 if (PL_regkind[OP(scan)] == EXACT) {
13358 bool has_exactf_sharp_s; /* Unexamined in this routine */
13359 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13364 switch (OP(scan)) {
13370 case EXACTFU_TRICKYFOLD:
13372 if( exact == PSEUDO )
13374 else if ( exact != OP(scan) )
13383 SV * const mysv=sv_newmortal();
13384 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13385 regprop(RExC_rx, mysv, scan);
13386 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13387 SvPV_nolen_const(mysv),
13388 REG_NODE_NUM(scan),
13389 PL_reg_name[exact]);
13396 SV * const mysv_val=sv_newmortal();
13397 DEBUG_PARSE_MSG("");
13398 regprop(RExC_rx, mysv_val, val);
13399 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13400 SvPV_nolen_const(mysv_val),
13401 (IV)REG_NODE_NUM(val),
13405 if (reg_off_by_arg[OP(scan)]) {
13406 ARG_SET(scan, val - scan);
13409 NEXT_OFF(scan) = val - scan;
13417 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13421 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13427 for (bit=0; bit<32; bit++) {
13428 if (flags & (1<<bit)) {
13429 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13432 if (!set++ && lead)
13433 PerlIO_printf(Perl_debug_log, "%s",lead);
13434 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13437 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13438 if (!set++ && lead) {
13439 PerlIO_printf(Perl_debug_log, "%s",lead);
13442 case REGEX_UNICODE_CHARSET:
13443 PerlIO_printf(Perl_debug_log, "UNICODE");
13445 case REGEX_LOCALE_CHARSET:
13446 PerlIO_printf(Perl_debug_log, "LOCALE");
13448 case REGEX_ASCII_RESTRICTED_CHARSET:
13449 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13451 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13452 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13455 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13461 PerlIO_printf(Perl_debug_log, "\n");
13463 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13469 Perl_regdump(pTHX_ const regexp *r)
13473 SV * const sv = sv_newmortal();
13474 SV *dsv= sv_newmortal();
13475 RXi_GET_DECL(r,ri);
13476 GET_RE_DEBUG_FLAGS_DECL;
13478 PERL_ARGS_ASSERT_REGDUMP;
13480 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13482 /* Header fields of interest. */
13483 if (r->anchored_substr) {
13484 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13485 RE_SV_DUMPLEN(r->anchored_substr), 30);
13486 PerlIO_printf(Perl_debug_log,
13487 "anchored %s%s at %"IVdf" ",
13488 s, RE_SV_TAIL(r->anchored_substr),
13489 (IV)r->anchored_offset);
13490 } else if (r->anchored_utf8) {
13491 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13492 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13493 PerlIO_printf(Perl_debug_log,
13494 "anchored utf8 %s%s at %"IVdf" ",
13495 s, RE_SV_TAIL(r->anchored_utf8),
13496 (IV)r->anchored_offset);
13498 if (r->float_substr) {
13499 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13500 RE_SV_DUMPLEN(r->float_substr), 30);
13501 PerlIO_printf(Perl_debug_log,
13502 "floating %s%s at %"IVdf"..%"UVuf" ",
13503 s, RE_SV_TAIL(r->float_substr),
13504 (IV)r->float_min_offset, (UV)r->float_max_offset);
13505 } else if (r->float_utf8) {
13506 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13507 RE_SV_DUMPLEN(r->float_utf8), 30);
13508 PerlIO_printf(Perl_debug_log,
13509 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13510 s, RE_SV_TAIL(r->float_utf8),
13511 (IV)r->float_min_offset, (UV)r->float_max_offset);
13513 if (r->check_substr || r->check_utf8)
13514 PerlIO_printf(Perl_debug_log,
13516 (r->check_substr == r->float_substr
13517 && r->check_utf8 == r->float_utf8
13518 ? "(checking floating" : "(checking anchored"));
13519 if (r->extflags & RXf_NOSCAN)
13520 PerlIO_printf(Perl_debug_log, " noscan");
13521 if (r->extflags & RXf_CHECK_ALL)
13522 PerlIO_printf(Perl_debug_log, " isall");
13523 if (r->check_substr || r->check_utf8)
13524 PerlIO_printf(Perl_debug_log, ") ");
13526 if (ri->regstclass) {
13527 regprop(r, sv, ri->regstclass);
13528 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13530 if (r->extflags & RXf_ANCH) {
13531 PerlIO_printf(Perl_debug_log, "anchored");
13532 if (r->extflags & RXf_ANCH_BOL)
13533 PerlIO_printf(Perl_debug_log, "(BOL)");
13534 if (r->extflags & RXf_ANCH_MBOL)
13535 PerlIO_printf(Perl_debug_log, "(MBOL)");
13536 if (r->extflags & RXf_ANCH_SBOL)
13537 PerlIO_printf(Perl_debug_log, "(SBOL)");
13538 if (r->extflags & RXf_ANCH_GPOS)
13539 PerlIO_printf(Perl_debug_log, "(GPOS)");
13540 PerlIO_putc(Perl_debug_log, ' ');
13542 if (r->extflags & RXf_GPOS_SEEN)
13543 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13544 if (r->intflags & PREGf_SKIP)
13545 PerlIO_printf(Perl_debug_log, "plus ");
13546 if (r->intflags & PREGf_IMPLICIT)
13547 PerlIO_printf(Perl_debug_log, "implicit ");
13548 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13549 if (r->extflags & RXf_EVAL_SEEN)
13550 PerlIO_printf(Perl_debug_log, "with eval ");
13551 PerlIO_printf(Perl_debug_log, "\n");
13552 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13554 PERL_ARGS_ASSERT_REGDUMP;
13555 PERL_UNUSED_CONTEXT;
13556 PERL_UNUSED_ARG(r);
13557 #endif /* DEBUGGING */
13561 - regprop - printable representation of opcode
13563 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13566 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13567 if (flags & ANYOF_INVERT) \
13568 /*make sure the invert info is in each */ \
13569 sv_catpvs(sv, "^"); \
13575 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13581 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13582 static const char * const anyofs[] = {
13583 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
13584 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
13585 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
13586 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
13587 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
13588 || _CC_VERTSPACE != 16
13589 #error Need to adjust order of anyofs[]
13626 RXi_GET_DECL(prog,progi);
13627 GET_RE_DEBUG_FLAGS_DECL;
13629 PERL_ARGS_ASSERT_REGPROP;
13633 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13634 /* It would be nice to FAIL() here, but this may be called from
13635 regexec.c, and it would be hard to supply pRExC_state. */
13636 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13637 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13639 k = PL_regkind[OP(o)];
13642 sv_catpvs(sv, " ");
13643 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13644 * is a crude hack but it may be the best for now since
13645 * we have no flag "this EXACTish node was UTF-8"
13647 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13648 PERL_PV_ESCAPE_UNI_DETECT |
13649 PERL_PV_ESCAPE_NONASCII |
13650 PERL_PV_PRETTY_ELLIPSES |
13651 PERL_PV_PRETTY_LTGT |
13652 PERL_PV_PRETTY_NOCLEAR
13654 } else if (k == TRIE) {
13655 /* print the details of the trie in dumpuntil instead, as
13656 * progi->data isn't available here */
13657 const char op = OP(o);
13658 const U32 n = ARG(o);
13659 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13660 (reg_ac_data *)progi->data->data[n] :
13662 const reg_trie_data * const trie
13663 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13665 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13666 DEBUG_TRIE_COMPILE_r(
13667 Perl_sv_catpvf(aTHX_ sv,
13668 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13669 (UV)trie->startstate,
13670 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13671 (UV)trie->wordcount,
13674 (UV)TRIE_CHARCOUNT(trie),
13675 (UV)trie->uniquecharcount
13678 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13680 int rangestart = -1;
13681 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13682 sv_catpvs(sv, "[");
13683 for (i = 0; i <= 256; i++) {
13684 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13685 if (rangestart == -1)
13687 } else if (rangestart != -1) {
13688 if (i <= rangestart + 3)
13689 for (; rangestart < i; rangestart++)
13690 put_byte(sv, rangestart);
13692 put_byte(sv, rangestart);
13693 sv_catpvs(sv, "-");
13694 put_byte(sv, i - 1);
13699 sv_catpvs(sv, "]");
13702 } else if (k == CURLY) {
13703 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13704 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13705 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13707 else if (k == WHILEM && o->flags) /* Ordinal/of */
13708 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13709 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13710 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13711 if ( RXp_PAREN_NAMES(prog) ) {
13712 if ( k != REF || (OP(o) < NREF)) {
13713 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13714 SV **name= av_fetch(list, ARG(o), 0 );
13716 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13719 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13720 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13721 I32 *nums=(I32*)SvPVX(sv_dat);
13722 SV **name= av_fetch(list, nums[0], 0 );
13725 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13726 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13727 (n ? "," : ""), (IV)nums[n]);
13729 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13733 } else if (k == GOSUB)
13734 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13735 else if (k == VERB) {
13737 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13738 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13739 } else if (k == LOGICAL)
13740 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13741 else if (k == ANYOF) {
13742 int i, rangestart = -1;
13743 const U8 flags = ANYOF_FLAGS(o);
13747 if (flags & ANYOF_LOCALE)
13748 sv_catpvs(sv, "{loc}");
13749 if (flags & ANYOF_LOC_FOLD)
13750 sv_catpvs(sv, "{i}");
13751 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13752 if (flags & ANYOF_INVERT)
13753 sv_catpvs(sv, "^");
13755 /* output what the standard cp 0-255 bitmap matches */
13756 for (i = 0; i <= 256; i++) {
13757 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13758 if (rangestart == -1)
13760 } else if (rangestart != -1) {
13761 if (i <= rangestart + 3)
13762 for (; rangestart < i; rangestart++)
13763 put_byte(sv, rangestart);
13765 put_byte(sv, rangestart);
13766 sv_catpvs(sv, "-");
13767 put_byte(sv, i - 1);
13774 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13775 /* output any special charclass tests (used entirely under use locale) */
13776 if (ANYOF_CLASS_TEST_ANY_SET(o))
13777 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13778 if (ANYOF_CLASS_TEST(o,i)) {
13779 sv_catpv(sv, anyofs[i]);
13783 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13785 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13786 sv_catpvs(sv, "{non-utf8-latin1-all}");
13789 /* output information about the unicode matching */
13790 if (flags & ANYOF_UNICODE_ALL)
13791 sv_catpvs(sv, "{unicode_all}");
13792 else if (ANYOF_NONBITMAP(o))
13793 sv_catpvs(sv, "{unicode}");
13794 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13795 sv_catpvs(sv, "{outside bitmap}");
13797 if (ANYOF_NONBITMAP(o)) {
13798 SV *lv; /* Set if there is something outside the bit map */
13799 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
13800 bool byte_output = FALSE; /* If something in the bitmap has been
13803 if (lv && lv != &PL_sv_undef) {
13805 U8 s[UTF8_MAXBYTES_CASE+1];
13807 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13808 uvchr_to_utf8(s, i);
13811 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13815 && swash_fetch(sw, s, TRUE))
13817 if (rangestart == -1)
13819 } else if (rangestart != -1) {
13820 byte_output = TRUE;
13821 if (i <= rangestart + 3)
13822 for (; rangestart < i; rangestart++) {
13823 put_byte(sv, rangestart);
13826 put_byte(sv, rangestart);
13827 sv_catpvs(sv, "-");
13836 char *s = savesvpv(lv);
13837 char * const origs = s;
13839 while (*s && *s != '\n')
13843 const char * const t = ++s;
13846 sv_catpvs(sv, " ");
13852 /* Truncate very long output */
13853 if (s - origs > 256) {
13854 Perl_sv_catpvf(aTHX_ sv,
13856 (int) (s - origs - 1),
13862 else if (*s == '\t') {
13877 SvREFCNT_dec_NN(lv);
13881 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13883 else if (k == POSIXD || k == NPOSIXD) {
13884 U8 index = FLAGS(o) * 2;
13885 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
13886 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
13889 sv_catpv(sv, anyofs[index]);
13892 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13893 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13895 PERL_UNUSED_CONTEXT;
13896 PERL_UNUSED_ARG(sv);
13897 PERL_UNUSED_ARG(o);
13898 PERL_UNUSED_ARG(prog);
13899 #endif /* DEBUGGING */
13903 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13904 { /* Assume that RE_INTUIT is set */
13906 struct regexp *const prog = ReANY(r);
13907 GET_RE_DEBUG_FLAGS_DECL;
13909 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13910 PERL_UNUSED_CONTEXT;
13914 const char * const s = SvPV_nolen_const(prog->check_substr
13915 ? prog->check_substr : prog->check_utf8);
13917 if (!PL_colorset) reginitcolors();
13918 PerlIO_printf(Perl_debug_log,
13919 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13921 prog->check_substr ? "" : "utf8 ",
13922 PL_colors[5],PL_colors[0],
13925 (strlen(s) > 60 ? "..." : ""));
13928 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13934 handles refcounting and freeing the perl core regexp structure. When
13935 it is necessary to actually free the structure the first thing it
13936 does is call the 'free' method of the regexp_engine associated to
13937 the regexp, allowing the handling of the void *pprivate; member
13938 first. (This routine is not overridable by extensions, which is why
13939 the extensions free is called first.)
13941 See regdupe and regdupe_internal if you change anything here.
13943 #ifndef PERL_IN_XSUB_RE
13945 Perl_pregfree(pTHX_ REGEXP *r)
13951 Perl_pregfree2(pTHX_ REGEXP *rx)
13954 struct regexp *const r = ReANY(rx);
13955 GET_RE_DEBUG_FLAGS_DECL;
13957 PERL_ARGS_ASSERT_PREGFREE2;
13959 if (r->mother_re) {
13960 ReREFCNT_dec(r->mother_re);
13962 CALLREGFREE_PVT(rx); /* free the private data */
13963 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13964 Safefree(r->xpv_len_u.xpvlenu_pv);
13967 SvREFCNT_dec(r->anchored_substr);
13968 SvREFCNT_dec(r->anchored_utf8);
13969 SvREFCNT_dec(r->float_substr);
13970 SvREFCNT_dec(r->float_utf8);
13971 Safefree(r->substrs);
13973 RX_MATCH_COPY_FREE(rx);
13974 #ifdef PERL_ANY_COW
13975 SvREFCNT_dec(r->saved_copy);
13978 SvREFCNT_dec(r->qr_anoncv);
13979 rx->sv_u.svu_rx = 0;
13984 This is a hacky workaround to the structural issue of match results
13985 being stored in the regexp structure which is in turn stored in
13986 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13987 could be PL_curpm in multiple contexts, and could require multiple
13988 result sets being associated with the pattern simultaneously, such
13989 as when doing a recursive match with (??{$qr})
13991 The solution is to make a lightweight copy of the regexp structure
13992 when a qr// is returned from the code executed by (??{$qr}) this
13993 lightweight copy doesn't actually own any of its data except for
13994 the starp/end and the actual regexp structure itself.
14000 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14002 struct regexp *ret;
14003 struct regexp *const r = ReANY(rx);
14004 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14006 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14009 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14011 SvOK_off((SV *)ret_x);
14013 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14014 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14015 made both spots point to the same regexp body.) */
14016 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14017 assert(!SvPVX(ret_x));
14018 ret_x->sv_u.svu_rx = temp->sv_any;
14019 temp->sv_any = NULL;
14020 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14021 SvREFCNT_dec_NN(temp);
14022 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14023 ing below will not set it. */
14024 SvCUR_set(ret_x, SvCUR(rx));
14027 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14028 sv_force_normal(sv) is called. */
14030 ret = ReANY(ret_x);
14032 SvFLAGS(ret_x) |= SvUTF8(rx);
14033 /* We share the same string buffer as the original regexp, on which we
14034 hold a reference count, incremented when mother_re is set below.
14035 The string pointer is copied here, being part of the regexp struct.
14037 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14038 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14040 const I32 npar = r->nparens+1;
14041 Newx(ret->offs, npar, regexp_paren_pair);
14042 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14045 Newx(ret->substrs, 1, struct reg_substr_data);
14046 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14048 SvREFCNT_inc_void(ret->anchored_substr);
14049 SvREFCNT_inc_void(ret->anchored_utf8);
14050 SvREFCNT_inc_void(ret->float_substr);
14051 SvREFCNT_inc_void(ret->float_utf8);
14053 /* check_substr and check_utf8, if non-NULL, point to either their
14054 anchored or float namesakes, and don't hold a second reference. */
14056 RX_MATCH_COPIED_off(ret_x);
14057 #ifdef PERL_ANY_COW
14058 ret->saved_copy = NULL;
14060 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14061 SvREFCNT_inc_void(ret->qr_anoncv);
14067 /* regfree_internal()
14069 Free the private data in a regexp. This is overloadable by
14070 extensions. Perl takes care of the regexp structure in pregfree(),
14071 this covers the *pprivate pointer which technically perl doesn't
14072 know about, however of course we have to handle the
14073 regexp_internal structure when no extension is in use.
14075 Note this is called before freeing anything in the regexp
14080 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14083 struct regexp *const r = ReANY(rx);
14084 RXi_GET_DECL(r,ri);
14085 GET_RE_DEBUG_FLAGS_DECL;
14087 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14093 SV *dsv= sv_newmortal();
14094 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14095 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14096 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14097 PL_colors[4],PL_colors[5],s);
14100 #ifdef RE_TRACK_PATTERN_OFFSETS
14102 Safefree(ri->u.offsets); /* 20010421 MJD */
14104 if (ri->code_blocks) {
14106 for (n = 0; n < ri->num_code_blocks; n++)
14107 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14108 Safefree(ri->code_blocks);
14112 int n = ri->data->count;
14115 /* If you add a ->what type here, update the comment in regcomp.h */
14116 switch (ri->data->what[n]) {
14122 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14125 Safefree(ri->data->data[n]);
14131 { /* Aho Corasick add-on structure for a trie node.
14132 Used in stclass optimization only */
14134 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14136 refcount = --aho->refcount;
14139 PerlMemShared_free(aho->states);
14140 PerlMemShared_free(aho->fail);
14141 /* do this last!!!! */
14142 PerlMemShared_free(ri->data->data[n]);
14143 PerlMemShared_free(ri->regstclass);
14149 /* trie structure. */
14151 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14153 refcount = --trie->refcount;
14156 PerlMemShared_free(trie->charmap);
14157 PerlMemShared_free(trie->states);
14158 PerlMemShared_free(trie->trans);
14160 PerlMemShared_free(trie->bitmap);
14162 PerlMemShared_free(trie->jump);
14163 PerlMemShared_free(trie->wordinfo);
14164 /* do this last!!!! */
14165 PerlMemShared_free(ri->data->data[n]);
14170 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14173 Safefree(ri->data->what);
14174 Safefree(ri->data);
14180 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14181 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14182 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14185 re_dup - duplicate a regexp.
14187 This routine is expected to clone a given regexp structure. It is only
14188 compiled under USE_ITHREADS.
14190 After all of the core data stored in struct regexp is duplicated
14191 the regexp_engine.dupe method is used to copy any private data
14192 stored in the *pprivate pointer. This allows extensions to handle
14193 any duplication it needs to do.
14195 See pregfree() and regfree_internal() if you change anything here.
14197 #if defined(USE_ITHREADS)
14198 #ifndef PERL_IN_XSUB_RE
14200 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14204 const struct regexp *r = ReANY(sstr);
14205 struct regexp *ret = ReANY(dstr);
14207 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14209 npar = r->nparens+1;
14210 Newx(ret->offs, npar, regexp_paren_pair);
14211 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14213 /* no need to copy these */
14214 Newx(ret->swap, npar, regexp_paren_pair);
14217 if (ret->substrs) {
14218 /* Do it this way to avoid reading from *r after the StructCopy().
14219 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14220 cache, it doesn't matter. */
14221 const bool anchored = r->check_substr
14222 ? r->check_substr == r->anchored_substr
14223 : r->check_utf8 == r->anchored_utf8;
14224 Newx(ret->substrs, 1, struct reg_substr_data);
14225 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14227 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14228 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14229 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14230 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14232 /* check_substr and check_utf8, if non-NULL, point to either their
14233 anchored or float namesakes, and don't hold a second reference. */
14235 if (ret->check_substr) {
14237 assert(r->check_utf8 == r->anchored_utf8);
14238 ret->check_substr = ret->anchored_substr;
14239 ret->check_utf8 = ret->anchored_utf8;
14241 assert(r->check_substr == r->float_substr);
14242 assert(r->check_utf8 == r->float_utf8);
14243 ret->check_substr = ret->float_substr;
14244 ret->check_utf8 = ret->float_utf8;
14246 } else if (ret->check_utf8) {
14248 ret->check_utf8 = ret->anchored_utf8;
14250 ret->check_utf8 = ret->float_utf8;
14255 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14256 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14259 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14261 if (RX_MATCH_COPIED(dstr))
14262 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14264 ret->subbeg = NULL;
14265 #ifdef PERL_ANY_COW
14266 ret->saved_copy = NULL;
14269 /* Whether mother_re be set or no, we need to copy the string. We
14270 cannot refrain from copying it when the storage points directly to
14271 our mother regexp, because that's
14272 1: a buffer in a different thread
14273 2: something we no longer hold a reference on
14274 so we need to copy it locally. */
14275 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14276 ret->mother_re = NULL;
14279 #endif /* PERL_IN_XSUB_RE */
14284 This is the internal complement to regdupe() which is used to copy
14285 the structure pointed to by the *pprivate pointer in the regexp.
14286 This is the core version of the extension overridable cloning hook.
14287 The regexp structure being duplicated will be copied by perl prior
14288 to this and will be provided as the regexp *r argument, however
14289 with the /old/ structures pprivate pointer value. Thus this routine
14290 may override any copying normally done by perl.
14292 It returns a pointer to the new regexp_internal structure.
14296 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14299 struct regexp *const r = ReANY(rx);
14300 regexp_internal *reti;
14302 RXi_GET_DECL(r,ri);
14304 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14308 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14309 Copy(ri->program, reti->program, len+1, regnode);
14311 reti->num_code_blocks = ri->num_code_blocks;
14312 if (ri->code_blocks) {
14314 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14315 struct reg_code_block);
14316 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14317 struct reg_code_block);
14318 for (n = 0; n < ri->num_code_blocks; n++)
14319 reti->code_blocks[n].src_regex = (REGEXP*)
14320 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14323 reti->code_blocks = NULL;
14325 reti->regstclass = NULL;
14328 struct reg_data *d;
14329 const int count = ri->data->count;
14332 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14333 char, struct reg_data);
14334 Newx(d->what, count, U8);
14337 for (i = 0; i < count; i++) {
14338 d->what[i] = ri->data->what[i];
14339 switch (d->what[i]) {
14340 /* see also regcomp.h and regfree_internal() */
14341 case 'a': /* actually an AV, but the dup function is identical. */
14345 case 'u': /* actually an HV, but the dup function is identical. */
14346 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14349 /* This is cheating. */
14350 Newx(d->data[i], 1, struct regnode_charclass_class);
14351 StructCopy(ri->data->data[i], d->data[i],
14352 struct regnode_charclass_class);
14353 reti->regstclass = (regnode*)d->data[i];
14356 /* Trie stclasses are readonly and can thus be shared
14357 * without duplication. We free the stclass in pregfree
14358 * when the corresponding reg_ac_data struct is freed.
14360 reti->regstclass= ri->regstclass;
14364 ((reg_trie_data*)ri->data->data[i])->refcount++;
14369 d->data[i] = ri->data->data[i];
14372 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14381 reti->name_list_idx = ri->name_list_idx;
14383 #ifdef RE_TRACK_PATTERN_OFFSETS
14384 if (ri->u.offsets) {
14385 Newx(reti->u.offsets, 2*len+1, U32);
14386 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14389 SetProgLen(reti,len);
14392 return (void*)reti;
14395 #endif /* USE_ITHREADS */
14397 #ifndef PERL_IN_XSUB_RE
14400 - regnext - dig the "next" pointer out of a node
14403 Perl_regnext(pTHX_ regnode *p)
14411 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14412 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14415 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14424 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14427 STRLEN l1 = strlen(pat1);
14428 STRLEN l2 = strlen(pat2);
14431 const char *message;
14433 PERL_ARGS_ASSERT_RE_CROAK2;
14439 Copy(pat1, buf, l1 , char);
14440 Copy(pat2, buf + l1, l2 , char);
14441 buf[l1 + l2] = '\n';
14442 buf[l1 + l2 + 1] = '\0';
14444 /* ANSI variant takes additional second argument */
14445 va_start(args, pat2);
14449 msv = vmess(buf, &args);
14451 message = SvPV_const(msv,l1);
14454 Copy(message, buf, l1 , char);
14455 buf[l1-1] = '\0'; /* Overwrite \n */
14456 Perl_croak(aTHX_ "%s", buf);
14459 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14461 #ifndef PERL_IN_XSUB_RE
14463 Perl_save_re_context(pTHX)
14467 struct re_save_state *state;
14469 SAVEVPTR(PL_curcop);
14470 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14472 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14473 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14474 SSPUSHUV(SAVEt_RE_STATE);
14476 Copy(&PL_reg_state, state, 1, struct re_save_state);
14478 PL_reg_oldsaved = NULL;
14479 PL_reg_oldsavedlen = 0;
14480 PL_reg_oldsavedoffset = 0;
14481 PL_reg_oldsavedcoffset = 0;
14482 PL_reg_maxiter = 0;
14483 PL_reg_leftiter = 0;
14484 PL_reg_poscache = NULL;
14485 PL_reg_poscache_size = 0;
14486 #ifdef PERL_ANY_COW
14490 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14492 const REGEXP * const rx = PM_GETRE(PL_curpm);
14495 for (i = 1; i <= RX_NPARENS(rx); i++) {
14496 char digits[TYPE_CHARS(long)];
14497 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14498 GV *const *const gvp
14499 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14502 GV * const gv = *gvp;
14503 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14515 S_put_byte(pTHX_ SV *sv, int c)
14517 PERL_ARGS_ASSERT_PUT_BYTE;
14519 /* Our definition of isPRINT() ignores locales, so only bytes that are
14520 not part of UTF-8 are considered printable. I assume that the same
14521 holds for UTF-EBCDIC.
14522 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14523 which Wikipedia says:
14525 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14526 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14527 identical, to the ASCII delete (DEL) or rubout control character.
14528 ) So the old condition can be simplified to !isPRINT(c) */
14531 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14534 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14538 const char string = c;
14539 if (c == '-' || c == ']' || c == '\\' || c == '^')
14540 sv_catpvs(sv, "\\");
14541 sv_catpvn(sv, &string, 1);
14546 #define CLEAR_OPTSTART \
14547 if (optstart) STMT_START { \
14548 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14552 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14554 STATIC const regnode *
14555 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14556 const regnode *last, const regnode *plast,
14557 SV* sv, I32 indent, U32 depth)
14560 U8 op = PSEUDO; /* Arbitrary non-END op. */
14561 const regnode *next;
14562 const regnode *optstart= NULL;
14564 RXi_GET_DECL(r,ri);
14565 GET_RE_DEBUG_FLAGS_DECL;
14567 PERL_ARGS_ASSERT_DUMPUNTIL;
14569 #ifdef DEBUG_DUMPUNTIL
14570 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14571 last ? last-start : 0,plast ? plast-start : 0);
14574 if (plast && plast < last)
14577 while (PL_regkind[op] != END && (!last || node < last)) {
14578 /* While that wasn't END last time... */
14581 if (op == CLOSE || op == WHILEM)
14583 next = regnext((regnode *)node);
14586 if (OP(node) == OPTIMIZED) {
14587 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14594 regprop(r, sv, node);
14595 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14596 (int)(2*indent + 1), "", SvPVX_const(sv));
14598 if (OP(node) != OPTIMIZED) {
14599 if (next == NULL) /* Next ptr. */
14600 PerlIO_printf(Perl_debug_log, " (0)");
14601 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14602 PerlIO_printf(Perl_debug_log, " (FAIL)");
14604 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14605 (void)PerlIO_putc(Perl_debug_log, '\n');
14609 if (PL_regkind[(U8)op] == BRANCHJ) {
14612 const regnode *nnode = (OP(next) == LONGJMP
14613 ? regnext((regnode *)next)
14615 if (last && nnode > last)
14617 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14620 else if (PL_regkind[(U8)op] == BRANCH) {
14622 DUMPUNTIL(NEXTOPER(node), next);
14624 else if ( PL_regkind[(U8)op] == TRIE ) {
14625 const regnode *this_trie = node;
14626 const char op = OP(node);
14627 const U32 n = ARG(node);
14628 const reg_ac_data * const ac = op>=AHOCORASICK ?
14629 (reg_ac_data *)ri->data->data[n] :
14631 const reg_trie_data * const trie =
14632 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14634 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14636 const regnode *nextbranch= NULL;
14639 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14640 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14642 PerlIO_printf(Perl_debug_log, "%*s%s ",
14643 (int)(2*(indent+3)), "",
14644 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14645 PL_colors[0], PL_colors[1],
14646 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14647 PERL_PV_PRETTY_ELLIPSES |
14648 PERL_PV_PRETTY_LTGT
14653 U16 dist= trie->jump[word_idx+1];
14654 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14655 (UV)((dist ? this_trie + dist : next) - start));
14658 nextbranch= this_trie + trie->jump[0];
14659 DUMPUNTIL(this_trie + dist, nextbranch);
14661 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14662 nextbranch= regnext((regnode *)nextbranch);
14664 PerlIO_printf(Perl_debug_log, "\n");
14667 if (last && next > last)
14672 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14673 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14674 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14676 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14678 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14680 else if ( op == PLUS || op == STAR) {
14681 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14683 else if (PL_regkind[(U8)op] == ANYOF) {
14684 /* arglen 1 + class block */
14685 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14686 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14687 node = NEXTOPER(node);
14689 else if (PL_regkind[(U8)op] == EXACT) {
14690 /* Literal string, where present. */
14691 node += NODE_SZ_STR(node) - 1;
14692 node = NEXTOPER(node);
14695 node = NEXTOPER(node);
14696 node += regarglen[(U8)op];
14698 if (op == CURLYX || op == OPEN)
14702 #ifdef DEBUG_DUMPUNTIL
14703 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14708 #endif /* DEBUGGING */
14712 * c-indentation-style: bsd
14713 * c-basic-offset: 4
14714 * indent-tabs-mode: nil
14717 * ex: set ts=8 sts=4 sw=4 et: