5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
122 typedef struct RExC_state_t {
123 U32 flags; /* RXf_* are we folding, multilining? */
124 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
125 char *precomp; /* uncompiled string. */
126 REGEXP *rx_sv; /* The SV that is the regexp. */
127 regexp *rx; /* perl core regexp structure */
128 regexp_internal *rxi; /* internal data for regexp object pprivate field */
129 char *start; /* Start of input for compile */
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
132 I32 whilem_seen; /* number of WHILEM in this expr */
133 regnode *emit_start; /* Start of emitted-code area */
134 regnode *emit_bound; /* First regnode outside of the allocated space */
135 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
136 I32 naughty; /* How bad is this pattern? */
137 I32 sawback; /* Did we see \1, ...? */
139 I32 size; /* Code size. */
140 I32 npar; /* Capture buffer count, (OPEN). */
141 I32 cpar; /* Capture buffer count, (CLOSE). */
142 I32 nestroot; /* root parens we are in - used by accept */
145 regnode **open_parens; /* pointers to open parens */
146 regnode **close_parens; /* pointers to close parens */
147 regnode *opend; /* END node in program */
148 I32 utf8; /* whether the pattern is utf8 or not */
149 I32 orig_utf8; /* whether the pattern was originally in utf8 */
150 /* XXX use this for future optimisation of case
151 * where pattern must be upgraded to utf8. */
152 I32 uni_semantics; /* If a d charset modifier should use unicode
153 rules, even if the pattern is not in
155 HV *paren_names; /* Paren names */
157 regnode **recurse; /* Recurse regops */
158 I32 recurse_count; /* Number of recurse regops */
161 I32 override_recoding;
162 I32 in_multi_char_class;
163 struct reg_code_block *code_blocks; /* positions of literal (?{})
165 int num_code_blocks; /* size of code_blocks[] */
166 int code_index; /* next code_blocks[] slot */
168 char *starttry; /* -Dr: where regtry was called. */
169 #define RExC_starttry (pRExC_state->starttry)
171 SV *runtime_code_qr; /* qr with the runtime code blocks */
173 const char *lastparse;
175 AV *paren_name_list; /* idx -> name */
176 #define RExC_lastparse (pRExC_state->lastparse)
177 #define RExC_lastnum (pRExC_state->lastnum)
178 #define RExC_paren_name_list (pRExC_state->paren_name_list)
182 #define RExC_flags (pRExC_state->flags)
183 #define RExC_pm_flags (pRExC_state->pm_flags)
184 #define RExC_precomp (pRExC_state->precomp)
185 #define RExC_rx_sv (pRExC_state->rx_sv)
186 #define RExC_rx (pRExC_state->rx)
187 #define RExC_rxi (pRExC_state->rxi)
188 #define RExC_start (pRExC_state->start)
189 #define RExC_end (pRExC_state->end)
190 #define RExC_parse (pRExC_state->parse)
191 #define RExC_whilem_seen (pRExC_state->whilem_seen)
192 #ifdef RE_TRACK_PATTERN_OFFSETS
193 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
195 #define RExC_emit (pRExC_state->emit)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_npar (pRExC_state->npar)
203 #define RExC_nestroot (pRExC_state->nestroot)
204 #define RExC_extralen (pRExC_state->extralen)
205 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
206 #define RExC_utf8 (pRExC_state->utf8)
207 #define RExC_uni_semantics (pRExC_state->uni_semantics)
208 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
209 #define RExC_open_parens (pRExC_state->open_parens)
210 #define RExC_close_parens (pRExC_state->close_parens)
211 #define RExC_opend (pRExC_state->opend)
212 #define RExC_paren_names (pRExC_state->paren_names)
213 #define RExC_recurse (pRExC_state->recurse)
214 #define RExC_recurse_count (pRExC_state->recurse_count)
215 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
216 #define RExC_contains_locale (pRExC_state->contains_locale)
217 #define RExC_override_recoding (pRExC_state->override_recoding)
218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
221 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
222 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
223 ((*s) == '{' && regcurly(s)))
226 #undef SPSTART /* dratted cpp namespace... */
229 * Flags to be passed up and down.
231 #define WORST 0 /* Worst case. */
232 #define HASWIDTH 0x01 /* Known to match non-null strings. */
234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
235 * character. (There needs to be a case: in the switch statement in regexec.c
236 * for any node marked SIMPLE.) Note that this is not the same thing as
239 #define SPSTART 0x04 /* Starts with * or + */
240 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
241 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
243 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
245 /* whether trie related optimizations are enabled */
246 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
247 #define TRIE_STUDY_OPT
248 #define FULL_TRIE_STUDY
254 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
255 #define PBITVAL(paren) (1 << ((paren) & 7))
256 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
257 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
258 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
260 /* If not already in utf8, do a longjmp back to the beginning */
261 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
262 #define REQUIRE_UTF8 STMT_START { \
263 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
266 /* About scan_data_t.
268 During optimisation we recurse through the regexp program performing
269 various inplace (keyhole style) optimisations. In addition study_chunk
270 and scan_commit populate this data structure with information about
271 what strings MUST appear in the pattern. We look for the longest
272 string that must appear at a fixed location, and we look for the
273 longest string that may appear at a floating location. So for instance
278 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
279 strings (because they follow a .* construct). study_chunk will identify
280 both FOO and BAR as being the longest fixed and floating strings respectively.
282 The strings can be composites, for instance
286 will result in a composite fixed substring 'foo'.
288 For each string some basic information is maintained:
290 - offset or min_offset
291 This is the position the string must appear at, or not before.
292 It also implicitly (when combined with minlenp) tells us how many
293 characters must match before the string we are searching for.
294 Likewise when combined with minlenp and the length of the string it
295 tells us how many characters must appear after the string we have
299 Only used for floating strings. This is the rightmost point that
300 the string can appear at. If set to I32 max it indicates that the
301 string can occur infinitely far to the right.
304 A pointer to the minimum number of characters of the pattern that the
305 string was found inside. This is important as in the case of positive
306 lookahead or positive lookbehind we can have multiple patterns
311 The minimum length of the pattern overall is 3, the minimum length
312 of the lookahead part is 3, but the minimum length of the part that
313 will actually match is 1. So 'FOO's minimum length is 3, but the
314 minimum length for the F is 1. This is important as the minimum length
315 is used to determine offsets in front of and behind the string being
316 looked for. Since strings can be composites this is the length of the
317 pattern at the time it was committed with a scan_commit. Note that
318 the length is calculated by study_chunk, so that the minimum lengths
319 are not known until the full pattern has been compiled, thus the
320 pointer to the value.
324 In the case of lookbehind the string being searched for can be
325 offset past the start point of the final matching string.
326 If this value was just blithely removed from the min_offset it would
327 invalidate some of the calculations for how many chars must match
328 before or after (as they are derived from min_offset and minlen and
329 the length of the string being searched for).
330 When the final pattern is compiled and the data is moved from the
331 scan_data_t structure into the regexp structure the information
332 about lookbehind is factored in, with the information that would
333 have been lost precalculated in the end_shift field for the
336 The fields pos_min and pos_delta are used to store the minimum offset
337 and the delta to the maximum offset at the current point in the pattern.
341 typedef struct scan_data_t {
342 /*I32 len_min; unused */
343 /*I32 len_delta; unused */
347 I32 last_end; /* min value, <0 unless valid. */
350 SV **longest; /* Either &l_fixed, or &l_float. */
351 SV *longest_fixed; /* longest fixed string found in pattern */
352 I32 offset_fixed; /* offset where it starts */
353 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
354 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
355 SV *longest_float; /* longest floating string found in pattern */
356 I32 offset_float_min; /* earliest point in string it can appear */
357 I32 offset_float_max; /* latest point in string it can appear */
358 I32 *minlen_float; /* pointer to the minlen relevant to the string */
359 I32 lookbehind_float; /* is the position of the string modified by LB */
363 struct regnode_charclass_class *start_class;
367 * Forward declarations for pregcomp()'s friends.
370 static const scan_data_t zero_scan_data =
371 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
373 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
374 #define SF_BEFORE_SEOL 0x0001
375 #define SF_BEFORE_MEOL 0x0002
376 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
377 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
380 # define SF_FIX_SHIFT_EOL (0+2)
381 # define SF_FL_SHIFT_EOL (0+4)
383 # define SF_FIX_SHIFT_EOL (+2)
384 # define SF_FL_SHIFT_EOL (+4)
387 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
388 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
390 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
391 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
392 #define SF_IS_INF 0x0040
393 #define SF_HAS_PAR 0x0080
394 #define SF_IN_PAR 0x0100
395 #define SF_HAS_EVAL 0x0200
396 #define SCF_DO_SUBSTR 0x0400
397 #define SCF_DO_STCLASS_AND 0x0800
398 #define SCF_DO_STCLASS_OR 0x1000
399 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
400 #define SCF_WHILEM_VISITED_POS 0x2000
402 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
403 #define SCF_SEEN_ACCEPT 0x8000
405 #define UTF cBOOL(RExC_utf8)
407 /* The enums for all these are ordered so things work out correctly */
408 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
409 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
410 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
411 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
412 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
413 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
414 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
416 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
418 #define OOB_NAMEDCLASS -1
420 /* There is no code point that is out-of-bounds, so this is problematic. But
421 * its only current use is to initialize a variable that is always set before
423 #define OOB_UNICODE 0xDEADBEEF
425 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
426 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
429 /* length of regex to show in messages that don't mark a position within */
430 #define RegexLengthToShowInErrorMessages 127
433 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
434 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
435 * op/pragma/warn/regcomp.
437 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
438 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
440 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
443 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
444 * arg. Show regex, up to a maximum length. If it's too long, chop and add
447 #define _FAIL(code) STMT_START { \
448 const char *ellipses = ""; \
449 IV len = RExC_end - RExC_precomp; \
452 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
453 if (len > RegexLengthToShowInErrorMessages) { \
454 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
455 len = RegexLengthToShowInErrorMessages - 10; \
461 #define FAIL(msg) _FAIL( \
462 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
463 msg, (int)len, RExC_precomp, ellipses))
465 #define FAIL2(msg,arg) _FAIL( \
466 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
467 arg, (int)len, RExC_precomp, ellipses))
470 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
472 #define Simple_vFAIL(m) STMT_START { \
473 const IV offset = RExC_parse - RExC_precomp; \
474 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
475 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
481 #define vFAIL(m) STMT_START { \
483 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
488 * Like Simple_vFAIL(), but accepts two arguments.
490 #define Simple_vFAIL2(m,a1) STMT_START { \
491 const IV offset = RExC_parse - RExC_precomp; \
492 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
493 (int)offset, RExC_precomp, RExC_precomp + offset); \
497 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
499 #define vFAIL2(m,a1) STMT_START { \
501 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
502 Simple_vFAIL2(m, a1); \
507 * Like Simple_vFAIL(), but accepts three arguments.
509 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
510 const IV offset = RExC_parse - RExC_precomp; \
511 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
512 (int)offset, RExC_precomp, RExC_precomp + offset); \
516 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
518 #define vFAIL3(m,a1,a2) STMT_START { \
520 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
521 Simple_vFAIL3(m, a1, a2); \
525 * Like Simple_vFAIL(), but accepts four arguments.
527 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
528 const IV offset = RExC_parse - RExC_precomp; \
529 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
530 (int)offset, RExC_precomp, RExC_precomp + offset); \
533 #define ckWARNreg(loc,m) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 (int)offset, RExC_precomp, RExC_precomp + offset); \
539 #define ckWARNregdep(loc,m) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
543 (int)offset, RExC_precomp, RExC_precomp + offset); \
546 #define ckWARN2regdep(loc,m, a1) STMT_START { \
547 const IV offset = loc - RExC_precomp; \
548 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
550 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
553 #define ckWARN2reg(loc, m, a1) STMT_START { \
554 const IV offset = loc - RExC_precomp; \
555 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
556 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
559 #define vWARN3(loc, m, a1, a2) STMT_START { \
560 const IV offset = loc - RExC_precomp; \
561 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
562 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
565 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
566 const IV offset = loc - RExC_precomp; \
567 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
568 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
574 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
577 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
578 const IV offset = loc - RExC_precomp; \
579 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
580 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
583 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
584 const IV offset = loc - RExC_precomp; \
585 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
586 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
590 /* Allow for side effects in s */
591 #define REGC(c,s) STMT_START { \
592 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
595 /* Macros for recording node offsets. 20001227 mjd@plover.com
596 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
597 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
598 * Element 0 holds the number n.
599 * Position is 1 indexed.
601 #ifndef RE_TRACK_PATTERN_OFFSETS
602 #define Set_Node_Offset_To_R(node,byte)
603 #define Set_Node_Offset(node,byte)
604 #define Set_Cur_Node_Offset
605 #define Set_Node_Length_To_R(node,len)
606 #define Set_Node_Length(node,len)
607 #define Set_Node_Cur_Length(node)
608 #define Node_Offset(n)
609 #define Node_Length(n)
610 #define Set_Node_Offset_Length(node,offset,len)
611 #define ProgLen(ri) ri->u.proglen
612 #define SetProgLen(ri,x) ri->u.proglen = x
614 #define ProgLen(ri) ri->u.offsets[0]
615 #define SetProgLen(ri,x) ri->u.offsets[0] = x
616 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
618 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
619 __LINE__, (int)(node), (int)(byte))); \
621 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
623 RExC_offsets[2*(node)-1] = (byte); \
628 #define Set_Node_Offset(node,byte) \
629 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
630 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
632 #define Set_Node_Length_To_R(node,len) STMT_START { \
634 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
635 __LINE__, (int)(node), (int)(len))); \
637 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
639 RExC_offsets[2*(node)] = (len); \
644 #define Set_Node_Length(node,len) \
645 Set_Node_Length_To_R((node)-RExC_emit_start, len)
646 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
647 #define Set_Node_Cur_Length(node) \
648 Set_Node_Length(node, RExC_parse - parse_start)
650 /* Get offsets and lengths */
651 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
652 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
654 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
655 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
656 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
660 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
661 #define EXPERIMENTAL_INPLACESCAN
662 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
664 #define DEBUG_STUDYDATA(str,data,depth) \
665 DEBUG_OPTIMISE_MORE_r(if(data){ \
666 PerlIO_printf(Perl_debug_log, \
667 "%*s" str "Pos:%"IVdf"/%"IVdf \
668 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
669 (int)(depth)*2, "", \
670 (IV)((data)->pos_min), \
671 (IV)((data)->pos_delta), \
672 (UV)((data)->flags), \
673 (IV)((data)->whilem_c), \
674 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
675 is_inf ? "INF " : "" \
677 if ((data)->last_found) \
678 PerlIO_printf(Perl_debug_log, \
679 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
680 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
681 SvPVX_const((data)->last_found), \
682 (IV)((data)->last_end), \
683 (IV)((data)->last_start_min), \
684 (IV)((data)->last_start_max), \
685 ((data)->longest && \
686 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
687 SvPVX_const((data)->longest_fixed), \
688 (IV)((data)->offset_fixed), \
689 ((data)->longest && \
690 (data)->longest==&((data)->longest_float)) ? "*" : "", \
691 SvPVX_const((data)->longest_float), \
692 (IV)((data)->offset_float_min), \
693 (IV)((data)->offset_float_max) \
695 PerlIO_printf(Perl_debug_log,"\n"); \
698 static void clear_re(pTHX_ void *r);
700 /* Mark that we cannot extend a found fixed substring at this point.
701 Update the longest found anchored substring and the longest found
702 floating substrings if needed. */
705 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
707 const STRLEN l = CHR_SVLEN(data->last_found);
708 const STRLEN old_l = CHR_SVLEN(*data->longest);
709 GET_RE_DEBUG_FLAGS_DECL;
711 PERL_ARGS_ASSERT_SCAN_COMMIT;
713 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
714 SvSetMagicSV(*data->longest, data->last_found);
715 if (*data->longest == data->longest_fixed) {
716 data->offset_fixed = l ? data->last_start_min : data->pos_min;
717 if (data->flags & SF_BEFORE_EOL)
719 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
721 data->flags &= ~SF_FIX_BEFORE_EOL;
722 data->minlen_fixed=minlenp;
723 data->lookbehind_fixed=0;
725 else { /* *data->longest == data->longest_float */
726 data->offset_float_min = l ? data->last_start_min : data->pos_min;
727 data->offset_float_max = (l
728 ? data->last_start_max
729 : data->pos_min + data->pos_delta);
730 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
731 data->offset_float_max = I32_MAX;
732 if (data->flags & SF_BEFORE_EOL)
734 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
736 data->flags &= ~SF_FL_BEFORE_EOL;
737 data->minlen_float=minlenp;
738 data->lookbehind_float=0;
741 SvCUR_set(data->last_found, 0);
743 SV * const sv = data->last_found;
744 if (SvUTF8(sv) && SvMAGICAL(sv)) {
745 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
751 data->flags &= ~SF_BEFORE_EOL;
752 DEBUG_STUDYDATA("commit: ",data,0);
755 /* Can match anything (initialization) */
757 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
759 PERL_ARGS_ASSERT_CL_ANYTHING;
761 ANYOF_BITMAP_SETALL(cl);
762 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
763 |ANYOF_NON_UTF8_LATIN1_ALL;
765 /* If any portion of the regex is to operate under locale rules,
766 * initialization includes it. The reason this isn't done for all regexes
767 * is that the optimizer was written under the assumption that locale was
768 * all-or-nothing. Given the complexity and lack of documentation in the
769 * optimizer, and that there are inadequate test cases for locale, so many
770 * parts of it may not work properly, it is safest to avoid locale unless
772 if (RExC_contains_locale) {
773 ANYOF_CLASS_SETALL(cl); /* /l uses class */
774 cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
777 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
781 /* Can match anything (initialization) */
783 S_cl_is_anything(const struct regnode_charclass_class *cl)
787 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
789 for (value = 0; value <= ANYOF_MAX; value += 2)
790 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
792 if (!(cl->flags & ANYOF_UNICODE_ALL))
794 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
799 /* Can match anything (initialization) */
801 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
803 PERL_ARGS_ASSERT_CL_INIT;
805 Zero(cl, 1, struct regnode_charclass_class);
807 cl_anything(pRExC_state, cl);
808 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
811 /* These two functions currently do the exact same thing */
812 #define cl_init_zero S_cl_init
814 /* 'AND' a given class with another one. Can create false positives. 'cl'
815 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
816 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
818 S_cl_and(struct regnode_charclass_class *cl,
819 const struct regnode_charclass_class *and_with)
821 PERL_ARGS_ASSERT_CL_AND;
823 assert(and_with->type == ANYOF);
825 /* I (khw) am not sure all these restrictions are necessary XXX */
826 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
827 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
828 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
829 && !(and_with->flags & ANYOF_LOC_FOLD)
830 && !(cl->flags & ANYOF_LOC_FOLD)) {
833 if (and_with->flags & ANYOF_INVERT)
834 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
835 cl->bitmap[i] &= ~and_with->bitmap[i];
837 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
838 cl->bitmap[i] &= and_with->bitmap[i];
839 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
841 if (and_with->flags & ANYOF_INVERT) {
843 /* Here, the and'ed node is inverted. Get the AND of the flags that
844 * aren't affected by the inversion. Those that are affected are
845 * handled individually below */
846 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
847 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
848 cl->flags |= affected_flags;
850 /* We currently don't know how to deal with things that aren't in the
851 * bitmap, but we know that the intersection is no greater than what
852 * is already in cl, so let there be false positives that get sorted
853 * out after the synthetic start class succeeds, and the node is
854 * matched for real. */
856 /* The inversion of these two flags indicate that the resulting
857 * intersection doesn't have them */
858 if (and_with->flags & ANYOF_UNICODE_ALL) {
859 cl->flags &= ~ANYOF_UNICODE_ALL;
861 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
862 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
865 else { /* and'd node is not inverted */
866 U8 outside_bitmap_but_not_utf8; /* Temp variable */
868 if (! ANYOF_NONBITMAP(and_with)) {
870 /* Here 'and_with' doesn't match anything outside the bitmap
871 * (except possibly ANYOF_UNICODE_ALL), which means the
872 * intersection can't either, except for ANYOF_UNICODE_ALL, in
873 * which case we don't know what the intersection is, but it's no
874 * greater than what cl already has, so can just leave it alone,
875 * with possible false positives */
876 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
877 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
878 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
881 else if (! ANYOF_NONBITMAP(cl)) {
883 /* Here, 'and_with' does match something outside the bitmap, and cl
884 * doesn't have a list of things to match outside the bitmap. If
885 * cl can match all code points above 255, the intersection will
886 * be those above-255 code points that 'and_with' matches. If cl
887 * can't match all Unicode code points, it means that it can't
888 * match anything outside the bitmap (since the 'if' that got us
889 * into this block tested for that), so we leave the bitmap empty.
891 if (cl->flags & ANYOF_UNICODE_ALL) {
892 ARG_SET(cl, ARG(and_with));
894 /* and_with's ARG may match things that don't require UTF8.
895 * And now cl's will too, in spite of this being an 'and'. See
896 * the comments below about the kludge */
897 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
901 /* Here, both 'and_with' and cl match something outside the
902 * bitmap. Currently we do not do the intersection, so just match
903 * whatever cl had at the beginning. */
907 /* Take the intersection of the two sets of flags. However, the
908 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
909 * kludge around the fact that this flag is not treated like the others
910 * which are initialized in cl_anything(). The way the optimizer works
911 * is that the synthetic start class (SSC) is initialized to match
912 * anything, and then the first time a real node is encountered, its
913 * values are AND'd with the SSC's with the result being the values of
914 * the real node. However, there are paths through the optimizer where
915 * the AND never gets called, so those initialized bits are set
916 * inappropriately, which is not usually a big deal, as they just cause
917 * false positives in the SSC, which will just mean a probably
918 * imperceptible slow down in execution. However this bit has a
919 * higher false positive consequence in that it can cause utf8.pm,
920 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
921 * bigger slowdown and also causes significant extra memory to be used.
922 * In order to prevent this, the code now takes a different tack. The
923 * bit isn't set unless some part of the regular expression needs it,
924 * but once set it won't get cleared. This means that these extra
925 * modules won't get loaded unless there was some path through the
926 * pattern that would have required them anyway, and so any false
927 * positives that occur by not ANDing them out when they could be
928 * aren't as severe as they would be if we treated this bit like all
930 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
931 & ANYOF_NONBITMAP_NON_UTF8;
932 cl->flags &= and_with->flags;
933 cl->flags |= outside_bitmap_but_not_utf8;
937 /* 'OR' a given class with another one. Can create false positives. 'cl'
938 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
939 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
941 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
943 PERL_ARGS_ASSERT_CL_OR;
945 if (or_with->flags & ANYOF_INVERT) {
947 /* Here, the or'd node is to be inverted. This means we take the
948 * complement of everything not in the bitmap, but currently we don't
949 * know what that is, so give up and match anything */
950 if (ANYOF_NONBITMAP(or_with)) {
951 cl_anything(pRExC_state, cl);
954 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
955 * <= (B1 | !B2) | (CL1 | !CL2)
956 * which is wasteful if CL2 is small, but we ignore CL2:
957 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
958 * XXXX Can we handle case-fold? Unclear:
959 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
960 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
962 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
963 && !(or_with->flags & ANYOF_LOC_FOLD)
964 && !(cl->flags & ANYOF_LOC_FOLD) ) {
967 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
968 cl->bitmap[i] |= ~or_with->bitmap[i];
969 } /* XXXX: logic is complicated otherwise */
971 cl_anything(pRExC_state, cl);
974 /* And, we can just take the union of the flags that aren't affected
975 * by the inversion */
976 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
978 /* For the remaining flags:
979 ANYOF_UNICODE_ALL and inverted means to not match anything above
980 255, which means that the union with cl should just be
981 what cl has in it, so can ignore this flag
982 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
983 is 127-255 to match them, but then invert that, so the
984 union with cl should just be what cl has in it, so can
987 } else { /* 'or_with' is not inverted */
988 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
989 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
990 && (!(or_with->flags & ANYOF_LOC_FOLD)
991 || (cl->flags & ANYOF_LOC_FOLD)) ) {
994 /* OR char bitmap and class bitmap separately */
995 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
996 cl->bitmap[i] |= or_with->bitmap[i];
997 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
998 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
999 cl->classflags[i] |= or_with->classflags[i];
1000 cl->flags |= ANYOF_CLASS;
1003 else { /* XXXX: logic is complicated, leave it along for a moment. */
1004 cl_anything(pRExC_state, cl);
1007 if (ANYOF_NONBITMAP(or_with)) {
1009 /* Use the added node's outside-the-bit-map match if there isn't a
1010 * conflict. If there is a conflict (both nodes match something
1011 * outside the bitmap, but what they match outside is not the same
1012 * pointer, and hence not easily compared until XXX we extend
1013 * inversion lists this far), give up and allow the start class to
1014 * match everything outside the bitmap. If that stuff is all above
1015 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1016 if (! ANYOF_NONBITMAP(cl)) {
1017 ARG_SET(cl, ARG(or_with));
1019 else if (ARG(cl) != ARG(or_with)) {
1021 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1022 cl_anything(pRExC_state, cl);
1025 cl->flags |= ANYOF_UNICODE_ALL;
1030 /* Take the union */
1031 cl->flags |= or_with->flags;
1035 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1036 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1037 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1038 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1043 dump_trie(trie,widecharmap,revcharmap)
1044 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1045 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1047 These routines dump out a trie in a somewhat readable format.
1048 The _interim_ variants are used for debugging the interim
1049 tables that are used to generate the final compressed
1050 representation which is what dump_trie expects.
1052 Part of the reason for their existence is to provide a form
1053 of documentation as to how the different representations function.
1058 Dumps the final compressed table form of the trie to Perl_debug_log.
1059 Used for debugging make_trie().
1063 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1064 AV *revcharmap, U32 depth)
1067 SV *sv=sv_newmortal();
1068 int colwidth= widecharmap ? 6 : 4;
1070 GET_RE_DEBUG_FLAGS_DECL;
1072 PERL_ARGS_ASSERT_DUMP_TRIE;
1074 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1075 (int)depth * 2 + 2,"",
1076 "Match","Base","Ofs" );
1078 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1079 SV ** const tmp = av_fetch( revcharmap, state, 0);
1081 PerlIO_printf( Perl_debug_log, "%*s",
1083 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1084 PL_colors[0], PL_colors[1],
1085 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1086 PERL_PV_ESCAPE_FIRSTCHAR
1091 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1092 (int)depth * 2 + 2,"");
1094 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1095 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1096 PerlIO_printf( Perl_debug_log, "\n");
1098 for( state = 1 ; state < trie->statecount ; state++ ) {
1099 const U32 base = trie->states[ state ].trans.base;
1101 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1103 if ( trie->states[ state ].wordnum ) {
1104 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1106 PerlIO_printf( Perl_debug_log, "%6s", "" );
1109 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1114 while( ( base + ofs < trie->uniquecharcount ) ||
1115 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1116 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1119 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1121 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1122 if ( ( base + ofs >= trie->uniquecharcount ) &&
1123 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1124 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1126 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1128 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1130 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1134 PerlIO_printf( Perl_debug_log, "]");
1137 PerlIO_printf( Perl_debug_log, "\n" );
1139 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1140 for (word=1; word <= trie->wordcount; word++) {
1141 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1142 (int)word, (int)(trie->wordinfo[word].prev),
1143 (int)(trie->wordinfo[word].len));
1145 PerlIO_printf(Perl_debug_log, "\n" );
1148 Dumps a fully constructed but uncompressed trie in list form.
1149 List tries normally only are used for construction when the number of
1150 possible chars (trie->uniquecharcount) is very high.
1151 Used for debugging make_trie().
1154 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1155 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1159 SV *sv=sv_newmortal();
1160 int colwidth= widecharmap ? 6 : 4;
1161 GET_RE_DEBUG_FLAGS_DECL;
1163 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1165 /* print out the table precompression. */
1166 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1167 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1168 "------:-----+-----------------\n" );
1170 for( state=1 ; state < next_alloc ; state ++ ) {
1173 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1174 (int)depth * 2 + 2,"", (UV)state );
1175 if ( ! trie->states[ state ].wordnum ) {
1176 PerlIO_printf( Perl_debug_log, "%5s| ","");
1178 PerlIO_printf( Perl_debug_log, "W%4x| ",
1179 trie->states[ state ].wordnum
1182 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1183 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1185 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1187 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1188 PL_colors[0], PL_colors[1],
1189 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1190 PERL_PV_ESCAPE_FIRSTCHAR
1192 TRIE_LIST_ITEM(state,charid).forid,
1193 (UV)TRIE_LIST_ITEM(state,charid).newstate
1196 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1197 (int)((depth * 2) + 14), "");
1200 PerlIO_printf( Perl_debug_log, "\n");
1205 Dumps a fully constructed but uncompressed trie in table form.
1206 This is the normal DFA style state transition table, with a few
1207 twists to facilitate compression later.
1208 Used for debugging make_trie().
1211 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1212 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1217 SV *sv=sv_newmortal();
1218 int colwidth= widecharmap ? 6 : 4;
1219 GET_RE_DEBUG_FLAGS_DECL;
1221 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1224 print out the table precompression so that we can do a visual check
1225 that they are identical.
1228 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1230 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1231 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1233 PerlIO_printf( Perl_debug_log, "%*s",
1235 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1236 PL_colors[0], PL_colors[1],
1237 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1238 PERL_PV_ESCAPE_FIRSTCHAR
1244 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1246 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1247 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1250 PerlIO_printf( Perl_debug_log, "\n" );
1252 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1254 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1255 (int)depth * 2 + 2,"",
1256 (UV)TRIE_NODENUM( state ) );
1258 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1259 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1261 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1263 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1265 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1266 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1268 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1269 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1277 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1278 startbranch: the first branch in the whole branch sequence
1279 first : start branch of sequence of branch-exact nodes.
1280 May be the same as startbranch
1281 last : Thing following the last branch.
1282 May be the same as tail.
1283 tail : item following the branch sequence
1284 count : words in the sequence
1285 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1286 depth : indent depth
1288 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1290 A trie is an N'ary tree where the branches are determined by digital
1291 decomposition of the key. IE, at the root node you look up the 1st character and
1292 follow that branch repeat until you find the end of the branches. Nodes can be
1293 marked as "accepting" meaning they represent a complete word. Eg:
1297 would convert into the following structure. Numbers represent states, letters
1298 following numbers represent valid transitions on the letter from that state, if
1299 the number is in square brackets it represents an accepting state, otherwise it
1300 will be in parenthesis.
1302 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1306 (1) +-i->(6)-+-s->[7]
1308 +-s->(3)-+-h->(4)-+-e->[5]
1310 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1312 This shows that when matching against the string 'hers' we will begin at state 1
1313 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1314 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1315 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1316 single traverse. We store a mapping from accepting to state to which word was
1317 matched, and then when we have multiple possibilities we try to complete the
1318 rest of the regex in the order in which they occured in the alternation.
1320 The only prior NFA like behaviour that would be changed by the TRIE support is
1321 the silent ignoring of duplicate alternations which are of the form:
1323 / (DUPE|DUPE) X? (?{ ... }) Y /x
1325 Thus EVAL blocks following a trie may be called a different number of times with
1326 and without the optimisation. With the optimisations dupes will be silently
1327 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1328 the following demonstrates:
1330 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1332 which prints out 'word' three times, but
1334 'words'=~/(word|word|word)(?{ print $1 })S/
1336 which doesnt print it out at all. This is due to other optimisations kicking in.
1338 Example of what happens on a structural level:
1340 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1342 1: CURLYM[1] {1,32767}(18)
1353 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1354 and should turn into:
1356 1: CURLYM[1] {1,32767}(18)
1358 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1366 Cases where tail != last would be like /(?foo|bar)baz/:
1376 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1377 and would end up looking like:
1380 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1387 d = uvuni_to_utf8_flags(d, uv, 0);
1389 is the recommended Unicode-aware way of saying
1394 #define TRIE_STORE_REVCHAR(val) \
1397 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1398 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1399 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1400 SvCUR_set(zlopp, kapow - flrbbbbb); \
1403 av_push(revcharmap, zlopp); \
1405 char ooooff = (char)val; \
1406 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1410 #define TRIE_READ_CHAR STMT_START { \
1413 /* if it is UTF then it is either already folded, or does not need folding */ \
1414 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1416 else if (folder == PL_fold_latin1) { \
1417 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1418 if ( foldlen > 0 ) { \
1419 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1425 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1426 skiplen = UNISKIP(uvc); \
1427 foldlen -= skiplen; \
1428 scan = foldbuf + skiplen; \
1431 /* raw data, will be folded later if needed */ \
1439 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1440 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1441 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1442 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1444 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1445 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1446 TRIE_LIST_CUR( state )++; \
1449 #define TRIE_LIST_NEW(state) STMT_START { \
1450 Newxz( trie->states[ state ].trans.list, \
1451 4, reg_trie_trans_le ); \
1452 TRIE_LIST_CUR( state ) = 1; \
1453 TRIE_LIST_LEN( state ) = 4; \
1456 #define TRIE_HANDLE_WORD(state) STMT_START { \
1457 U16 dupe= trie->states[ state ].wordnum; \
1458 regnode * const noper_next = regnext( noper ); \
1461 /* store the word for dumping */ \
1463 if (OP(noper) != NOTHING) \
1464 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1466 tmp = newSVpvn_utf8( "", 0, UTF ); \
1467 av_push( trie_words, tmp ); \
1471 trie->wordinfo[curword].prev = 0; \
1472 trie->wordinfo[curword].len = wordlen; \
1473 trie->wordinfo[curword].accept = state; \
1475 if ( noper_next < tail ) { \
1477 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1478 trie->jump[curword] = (U16)(noper_next - convert); \
1480 jumper = noper_next; \
1482 nextbranch= regnext(cur); \
1486 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1487 /* chain, so that when the bits of chain are later */\
1488 /* linked together, the dups appear in the chain */\
1489 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1490 trie->wordinfo[dupe].prev = curword; \
1492 /* we haven't inserted this word yet. */ \
1493 trie->states[ state ].wordnum = curword; \
1498 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1499 ( ( base + charid >= ucharcount \
1500 && base + charid < ubound \
1501 && state == trie->trans[ base - ucharcount + charid ].check \
1502 && trie->trans[ base - ucharcount + charid ].next ) \
1503 ? trie->trans[ base - ucharcount + charid ].next \
1504 : ( state==1 ? special : 0 ) \
1508 #define MADE_JUMP_TRIE 2
1509 #define MADE_EXACT_TRIE 4
1512 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1515 /* first pass, loop through and scan words */
1516 reg_trie_data *trie;
1517 HV *widecharmap = NULL;
1518 AV *revcharmap = newAV();
1520 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1525 regnode *jumper = NULL;
1526 regnode *nextbranch = NULL;
1527 regnode *convert = NULL;
1528 U32 *prev_states; /* temp array mapping each state to previous one */
1529 /* we just use folder as a flag in utf8 */
1530 const U8 * folder = NULL;
1533 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1534 AV *trie_words = NULL;
1535 /* along with revcharmap, this only used during construction but both are
1536 * useful during debugging so we store them in the struct when debugging.
1539 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1540 STRLEN trie_charcount=0;
1542 SV *re_trie_maxbuff;
1543 GET_RE_DEBUG_FLAGS_DECL;
1545 PERL_ARGS_ASSERT_MAKE_TRIE;
1547 PERL_UNUSED_ARG(depth);
1554 case EXACTFU_TRICKYFOLD:
1555 case EXACTFU: folder = PL_fold_latin1; break;
1556 case EXACTF: folder = PL_fold; break;
1557 case EXACTFL: folder = PL_fold_locale; break;
1558 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1561 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1563 trie->startstate = 1;
1564 trie->wordcount = word_count;
1565 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1566 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1568 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1569 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1570 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1573 trie_words = newAV();
1576 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1577 if (!SvIOK(re_trie_maxbuff)) {
1578 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1580 DEBUG_TRIE_COMPILE_r({
1581 PerlIO_printf( Perl_debug_log,
1582 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1583 (int)depth * 2 + 2, "",
1584 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1585 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1589 /* Find the node we are going to overwrite */
1590 if ( first == startbranch && OP( last ) != BRANCH ) {
1591 /* whole branch chain */
1594 /* branch sub-chain */
1595 convert = NEXTOPER( first );
1598 /* -- First loop and Setup --
1600 We first traverse the branches and scan each word to determine if it
1601 contains widechars, and how many unique chars there are, this is
1602 important as we have to build a table with at least as many columns as we
1605 We use an array of integers to represent the character codes 0..255
1606 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1607 native representation of the character value as the key and IV's for the
1610 *TODO* If we keep track of how many times each character is used we can
1611 remap the columns so that the table compression later on is more
1612 efficient in terms of memory by ensuring the most common value is in the
1613 middle and the least common are on the outside. IMO this would be better
1614 than a most to least common mapping as theres a decent chance the most
1615 common letter will share a node with the least common, meaning the node
1616 will not be compressible. With a middle is most common approach the worst
1617 case is when we have the least common nodes twice.
1621 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1622 regnode *noper = NEXTOPER( cur );
1623 const U8 *uc = (U8*)STRING( noper );
1624 const U8 *e = uc + STR_LEN( noper );
1626 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1628 const U8 *scan = (U8*)NULL;
1629 U32 wordlen = 0; /* required init */
1631 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1633 if (OP(noper) == NOTHING) {
1634 regnode *noper_next= regnext(noper);
1635 if (noper_next != tail && OP(noper_next) == flags) {
1637 uc= (U8*)STRING(noper);
1638 e= uc + STR_LEN(noper);
1639 trie->minlen= STR_LEN(noper);
1646 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1647 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1648 regardless of encoding */
1649 if (OP( noper ) == EXACTFU_SS) {
1650 /* false positives are ok, so just set this */
1651 TRIE_BITMAP_SET(trie,0xDF);
1654 for ( ; uc < e ; uc += len ) {
1655 TRIE_CHARCOUNT(trie)++;
1660 U8 folded= folder[ (U8) uvc ];
1661 if ( !trie->charmap[ folded ] ) {
1662 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1663 TRIE_STORE_REVCHAR( folded );
1666 if ( !trie->charmap[ uvc ] ) {
1667 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1668 TRIE_STORE_REVCHAR( uvc );
1671 /* store the codepoint in the bitmap, and its folded
1673 TRIE_BITMAP_SET(trie, uvc);
1675 /* store the folded codepoint */
1676 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1679 /* store first byte of utf8 representation of
1680 variant codepoints */
1681 if (! UNI_IS_INVARIANT(uvc)) {
1682 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1685 set_bit = 0; /* We've done our bit :-) */
1690 widecharmap = newHV();
1692 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1695 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1697 if ( !SvTRUE( *svpp ) ) {
1698 sv_setiv( *svpp, ++trie->uniquecharcount );
1699 TRIE_STORE_REVCHAR(uvc);
1703 if( cur == first ) {
1704 trie->minlen = chars;
1705 trie->maxlen = chars;
1706 } else if (chars < trie->minlen) {
1707 trie->minlen = chars;
1708 } else if (chars > trie->maxlen) {
1709 trie->maxlen = chars;
1711 if (OP( noper ) == EXACTFU_SS) {
1712 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1713 if (trie->minlen > 1)
1716 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1717 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1718 * - We assume that any such sequence might match a 2 byte string */
1719 if (trie->minlen > 2 )
1723 } /* end first pass */
1724 DEBUG_TRIE_COMPILE_r(
1725 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1726 (int)depth * 2 + 2,"",
1727 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1728 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1729 (int)trie->minlen, (int)trie->maxlen )
1733 We now know what we are dealing with in terms of unique chars and
1734 string sizes so we can calculate how much memory a naive
1735 representation using a flat table will take. If it's over a reasonable
1736 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1737 conservative but potentially much slower representation using an array
1740 At the end we convert both representations into the same compressed
1741 form that will be used in regexec.c for matching with. The latter
1742 is a form that cannot be used to construct with but has memory
1743 properties similar to the list form and access properties similar
1744 to the table form making it both suitable for fast searches and
1745 small enough that its feasable to store for the duration of a program.
1747 See the comment in the code where the compressed table is produced
1748 inplace from the flat tabe representation for an explanation of how
1749 the compression works.
1754 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1757 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1759 Second Pass -- Array Of Lists Representation
1761 Each state will be represented by a list of charid:state records
1762 (reg_trie_trans_le) the first such element holds the CUR and LEN
1763 points of the allocated array. (See defines above).
1765 We build the initial structure using the lists, and then convert
1766 it into the compressed table form which allows faster lookups
1767 (but cant be modified once converted).
1770 STRLEN transcount = 1;
1772 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1773 "%*sCompiling trie using list compiler\n",
1774 (int)depth * 2 + 2, ""));
1776 trie->states = (reg_trie_state *)
1777 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1778 sizeof(reg_trie_state) );
1782 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1784 regnode *noper = NEXTOPER( cur );
1785 U8 *uc = (U8*)STRING( noper );
1786 const U8 *e = uc + STR_LEN( noper );
1787 U32 state = 1; /* required init */
1788 U16 charid = 0; /* sanity init */
1789 U8 *scan = (U8*)NULL; /* sanity init */
1790 STRLEN foldlen = 0; /* required init */
1791 U32 wordlen = 0; /* required init */
1792 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1795 if (OP(noper) == NOTHING) {
1796 regnode *noper_next= regnext(noper);
1797 if (noper_next != tail && OP(noper_next) == flags) {
1799 uc= (U8*)STRING(noper);
1800 e= uc + STR_LEN(noper);
1804 if (OP(noper) != NOTHING) {
1805 for ( ; uc < e ; uc += len ) {
1810 charid = trie->charmap[ uvc ];
1812 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1816 charid=(U16)SvIV( *svpp );
1819 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1826 if ( !trie->states[ state ].trans.list ) {
1827 TRIE_LIST_NEW( state );
1829 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1830 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1831 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1836 newstate = next_alloc++;
1837 prev_states[newstate] = state;
1838 TRIE_LIST_PUSH( state, charid, newstate );
1843 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1847 TRIE_HANDLE_WORD(state);
1849 } /* end second pass */
1851 /* next alloc is the NEXT state to be allocated */
1852 trie->statecount = next_alloc;
1853 trie->states = (reg_trie_state *)
1854 PerlMemShared_realloc( trie->states,
1856 * sizeof(reg_trie_state) );
1858 /* and now dump it out before we compress it */
1859 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1860 revcharmap, next_alloc,
1864 trie->trans = (reg_trie_trans *)
1865 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1872 for( state=1 ; state < next_alloc ; state ++ ) {
1876 DEBUG_TRIE_COMPILE_MORE_r(
1877 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1881 if (trie->states[state].trans.list) {
1882 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1886 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1887 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1888 if ( forid < minid ) {
1890 } else if ( forid > maxid ) {
1894 if ( transcount < tp + maxid - minid + 1) {
1896 trie->trans = (reg_trie_trans *)
1897 PerlMemShared_realloc( trie->trans,
1899 * sizeof(reg_trie_trans) );
1900 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1902 base = trie->uniquecharcount + tp - minid;
1903 if ( maxid == minid ) {
1905 for ( ; zp < tp ; zp++ ) {
1906 if ( ! trie->trans[ zp ].next ) {
1907 base = trie->uniquecharcount + zp - minid;
1908 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1909 trie->trans[ zp ].check = state;
1915 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1916 trie->trans[ tp ].check = state;
1921 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1922 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1923 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1924 trie->trans[ tid ].check = state;
1926 tp += ( maxid - minid + 1 );
1928 Safefree(trie->states[ state ].trans.list);
1931 DEBUG_TRIE_COMPILE_MORE_r(
1932 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1935 trie->states[ state ].trans.base=base;
1937 trie->lasttrans = tp + 1;
1941 Second Pass -- Flat Table Representation.
1943 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1944 We know that we will need Charcount+1 trans at most to store the data
1945 (one row per char at worst case) So we preallocate both structures
1946 assuming worst case.
1948 We then construct the trie using only the .next slots of the entry
1951 We use the .check field of the first entry of the node temporarily to
1952 make compression both faster and easier by keeping track of how many non
1953 zero fields are in the node.
1955 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1958 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1959 number representing the first entry of the node, and state as a
1960 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1961 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1962 are 2 entrys per node. eg:
1970 The table is internally in the right hand, idx form. However as we also
1971 have to deal with the states array which is indexed by nodenum we have to
1972 use TRIE_NODENUM() to convert.
1975 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1976 "%*sCompiling trie using table compiler\n",
1977 (int)depth * 2 + 2, ""));
1979 trie->trans = (reg_trie_trans *)
1980 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1981 * trie->uniquecharcount + 1,
1982 sizeof(reg_trie_trans) );
1983 trie->states = (reg_trie_state *)
1984 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1985 sizeof(reg_trie_state) );
1986 next_alloc = trie->uniquecharcount + 1;
1989 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1991 regnode *noper = NEXTOPER( cur );
1992 const U8 *uc = (U8*)STRING( noper );
1993 const U8 *e = uc + STR_LEN( noper );
1995 U32 state = 1; /* required init */
1997 U16 charid = 0; /* sanity init */
1998 U32 accept_state = 0; /* sanity init */
1999 U8 *scan = (U8*)NULL; /* sanity init */
2001 STRLEN foldlen = 0; /* required init */
2002 U32 wordlen = 0; /* required init */
2004 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2006 if (OP(noper) == NOTHING) {
2007 regnode *noper_next= regnext(noper);
2008 if (noper_next != tail && OP(noper_next) == flags) {
2010 uc= (U8*)STRING(noper);
2011 e= uc + STR_LEN(noper);
2015 if ( OP(noper) != NOTHING ) {
2016 for ( ; uc < e ; uc += len ) {
2021 charid = trie->charmap[ uvc ];
2023 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2024 charid = svpp ? (U16)SvIV(*svpp) : 0;
2028 if ( !trie->trans[ state + charid ].next ) {
2029 trie->trans[ state + charid ].next = next_alloc;
2030 trie->trans[ state ].check++;
2031 prev_states[TRIE_NODENUM(next_alloc)]
2032 = TRIE_NODENUM(state);
2033 next_alloc += trie->uniquecharcount;
2035 state = trie->trans[ state + charid ].next;
2037 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2039 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2042 accept_state = TRIE_NODENUM( state );
2043 TRIE_HANDLE_WORD(accept_state);
2045 } /* end second pass */
2047 /* and now dump it out before we compress it */
2048 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2050 next_alloc, depth+1));
2054 * Inplace compress the table.*
2056 For sparse data sets the table constructed by the trie algorithm will
2057 be mostly 0/FAIL transitions or to put it another way mostly empty.
2058 (Note that leaf nodes will not contain any transitions.)
2060 This algorithm compresses the tables by eliminating most such
2061 transitions, at the cost of a modest bit of extra work during lookup:
2063 - Each states[] entry contains a .base field which indicates the
2064 index in the state[] array wheres its transition data is stored.
2066 - If .base is 0 there are no valid transitions from that node.
2068 - If .base is nonzero then charid is added to it to find an entry in
2071 -If trans[states[state].base+charid].check!=state then the
2072 transition is taken to be a 0/Fail transition. Thus if there are fail
2073 transitions at the front of the node then the .base offset will point
2074 somewhere inside the previous nodes data (or maybe even into a node
2075 even earlier), but the .check field determines if the transition is
2079 The following process inplace converts the table to the compressed
2080 table: We first do not compress the root node 1,and mark all its
2081 .check pointers as 1 and set its .base pointer as 1 as well. This
2082 allows us to do a DFA construction from the compressed table later,
2083 and ensures that any .base pointers we calculate later are greater
2086 - We set 'pos' to indicate the first entry of the second node.
2088 - We then iterate over the columns of the node, finding the first and
2089 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2090 and set the .check pointers accordingly, and advance pos
2091 appropriately and repreat for the next node. Note that when we copy
2092 the next pointers we have to convert them from the original
2093 NODEIDX form to NODENUM form as the former is not valid post
2096 - If a node has no transitions used we mark its base as 0 and do not
2097 advance the pos pointer.
2099 - If a node only has one transition we use a second pointer into the
2100 structure to fill in allocated fail transitions from other states.
2101 This pointer is independent of the main pointer and scans forward
2102 looking for null transitions that are allocated to a state. When it
2103 finds one it writes the single transition into the "hole". If the
2104 pointer doesnt find one the single transition is appended as normal.
2106 - Once compressed we can Renew/realloc the structures to release the
2109 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2110 specifically Fig 3.47 and the associated pseudocode.
2114 const U32 laststate = TRIE_NODENUM( next_alloc );
2117 trie->statecount = laststate;
2119 for ( state = 1 ; state < laststate ; state++ ) {
2121 const U32 stateidx = TRIE_NODEIDX( state );
2122 const U32 o_used = trie->trans[ stateidx ].check;
2123 U32 used = trie->trans[ stateidx ].check;
2124 trie->trans[ stateidx ].check = 0;
2126 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2127 if ( flag || trie->trans[ stateidx + charid ].next ) {
2128 if ( trie->trans[ stateidx + charid ].next ) {
2130 for ( ; zp < pos ; zp++ ) {
2131 if ( ! trie->trans[ zp ].next ) {
2135 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2136 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2137 trie->trans[ zp ].check = state;
2138 if ( ++zp > pos ) pos = zp;
2145 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2147 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2148 trie->trans[ pos ].check = state;
2153 trie->lasttrans = pos + 1;
2154 trie->states = (reg_trie_state *)
2155 PerlMemShared_realloc( trie->states, laststate
2156 * sizeof(reg_trie_state) );
2157 DEBUG_TRIE_COMPILE_MORE_r(
2158 PerlIO_printf( Perl_debug_log,
2159 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2160 (int)depth * 2 + 2,"",
2161 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2164 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2167 } /* end table compress */
2169 DEBUG_TRIE_COMPILE_MORE_r(
2170 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2171 (int)depth * 2 + 2, "",
2172 (UV)trie->statecount,
2173 (UV)trie->lasttrans)
2175 /* resize the trans array to remove unused space */
2176 trie->trans = (reg_trie_trans *)
2177 PerlMemShared_realloc( trie->trans, trie->lasttrans
2178 * sizeof(reg_trie_trans) );
2180 { /* Modify the program and insert the new TRIE node */
2181 U8 nodetype =(U8)(flags & 0xFF);
2185 regnode *optimize = NULL;
2186 #ifdef RE_TRACK_PATTERN_OFFSETS
2189 U32 mjd_nodelen = 0;
2190 #endif /* RE_TRACK_PATTERN_OFFSETS */
2191 #endif /* DEBUGGING */
2193 This means we convert either the first branch or the first Exact,
2194 depending on whether the thing following (in 'last') is a branch
2195 or not and whther first is the startbranch (ie is it a sub part of
2196 the alternation or is it the whole thing.)
2197 Assuming its a sub part we convert the EXACT otherwise we convert
2198 the whole branch sequence, including the first.
2200 /* Find the node we are going to overwrite */
2201 if ( first != startbranch || OP( last ) == BRANCH ) {
2202 /* branch sub-chain */
2203 NEXT_OFF( first ) = (U16)(last - first);
2204 #ifdef RE_TRACK_PATTERN_OFFSETS
2206 mjd_offset= Node_Offset((convert));
2207 mjd_nodelen= Node_Length((convert));
2210 /* whole branch chain */
2212 #ifdef RE_TRACK_PATTERN_OFFSETS
2215 const regnode *nop = NEXTOPER( convert );
2216 mjd_offset= Node_Offset((nop));
2217 mjd_nodelen= Node_Length((nop));
2221 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2222 (int)depth * 2 + 2, "",
2223 (UV)mjd_offset, (UV)mjd_nodelen)
2226 /* But first we check to see if there is a common prefix we can
2227 split out as an EXACT and put in front of the TRIE node. */
2228 trie->startstate= 1;
2229 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2231 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2235 const U32 base = trie->states[ state ].trans.base;
2237 if ( trie->states[state].wordnum )
2240 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2241 if ( ( base + ofs >= trie->uniquecharcount ) &&
2242 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2243 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2245 if ( ++count > 1 ) {
2246 SV **tmp = av_fetch( revcharmap, ofs, 0);
2247 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2248 if ( state == 1 ) break;
2250 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2252 PerlIO_printf(Perl_debug_log,
2253 "%*sNew Start State=%"UVuf" Class: [",
2254 (int)depth * 2 + 2, "",
2257 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2258 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2260 TRIE_BITMAP_SET(trie,*ch);
2262 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2264 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2268 TRIE_BITMAP_SET(trie,*ch);
2270 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2271 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2277 SV **tmp = av_fetch( revcharmap, idx, 0);
2279 char *ch = SvPV( *tmp, len );
2281 SV *sv=sv_newmortal();
2282 PerlIO_printf( Perl_debug_log,
2283 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2284 (int)depth * 2 + 2, "",
2286 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2287 PL_colors[0], PL_colors[1],
2288 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2289 PERL_PV_ESCAPE_FIRSTCHAR
2294 OP( convert ) = nodetype;
2295 str=STRING(convert);
2298 STR_LEN(convert) += len;
2304 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2309 trie->prefixlen = (state-1);
2311 regnode *n = convert+NODE_SZ_STR(convert);
2312 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2313 trie->startstate = state;
2314 trie->minlen -= (state - 1);
2315 trie->maxlen -= (state - 1);
2317 /* At least the UNICOS C compiler choked on this
2318 * being argument to DEBUG_r(), so let's just have
2321 #ifdef PERL_EXT_RE_BUILD
2327 regnode *fix = convert;
2328 U32 word = trie->wordcount;
2330 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2331 while( ++fix < n ) {
2332 Set_Node_Offset_Length(fix, 0, 0);
2335 SV ** const tmp = av_fetch( trie_words, word, 0 );
2337 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2338 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2340 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2348 NEXT_OFF(convert) = (U16)(tail - convert);
2349 DEBUG_r(optimize= n);
2355 if ( trie->maxlen ) {
2356 NEXT_OFF( convert ) = (U16)(tail - convert);
2357 ARG_SET( convert, data_slot );
2358 /* Store the offset to the first unabsorbed branch in
2359 jump[0], which is otherwise unused by the jump logic.
2360 We use this when dumping a trie and during optimisation. */
2362 trie->jump[0] = (U16)(nextbranch - convert);
2364 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2365 * and there is a bitmap
2366 * and the first "jump target" node we found leaves enough room
2367 * then convert the TRIE node into a TRIEC node, with the bitmap
2368 * embedded inline in the opcode - this is hypothetically faster.
2370 if ( !trie->states[trie->startstate].wordnum
2372 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2374 OP( convert ) = TRIEC;
2375 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2376 PerlMemShared_free(trie->bitmap);
2379 OP( convert ) = TRIE;
2381 /* store the type in the flags */
2382 convert->flags = nodetype;
2386 + regarglen[ OP( convert ) ];
2388 /* XXX We really should free up the resource in trie now,
2389 as we won't use them - (which resources?) dmq */
2391 /* needed for dumping*/
2392 DEBUG_r(if (optimize) {
2393 regnode *opt = convert;
2395 while ( ++opt < optimize) {
2396 Set_Node_Offset_Length(opt,0,0);
2399 Try to clean up some of the debris left after the
2402 while( optimize < jumper ) {
2403 mjd_nodelen += Node_Length((optimize));
2404 OP( optimize ) = OPTIMIZED;
2405 Set_Node_Offset_Length(optimize,0,0);
2408 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2410 } /* end node insert */
2412 /* Finish populating the prev field of the wordinfo array. Walk back
2413 * from each accept state until we find another accept state, and if
2414 * so, point the first word's .prev field at the second word. If the
2415 * second already has a .prev field set, stop now. This will be the
2416 * case either if we've already processed that word's accept state,
2417 * or that state had multiple words, and the overspill words were
2418 * already linked up earlier.
2425 for (word=1; word <= trie->wordcount; word++) {
2427 if (trie->wordinfo[word].prev)
2429 state = trie->wordinfo[word].accept;
2431 state = prev_states[state];
2434 prev = trie->states[state].wordnum;
2438 trie->wordinfo[word].prev = prev;
2440 Safefree(prev_states);
2444 /* and now dump out the compressed format */
2445 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2447 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2449 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2450 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2452 SvREFCNT_dec(revcharmap);
2456 : trie->startstate>1
2462 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2464 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2466 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2467 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2470 We find the fail state for each state in the trie, this state is the longest proper
2471 suffix of the current state's 'word' that is also a proper prefix of another word in our
2472 trie. State 1 represents the word '' and is thus the default fail state. This allows
2473 the DFA not to have to restart after its tried and failed a word at a given point, it
2474 simply continues as though it had been matching the other word in the first place.
2476 'abcdgu'=~/abcdefg|cdgu/
2477 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2478 fail, which would bring us to the state representing 'd' in the second word where we would
2479 try 'g' and succeed, proceeding to match 'cdgu'.
2481 /* add a fail transition */
2482 const U32 trie_offset = ARG(source);
2483 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2485 const U32 ucharcount = trie->uniquecharcount;
2486 const U32 numstates = trie->statecount;
2487 const U32 ubound = trie->lasttrans + ucharcount;
2491 U32 base = trie->states[ 1 ].trans.base;
2494 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2495 GET_RE_DEBUG_FLAGS_DECL;
2497 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2499 PERL_UNUSED_ARG(depth);
2503 ARG_SET( stclass, data_slot );
2504 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2505 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2506 aho->trie=trie_offset;
2507 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2508 Copy( trie->states, aho->states, numstates, reg_trie_state );
2509 Newxz( q, numstates, U32);
2510 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2513 /* initialize fail[0..1] to be 1 so that we always have
2514 a valid final fail state */
2515 fail[ 0 ] = fail[ 1 ] = 1;
2517 for ( charid = 0; charid < ucharcount ; charid++ ) {
2518 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2520 q[ q_write ] = newstate;
2521 /* set to point at the root */
2522 fail[ q[ q_write++ ] ]=1;
2525 while ( q_read < q_write) {
2526 const U32 cur = q[ q_read++ % numstates ];
2527 base = trie->states[ cur ].trans.base;
2529 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2530 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2532 U32 fail_state = cur;
2535 fail_state = fail[ fail_state ];
2536 fail_base = aho->states[ fail_state ].trans.base;
2537 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2539 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2540 fail[ ch_state ] = fail_state;
2541 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2543 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2545 q[ q_write++ % numstates] = ch_state;
2549 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2550 when we fail in state 1, this allows us to use the
2551 charclass scan to find a valid start char. This is based on the principle
2552 that theres a good chance the string being searched contains lots of stuff
2553 that cant be a start char.
2555 fail[ 0 ] = fail[ 1 ] = 0;
2556 DEBUG_TRIE_COMPILE_r({
2557 PerlIO_printf(Perl_debug_log,
2558 "%*sStclass Failtable (%"UVuf" states): 0",
2559 (int)(depth * 2), "", (UV)numstates
2561 for( q_read=1; q_read<numstates; q_read++ ) {
2562 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2564 PerlIO_printf(Perl_debug_log, "\n");
2567 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2572 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2573 * These need to be revisited when a newer toolchain becomes available.
2575 #if defined(__sparc64__) && defined(__GNUC__)
2576 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2577 # undef SPARC64_GCC_WORKAROUND
2578 # define SPARC64_GCC_WORKAROUND 1
2582 #define DEBUG_PEEP(str,scan,depth) \
2583 DEBUG_OPTIMISE_r({if (scan){ \
2584 SV * const mysv=sv_newmortal(); \
2585 regnode *Next = regnext(scan); \
2586 regprop(RExC_rx, mysv, scan); \
2587 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2588 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2589 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2593 /* The below joins as many adjacent EXACTish nodes as possible into a single
2594 * one. The regop may be changed if the node(s) contain certain sequences that
2595 * require special handling. The joining is only done if:
2596 * 1) there is room in the current conglomerated node to entirely contain the
2598 * 2) they are the exact same node type
2600 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2601 * these get optimized out
2603 * If a node is to match under /i (folded), the number of characters it matches
2604 * can be different than its character length if it contains a multi-character
2605 * fold. *min_subtract is set to the total delta of the input nodes.
2607 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2608 * and contains LATIN SMALL LETTER SHARP S
2610 * This is as good a place as any to discuss the design of handling these
2611 * multi-character fold sequences. It's been wrong in Perl for a very long
2612 * time. There are three code points in Unicode whose multi-character folds
2613 * were long ago discovered to mess things up. The previous designs for
2614 * dealing with these involved assigning a special node for them. This
2615 * approach doesn't work, as evidenced by this example:
2616 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2617 * Both these fold to "sss", but if the pattern is parsed to create a node that
2618 * would match just the \xDF, it won't be able to handle the case where a
2619 * successful match would have to cross the node's boundary. The new approach
2620 * that hopefully generally solves the problem generates an EXACTFU_SS node
2623 * It turns out that there are problems with all multi-character folds, and not
2624 * just these three. Now the code is general, for all such cases, but the
2625 * three still have some special handling. The approach taken is:
2626 * 1) This routine examines each EXACTFish node that could contain multi-
2627 * character fold sequences. It returns in *min_subtract how much to
2628 * subtract from the the actual length of the string to get a real minimum
2629 * match length; it is 0 if there are no multi-char folds. This delta is
2630 * used by the caller to adjust the min length of the match, and the delta
2631 * between min and max, so that the optimizer doesn't reject these
2632 * possibilities based on size constraints.
2633 * 2) Certain of these sequences require special handling by the trie code,
2634 * so, if found, this code changes the joined node type to special ops:
2635 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2636 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2637 * is used for an EXACTFU node that contains at least one "ss" sequence in
2638 * it. For non-UTF-8 patterns and strings, this is the only case where
2639 * there is a possible fold length change. That means that a regular
2640 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2641 * with length changes, and so can be processed faster. regexec.c takes
2642 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2643 * pre-folded by regcomp.c. This saves effort in regex matching.
2644 * However, the pre-folding isn't done for non-UTF8 patterns because the
2645 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2646 * down by forcing the pattern into UTF8 unless necessary. Also what
2647 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2648 * possibilities for the non-UTF8 patterns are quite simple, except for
2649 * the sharp s. All the ones that don't involve a UTF-8 target string are
2650 * members of a fold-pair, and arrays are set up for all of them so that
2651 * the other member of the pair can be found quickly. Code elsewhere in
2652 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2653 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2654 * described in the next item.
2655 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2656 * 'ss' or not is not knowable at compile time. It will match iff the
2657 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2658 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2659 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2660 * described in item 3). An assumption that the optimizer part of
2661 * regexec.c (probably unwittingly) makes is that a character in the
2662 * pattern corresponds to at most a single character in the target string.
2663 * (And I do mean character, and not byte here, unlike other parts of the
2664 * documentation that have never been updated to account for multibyte
2665 * Unicode.) This assumption is wrong only in this case, as all other
2666 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2667 * virtue of having this file pre-fold UTF-8 patterns. I'm
2668 * reluctant to try to change this assumption, so instead the code punts.
2669 * This routine examines EXACTF nodes for the sharp s, and returns a
2670 * boolean indicating whether or not the node is an EXACTF node that
2671 * contains a sharp s. When it is true, the caller sets a flag that later
2672 * causes the optimizer in this file to not set values for the floating
2673 * and fixed string lengths, and thus avoids the optimizer code in
2674 * regexec.c that makes the invalid assumption. Thus, there is no
2675 * optimization based on string lengths for EXACTF nodes that contain the
2676 * sharp s. This only happens for /id rules (which means the pattern
2680 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2681 if (PL_regkind[OP(scan)] == EXACT) \
2682 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2685 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2686 /* Merge several consecutive EXACTish nodes into one. */
2687 regnode *n = regnext(scan);
2689 regnode *next = scan + NODE_SZ_STR(scan);
2693 regnode *stop = scan;
2694 GET_RE_DEBUG_FLAGS_DECL;
2696 PERL_UNUSED_ARG(depth);
2699 PERL_ARGS_ASSERT_JOIN_EXACT;
2700 #ifndef EXPERIMENTAL_INPLACESCAN
2701 PERL_UNUSED_ARG(flags);
2702 PERL_UNUSED_ARG(val);
2704 DEBUG_PEEP("join",scan,depth);
2706 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2707 * EXACT ones that are mergeable to the current one. */
2709 && (PL_regkind[OP(n)] == NOTHING
2710 || (stringok && OP(n) == OP(scan)))
2712 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2715 if (OP(n) == TAIL || n > next)
2717 if (PL_regkind[OP(n)] == NOTHING) {
2718 DEBUG_PEEP("skip:",n,depth);
2719 NEXT_OFF(scan) += NEXT_OFF(n);
2720 next = n + NODE_STEP_REGNODE;
2727 else if (stringok) {
2728 const unsigned int oldl = STR_LEN(scan);
2729 regnode * const nnext = regnext(n);
2731 /* XXX I (khw) kind of doubt that this works on platforms where
2732 * U8_MAX is above 255 because of lots of other assumptions */
2733 if (oldl + STR_LEN(n) > U8_MAX)
2736 DEBUG_PEEP("merg",n,depth);
2739 NEXT_OFF(scan) += NEXT_OFF(n);
2740 STR_LEN(scan) += STR_LEN(n);
2741 next = n + NODE_SZ_STR(n);
2742 /* Now we can overwrite *n : */
2743 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2751 #ifdef EXPERIMENTAL_INPLACESCAN
2752 if (flags && !NEXT_OFF(n)) {
2753 DEBUG_PEEP("atch", val, depth);
2754 if (reg_off_by_arg[OP(n)]) {
2755 ARG_SET(n, val - n);
2758 NEXT_OFF(n) = val - n;
2766 *has_exactf_sharp_s = FALSE;
2768 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2769 * can now analyze for sequences of problematic code points. (Prior to
2770 * this final joining, sequences could have been split over boundaries, and
2771 * hence missed). The sequences only happen in folding, hence for any
2772 * non-EXACT EXACTish node */
2773 if (OP(scan) != EXACT) {
2774 const U8 * const s0 = (U8*) STRING(scan);
2776 const U8 * const s_end = s0 + STR_LEN(scan);
2778 /* One pass is made over the node's string looking for all the
2779 * possibilities. to avoid some tests in the loop, there are two main
2780 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2784 /* Examine the string for a multi-character fold sequence. UTF-8
2785 * patterns have all characters pre-folded by the time this code is
2787 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2788 length sequence we are looking for is 2 */
2791 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2792 if (! len) { /* Not a multi-char fold: get next char */
2797 /* Nodes with 'ss' require special handling, except for EXACTFL
2798 * and EXACTFA for which there is no multi-char fold to this */
2799 if (len == 2 && *s == 's' && *(s+1) == 's'
2800 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2803 OP(scan) = EXACTFU_SS;
2806 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2807 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2808 COMBINING_DIAERESIS_UTF8
2809 COMBINING_ACUTE_ACCENT_UTF8,
2811 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2812 COMBINING_DIAERESIS_UTF8
2813 COMBINING_ACUTE_ACCENT_UTF8,
2818 /* These two folds require special handling by trie's, so
2819 * change the node type to indicate this. If EXACTFA and
2820 * EXACTFL were ever to be handled by trie's, this would
2821 * have to be changed. If this node has already been
2822 * changed to EXACTFU_SS in this loop, leave it as is. (I
2823 * (khw) think it doesn't matter in regexec.c for UTF
2824 * patterns, but no need to change it */
2825 if (OP(scan) == EXACTFU) {
2826 OP(scan) = EXACTFU_TRICKYFOLD;
2830 else { /* Here is a generic multi-char fold. */
2831 const U8* multi_end = s + len;
2833 /* Count how many characters in it. In the case of /l and
2834 * /aa, no folds which contain ASCII code points are
2835 * allowed, so check for those, and skip if found. (In
2836 * EXACTFL, no folds are allowed to any Latin1 code point,
2837 * not just ASCII. But there aren't any of these
2838 * currently, nor ever likely, so don't take the time to
2839 * test for them. The code that generates the
2840 * is_MULTI_foo() macros croaks should one actually get put
2841 * into Unicode .) */
2842 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2843 count = utf8_length(s, multi_end);
2847 while (s < multi_end) {
2850 goto next_iteration;
2860 /* The delta is how long the sequence is minus 1 (1 is how long
2861 * the character that folds to the sequence is) */
2862 *min_subtract += count - 1;
2866 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2868 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2869 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2870 * nodes can't have multi-char folds to this range (and there are
2871 * no existing ones in the upper latin1 range). In the EXACTF
2872 * case we look also for the sharp s, which can be in the final
2873 * position. Otherwise we can stop looking 1 byte earlier because
2874 * have to find at least two characters for a multi-fold */
2875 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2877 /* The below is perhaps overboard, but this allows us to save a
2878 * test each time through the loop at the expense of a mask. This
2879 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2880 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2881 * are 64. This uses an exclusive 'or' to find that bit and then
2882 * inverts it to form a mask, with just a single 0, in the bit
2883 * position where 'S' and 's' differ. */
2884 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2885 const U8 s_masked = 's' & S_or_s_mask;
2888 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2889 if (! len) { /* Not a multi-char fold. */
2890 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2892 *has_exactf_sharp_s = TRUE;
2899 && ((*s & S_or_s_mask) == s_masked)
2900 && ((*(s+1) & S_or_s_mask) == s_masked))
2903 /* EXACTF nodes need to know that the minimum length
2904 * changed so that a sharp s in the string can match this
2905 * ss in the pattern, but they remain EXACTF nodes, as they
2906 * won't match this unless the target string is is UTF-8,
2907 * which we don't know until runtime */
2908 if (OP(scan) != EXACTF) {
2909 OP(scan) = EXACTFU_SS;
2913 *min_subtract += len - 1;
2920 /* Allow dumping but overwriting the collection of skipped
2921 * ops and/or strings with fake optimized ops */
2922 n = scan + NODE_SZ_STR(scan);
2930 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2934 /* REx optimizer. Converts nodes into quicker variants "in place".
2935 Finds fixed substrings. */
2937 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2938 to the position after last scanned or to NULL. */
2940 #define INIT_AND_WITHP \
2941 assert(!and_withp); \
2942 Newx(and_withp,1,struct regnode_charclass_class); \
2943 SAVEFREEPV(and_withp)
2945 /* this is a chain of data about sub patterns we are processing that
2946 need to be handled separately/specially in study_chunk. Its so
2947 we can simulate recursion without losing state. */
2949 typedef struct scan_frame {
2950 regnode *last; /* last node to process in this frame */
2951 regnode *next; /* next node to process when last is reached */
2952 struct scan_frame *prev; /*previous frame*/
2953 I32 stop; /* what stopparen do we use */
2957 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2959 #define CASE_SYNST_FNC(nAmE) \
2961 if (flags & SCF_DO_STCLASS_AND) { \
2962 for (value = 0; value < 256; value++) \
2963 if (!is_ ## nAmE ## _cp(value)) \
2964 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2967 for (value = 0; value < 256; value++) \
2968 if (is_ ## nAmE ## _cp(value)) \
2969 ANYOF_BITMAP_SET(data->start_class, value); \
2973 if (flags & SCF_DO_STCLASS_AND) { \
2974 for (value = 0; value < 256; value++) \
2975 if (is_ ## nAmE ## _cp(value)) \
2976 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2979 for (value = 0; value < 256; value++) \
2980 if (!is_ ## nAmE ## _cp(value)) \
2981 ANYOF_BITMAP_SET(data->start_class, value); \
2988 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2989 I32 *minlenp, I32 *deltap,
2994 struct regnode_charclass_class *and_withp,
2995 U32 flags, U32 depth)
2996 /* scanp: Start here (read-write). */
2997 /* deltap: Write maxlen-minlen here. */
2998 /* last: Stop before this one. */
2999 /* data: string data about the pattern */
3000 /* stopparen: treat close N as END */
3001 /* recursed: which subroutines have we recursed into */
3002 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3005 I32 min = 0; /* There must be at least this number of characters to match */
3007 regnode *scan = *scanp, *next;
3009 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3010 int is_inf_internal = 0; /* The studied chunk is infinite */
3011 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3012 scan_data_t data_fake;
3013 SV *re_trie_maxbuff = NULL;
3014 regnode *first_non_open = scan;
3015 I32 stopmin = I32_MAX;
3016 scan_frame *frame = NULL;
3017 GET_RE_DEBUG_FLAGS_DECL;
3019 PERL_ARGS_ASSERT_STUDY_CHUNK;
3022 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3026 while (first_non_open && OP(first_non_open) == OPEN)
3027 first_non_open=regnext(first_non_open);
3032 while ( scan && OP(scan) != END && scan < last ){
3033 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3034 node length to get a real minimum (because
3035 the folded version may be shorter) */
3036 bool has_exactf_sharp_s = FALSE;
3037 /* Peephole optimizer: */
3038 DEBUG_STUDYDATA("Peep:", data,depth);
3039 DEBUG_PEEP("Peep",scan,depth);
3041 /* Its not clear to khw or hv why this is done here, and not in the
3042 * clauses that deal with EXACT nodes. khw's guess is that it's
3043 * because of a previous design */
3044 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3046 /* Follow the next-chain of the current node and optimize
3047 away all the NOTHINGs from it. */
3048 if (OP(scan) != CURLYX) {
3049 const int max = (reg_off_by_arg[OP(scan)]
3051 /* I32 may be smaller than U16 on CRAYs! */
3052 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3053 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3057 /* Skip NOTHING and LONGJMP. */
3058 while ((n = regnext(n))
3059 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3060 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3061 && off + noff < max)
3063 if (reg_off_by_arg[OP(scan)])
3066 NEXT_OFF(scan) = off;
3071 /* The principal pseudo-switch. Cannot be a switch, since we
3072 look into several different things. */
3073 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3074 || OP(scan) == IFTHEN) {
3075 next = regnext(scan);
3077 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3079 if (OP(next) == code || code == IFTHEN) {
3080 /* NOTE - There is similar code to this block below for handling
3081 TRIE nodes on a re-study. If you change stuff here check there
3083 I32 max1 = 0, min1 = I32_MAX, num = 0;
3084 struct regnode_charclass_class accum;
3085 regnode * const startbranch=scan;
3087 if (flags & SCF_DO_SUBSTR)
3088 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3089 if (flags & SCF_DO_STCLASS)
3090 cl_init_zero(pRExC_state, &accum);
3092 while (OP(scan) == code) {
3093 I32 deltanext, minnext, f = 0, fake;
3094 struct regnode_charclass_class this_class;
3097 data_fake.flags = 0;
3099 data_fake.whilem_c = data->whilem_c;
3100 data_fake.last_closep = data->last_closep;
3103 data_fake.last_closep = &fake;
3105 data_fake.pos_delta = delta;
3106 next = regnext(scan);
3107 scan = NEXTOPER(scan);
3109 scan = NEXTOPER(scan);
3110 if (flags & SCF_DO_STCLASS) {
3111 cl_init(pRExC_state, &this_class);
3112 data_fake.start_class = &this_class;
3113 f = SCF_DO_STCLASS_AND;
3115 if (flags & SCF_WHILEM_VISITED_POS)
3116 f |= SCF_WHILEM_VISITED_POS;
3118 /* we suppose the run is continuous, last=next...*/
3119 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3121 stopparen, recursed, NULL, f,depth+1);
3124 if (max1 < minnext + deltanext)
3125 max1 = minnext + deltanext;
3126 if (deltanext == I32_MAX)
3127 is_inf = is_inf_internal = 1;
3129 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3131 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3132 if ( stopmin > minnext)
3133 stopmin = min + min1;
3134 flags &= ~SCF_DO_SUBSTR;
3136 data->flags |= SCF_SEEN_ACCEPT;
3139 if (data_fake.flags & SF_HAS_EVAL)
3140 data->flags |= SF_HAS_EVAL;
3141 data->whilem_c = data_fake.whilem_c;
3143 if (flags & SCF_DO_STCLASS)
3144 cl_or(pRExC_state, &accum, &this_class);
3146 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3148 if (flags & SCF_DO_SUBSTR) {
3149 data->pos_min += min1;
3150 data->pos_delta += max1 - min1;
3151 if (max1 != min1 || is_inf)
3152 data->longest = &(data->longest_float);
3155 delta += max1 - min1;
3156 if (flags & SCF_DO_STCLASS_OR) {
3157 cl_or(pRExC_state, data->start_class, &accum);
3159 cl_and(data->start_class, and_withp);
3160 flags &= ~SCF_DO_STCLASS;
3163 else if (flags & SCF_DO_STCLASS_AND) {
3165 cl_and(data->start_class, &accum);
3166 flags &= ~SCF_DO_STCLASS;
3169 /* Switch to OR mode: cache the old value of
3170 * data->start_class */
3172 StructCopy(data->start_class, and_withp,
3173 struct regnode_charclass_class);
3174 flags &= ~SCF_DO_STCLASS_AND;
3175 StructCopy(&accum, data->start_class,
3176 struct regnode_charclass_class);
3177 flags |= SCF_DO_STCLASS_OR;
3178 data->start_class->flags |= ANYOF_EOS;
3182 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3185 Assuming this was/is a branch we are dealing with: 'scan' now
3186 points at the item that follows the branch sequence, whatever
3187 it is. We now start at the beginning of the sequence and look
3194 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3196 If we can find such a subsequence we need to turn the first
3197 element into a trie and then add the subsequent branch exact
3198 strings to the trie.
3202 1. patterns where the whole set of branches can be converted.
3204 2. patterns where only a subset can be converted.
3206 In case 1 we can replace the whole set with a single regop
3207 for the trie. In case 2 we need to keep the start and end
3210 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3211 becomes BRANCH TRIE; BRANCH X;
3213 There is an additional case, that being where there is a
3214 common prefix, which gets split out into an EXACT like node
3215 preceding the TRIE node.
3217 If x(1..n)==tail then we can do a simple trie, if not we make
3218 a "jump" trie, such that when we match the appropriate word
3219 we "jump" to the appropriate tail node. Essentially we turn
3220 a nested if into a case structure of sorts.
3225 if (!re_trie_maxbuff) {
3226 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3227 if (!SvIOK(re_trie_maxbuff))
3228 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3230 if ( SvIV(re_trie_maxbuff)>=0 ) {
3232 regnode *first = (regnode *)NULL;
3233 regnode *last = (regnode *)NULL;
3234 regnode *tail = scan;
3239 SV * const mysv = sv_newmortal(); /* for dumping */
3241 /* var tail is used because there may be a TAIL
3242 regop in the way. Ie, the exacts will point to the
3243 thing following the TAIL, but the last branch will
3244 point at the TAIL. So we advance tail. If we
3245 have nested (?:) we may have to move through several
3249 while ( OP( tail ) == TAIL ) {
3250 /* this is the TAIL generated by (?:) */
3251 tail = regnext( tail );
3255 DEBUG_TRIE_COMPILE_r({
3256 regprop(RExC_rx, mysv, tail );
3257 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3258 (int)depth * 2 + 2, "",
3259 "Looking for TRIE'able sequences. Tail node is: ",
3260 SvPV_nolen_const( mysv )
3266 Step through the branches
3267 cur represents each branch,
3268 noper is the first thing to be matched as part of that branch
3269 noper_next is the regnext() of that node.
3271 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3272 via a "jump trie" but we also support building with NOJUMPTRIE,
3273 which restricts the trie logic to structures like /FOO|BAR/.
3275 If noper is a trieable nodetype then the branch is a possible optimization
3276 target. If we are building under NOJUMPTRIE then we require that noper_next
3277 is the same as scan (our current position in the regex program).
3279 Once we have two or more consecutive such branches we can create a
3280 trie of the EXACT's contents and stitch it in place into the program.
3282 If the sequence represents all of the branches in the alternation we
3283 replace the entire thing with a single TRIE node.
3285 Otherwise when it is a subsequence we need to stitch it in place and
3286 replace only the relevant branches. This means the first branch has
3287 to remain as it is used by the alternation logic, and its next pointer,
3288 and needs to be repointed at the item on the branch chain following
3289 the last branch we have optimized away.
3291 This could be either a BRANCH, in which case the subsequence is internal,
3292 or it could be the item following the branch sequence in which case the
3293 subsequence is at the end (which does not necessarily mean the first node
3294 is the start of the alternation).
3296 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3299 ----------------+-----------
3303 EXACTFU_SS | EXACTFU
3304 EXACTFU_TRICKYFOLD | EXACTFU
3309 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3310 ( EXACT == (X) ) ? EXACT : \
3311 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3314 /* dont use tail as the end marker for this traverse */
3315 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3316 regnode * const noper = NEXTOPER( cur );
3317 U8 noper_type = OP( noper );
3318 U8 noper_trietype = TRIE_TYPE( noper_type );
3319 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3320 regnode * const noper_next = regnext( noper );
3321 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3322 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3325 DEBUG_TRIE_COMPILE_r({
3326 regprop(RExC_rx, mysv, cur);
3327 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3328 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3330 regprop(RExC_rx, mysv, noper);
3331 PerlIO_printf( Perl_debug_log, " -> %s",
3332 SvPV_nolen_const(mysv));
3335 regprop(RExC_rx, mysv, noper_next );
3336 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3337 SvPV_nolen_const(mysv));
3339 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3340 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3341 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3345 /* Is noper a trieable nodetype that can be merged with the
3346 * current trie (if there is one)? */
3350 ( noper_trietype == NOTHING)
3351 || ( trietype == NOTHING )
3352 || ( trietype == noper_trietype )
3355 && noper_next == tail
3359 /* Handle mergable triable node
3360 * Either we are the first node in a new trieable sequence,
3361 * in which case we do some bookkeeping, otherwise we update
3362 * the end pointer. */
3365 if ( noper_trietype == NOTHING ) {
3366 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3367 regnode * const noper_next = regnext( noper );
3368 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3369 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3372 if ( noper_next_trietype ) {
3373 trietype = noper_next_trietype;
3374 } else if (noper_next_type) {
3375 /* a NOTHING regop is 1 regop wide. We need at least two
3376 * for a trie so we can't merge this in */
3380 trietype = noper_trietype;
3383 if ( trietype == NOTHING )
3384 trietype = noper_trietype;
3389 } /* end handle mergable triable node */
3391 /* handle unmergable node -
3392 * noper may either be a triable node which can not be tried
3393 * together with the current trie, or a non triable node */
3395 /* If last is set and trietype is not NOTHING then we have found
3396 * at least two triable branch sequences in a row of a similar
3397 * trietype so we can turn them into a trie. If/when we
3398 * allow NOTHING to start a trie sequence this condition will be
3399 * required, and it isn't expensive so we leave it in for now. */
3400 if ( trietype && trietype != NOTHING )
3401 make_trie( pRExC_state,
3402 startbranch, first, cur, tail, count,
3403 trietype, depth+1 );
3404 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3408 && noper_next == tail
3411 /* noper is triable, so we can start a new trie sequence */
3414 trietype = noper_trietype;
3416 /* if we already saw a first but the current node is not triable then we have
3417 * to reset the first information. */
3422 } /* end handle unmergable node */
3423 } /* loop over branches */
3424 DEBUG_TRIE_COMPILE_r({
3425 regprop(RExC_rx, mysv, cur);
3426 PerlIO_printf( Perl_debug_log,
3427 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3428 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3431 if ( last && trietype ) {
3432 if ( trietype != NOTHING ) {
3433 /* the last branch of the sequence was part of a trie,
3434 * so we have to construct it here outside of the loop
3436 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3437 #ifdef TRIE_STUDY_OPT
3438 if ( ((made == MADE_EXACT_TRIE &&
3439 startbranch == first)
3440 || ( first_non_open == first )) &&
3442 flags |= SCF_TRIE_RESTUDY;
3443 if ( startbranch == first
3446 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3451 /* at this point we know whatever we have is a NOTHING sequence/branch
3452 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3454 if ( startbranch == first ) {
3456 /* the entire thing is a NOTHING sequence, something like this:
3457 * (?:|) So we can turn it into a plain NOTHING op. */
3458 DEBUG_TRIE_COMPILE_r({
3459 regprop(RExC_rx, mysv, cur);
3460 PerlIO_printf( Perl_debug_log,
3461 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3462 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3465 OP(startbranch)= NOTHING;
3466 NEXT_OFF(startbranch)= tail - startbranch;
3467 for ( opt= startbranch + 1; opt < tail ; opt++ )
3471 } /* end if ( last) */
3472 } /* TRIE_MAXBUF is non zero */
3477 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3478 scan = NEXTOPER(NEXTOPER(scan));
3479 } else /* single branch is optimized. */
3480 scan = NEXTOPER(scan);
3482 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3483 scan_frame *newframe = NULL;
3488 if (OP(scan) != SUSPEND) {
3489 /* set the pointer */
3490 if (OP(scan) == GOSUB) {
3492 RExC_recurse[ARG2L(scan)] = scan;
3493 start = RExC_open_parens[paren-1];
3494 end = RExC_close_parens[paren-1];
3497 start = RExC_rxi->program + 1;
3501 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3502 SAVEFREEPV(recursed);
3504 if (!PAREN_TEST(recursed,paren+1)) {
3505 PAREN_SET(recursed,paren+1);
3506 Newx(newframe,1,scan_frame);
3508 if (flags & SCF_DO_SUBSTR) {
3509 SCAN_COMMIT(pRExC_state,data,minlenp);
3510 data->longest = &(data->longest_float);
3512 is_inf = is_inf_internal = 1;
3513 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3514 cl_anything(pRExC_state, data->start_class);
3515 flags &= ~SCF_DO_STCLASS;
3518 Newx(newframe,1,scan_frame);
3521 end = regnext(scan);
3526 SAVEFREEPV(newframe);
3527 newframe->next = regnext(scan);
3528 newframe->last = last;
3529 newframe->stop = stopparen;
3530 newframe->prev = frame;
3540 else if (OP(scan) == EXACT) {
3541 I32 l = STR_LEN(scan);
3544 const U8 * const s = (U8*)STRING(scan);
3545 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3546 l = utf8_length(s, s + l);
3548 uc = *((U8*)STRING(scan));
3551 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3552 /* The code below prefers earlier match for fixed
3553 offset, later match for variable offset. */
3554 if (data->last_end == -1) { /* Update the start info. */
3555 data->last_start_min = data->pos_min;
3556 data->last_start_max = is_inf
3557 ? I32_MAX : data->pos_min + data->pos_delta;
3559 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3561 SvUTF8_on(data->last_found);
3563 SV * const sv = data->last_found;
3564 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3565 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3566 if (mg && mg->mg_len >= 0)
3567 mg->mg_len += utf8_length((U8*)STRING(scan),
3568 (U8*)STRING(scan)+STR_LEN(scan));
3570 data->last_end = data->pos_min + l;
3571 data->pos_min += l; /* As in the first entry. */
3572 data->flags &= ~SF_BEFORE_EOL;
3574 if (flags & SCF_DO_STCLASS_AND) {
3575 /* Check whether it is compatible with what we know already! */
3579 /* If compatible, we or it in below. It is compatible if is
3580 * in the bitmp and either 1) its bit or its fold is set, or 2)
3581 * it's for a locale. Even if there isn't unicode semantics
3582 * here, at runtime there may be because of matching against a
3583 * utf8 string, so accept a possible false positive for
3584 * latin1-range folds */
3586 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3587 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3588 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3589 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3594 ANYOF_CLASS_ZERO(data->start_class);
3595 ANYOF_BITMAP_ZERO(data->start_class);
3597 ANYOF_BITMAP_SET(data->start_class, uc);
3598 else if (uc >= 0x100) {
3601 /* Some Unicode code points fold to the Latin1 range; as
3602 * XXX temporary code, instead of figuring out if this is
3603 * one, just assume it is and set all the start class bits
3604 * that could be some such above 255 code point's fold
3605 * which will generate fals positives. As the code
3606 * elsewhere that does compute the fold settles down, it
3607 * can be extracted out and re-used here */
3608 for (i = 0; i < 256; i++){
3609 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3610 ANYOF_BITMAP_SET(data->start_class, i);
3614 data->start_class->flags &= ~ANYOF_EOS;
3616 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3618 else if (flags & SCF_DO_STCLASS_OR) {
3619 /* false positive possible if the class is case-folded */
3621 ANYOF_BITMAP_SET(data->start_class, uc);
3623 data->start_class->flags |= ANYOF_UNICODE_ALL;
3624 data->start_class->flags &= ~ANYOF_EOS;
3625 cl_and(data->start_class, and_withp);
3627 flags &= ~SCF_DO_STCLASS;
3629 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3630 I32 l = STR_LEN(scan);
3631 UV uc = *((U8*)STRING(scan));
3633 /* Search for fixed substrings supports EXACT only. */
3634 if (flags & SCF_DO_SUBSTR) {
3636 SCAN_COMMIT(pRExC_state, data, minlenp);
3639 const U8 * const s = (U8 *)STRING(scan);
3640 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3641 l = utf8_length(s, s + l);
3643 if (has_exactf_sharp_s) {
3644 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3646 min += l - min_subtract;
3648 delta += min_subtract;
3649 if (flags & SCF_DO_SUBSTR) {
3650 data->pos_min += l - min_subtract;
3651 if (data->pos_min < 0) {
3654 data->pos_delta += min_subtract;
3656 data->longest = &(data->longest_float);
3659 if (flags & SCF_DO_STCLASS_AND) {
3660 /* Check whether it is compatible with what we know already! */
3663 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3664 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3665 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3669 ANYOF_CLASS_ZERO(data->start_class);
3670 ANYOF_BITMAP_ZERO(data->start_class);
3672 ANYOF_BITMAP_SET(data->start_class, uc);
3673 data->start_class->flags &= ~ANYOF_EOS;
3674 if (OP(scan) == EXACTFL) {
3675 /* XXX This set is probably no longer necessary, and
3676 * probably wrong as LOCALE now is on in the initial
3678 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3682 /* Also set the other member of the fold pair. In case
3683 * that unicode semantics is called for at runtime, use
3684 * the full latin1 fold. (Can't do this for locale,
3685 * because not known until runtime) */
3686 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3688 /* All other (EXACTFL handled above) folds except under
3689 * /iaa that include s, S, and sharp_s also may include
3691 if (OP(scan) != EXACTFA) {
3692 if (uc == 's' || uc == 'S') {
3693 ANYOF_BITMAP_SET(data->start_class,
3694 LATIN_SMALL_LETTER_SHARP_S);
3696 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3697 ANYOF_BITMAP_SET(data->start_class, 's');
3698 ANYOF_BITMAP_SET(data->start_class, 'S');
3703 else if (uc >= 0x100) {
3705 for (i = 0; i < 256; i++){
3706 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3707 ANYOF_BITMAP_SET(data->start_class, i);
3712 else if (flags & SCF_DO_STCLASS_OR) {
3713 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3714 /* false positive possible if the class is case-folded.
3715 Assume that the locale settings are the same... */
3717 ANYOF_BITMAP_SET(data->start_class, uc);
3718 if (OP(scan) != EXACTFL) {
3720 /* And set the other member of the fold pair, but
3721 * can't do that in locale because not known until
3723 ANYOF_BITMAP_SET(data->start_class,
3724 PL_fold_latin1[uc]);
3726 /* All folds except under /iaa that include s, S,
3727 * and sharp_s also may include the others */
3728 if (OP(scan) != EXACTFA) {
3729 if (uc == 's' || uc == 'S') {
3730 ANYOF_BITMAP_SET(data->start_class,
3731 LATIN_SMALL_LETTER_SHARP_S);
3733 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3734 ANYOF_BITMAP_SET(data->start_class, 's');
3735 ANYOF_BITMAP_SET(data->start_class, 'S');
3740 data->start_class->flags &= ~ANYOF_EOS;
3742 cl_and(data->start_class, and_withp);
3744 flags &= ~SCF_DO_STCLASS;
3746 else if (REGNODE_VARIES(OP(scan))) {
3747 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3748 I32 f = flags, pos_before = 0;
3749 regnode * const oscan = scan;
3750 struct regnode_charclass_class this_class;
3751 struct regnode_charclass_class *oclass = NULL;
3752 I32 next_is_eval = 0;
3754 switch (PL_regkind[OP(scan)]) {
3755 case WHILEM: /* End of (?:...)* . */
3756 scan = NEXTOPER(scan);
3759 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3760 next = NEXTOPER(scan);
3761 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3763 maxcount = REG_INFTY;
3764 next = regnext(scan);
3765 scan = NEXTOPER(scan);
3769 if (flags & SCF_DO_SUBSTR)
3774 if (flags & SCF_DO_STCLASS) {
3776 maxcount = REG_INFTY;
3777 next = regnext(scan);
3778 scan = NEXTOPER(scan);
3781 is_inf = is_inf_internal = 1;
3782 scan = regnext(scan);
3783 if (flags & SCF_DO_SUBSTR) {
3784 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3785 data->longest = &(data->longest_float);
3787 goto optimize_curly_tail;
3789 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3790 && (scan->flags == stopparen))
3795 mincount = ARG1(scan);
3796 maxcount = ARG2(scan);
3798 next = regnext(scan);
3799 if (OP(scan) == CURLYX) {
3800 I32 lp = (data ? *(data->last_closep) : 0);
3801 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3803 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3804 next_is_eval = (OP(scan) == EVAL);
3806 if (flags & SCF_DO_SUBSTR) {
3807 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3808 pos_before = data->pos_min;
3812 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3814 data->flags |= SF_IS_INF;
3816 if (flags & SCF_DO_STCLASS) {
3817 cl_init(pRExC_state, &this_class);
3818 oclass = data->start_class;
3819 data->start_class = &this_class;
3820 f |= SCF_DO_STCLASS_AND;
3821 f &= ~SCF_DO_STCLASS_OR;
3823 /* Exclude from super-linear cache processing any {n,m}
3824 regops for which the combination of input pos and regex
3825 pos is not enough information to determine if a match
3828 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3829 regex pos at the \s*, the prospects for a match depend not
3830 only on the input position but also on how many (bar\s*)
3831 repeats into the {4,8} we are. */
3832 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3833 f &= ~SCF_WHILEM_VISITED_POS;
3835 /* This will finish on WHILEM, setting scan, or on NULL: */
3836 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3837 last, data, stopparen, recursed, NULL,
3839 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3841 if (flags & SCF_DO_STCLASS)
3842 data->start_class = oclass;
3843 if (mincount == 0 || minnext == 0) {
3844 if (flags & SCF_DO_STCLASS_OR) {
3845 cl_or(pRExC_state, data->start_class, &this_class);
3847 else if (flags & SCF_DO_STCLASS_AND) {
3848 /* Switch to OR mode: cache the old value of
3849 * data->start_class */
3851 StructCopy(data->start_class, and_withp,
3852 struct regnode_charclass_class);
3853 flags &= ~SCF_DO_STCLASS_AND;
3854 StructCopy(&this_class, data->start_class,
3855 struct regnode_charclass_class);
3856 flags |= SCF_DO_STCLASS_OR;
3857 data->start_class->flags |= ANYOF_EOS;
3859 } else { /* Non-zero len */
3860 if (flags & SCF_DO_STCLASS_OR) {
3861 cl_or(pRExC_state, data->start_class, &this_class);
3862 cl_and(data->start_class, and_withp);
3864 else if (flags & SCF_DO_STCLASS_AND)
3865 cl_and(data->start_class, &this_class);
3866 flags &= ~SCF_DO_STCLASS;
3868 if (!scan) /* It was not CURLYX, but CURLY. */
3870 if ( /* ? quantifier ok, except for (?{ ... }) */
3871 (next_is_eval || !(mincount == 0 && maxcount == 1))
3872 && (minnext == 0) && (deltanext == 0)
3873 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3874 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3876 ckWARNreg(RExC_parse,
3877 "Quantifier unexpected on zero-length expression");
3880 min += minnext * mincount;
3881 is_inf_internal |= ((maxcount == REG_INFTY
3882 && (minnext + deltanext) > 0)
3883 || deltanext == I32_MAX);
3884 is_inf |= is_inf_internal;
3885 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3887 /* Try powerful optimization CURLYX => CURLYN. */
3888 if ( OP(oscan) == CURLYX && data
3889 && data->flags & SF_IN_PAR
3890 && !(data->flags & SF_HAS_EVAL)
3891 && !deltanext && minnext == 1 ) {
3892 /* Try to optimize to CURLYN. */
3893 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3894 regnode * const nxt1 = nxt;
3901 if (!REGNODE_SIMPLE(OP(nxt))
3902 && !(PL_regkind[OP(nxt)] == EXACT
3903 && STR_LEN(nxt) == 1))
3909 if (OP(nxt) != CLOSE)
3911 if (RExC_open_parens) {
3912 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3913 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3915 /* Now we know that nxt2 is the only contents: */
3916 oscan->flags = (U8)ARG(nxt);
3918 OP(nxt1) = NOTHING; /* was OPEN. */
3921 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3922 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3923 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3924 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3925 OP(nxt + 1) = OPTIMIZED; /* was count. */
3926 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3931 /* Try optimization CURLYX => CURLYM. */
3932 if ( OP(oscan) == CURLYX && data
3933 && !(data->flags & SF_HAS_PAR)
3934 && !(data->flags & SF_HAS_EVAL)
3935 && !deltanext /* atom is fixed width */
3936 && minnext != 0 /* CURLYM can't handle zero width */
3937 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3939 /* XXXX How to optimize if data == 0? */
3940 /* Optimize to a simpler form. */
3941 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3945 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3946 && (OP(nxt2) != WHILEM))
3948 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3949 /* Need to optimize away parenths. */
3950 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3951 /* Set the parenth number. */
3952 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3954 oscan->flags = (U8)ARG(nxt);
3955 if (RExC_open_parens) {
3956 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3957 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3959 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3960 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3963 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3964 OP(nxt + 1) = OPTIMIZED; /* was count. */
3965 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3966 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3969 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3970 regnode *nnxt = regnext(nxt1);
3972 if (reg_off_by_arg[OP(nxt1)])
3973 ARG_SET(nxt1, nxt2 - nxt1);
3974 else if (nxt2 - nxt1 < U16_MAX)
3975 NEXT_OFF(nxt1) = nxt2 - nxt1;
3977 OP(nxt) = NOTHING; /* Cannot beautify */
3982 /* Optimize again: */
3983 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3984 NULL, stopparen, recursed, NULL, 0,depth+1);
3989 else if ((OP(oscan) == CURLYX)
3990 && (flags & SCF_WHILEM_VISITED_POS)
3991 /* See the comment on a similar expression above.
3992 However, this time it's not a subexpression
3993 we care about, but the expression itself. */
3994 && (maxcount == REG_INFTY)
3995 && data && ++data->whilem_c < 16) {
3996 /* This stays as CURLYX, we can put the count/of pair. */
3997 /* Find WHILEM (as in regexec.c) */
3998 regnode *nxt = oscan + NEXT_OFF(oscan);
4000 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4002 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4003 | (RExC_whilem_seen << 4)); /* On WHILEM */
4005 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4007 if (flags & SCF_DO_SUBSTR) {
4008 SV *last_str = NULL;
4009 int counted = mincount != 0;
4011 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4012 #if defined(SPARC64_GCC_WORKAROUND)
4015 const char *s = NULL;
4018 if (pos_before >= data->last_start_min)
4021 b = data->last_start_min;
4024 s = SvPV_const(data->last_found, l);
4025 old = b - data->last_start_min;
4028 I32 b = pos_before >= data->last_start_min
4029 ? pos_before : data->last_start_min;
4031 const char * const s = SvPV_const(data->last_found, l);
4032 I32 old = b - data->last_start_min;
4036 old = utf8_hop((U8*)s, old) - (U8*)s;
4038 /* Get the added string: */
4039 last_str = newSVpvn_utf8(s + old, l, UTF);
4040 if (deltanext == 0 && pos_before == b) {
4041 /* What was added is a constant string */
4043 SvGROW(last_str, (mincount * l) + 1);
4044 repeatcpy(SvPVX(last_str) + l,
4045 SvPVX_const(last_str), l, mincount - 1);
4046 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4047 /* Add additional parts. */
4048 SvCUR_set(data->last_found,
4049 SvCUR(data->last_found) - l);
4050 sv_catsv(data->last_found, last_str);
4052 SV * sv = data->last_found;
4054 SvUTF8(sv) && SvMAGICAL(sv) ?
4055 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4056 if (mg && mg->mg_len >= 0)
4057 mg->mg_len += CHR_SVLEN(last_str) - l;
4059 data->last_end += l * (mincount - 1);
4062 /* start offset must point into the last copy */
4063 data->last_start_min += minnext * (mincount - 1);
4064 data->last_start_max += is_inf ? I32_MAX
4065 : (maxcount - 1) * (minnext + data->pos_delta);
4068 /* It is counted once already... */
4069 data->pos_min += minnext * (mincount - counted);
4070 data->pos_delta += - counted * deltanext +
4071 (minnext + deltanext) * maxcount - minnext * mincount;
4072 if (mincount != maxcount) {
4073 /* Cannot extend fixed substrings found inside
4075 SCAN_COMMIT(pRExC_state,data,minlenp);
4076 if (mincount && last_str) {
4077 SV * const sv = data->last_found;
4078 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4079 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4083 sv_setsv(sv, last_str);
4084 data->last_end = data->pos_min;
4085 data->last_start_min =
4086 data->pos_min - CHR_SVLEN(last_str);
4087 data->last_start_max = is_inf
4089 : data->pos_min + data->pos_delta
4090 - CHR_SVLEN(last_str);
4092 data->longest = &(data->longest_float);
4094 SvREFCNT_dec(last_str);
4096 if (data && (fl & SF_HAS_EVAL))
4097 data->flags |= SF_HAS_EVAL;
4098 optimize_curly_tail:
4099 if (OP(oscan) != CURLYX) {
4100 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4102 NEXT_OFF(oscan) += NEXT_OFF(next);
4105 default: /* REF, ANYOFV, and CLUMP only? */
4106 if (flags & SCF_DO_SUBSTR) {
4107 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4108 data->longest = &(data->longest_float);
4110 is_inf = is_inf_internal = 1;
4111 if (flags & SCF_DO_STCLASS_OR)
4112 cl_anything(pRExC_state, data->start_class);
4113 flags &= ~SCF_DO_STCLASS;
4117 else if (OP(scan) == LNBREAK) {
4118 if (flags & SCF_DO_STCLASS) {
4120 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4121 if (flags & SCF_DO_STCLASS_AND) {
4122 for (value = 0; value < 256; value++)
4123 if (!is_VERTWS_cp(value))
4124 ANYOF_BITMAP_CLEAR(data->start_class, value);
4127 for (value = 0; value < 256; value++)
4128 if (is_VERTWS_cp(value))
4129 ANYOF_BITMAP_SET(data->start_class, value);
4131 if (flags & SCF_DO_STCLASS_OR)
4132 cl_and(data->start_class, and_withp);
4133 flags &= ~SCF_DO_STCLASS;
4136 delta++; /* Because of the 2 char string cr-lf */
4137 if (flags & SCF_DO_SUBSTR) {
4138 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4140 data->pos_delta += 1;
4141 data->longest = &(data->longest_float);
4144 else if (REGNODE_SIMPLE(OP(scan))) {
4147 if (flags & SCF_DO_SUBSTR) {
4148 SCAN_COMMIT(pRExC_state,data,minlenp);
4152 if (flags & SCF_DO_STCLASS) {
4153 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4155 /* Some of the logic below assumes that switching
4156 locale on will only add false positives. */
4157 switch (PL_regkind[OP(scan)]) {
4161 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4162 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4163 cl_anything(pRExC_state, data->start_class);
4166 if (OP(scan) == SANY)
4168 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4169 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4170 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4171 cl_anything(pRExC_state, data->start_class);
4173 if (flags & SCF_DO_STCLASS_AND || !value)
4174 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4177 if (flags & SCF_DO_STCLASS_AND)
4178 cl_and(data->start_class,
4179 (struct regnode_charclass_class*)scan);
4181 cl_or(pRExC_state, data->start_class,
4182 (struct regnode_charclass_class*)scan);
4185 if (flags & SCF_DO_STCLASS_AND) {
4186 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4187 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4188 if (OP(scan) == ALNUMU) {
4189 for (value = 0; value < 256; value++) {
4190 if (!isWORDCHAR_L1(value)) {
4191 ANYOF_BITMAP_CLEAR(data->start_class, value);
4195 for (value = 0; value < 256; value++) {
4196 if (!isALNUM(value)) {
4197 ANYOF_BITMAP_CLEAR(data->start_class, value);
4204 if (data->start_class->flags & ANYOF_LOCALE)
4205 ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4207 /* Even if under locale, set the bits for non-locale
4208 * in case it isn't a true locale-node. This will
4209 * create false positives if it truly is locale */
4210 if (OP(scan) == ALNUMU) {
4211 for (value = 0; value < 256; value++) {
4212 if (isWORDCHAR_L1(value)) {
4213 ANYOF_BITMAP_SET(data->start_class, value);
4217 for (value = 0; value < 256; value++) {
4218 if (isALNUM(value)) {
4219 ANYOF_BITMAP_SET(data->start_class, value);
4226 if (flags & SCF_DO_STCLASS_AND) {
4227 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4228 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4229 if (OP(scan) == NALNUMU) {
4230 for (value = 0; value < 256; value++) {
4231 if (isWORDCHAR_L1(value)) {
4232 ANYOF_BITMAP_CLEAR(data->start_class, value);
4236 for (value = 0; value < 256; value++) {
4237 if (isALNUM(value)) {
4238 ANYOF_BITMAP_CLEAR(data->start_class, value);
4245 if (data->start_class->flags & ANYOF_LOCALE)
4246 ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4248 /* Even if under locale, set the bits for non-locale in
4249 * case it isn't a true locale-node. This will create
4250 * false positives if it truly is locale */
4251 if (OP(scan) == NALNUMU) {
4252 for (value = 0; value < 256; value++) {
4253 if (! isWORDCHAR_L1(value)) {
4254 ANYOF_BITMAP_SET(data->start_class, value);
4258 for (value = 0; value < 256; value++) {
4259 if (! isALNUM(value)) {
4260 ANYOF_BITMAP_SET(data->start_class, value);
4267 if (flags & SCF_DO_STCLASS_AND) {
4268 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4269 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4270 if (OP(scan) == SPACEU) {
4271 for (value = 0; value < 256; value++) {
4272 if (!isSPACE_L1(value)) {
4273 ANYOF_BITMAP_CLEAR(data->start_class, value);
4277 for (value = 0; value < 256; value++) {
4278 if (!isSPACE(value)) {
4279 ANYOF_BITMAP_CLEAR(data->start_class, value);
4286 if (data->start_class->flags & ANYOF_LOCALE) {
4287 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4289 if (OP(scan) == SPACEU) {
4290 for (value = 0; value < 256; value++) {
4291 if (isSPACE_L1(value)) {
4292 ANYOF_BITMAP_SET(data->start_class, value);
4296 for (value = 0; value < 256; value++) {
4297 if (isSPACE(value)) {
4298 ANYOF_BITMAP_SET(data->start_class, value);
4305 if (flags & SCF_DO_STCLASS_AND) {
4306 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4307 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4308 if (OP(scan) == NSPACEU) {
4309 for (value = 0; value < 256; value++) {
4310 if (isSPACE_L1(value)) {
4311 ANYOF_BITMAP_CLEAR(data->start_class, value);
4315 for (value = 0; value < 256; value++) {
4316 if (isSPACE(value)) {
4317 ANYOF_BITMAP_CLEAR(data->start_class, value);
4324 if (data->start_class->flags & ANYOF_LOCALE)
4325 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4326 if (OP(scan) == NSPACEU) {
4327 for (value = 0; value < 256; value++) {
4328 if (!isSPACE_L1(value)) {
4329 ANYOF_BITMAP_SET(data->start_class, value);
4334 for (value = 0; value < 256; value++) {
4335 if (!isSPACE(value)) {
4336 ANYOF_BITMAP_SET(data->start_class, value);
4343 if (flags & SCF_DO_STCLASS_AND) {
4344 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4345 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4346 for (value = 0; value < 256; value++)
4347 if (!isDIGIT(value))
4348 ANYOF_BITMAP_CLEAR(data->start_class, value);
4352 if (data->start_class->flags & ANYOF_LOCALE)
4353 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4354 for (value = 0; value < 256; value++)
4356 ANYOF_BITMAP_SET(data->start_class, value);
4360 if (flags & SCF_DO_STCLASS_AND) {
4361 if (!(data->start_class->flags & ANYOF_LOCALE))
4362 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4363 for (value = 0; value < 256; value++)
4365 ANYOF_BITMAP_CLEAR(data->start_class, value);
4368 if (data->start_class->flags & ANYOF_LOCALE)
4369 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4370 for (value = 0; value < 256; value++)
4371 if (!isDIGIT(value))
4372 ANYOF_BITMAP_SET(data->start_class, value);
4375 CASE_SYNST_FNC(VERTWS);
4376 CASE_SYNST_FNC(HORIZWS);
4379 if (flags & SCF_DO_STCLASS_OR)
4380 cl_and(data->start_class, and_withp);
4381 flags &= ~SCF_DO_STCLASS;
4384 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4385 data->flags |= (OP(scan) == MEOL
4388 SCAN_COMMIT(pRExC_state, data, minlenp);
4391 else if ( PL_regkind[OP(scan)] == BRANCHJ
4392 /* Lookbehind, or need to calculate parens/evals/stclass: */
4393 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4394 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4395 if ( OP(scan) == UNLESSM &&
4397 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4398 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4401 regnode *upto= regnext(scan);
4403 SV * const mysv_val=sv_newmortal();
4404 DEBUG_STUDYDATA("OPFAIL",data,depth);
4406 /*DEBUG_PARSE_MSG("opfail");*/
4407 regprop(RExC_rx, mysv_val, upto);
4408 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4409 SvPV_nolen_const(mysv_val),
4410 (IV)REG_NODE_NUM(upto),
4415 NEXT_OFF(scan) = upto - scan;
4416 for (opt= scan + 1; opt < upto ; opt++)
4417 OP(opt) = OPTIMIZED;
4421 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4422 || OP(scan) == UNLESSM )
4424 /* Negative Lookahead/lookbehind
4425 In this case we can't do fixed string optimisation.
4428 I32 deltanext, minnext, fake = 0;
4430 struct regnode_charclass_class intrnl;
4433 data_fake.flags = 0;
4435 data_fake.whilem_c = data->whilem_c;
4436 data_fake.last_closep = data->last_closep;
4439 data_fake.last_closep = &fake;
4440 data_fake.pos_delta = delta;
4441 if ( flags & SCF_DO_STCLASS && !scan->flags
4442 && OP(scan) == IFMATCH ) { /* Lookahead */
4443 cl_init(pRExC_state, &intrnl);
4444 data_fake.start_class = &intrnl;
4445 f |= SCF_DO_STCLASS_AND;
4447 if (flags & SCF_WHILEM_VISITED_POS)
4448 f |= SCF_WHILEM_VISITED_POS;
4449 next = regnext(scan);
4450 nscan = NEXTOPER(NEXTOPER(scan));
4451 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4452 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4455 FAIL("Variable length lookbehind not implemented");
4457 else if (minnext > (I32)U8_MAX) {
4458 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4460 scan->flags = (U8)minnext;
4463 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4465 if (data_fake.flags & SF_HAS_EVAL)
4466 data->flags |= SF_HAS_EVAL;
4467 data->whilem_c = data_fake.whilem_c;
4469 if (f & SCF_DO_STCLASS_AND) {
4470 if (flags & SCF_DO_STCLASS_OR) {
4471 /* OR before, AND after: ideally we would recurse with
4472 * data_fake to get the AND applied by study of the
4473 * remainder of the pattern, and then derecurse;
4474 * *** HACK *** for now just treat as "no information".
4475 * See [perl #56690].
4477 cl_init(pRExC_state, data->start_class);
4479 /* AND before and after: combine and continue */
4480 const int was = (data->start_class->flags & ANYOF_EOS);
4482 cl_and(data->start_class, &intrnl);
4484 data->start_class->flags |= ANYOF_EOS;
4488 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4490 /* Positive Lookahead/lookbehind
4491 In this case we can do fixed string optimisation,
4492 but we must be careful about it. Note in the case of
4493 lookbehind the positions will be offset by the minimum
4494 length of the pattern, something we won't know about
4495 until after the recurse.
4497 I32 deltanext, fake = 0;
4499 struct regnode_charclass_class intrnl;
4501 /* We use SAVEFREEPV so that when the full compile
4502 is finished perl will clean up the allocated
4503 minlens when it's all done. This way we don't
4504 have to worry about freeing them when we know
4505 they wont be used, which would be a pain.
4508 Newx( minnextp, 1, I32 );
4509 SAVEFREEPV(minnextp);
4512 StructCopy(data, &data_fake, scan_data_t);
4513 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4516 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4517 data_fake.last_found=newSVsv(data->last_found);
4521 data_fake.last_closep = &fake;
4522 data_fake.flags = 0;
4523 data_fake.pos_delta = delta;
4525 data_fake.flags |= SF_IS_INF;
4526 if ( flags & SCF_DO_STCLASS && !scan->flags
4527 && OP(scan) == IFMATCH ) { /* Lookahead */
4528 cl_init(pRExC_state, &intrnl);
4529 data_fake.start_class = &intrnl;
4530 f |= SCF_DO_STCLASS_AND;
4532 if (flags & SCF_WHILEM_VISITED_POS)
4533 f |= SCF_WHILEM_VISITED_POS;
4534 next = regnext(scan);
4535 nscan = NEXTOPER(NEXTOPER(scan));
4537 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4538 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4541 FAIL("Variable length lookbehind not implemented");
4543 else if (*minnextp > (I32)U8_MAX) {
4544 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4546 scan->flags = (U8)*minnextp;
4551 if (f & SCF_DO_STCLASS_AND) {
4552 const int was = (data->start_class->flags & ANYOF_EOS);
4554 cl_and(data->start_class, &intrnl);
4556 data->start_class->flags |= ANYOF_EOS;
4559 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4561 if (data_fake.flags & SF_HAS_EVAL)
4562 data->flags |= SF_HAS_EVAL;
4563 data->whilem_c = data_fake.whilem_c;
4564 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4565 if (RExC_rx->minlen<*minnextp)
4566 RExC_rx->minlen=*minnextp;
4567 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4568 SvREFCNT_dec(data_fake.last_found);
4570 if ( data_fake.minlen_fixed != minlenp )
4572 data->offset_fixed= data_fake.offset_fixed;
4573 data->minlen_fixed= data_fake.minlen_fixed;
4574 data->lookbehind_fixed+= scan->flags;
4576 if ( data_fake.minlen_float != minlenp )
4578 data->minlen_float= data_fake.minlen_float;
4579 data->offset_float_min=data_fake.offset_float_min;
4580 data->offset_float_max=data_fake.offset_float_max;
4581 data->lookbehind_float+= scan->flags;
4588 else if (OP(scan) == OPEN) {
4589 if (stopparen != (I32)ARG(scan))
4592 else if (OP(scan) == CLOSE) {
4593 if (stopparen == (I32)ARG(scan)) {
4596 if ((I32)ARG(scan) == is_par) {
4597 next = regnext(scan);
4599 if ( next && (OP(next) != WHILEM) && next < last)
4600 is_par = 0; /* Disable optimization */
4603 *(data->last_closep) = ARG(scan);
4605 else if (OP(scan) == EVAL) {
4607 data->flags |= SF_HAS_EVAL;
4609 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4610 if (flags & SCF_DO_SUBSTR) {
4611 SCAN_COMMIT(pRExC_state,data,minlenp);
4612 flags &= ~SCF_DO_SUBSTR;
4614 if (data && OP(scan)==ACCEPT) {
4615 data->flags |= SCF_SEEN_ACCEPT;
4620 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4622 if (flags & SCF_DO_SUBSTR) {
4623 SCAN_COMMIT(pRExC_state,data,minlenp);
4624 data->longest = &(data->longest_float);
4626 is_inf = is_inf_internal = 1;
4627 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4628 cl_anything(pRExC_state, data->start_class);
4629 flags &= ~SCF_DO_STCLASS;
4631 else if (OP(scan) == GPOS) {
4632 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4633 !(delta || is_inf || (data && data->pos_delta)))
4635 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4636 RExC_rx->extflags |= RXf_ANCH_GPOS;
4637 if (RExC_rx->gofs < (U32)min)
4638 RExC_rx->gofs = min;
4640 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4644 #ifdef TRIE_STUDY_OPT
4645 #ifdef FULL_TRIE_STUDY
4646 else if (PL_regkind[OP(scan)] == TRIE) {
4647 /* NOTE - There is similar code to this block above for handling
4648 BRANCH nodes on the initial study. If you change stuff here
4650 regnode *trie_node= scan;
4651 regnode *tail= regnext(scan);
4652 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4653 I32 max1 = 0, min1 = I32_MAX;
4654 struct regnode_charclass_class accum;
4656 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4657 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4658 if (flags & SCF_DO_STCLASS)
4659 cl_init_zero(pRExC_state, &accum);
4665 const regnode *nextbranch= NULL;
4668 for ( word=1 ; word <= trie->wordcount ; word++)
4670 I32 deltanext=0, minnext=0, f = 0, fake;
4671 struct regnode_charclass_class this_class;
4673 data_fake.flags = 0;
4675 data_fake.whilem_c = data->whilem_c;
4676 data_fake.last_closep = data->last_closep;
4679 data_fake.last_closep = &fake;
4680 data_fake.pos_delta = delta;
4681 if (flags & SCF_DO_STCLASS) {
4682 cl_init(pRExC_state, &this_class);
4683 data_fake.start_class = &this_class;
4684 f = SCF_DO_STCLASS_AND;
4686 if (flags & SCF_WHILEM_VISITED_POS)
4687 f |= SCF_WHILEM_VISITED_POS;
4689 if (trie->jump[word]) {
4691 nextbranch = trie_node + trie->jump[0];
4692 scan= trie_node + trie->jump[word];
4693 /* We go from the jump point to the branch that follows
4694 it. Note this means we need the vestigal unused branches
4695 even though they arent otherwise used.
4697 minnext = study_chunk(pRExC_state, &scan, minlenp,
4698 &deltanext, (regnode *)nextbranch, &data_fake,
4699 stopparen, recursed, NULL, f,depth+1);
4701 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4702 nextbranch= regnext((regnode*)nextbranch);
4704 if (min1 > (I32)(minnext + trie->minlen))
4705 min1 = minnext + trie->minlen;
4706 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4707 max1 = minnext + deltanext + trie->maxlen;
4708 if (deltanext == I32_MAX)
4709 is_inf = is_inf_internal = 1;
4711 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4713 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4714 if ( stopmin > min + min1)
4715 stopmin = min + min1;
4716 flags &= ~SCF_DO_SUBSTR;
4718 data->flags |= SCF_SEEN_ACCEPT;
4721 if (data_fake.flags & SF_HAS_EVAL)
4722 data->flags |= SF_HAS_EVAL;
4723 data->whilem_c = data_fake.whilem_c;
4725 if (flags & SCF_DO_STCLASS)
4726 cl_or(pRExC_state, &accum, &this_class);
4729 if (flags & SCF_DO_SUBSTR) {
4730 data->pos_min += min1;
4731 data->pos_delta += max1 - min1;
4732 if (max1 != min1 || is_inf)
4733 data->longest = &(data->longest_float);
4736 delta += max1 - min1;
4737 if (flags & SCF_DO_STCLASS_OR) {
4738 cl_or(pRExC_state, data->start_class, &accum);
4740 cl_and(data->start_class, and_withp);
4741 flags &= ~SCF_DO_STCLASS;
4744 else if (flags & SCF_DO_STCLASS_AND) {
4746 cl_and(data->start_class, &accum);
4747 flags &= ~SCF_DO_STCLASS;
4750 /* Switch to OR mode: cache the old value of
4751 * data->start_class */
4753 StructCopy(data->start_class, and_withp,
4754 struct regnode_charclass_class);
4755 flags &= ~SCF_DO_STCLASS_AND;
4756 StructCopy(&accum, data->start_class,
4757 struct regnode_charclass_class);
4758 flags |= SCF_DO_STCLASS_OR;
4759 data->start_class->flags |= ANYOF_EOS;
4766 else if (PL_regkind[OP(scan)] == TRIE) {
4767 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4770 min += trie->minlen;
4771 delta += (trie->maxlen - trie->minlen);
4772 flags &= ~SCF_DO_STCLASS; /* xxx */
4773 if (flags & SCF_DO_SUBSTR) {
4774 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4775 data->pos_min += trie->minlen;
4776 data->pos_delta += (trie->maxlen - trie->minlen);
4777 if (trie->maxlen != trie->minlen)
4778 data->longest = &(data->longest_float);
4780 if (trie->jump) /* no more substrings -- for now /grr*/
4781 flags &= ~SCF_DO_SUBSTR;
4783 #endif /* old or new */
4784 #endif /* TRIE_STUDY_OPT */
4786 /* Else: zero-length, ignore. */
4787 scan = regnext(scan);
4792 stopparen = frame->stop;
4793 frame = frame->prev;
4794 goto fake_study_recurse;
4799 DEBUG_STUDYDATA("pre-fin:",data,depth);
4802 *deltap = is_inf_internal ? I32_MAX : delta;
4803 if (flags & SCF_DO_SUBSTR && is_inf)
4804 data->pos_delta = I32_MAX - data->pos_min;
4805 if (is_par > (I32)U8_MAX)
4807 if (is_par && pars==1 && data) {
4808 data->flags |= SF_IN_PAR;
4809 data->flags &= ~SF_HAS_PAR;
4811 else if (pars && data) {
4812 data->flags |= SF_HAS_PAR;
4813 data->flags &= ~SF_IN_PAR;
4815 if (flags & SCF_DO_STCLASS_OR)
4816 cl_and(data->start_class, and_withp);
4817 if (flags & SCF_TRIE_RESTUDY)
4818 data->flags |= SCF_TRIE_RESTUDY;
4820 DEBUG_STUDYDATA("post-fin:",data,depth);
4822 return min < stopmin ? min : stopmin;
4826 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4828 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4830 PERL_ARGS_ASSERT_ADD_DATA;
4832 Renewc(RExC_rxi->data,
4833 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4834 char, struct reg_data);
4836 Renew(RExC_rxi->data->what, count + n, U8);
4838 Newx(RExC_rxi->data->what, n, U8);
4839 RExC_rxi->data->count = count + n;
4840 Copy(s, RExC_rxi->data->what + count, n, U8);
4844 /*XXX: todo make this not included in a non debugging perl */
4845 #ifndef PERL_IN_XSUB_RE
4847 Perl_reginitcolors(pTHX)
4850 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4852 char *t = savepv(s);
4856 t = strchr(t, '\t');
4862 PL_colors[i] = t = (char *)"";
4867 PL_colors[i++] = (char *)"";
4874 #ifdef TRIE_STUDY_OPT
4875 #define CHECK_RESTUDY_GOTO \
4877 (data.flags & SCF_TRIE_RESTUDY) \
4881 #define CHECK_RESTUDY_GOTO
4885 * pregcomp - compile a regular expression into internal code
4887 * Decides which engine's compiler to call based on the hint currently in
4891 #ifndef PERL_IN_XSUB_RE
4893 /* return the currently in-scope regex engine (or the default if none) */
4895 regexp_engine const *
4896 Perl_current_re_engine(pTHX)
4900 if (IN_PERL_COMPILETIME) {
4901 HV * const table = GvHV(PL_hintgv);
4905 return &PL_core_reg_engine;
4906 ptr = hv_fetchs(table, "regcomp", FALSE);
4907 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4908 return &PL_core_reg_engine;
4909 return INT2PTR(regexp_engine*,SvIV(*ptr));
4913 if (!PL_curcop->cop_hints_hash)
4914 return &PL_core_reg_engine;
4915 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4916 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4917 return &PL_core_reg_engine;
4918 return INT2PTR(regexp_engine*,SvIV(ptr));
4924 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4927 regexp_engine const *eng = current_re_engine();
4928 GET_RE_DEBUG_FLAGS_DECL;
4930 PERL_ARGS_ASSERT_PREGCOMP;
4932 /* Dispatch a request to compile a regexp to correct regexp engine. */
4934 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4937 return CALLREGCOMP_ENG(eng, pattern, flags);
4941 /* public(ish) entry point for the perl core's own regex compiling code.
4942 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4943 * pattern rather than a list of OPs, and uses the internal engine rather
4944 * than the current one */
4947 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4949 SV *pat = pattern; /* defeat constness! */
4950 PERL_ARGS_ASSERT_RE_COMPILE;
4951 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4952 #ifdef PERL_IN_XSUB_RE
4955 &PL_core_reg_engine,
4957 NULL, NULL, rx_flags, 0);
4960 /* see if there are any run-time code blocks in the pattern.
4961 * False positives are allowed */
4964 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4965 U32 pm_flags, char *pat, STRLEN plen)
4970 /* avoid infinitely recursing when we recompile the pattern parcelled up
4971 * as qr'...'. A single constant qr// string can't have have any
4972 * run-time component in it, and thus, no runtime code. (A non-qr
4973 * string, however, can, e.g. $x =~ '(?{})') */
4974 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4977 for (s = 0; s < plen; s++) {
4978 if (n < pRExC_state->num_code_blocks
4979 && s == pRExC_state->code_blocks[n].start)
4981 s = pRExC_state->code_blocks[n].end;
4985 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4987 if (pat[s] == '(' && pat[s+1] == '?' &&
4988 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4995 /* Handle run-time code blocks. We will already have compiled any direct
4996 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4997 * copy of it, but with any literal code blocks blanked out and
4998 * appropriate chars escaped; then feed it into
5000 * eval "qr'modified_pattern'"
5004 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5008 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5010 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5011 * and merge them with any code blocks of the original regexp.
5013 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5014 * instead, just save the qr and return FALSE; this tells our caller that
5015 * the original pattern needs upgrading to utf8.
5019 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5020 char *pat, STRLEN plen)
5024 GET_RE_DEBUG_FLAGS_DECL;
5026 if (pRExC_state->runtime_code_qr) {
5027 /* this is the second time we've been called; this should
5028 * only happen if the main pattern got upgraded to utf8
5029 * during compilation; re-use the qr we compiled first time
5030 * round (which should be utf8 too)
5032 qr = pRExC_state->runtime_code_qr;
5033 pRExC_state->runtime_code_qr = NULL;
5034 assert(RExC_utf8 && SvUTF8(qr));
5040 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5044 /* determine how many extra chars we need for ' and \ escaping */
5045 for (s = 0; s < plen; s++) {
5046 if (pat[s] == '\'' || pat[s] == '\\')
5050 Newx(newpat, newlen, char);
5052 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5054 for (s = 0; s < plen; s++) {
5055 if (n < pRExC_state->num_code_blocks
5056 && s == pRExC_state->code_blocks[n].start)
5058 /* blank out literal code block */
5059 assert(pat[s] == '(');
5060 while (s <= pRExC_state->code_blocks[n].end) {
5068 if (pat[s] == '\'' || pat[s] == '\\')
5073 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5077 PerlIO_printf(Perl_debug_log,
5078 "%sre-parsing pattern for runtime code:%s %s\n",
5079 PL_colors[4],PL_colors[5],newpat);
5082 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5088 PUSHSTACKi(PERLSI_REQUIRE);
5089 /* this causes the toker to collapse \\ into \ when parsing
5090 * qr''; normally only q'' does this. It also alters hints
5092 PL_reg_state.re_reparsing = TRUE;
5093 eval_sv(sv, G_SCALAR);
5099 SV * const errsv = ERRSV;
5100 if (SvTRUE_NN(errsv))
5102 Safefree(pRExC_state->code_blocks);
5103 /* use croak_sv ? */
5104 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5107 assert(SvROK(qr_ref));
5109 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5110 /* the leaving below frees the tmp qr_ref.
5111 * Give qr a life of its own */
5119 if (!RExC_utf8 && SvUTF8(qr)) {
5120 /* first time through; the pattern got upgraded; save the
5121 * qr for the next time through */
5122 assert(!pRExC_state->runtime_code_qr);
5123 pRExC_state->runtime_code_qr = qr;
5128 /* extract any code blocks within the returned qr// */
5131 /* merge the main (r1) and run-time (r2) code blocks into one */
5133 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5134 struct reg_code_block *new_block, *dst;
5135 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5138 if (!r2->num_code_blocks) /* we guessed wrong */
5145 r1->num_code_blocks + r2->num_code_blocks,
5146 struct reg_code_block);
5149 while ( i1 < r1->num_code_blocks
5150 || i2 < r2->num_code_blocks)
5152 struct reg_code_block *src;
5155 if (i1 == r1->num_code_blocks) {
5156 src = &r2->code_blocks[i2++];
5159 else if (i2 == r2->num_code_blocks)
5160 src = &r1->code_blocks[i1++];
5161 else if ( r1->code_blocks[i1].start
5162 < r2->code_blocks[i2].start)
5164 src = &r1->code_blocks[i1++];
5165 assert(src->end < r2->code_blocks[i2].start);
5168 assert( r1->code_blocks[i1].start
5169 > r2->code_blocks[i2].start);
5170 src = &r2->code_blocks[i2++];
5172 assert(src->end < r1->code_blocks[i1].start);
5175 assert(pat[src->start] == '(');
5176 assert(pat[src->end] == ')');
5177 dst->start = src->start;
5178 dst->end = src->end;
5179 dst->block = src->block;
5180 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5184 r1->num_code_blocks += r2->num_code_blocks;
5185 Safefree(r1->code_blocks);
5186 r1->code_blocks = new_block;
5195 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5197 /* This is the common code for setting up the floating and fixed length
5198 * string data extracted from Perlre_op_compile() below. Returns a boolean
5199 * as to whether succeeded or not */
5203 if (! (longest_length
5204 || (eol /* Can't have SEOL and MULTI */
5205 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5207 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5208 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5213 /* copy the information about the longest from the reg_scan_data
5214 over to the program. */
5215 if (SvUTF8(sv_longest)) {
5216 *rx_utf8 = sv_longest;
5219 *rx_substr = sv_longest;
5222 /* end_shift is how many chars that must be matched that
5223 follow this item. We calculate it ahead of time as once the
5224 lookbehind offset is added in we lose the ability to correctly
5226 ml = minlen ? *(minlen) : (I32)longest_length;
5227 *rx_end_shift = ml - offset
5228 - longest_length + (SvTAIL(sv_longest) != 0)
5231 t = (eol/* Can't have SEOL and MULTI */
5232 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5233 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5239 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5240 * regular expression into internal code.
5241 * The pattern may be passed either as:
5242 * a list of SVs (patternp plus pat_count)
5243 * a list of OPs (expr)
5244 * If both are passed, the SV list is used, but the OP list indicates
5245 * which SVs are actually pre-compiled code blocks
5247 * The SVs in the list have magic and qr overloading applied to them (and
5248 * the list may be modified in-place with replacement SVs in the latter
5251 * If the pattern hasn't changed from old_re, then old_re will be
5254 * eng is the current engine. If that engine has an op_comp method, then
5255 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5256 * do the initial concatenation of arguments and pass on to the external
5259 * If is_bare_re is not null, set it to a boolean indicating whether the
5260 * arg list reduced (after overloading) to a single bare regex which has
5261 * been returned (i.e. /$qr/).
5263 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5265 * pm_flags contains the PMf_* flags, typically based on those from the
5266 * pm_flags field of the related PMOP. Currently we're only interested in
5267 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5269 * We can't allocate space until we know how big the compiled form will be,
5270 * but we can't compile it (and thus know how big it is) until we've got a
5271 * place to put the code. So we cheat: we compile it twice, once with code
5272 * generation turned off and size counting turned on, and once "for real".
5273 * This also means that we don't allocate space until we are sure that the
5274 * thing really will compile successfully, and we never have to move the
5275 * code and thus invalidate pointers into it. (Note that it has to be in
5276 * one piece because free() must be able to free it all.) [NB: not true in perl]
5278 * Beware that the optimization-preparation code in here knows about some
5279 * of the structure of the compiled regexp. [I'll say.]
5283 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5284 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5285 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5290 regexp_internal *ri;
5299 SV * VOL code_blocksv = NULL;
5301 /* these are all flags - maybe they should be turned
5302 * into a single int with different bit masks */
5303 I32 sawlookahead = 0;
5306 bool used_setjump = FALSE;
5307 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5308 bool code_is_utf8 = 0;
5309 bool VOL recompile = 0;
5310 bool runtime_code = 0;
5314 RExC_state_t RExC_state;
5315 RExC_state_t * const pRExC_state = &RExC_state;
5316 #ifdef TRIE_STUDY_OPT
5318 RExC_state_t copyRExC_state;
5320 GET_RE_DEBUG_FLAGS_DECL;
5322 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5324 DEBUG_r(if (!PL_colorset) reginitcolors());
5326 #ifndef PERL_IN_XSUB_RE
5327 /* Initialize these here instead of as-needed, as is quick and avoids
5328 * having to test them each time otherwise */
5329 if (! PL_AboveLatin1) {
5330 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5331 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5332 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5334 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5335 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5337 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5338 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5340 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5341 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5343 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5345 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5346 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5348 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5350 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5351 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5353 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5354 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5356 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5357 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5359 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5360 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5362 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5363 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5365 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5366 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5368 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5369 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5371 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5373 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5374 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5376 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5377 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5379 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5383 pRExC_state->code_blocks = NULL;
5384 pRExC_state->num_code_blocks = 0;
5387 *is_bare_re = FALSE;
5389 if (expr && (expr->op_type == OP_LIST ||
5390 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5392 /* is the source UTF8, and how many code blocks are there? */
5396 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5397 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5399 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5400 /* count of DO blocks */
5404 pRExC_state->num_code_blocks = ncode;
5405 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5410 /* handle a list of SVs */
5414 /* apply magic and RE overloading to each arg */
5415 for (svp = patternp; svp < patternp + pat_count; svp++) {
5418 if (SvROK(rx) && SvAMAGIC(rx)) {
5419 SV *sv = AMG_CALLunary(rx, regexp_amg);
5423 if (SvTYPE(sv) != SVt_REGEXP)
5424 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5430 if (pat_count > 1) {
5431 /* concat multiple args and find any code block indexes */
5436 STRLEN orig_patlen = 0;
5438 if (pRExC_state->num_code_blocks) {
5439 o = cLISTOPx(expr)->op_first;
5440 assert( o->op_type == OP_PUSHMARK
5441 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5442 || o->op_type == OP_PADRANGE);
5446 pat = newSVpvn("", 0);
5449 /* determine if the pattern is going to be utf8 (needed
5450 * in advance to align code block indices correctly).
5451 * XXX This could fail to be detected for an arg with
5452 * overloading but not concat overloading; but the main effect
5453 * in this obscure case is to need a 'use re eval' for a
5454 * literal code block */
5455 for (svp = patternp; svp < patternp + pat_count; svp++) {
5462 for (svp = patternp; svp < patternp + pat_count; svp++) {
5463 SV *sv, *msv = *svp;
5466 /* we make the assumption here that each op in the list of
5467 * op_siblings maps to one SV pushed onto the stack,
5468 * except for code blocks, with have both an OP_NULL and
5470 * This allows us to match up the list of SVs against the
5471 * list of OPs to find the next code block.
5473 * Note that PUSHMARK PADSV PADSV ..
5475 * PADRANGE NULL NULL ..
5476 * so the alignment still works. */
5478 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5479 assert(n < pRExC_state->num_code_blocks);
5480 pRExC_state->code_blocks[n].start = SvCUR(pat);
5481 pRExC_state->code_blocks[n].block = o;
5482 pRExC_state->code_blocks[n].src_regex = NULL;
5485 o = o->op_sibling; /* skip CONST */
5491 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5492 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5495 /* overloading involved: all bets are off over literal
5496 * code. Pretend we haven't seen it */
5497 pRExC_state->num_code_blocks -= n;
5503 while (SvAMAGIC(msv)
5504 && (sv = AMG_CALLunary(msv, string_amg))
5508 && SvRV(msv) == SvRV(sv))
5513 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5515 orig_patlen = SvCUR(pat);
5516 sv_catsv_nomg(pat, msv);
5519 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5522 /* extract any code blocks within any embedded qr//'s */
5523 if (rx && SvTYPE(rx) == SVt_REGEXP
5524 && RX_ENGINE((REGEXP*)rx)->op_comp)
5527 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5528 if (ri->num_code_blocks) {
5530 /* the presence of an embedded qr// with code means
5531 * we should always recompile: the text of the
5532 * qr// may not have changed, but it may be a
5533 * different closure than last time */
5535 Renew(pRExC_state->code_blocks,
5536 pRExC_state->num_code_blocks + ri->num_code_blocks,
5537 struct reg_code_block);
5538 pRExC_state->num_code_blocks += ri->num_code_blocks;
5539 for (i=0; i < ri->num_code_blocks; i++) {
5540 struct reg_code_block *src, *dst;
5541 STRLEN offset = orig_patlen
5542 + ReANY((REGEXP *)rx)->pre_prefix;
5543 assert(n < pRExC_state->num_code_blocks);
5544 src = &ri->code_blocks[i];
5545 dst = &pRExC_state->code_blocks[n];
5546 dst->start = src->start + offset;
5547 dst->end = src->end + offset;
5548 dst->block = src->block;
5549 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5563 while (SvAMAGIC(pat)
5564 && (sv = AMG_CALLunary(pat, string_amg))
5572 /* handle bare regex: foo =~ $re */
5577 if (SvTYPE(re) == SVt_REGEXP) {
5581 Safefree(pRExC_state->code_blocks);
5587 /* not a list of SVs, so must be a list of OPs */
5589 if (expr->op_type == OP_LIST) {
5594 pat = newSVpvn("", 0);
5599 /* given a list of CONSTs and DO blocks in expr, append all
5600 * the CONSTs to pat, and record the start and end of each
5601 * code block in code_blocks[] (each DO{} op is followed by an
5602 * OP_CONST containing the corresponding literal '(?{...})
5605 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5606 if (o->op_type == OP_CONST) {
5607 sv_catsv(pat, cSVOPo_sv);
5609 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5613 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5614 assert(i+1 < pRExC_state->num_code_blocks);
5615 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5616 pRExC_state->code_blocks[i].block = o;
5617 pRExC_state->code_blocks[i].src_regex = NULL;
5623 assert(expr->op_type == OP_CONST);
5624 pat = cSVOPx_sv(expr);
5628 exp = SvPV_nomg(pat, plen);
5630 if (!eng->op_comp) {
5631 if ((SvUTF8(pat) && IN_BYTES)
5632 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5634 /* make a temporary copy; either to convert to bytes,
5635 * or to avoid repeating get-magic / overloaded stringify */
5636 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5637 (IN_BYTES ? 0 : SvUTF8(pat)));
5639 Safefree(pRExC_state->code_blocks);
5640 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5643 /* ignore the utf8ness if the pattern is 0 length */
5644 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5645 RExC_uni_semantics = 0;
5646 RExC_contains_locale = 0;
5647 pRExC_state->runtime_code_qr = NULL;
5649 /****************** LONG JUMP TARGET HERE***********************/
5650 /* Longjmp back to here if have to switch in midstream to utf8 */
5651 if (! RExC_orig_utf8) {
5652 JMPENV_PUSH(jump_ret);
5653 used_setjump = TRUE;
5656 if (jump_ret == 0) { /* First time through */
5660 SV *dsv= sv_newmortal();
5661 RE_PV_QUOTED_DECL(s, RExC_utf8,
5662 dsv, exp, plen, 60);
5663 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5664 PL_colors[4],PL_colors[5],s);
5667 else { /* longjumped back */
5670 STRLEN s = 0, d = 0;
5673 /* If the cause for the longjmp was other than changing to utf8, pop
5674 * our own setjmp, and longjmp to the correct handler */
5675 if (jump_ret != UTF8_LONGJMP) {
5677 JMPENV_JUMP(jump_ret);
5682 /* It's possible to write a regexp in ascii that represents Unicode
5683 codepoints outside of the byte range, such as via \x{100}. If we
5684 detect such a sequence we have to convert the entire pattern to utf8
5685 and then recompile, as our sizing calculation will have been based
5686 on 1 byte == 1 character, but we will need to use utf8 to encode
5687 at least some part of the pattern, and therefore must convert the whole
5690 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5691 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5693 /* upgrade pattern to UTF8, and if there are code blocks,
5694 * recalculate the indices.
5695 * This is essentially an unrolled Perl_bytes_to_utf8() */
5697 src = (U8*)SvPV_nomg(pat, plen);
5698 Newx(dst, plen * 2 + 1, U8);
5701 const UV uv = NATIVE_TO_ASCII(src[s]);
5702 if (UNI_IS_INVARIANT(uv))
5703 dst[d] = (U8)UTF_TO_NATIVE(uv);
5705 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5706 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5708 if (n < pRExC_state->num_code_blocks) {
5709 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5710 pRExC_state->code_blocks[n].start = d;
5711 assert(dst[d] == '(');
5714 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5715 pRExC_state->code_blocks[n].end = d;
5716 assert(dst[d] == ')');
5729 RExC_orig_utf8 = RExC_utf8 = 1;
5732 /* return old regex if pattern hasn't changed */
5736 && !!RX_UTF8(old_re) == !!RExC_utf8
5737 && RX_PRECOMP(old_re)
5738 && RX_PRELEN(old_re) == plen
5739 && memEQ(RX_PRECOMP(old_re), exp, plen))
5741 /* with runtime code, always recompile */
5742 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5744 if (!runtime_code) {
5748 Safefree(pRExC_state->code_blocks);
5752 else if ((pm_flags & PMf_USE_RE_EVAL)
5753 /* this second condition covers the non-regex literal case,
5754 * i.e. $foo =~ '(?{})'. */
5755 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5756 && (PL_hints & HINT_RE_EVAL))
5758 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5761 #ifdef TRIE_STUDY_OPT
5765 rx_flags = orig_rx_flags;
5767 if (initial_charset == REGEX_LOCALE_CHARSET) {
5768 RExC_contains_locale = 1;
5770 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5772 /* Set to use unicode semantics if the pattern is in utf8 and has the
5773 * 'depends' charset specified, as it means unicode when utf8 */
5774 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5778 RExC_flags = rx_flags;
5779 RExC_pm_flags = pm_flags;
5782 if (TAINTING_get && TAINT_get)
5783 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5785 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5786 /* whoops, we have a non-utf8 pattern, whilst run-time code
5787 * got compiled as utf8. Try again with a utf8 pattern */
5788 JMPENV_JUMP(UTF8_LONGJMP);
5791 assert(!pRExC_state->runtime_code_qr);
5796 RExC_in_lookbehind = 0;
5797 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5799 RExC_override_recoding = 0;
5800 RExC_in_multi_char_class = 0;
5802 /* First pass: determine size, legality. */
5810 RExC_emit = &PL_regdummy;
5811 RExC_whilem_seen = 0;
5812 RExC_open_parens = NULL;
5813 RExC_close_parens = NULL;
5815 RExC_paren_names = NULL;
5817 RExC_paren_name_list = NULL;
5819 RExC_recurse = NULL;
5820 RExC_recurse_count = 0;
5821 pRExC_state->code_index = 0;
5823 #if 0 /* REGC() is (currently) a NOP at the first pass.
5824 * Clever compilers notice this and complain. --jhi */
5825 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5828 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5830 RExC_lastparse=NULL;
5832 /* reg may croak on us, not giving us a chance to free
5833 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5834 need it to survive as long as the regexp (qr/(?{})/).
5835 We must check that code_blocksv is not already set, because we may
5836 have longjmped back. */
5837 if (pRExC_state->code_blocks && !code_blocksv) {
5838 code_blocksv = newSV_type(SVt_PV);
5839 SAVEFREESV(code_blocksv);
5840 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5841 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5843 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5844 RExC_precomp = NULL;
5848 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5850 /* Here, finished first pass. Get rid of any added setjmp */
5856 PerlIO_printf(Perl_debug_log,
5857 "Required size %"IVdf" nodes\n"
5858 "Starting second pass (creation)\n",
5861 RExC_lastparse=NULL;
5864 /* The first pass could have found things that force Unicode semantics */
5865 if ((RExC_utf8 || RExC_uni_semantics)
5866 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5868 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5871 /* Small enough for pointer-storage convention?
5872 If extralen==0, this means that we will not need long jumps. */
5873 if (RExC_size >= 0x10000L && RExC_extralen)
5874 RExC_size += RExC_extralen;
5877 if (RExC_whilem_seen > 15)
5878 RExC_whilem_seen = 15;
5880 /* Allocate space and zero-initialize. Note, the two step process
5881 of zeroing when in debug mode, thus anything assigned has to
5882 happen after that */
5883 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5885 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5886 char, regexp_internal);
5887 if ( r == NULL || ri == NULL )
5888 FAIL("Regexp out of space");
5890 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5891 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5893 /* bulk initialize base fields with 0. */
5894 Zero(ri, sizeof(regexp_internal), char);
5897 /* non-zero initialization begins here */
5900 r->extflags = rx_flags;
5901 if (pm_flags & PMf_IS_QR) {
5902 ri->code_blocks = pRExC_state->code_blocks;
5903 ri->num_code_blocks = pRExC_state->num_code_blocks;
5908 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5909 if (pRExC_state->code_blocks[n].src_regex)
5910 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5911 SAVEFREEPV(pRExC_state->code_blocks);
5915 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5916 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5918 /* The caret is output if there are any defaults: if not all the STD
5919 * flags are set, or if no character set specifier is needed */
5921 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5923 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5924 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5925 >> RXf_PMf_STD_PMMOD_SHIFT);
5926 const char *fptr = STD_PAT_MODS; /*"msix"*/
5928 /* Allocate for the worst case, which is all the std flags are turned
5929 * on. If more precision is desired, we could do a population count of
5930 * the flags set. This could be done with a small lookup table, or by
5931 * shifting, masking and adding, or even, when available, assembly
5932 * language for a machine-language population count.
5933 * We never output a minus, as all those are defaults, so are
5934 * covered by the caret */
5935 const STRLEN wraplen = plen + has_p + has_runon
5936 + has_default /* If needs a caret */
5938 /* If needs a character set specifier */
5939 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5940 + (sizeof(STD_PAT_MODS) - 1)
5941 + (sizeof("(?:)") - 1);
5943 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5944 r->xpv_len_u.xpvlenu_pv = p;
5946 SvFLAGS(rx) |= SVf_UTF8;
5949 /* If a default, cover it using the caret */
5951 *p++= DEFAULT_PAT_MOD;
5955 const char* const name = get_regex_charset_name(r->extflags, &len);
5956 Copy(name, p, len, char);
5960 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5963 while((ch = *fptr++)) {
5971 Copy(RExC_precomp, p, plen, char);
5972 assert ((RX_WRAPPED(rx) - p) < 16);
5973 r->pre_prefix = p - RX_WRAPPED(rx);
5979 SvCUR_set(rx, p - RX_WRAPPED(rx));
5983 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5985 if (RExC_seen & REG_SEEN_RECURSE) {
5986 Newxz(RExC_open_parens, RExC_npar,regnode *);
5987 SAVEFREEPV(RExC_open_parens);
5988 Newxz(RExC_close_parens,RExC_npar,regnode *);
5989 SAVEFREEPV(RExC_close_parens);
5992 /* Useful during FAIL. */
5993 #ifdef RE_TRACK_PATTERN_OFFSETS
5994 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5995 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5996 "%s %"UVuf" bytes for offset annotations.\n",
5997 ri->u.offsets ? "Got" : "Couldn't get",
5998 (UV)((2*RExC_size+1) * sizeof(U32))));
6000 SetProgLen(ri,RExC_size);
6005 /* Second pass: emit code. */
6006 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6007 RExC_pm_flags = pm_flags;
6012 RExC_emit_start = ri->program;
6013 RExC_emit = ri->program;
6014 RExC_emit_bound = ri->program + RExC_size + 1;
6015 pRExC_state->code_index = 0;
6017 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6018 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6022 /* XXXX To minimize changes to RE engine we always allocate
6023 3-units-long substrs field. */
6024 Newx(r->substrs, 1, struct reg_substr_data);
6025 if (RExC_recurse_count) {
6026 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6027 SAVEFREEPV(RExC_recurse);
6031 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6032 Zero(r->substrs, 1, struct reg_substr_data);
6034 #ifdef TRIE_STUDY_OPT
6036 StructCopy(&zero_scan_data, &data, scan_data_t);
6037 copyRExC_state = RExC_state;
6040 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6042 RExC_state = copyRExC_state;
6043 if (seen & REG_TOP_LEVEL_BRANCHES)
6044 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6046 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6047 if (data.last_found) {
6048 SvREFCNT_dec(data.longest_fixed);
6049 SvREFCNT_dec(data.longest_float);
6050 SvREFCNT_dec(data.last_found);
6052 StructCopy(&zero_scan_data, &data, scan_data_t);
6055 StructCopy(&zero_scan_data, &data, scan_data_t);
6058 /* Dig out information for optimizations. */
6059 r->extflags = RExC_flags; /* was pm_op */
6060 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6063 SvUTF8_on(rx); /* Unicode in it? */
6064 ri->regstclass = NULL;
6065 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6066 r->intflags |= PREGf_NAUGHTY;
6067 scan = ri->program + 1; /* First BRANCH. */
6069 /* testing for BRANCH here tells us whether there is "must appear"
6070 data in the pattern. If there is then we can use it for optimisations */
6071 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6073 STRLEN longest_float_length, longest_fixed_length;
6074 struct regnode_charclass_class ch_class; /* pointed to by data */
6076 I32 last_close = 0; /* pointed to by data */
6077 regnode *first= scan;
6078 regnode *first_next= regnext(first);
6080 * Skip introductions and multiplicators >= 1
6081 * so that we can extract the 'meat' of the pattern that must
6082 * match in the large if() sequence following.
6083 * NOTE that EXACT is NOT covered here, as it is normally
6084 * picked up by the optimiser separately.
6086 * This is unfortunate as the optimiser isnt handling lookahead
6087 * properly currently.
6090 while ((OP(first) == OPEN && (sawopen = 1)) ||
6091 /* An OR of *one* alternative - should not happen now. */
6092 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6093 /* for now we can't handle lookbehind IFMATCH*/
6094 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6095 (OP(first) == PLUS) ||
6096 (OP(first) == MINMOD) ||
6097 /* An {n,m} with n>0 */
6098 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6099 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6102 * the only op that could be a regnode is PLUS, all the rest
6103 * will be regnode_1 or regnode_2.
6106 if (OP(first) == PLUS)
6109 first += regarglen[OP(first)];
6111 first = NEXTOPER(first);
6112 first_next= regnext(first);
6115 /* Starting-point info. */
6117 DEBUG_PEEP("first:",first,0);
6118 /* Ignore EXACT as we deal with it later. */
6119 if (PL_regkind[OP(first)] == EXACT) {
6120 if (OP(first) == EXACT)
6121 NOOP; /* Empty, get anchored substr later. */
6123 ri->regstclass = first;
6126 else if (PL_regkind[OP(first)] == TRIE &&
6127 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6130 /* this can happen only on restudy */
6131 if ( OP(first) == TRIE ) {
6132 struct regnode_1 *trieop = (struct regnode_1 *)
6133 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6134 StructCopy(first,trieop,struct regnode_1);
6135 trie_op=(regnode *)trieop;
6137 struct regnode_charclass *trieop = (struct regnode_charclass *)
6138 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6139 StructCopy(first,trieop,struct regnode_charclass);
6140 trie_op=(regnode *)trieop;
6143 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6144 ri->regstclass = trie_op;
6147 else if (REGNODE_SIMPLE(OP(first)))
6148 ri->regstclass = first;
6149 else if (PL_regkind[OP(first)] == BOUND ||
6150 PL_regkind[OP(first)] == NBOUND)
6151 ri->regstclass = first;
6152 else if (PL_regkind[OP(first)] == BOL) {
6153 r->extflags |= (OP(first) == MBOL
6155 : (OP(first) == SBOL
6158 first = NEXTOPER(first);
6161 else if (OP(first) == GPOS) {
6162 r->extflags |= RXf_ANCH_GPOS;
6163 first = NEXTOPER(first);
6166 else if ((!sawopen || !RExC_sawback) &&
6167 (OP(first) == STAR &&
6168 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6169 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6171 /* turn .* into ^.* with an implied $*=1 */
6173 (OP(NEXTOPER(first)) == REG_ANY)
6176 r->extflags |= type;
6177 r->intflags |= PREGf_IMPLICIT;
6178 first = NEXTOPER(first);
6181 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6182 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6183 /* x+ must match at the 1st pos of run of x's */
6184 r->intflags |= PREGf_SKIP;
6186 /* Scan is after the zeroth branch, first is atomic matcher. */
6187 #ifdef TRIE_STUDY_OPT
6190 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6191 (IV)(first - scan + 1))
6195 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6196 (IV)(first - scan + 1))
6202 * If there's something expensive in the r.e., find the
6203 * longest literal string that must appear and make it the
6204 * regmust. Resolve ties in favor of later strings, since
6205 * the regstart check works with the beginning of the r.e.
6206 * and avoiding duplication strengthens checking. Not a
6207 * strong reason, but sufficient in the absence of others.
6208 * [Now we resolve ties in favor of the earlier string if
6209 * it happens that c_offset_min has been invalidated, since the
6210 * earlier string may buy us something the later one won't.]
6213 data.longest_fixed = newSVpvs("");
6214 data.longest_float = newSVpvs("");
6215 data.last_found = newSVpvs("");
6216 data.longest = &(data.longest_fixed);
6218 if (!ri->regstclass) {
6219 cl_init(pRExC_state, &ch_class);
6220 data.start_class = &ch_class;
6221 stclass_flag = SCF_DO_STCLASS_AND;
6222 } else /* XXXX Check for BOUND? */
6224 data.last_closep = &last_close;
6226 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6227 &data, -1, NULL, NULL,
6228 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6234 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6235 && data.last_start_min == 0 && data.last_end > 0
6236 && !RExC_seen_zerolen
6237 && !(RExC_seen & REG_SEEN_VERBARG)
6238 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6239 r->extflags |= RXf_CHECK_ALL;
6240 scan_commit(pRExC_state, &data,&minlen,0);
6241 SvREFCNT_dec(data.last_found);
6243 longest_float_length = CHR_SVLEN(data.longest_float);
6245 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6246 && data.offset_fixed == data.offset_float_min
6247 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6248 && S_setup_longest (aTHX_ pRExC_state,
6252 &(r->float_end_shift),
6253 data.lookbehind_float,
6254 data.offset_float_min,
6256 longest_float_length,
6257 data.flags & SF_FL_BEFORE_EOL,
6258 data.flags & SF_FL_BEFORE_MEOL))
6260 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6261 r->float_max_offset = data.offset_float_max;
6262 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6263 r->float_max_offset -= data.lookbehind_float;
6266 r->float_substr = r->float_utf8 = NULL;
6267 SvREFCNT_dec(data.longest_float);
6268 longest_float_length = 0;
6271 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6273 if (S_setup_longest (aTHX_ pRExC_state,
6275 &(r->anchored_utf8),
6276 &(r->anchored_substr),
6277 &(r->anchored_end_shift),
6278 data.lookbehind_fixed,
6281 longest_fixed_length,
6282 data.flags & SF_FIX_BEFORE_EOL,
6283 data.flags & SF_FIX_BEFORE_MEOL))
6285 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6288 r->anchored_substr = r->anchored_utf8 = NULL;
6289 SvREFCNT_dec(data.longest_fixed);
6290 longest_fixed_length = 0;
6294 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6295 ri->regstclass = NULL;
6297 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6299 && !(data.start_class->flags & ANYOF_EOS)
6300 && !cl_is_anything(data.start_class))
6302 const U32 n = add_data(pRExC_state, 1, "f");
6303 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6305 Newx(RExC_rxi->data->data[n], 1,
6306 struct regnode_charclass_class);
6307 StructCopy(data.start_class,
6308 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6309 struct regnode_charclass_class);
6310 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6311 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6312 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6313 regprop(r, sv, (regnode*)data.start_class);
6314 PerlIO_printf(Perl_debug_log,
6315 "synthetic stclass \"%s\".\n",
6316 SvPVX_const(sv));});
6319 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6320 if (longest_fixed_length > longest_float_length) {
6321 r->check_end_shift = r->anchored_end_shift;
6322 r->check_substr = r->anchored_substr;
6323 r->check_utf8 = r->anchored_utf8;
6324 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6325 if (r->extflags & RXf_ANCH_SINGLE)
6326 r->extflags |= RXf_NOSCAN;
6329 r->check_end_shift = r->float_end_shift;
6330 r->check_substr = r->float_substr;
6331 r->check_utf8 = r->float_utf8;
6332 r->check_offset_min = r->float_min_offset;
6333 r->check_offset_max = r->float_max_offset;
6335 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6336 This should be changed ASAP! */
6337 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6338 r->extflags |= RXf_USE_INTUIT;
6339 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6340 r->extflags |= RXf_INTUIT_TAIL;
6342 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6343 if ( (STRLEN)minlen < longest_float_length )
6344 minlen= longest_float_length;
6345 if ( (STRLEN)minlen < longest_fixed_length )
6346 minlen= longest_fixed_length;
6350 /* Several toplevels. Best we can is to set minlen. */
6352 struct regnode_charclass_class ch_class;
6355 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6357 scan = ri->program + 1;
6358 cl_init(pRExC_state, &ch_class);
6359 data.start_class = &ch_class;
6360 data.last_closep = &last_close;
6363 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6364 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6368 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6369 = r->float_substr = r->float_utf8 = NULL;
6371 if (!(data.start_class->flags & ANYOF_EOS)
6372 && !cl_is_anything(data.start_class))
6374 const U32 n = add_data(pRExC_state, 1, "f");
6375 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6377 Newx(RExC_rxi->data->data[n], 1,
6378 struct regnode_charclass_class);
6379 StructCopy(data.start_class,
6380 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6381 struct regnode_charclass_class);
6382 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6383 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6384 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6385 regprop(r, sv, (regnode*)data.start_class);
6386 PerlIO_printf(Perl_debug_log,
6387 "synthetic stclass \"%s\".\n",
6388 SvPVX_const(sv));});
6392 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6393 the "real" pattern. */
6395 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6396 (IV)minlen, (IV)r->minlen);
6398 r->minlenret = minlen;
6399 if (r->minlen < minlen)
6402 if (RExC_seen & REG_SEEN_GPOS)
6403 r->extflags |= RXf_GPOS_SEEN;
6404 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6405 r->extflags |= RXf_LOOKBEHIND_SEEN;
6406 if (pRExC_state->num_code_blocks)
6407 r->extflags |= RXf_EVAL_SEEN;
6408 if (RExC_seen & REG_SEEN_CANY)
6409 r->extflags |= RXf_CANY_SEEN;
6410 if (RExC_seen & REG_SEEN_VERBARG)
6412 r->intflags |= PREGf_VERBARG_SEEN;
6413 r->extflags |= RXf_MODIFIES_VARS;
6415 if (RExC_seen & REG_SEEN_CUTGROUP)
6416 r->intflags |= PREGf_CUTGROUP_SEEN;
6417 if (pm_flags & PMf_USE_RE_EVAL)
6418 r->intflags |= PREGf_USE_RE_EVAL;
6419 if (RExC_paren_names)
6420 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6422 RXp_PAREN_NAMES(r) = NULL;
6424 #ifdef STUPID_PATTERN_CHECKS
6425 if (RX_PRELEN(rx) == 0)
6426 r->extflags |= RXf_NULL;
6427 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6428 r->extflags |= RXf_WHITE;
6429 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6430 r->extflags |= RXf_START_ONLY;
6433 regnode *first = ri->program + 1;
6436 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6437 r->extflags |= RXf_NULL;
6438 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6439 r->extflags |= RXf_START_ONLY;
6440 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6441 && OP(regnext(first)) == END)
6442 r->extflags |= RXf_WHITE;
6446 if (RExC_paren_names) {
6447 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6448 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6451 ri->name_list_idx = 0;
6453 if (RExC_recurse_count) {
6454 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6455 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6456 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6459 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6460 /* assume we don't need to swap parens around before we match */
6463 PerlIO_printf(Perl_debug_log,"Final program:\n");
6466 #ifdef RE_TRACK_PATTERN_OFFSETS
6467 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6468 const U32 len = ri->u.offsets[0];
6470 GET_RE_DEBUG_FLAGS_DECL;
6471 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6472 for (i = 1; i <= len; i++) {
6473 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6474 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6475 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6477 PerlIO_printf(Perl_debug_log, "\n");
6485 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6488 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6490 PERL_UNUSED_ARG(value);
6492 if (flags & RXapif_FETCH) {
6493 return reg_named_buff_fetch(rx, key, flags);
6494 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6495 Perl_croak_no_modify();
6497 } else if (flags & RXapif_EXISTS) {
6498 return reg_named_buff_exists(rx, key, flags)
6501 } else if (flags & RXapif_REGNAMES) {
6502 return reg_named_buff_all(rx, flags);
6503 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6504 return reg_named_buff_scalar(rx, flags);
6506 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6512 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6515 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6516 PERL_UNUSED_ARG(lastkey);
6518 if (flags & RXapif_FIRSTKEY)
6519 return reg_named_buff_firstkey(rx, flags);
6520 else if (flags & RXapif_NEXTKEY)
6521 return reg_named_buff_nextkey(rx, flags);
6523 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6529 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6532 AV *retarray = NULL;
6534 struct regexp *const rx = ReANY(r);
6536 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6538 if (flags & RXapif_ALL)
6541 if (rx && RXp_PAREN_NAMES(rx)) {
6542 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6545 SV* sv_dat=HeVAL(he_str);
6546 I32 *nums=(I32*)SvPVX(sv_dat);
6547 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6548 if ((I32)(rx->nparens) >= nums[i]
6549 && rx->offs[nums[i]].start != -1
6550 && rx->offs[nums[i]].end != -1)
6553 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6558 ret = newSVsv(&PL_sv_undef);
6561 av_push(retarray, ret);
6564 return newRV_noinc(MUTABLE_SV(retarray));
6571 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6574 struct regexp *const rx = ReANY(r);
6576 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6578 if (rx && RXp_PAREN_NAMES(rx)) {
6579 if (flags & RXapif_ALL) {
6580 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6582 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6596 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6598 struct regexp *const rx = ReANY(r);
6600 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6602 if ( rx && RXp_PAREN_NAMES(rx) ) {
6603 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6605 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6612 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6614 struct regexp *const rx = ReANY(r);
6615 GET_RE_DEBUG_FLAGS_DECL;
6617 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6619 if (rx && RXp_PAREN_NAMES(rx)) {
6620 HV *hv = RXp_PAREN_NAMES(rx);
6622 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6625 SV* sv_dat = HeVAL(temphe);
6626 I32 *nums = (I32*)SvPVX(sv_dat);
6627 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6628 if ((I32)(rx->lastparen) >= nums[i] &&
6629 rx->offs[nums[i]].start != -1 &&
6630 rx->offs[nums[i]].end != -1)
6636 if (parno || flags & RXapif_ALL) {
6637 return newSVhek(HeKEY_hek(temphe));
6645 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6650 struct regexp *const rx = ReANY(r);
6652 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6654 if (rx && RXp_PAREN_NAMES(rx)) {
6655 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6656 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6657 } else if (flags & RXapif_ONE) {
6658 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6659 av = MUTABLE_AV(SvRV(ret));
6660 length = av_len(av);
6662 return newSViv(length + 1);
6664 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6668 return &PL_sv_undef;
6672 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6674 struct regexp *const rx = ReANY(r);
6677 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6679 if (rx && RXp_PAREN_NAMES(rx)) {
6680 HV *hv= RXp_PAREN_NAMES(rx);
6682 (void)hv_iterinit(hv);
6683 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6686 SV* sv_dat = HeVAL(temphe);
6687 I32 *nums = (I32*)SvPVX(sv_dat);
6688 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6689 if ((I32)(rx->lastparen) >= nums[i] &&
6690 rx->offs[nums[i]].start != -1 &&
6691 rx->offs[nums[i]].end != -1)
6697 if (parno || flags & RXapif_ALL) {
6698 av_push(av, newSVhek(HeKEY_hek(temphe)));
6703 return newRV_noinc(MUTABLE_SV(av));
6707 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6710 struct regexp *const rx = ReANY(r);
6716 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6718 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6719 || n == RX_BUFF_IDX_CARET_FULLMATCH
6720 || n == RX_BUFF_IDX_CARET_POSTMATCH
6722 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6729 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6730 /* no need to distinguish between them any more */
6731 n = RX_BUFF_IDX_FULLMATCH;
6733 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6734 && rx->offs[0].start != -1)
6736 /* $`, ${^PREMATCH} */
6737 i = rx->offs[0].start;
6741 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6742 && rx->offs[0].end != -1)
6744 /* $', ${^POSTMATCH} */
6745 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6746 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6749 if ( 0 <= n && n <= (I32)rx->nparens &&
6750 (s1 = rx->offs[n].start) != -1 &&
6751 (t1 = rx->offs[n].end) != -1)
6753 /* $&, ${^MATCH}, $1 ... */
6755 s = rx->subbeg + s1 - rx->suboffset;
6760 assert(s >= rx->subbeg);
6761 assert(rx->sublen >= (s - rx->subbeg) + i );
6763 #if NO_TAINT_SUPPORT
6764 sv_setpvn(sv, s, i);
6766 const int oldtainted = TAINT_get;
6768 sv_setpvn(sv, s, i);
6769 TAINT_set(oldtainted);
6771 if ( (rx->extflags & RXf_CANY_SEEN)
6772 ? (RXp_MATCH_UTF8(rx)
6773 && (!i || is_utf8_string((U8*)s, i)))
6774 : (RXp_MATCH_UTF8(rx)) )
6781 if (RXp_MATCH_TAINTED(rx)) {
6782 if (SvTYPE(sv) >= SVt_PVMG) {
6783 MAGIC* const mg = SvMAGIC(sv);
6786 SvMAGIC_set(sv, mg->mg_moremagic);
6788 if ((mgt = SvMAGIC(sv))) {
6789 mg->mg_moremagic = mgt;
6790 SvMAGIC_set(sv, mg);
6801 sv_setsv(sv,&PL_sv_undef);
6807 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6808 SV const * const value)
6810 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6812 PERL_UNUSED_ARG(rx);
6813 PERL_UNUSED_ARG(paren);
6814 PERL_UNUSED_ARG(value);
6817 Perl_croak_no_modify();
6821 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6824 struct regexp *const rx = ReANY(r);
6828 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6830 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6832 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6833 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6837 case RX_BUFF_IDX_PREMATCH: /* $` */
6838 if (rx->offs[0].start != -1) {
6839 i = rx->offs[0].start;
6848 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6849 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6851 case RX_BUFF_IDX_POSTMATCH: /* $' */
6852 if (rx->offs[0].end != -1) {
6853 i = rx->sublen - rx->offs[0].end;
6855 s1 = rx->offs[0].end;
6862 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6863 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6867 /* $& / ${^MATCH}, $1, $2, ... */
6869 if (paren <= (I32)rx->nparens &&
6870 (s1 = rx->offs[paren].start) != -1 &&
6871 (t1 = rx->offs[paren].end) != -1)
6877 if (ckWARN(WARN_UNINITIALIZED))
6878 report_uninit((const SV *)sv);
6883 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6884 const char * const s = rx->subbeg - rx->suboffset + s1;
6889 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6896 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6898 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6899 PERL_UNUSED_ARG(rx);
6903 return newSVpvs("Regexp");
6906 /* Scans the name of a named buffer from the pattern.
6907 * If flags is REG_RSN_RETURN_NULL returns null.
6908 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6909 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6910 * to the parsed name as looked up in the RExC_paren_names hash.
6911 * If there is an error throws a vFAIL().. type exception.
6914 #define REG_RSN_RETURN_NULL 0
6915 #define REG_RSN_RETURN_NAME 1
6916 #define REG_RSN_RETURN_DATA 2
6919 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6921 char *name_start = RExC_parse;
6923 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6925 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6926 /* skip IDFIRST by using do...while */
6929 RExC_parse += UTF8SKIP(RExC_parse);
6930 } while (isALNUM_utf8((U8*)RExC_parse));
6934 } while (isALNUM(*RExC_parse));
6936 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6937 vFAIL("Group name must start with a non-digit word character");
6941 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6942 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6943 if ( flags == REG_RSN_RETURN_NAME)
6945 else if (flags==REG_RSN_RETURN_DATA) {
6948 if ( ! sv_name ) /* should not happen*/
6949 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6950 if (RExC_paren_names)
6951 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6953 sv_dat = HeVAL(he_str);
6955 vFAIL("Reference to nonexistent named group");
6959 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6960 (unsigned long) flags);
6962 assert(0); /* NOT REACHED */
6967 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6968 int rem=(int)(RExC_end - RExC_parse); \
6977 if (RExC_lastparse!=RExC_parse) \
6978 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6981 iscut ? "..." : "<" \
6984 PerlIO_printf(Perl_debug_log,"%16s",""); \
6987 num = RExC_size + 1; \
6989 num=REG_NODE_NUM(RExC_emit); \
6990 if (RExC_lastnum!=num) \
6991 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6993 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6994 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6995 (int)((depth*2)), "", \
6999 RExC_lastparse=RExC_parse; \
7004 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7005 DEBUG_PARSE_MSG((funcname)); \
7006 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7008 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7009 DEBUG_PARSE_MSG((funcname)); \
7010 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7013 /* This section of code defines the inversion list object and its methods. The
7014 * interfaces are highly subject to change, so as much as possible is static to
7015 * this file. An inversion list is here implemented as a malloc'd C UV array
7016 * with some added info that is placed as UVs at the beginning in a header
7017 * portion. An inversion list for Unicode is an array of code points, sorted
7018 * by ordinal number. The zeroth element is the first code point in the list.
7019 * The 1th element is the first element beyond that not in the list. In other
7020 * words, the first range is
7021 * invlist[0]..(invlist[1]-1)
7022 * The other ranges follow. Thus every element whose index is divisible by two
7023 * marks the beginning of a range that is in the list, and every element not
7024 * divisible by two marks the beginning of a range not in the list. A single
7025 * element inversion list that contains the single code point N generally
7026 * consists of two elements
7029 * (The exception is when N is the highest representable value on the
7030 * machine, in which case the list containing just it would be a single
7031 * element, itself. By extension, if the last range in the list extends to
7032 * infinity, then the first element of that range will be in the inversion list
7033 * at a position that is divisible by two, and is the final element in the
7035 * Taking the complement (inverting) an inversion list is quite simple, if the
7036 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7037 * This implementation reserves an element at the beginning of each inversion
7038 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
7039 * actual beginning of the list is either that element if 0, or the next one if
7042 * More about inversion lists can be found in "Unicode Demystified"
7043 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7044 * More will be coming when functionality is added later.
7046 * The inversion list data structure is currently implemented as an SV pointing
7047 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7048 * array of UV whose memory management is automatically handled by the existing
7049 * facilities for SV's.
7051 * Some of the methods should always be private to the implementation, and some
7052 * should eventually be made public */
7054 /* The header definitions are in F<inline_invlist.c> */
7056 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7057 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7059 #define INVLIST_INITIAL_LEN 10
7061 PERL_STATIC_INLINE UV*
7062 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7064 /* Returns a pointer to the first element in the inversion list's array.
7065 * This is called upon initialization of an inversion list. Where the
7066 * array begins depends on whether the list has the code point U+0000
7067 * in it or not. The other parameter tells it whether the code that
7068 * follows this call is about to put a 0 in the inversion list or not.
7069 * The first element is either the element with 0, if 0, or the next one,
7072 UV* zero = get_invlist_zero_addr(invlist);
7074 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7077 assert(! *_get_invlist_len_addr(invlist));
7079 /* 1^1 = 0; 1^0 = 1 */
7080 *zero = 1 ^ will_have_0;
7081 return zero + *zero;
7084 PERL_STATIC_INLINE UV*
7085 S_invlist_array(pTHX_ SV* const invlist)
7087 /* Returns the pointer to the inversion list's array. Every time the
7088 * length changes, this needs to be called in case malloc or realloc moved
7091 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7093 /* Must not be empty. If these fail, you probably didn't check for <len>
7094 * being non-zero before trying to get the array */
7095 assert(*_get_invlist_len_addr(invlist));
7096 assert(*get_invlist_zero_addr(invlist) == 0
7097 || *get_invlist_zero_addr(invlist) == 1);
7099 /* The array begins either at the element reserved for zero if the
7100 * list contains 0 (that element will be set to 0), or otherwise the next
7101 * element (in which case the reserved element will be set to 1). */
7102 return (UV *) (get_invlist_zero_addr(invlist)
7103 + *get_invlist_zero_addr(invlist));
7106 PERL_STATIC_INLINE void
7107 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7109 /* Sets the current number of elements stored in the inversion list */
7111 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7113 *_get_invlist_len_addr(invlist) = len;
7115 assert(len <= SvLEN(invlist));
7117 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7118 /* If the list contains U+0000, that element is part of the header,
7119 * and should not be counted as part of the array. It will contain
7120 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7122 * SvCUR_set(invlist,
7123 * TO_INTERNAL_SIZE(len
7124 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7125 * But, this is only valid if len is not 0. The consequences of not doing
7126 * this is that the memory allocation code may think that 1 more UV is
7127 * being used than actually is, and so might do an unnecessary grow. That
7128 * seems worth not bothering to make this the precise amount.
7130 * Note that when inverting, SvCUR shouldn't change */
7133 PERL_STATIC_INLINE IV*
7134 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7136 /* Return the address of the UV that is reserved to hold the cached index
7139 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7141 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7144 PERL_STATIC_INLINE IV
7145 S_invlist_previous_index(pTHX_ SV* const invlist)
7147 /* Returns cached index of previous search */
7149 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7151 return *get_invlist_previous_index_addr(invlist);
7154 PERL_STATIC_INLINE void
7155 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7157 /* Caches <index> for later retrieval */
7159 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7161 assert(index == 0 || index < (int) _invlist_len(invlist));
7163 *get_invlist_previous_index_addr(invlist) = index;
7166 PERL_STATIC_INLINE UV
7167 S_invlist_max(pTHX_ SV* const invlist)
7169 /* Returns the maximum number of elements storable in the inversion list's
7170 * array, without having to realloc() */
7172 PERL_ARGS_ASSERT_INVLIST_MAX;
7174 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7177 PERL_STATIC_INLINE UV*
7178 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7180 /* Return the address of the UV that is reserved to hold 0 if the inversion
7181 * list contains 0. This has to be the last element of the heading, as the
7182 * list proper starts with either it if 0, or the next element if not.
7183 * (But we force it to contain either 0 or 1) */
7185 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7187 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7190 #ifndef PERL_IN_XSUB_RE
7192 Perl__new_invlist(pTHX_ IV initial_size)
7195 /* Return a pointer to a newly constructed inversion list, with enough
7196 * space to store 'initial_size' elements. If that number is negative, a
7197 * system default is used instead */
7201 if (initial_size < 0) {
7202 initial_size = INVLIST_INITIAL_LEN;
7205 /* Allocate the initial space */
7206 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7207 invlist_set_len(new_list, 0);
7209 /* Force iterinit() to be used to get iteration to work */
7210 *get_invlist_iter_addr(new_list) = UV_MAX;
7212 /* This should force a segfault if a method doesn't initialize this
7214 *get_invlist_zero_addr(new_list) = UV_MAX;
7216 *get_invlist_previous_index_addr(new_list) = 0;
7217 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7218 #if HEADER_LENGTH != 5
7219 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7227 S__new_invlist_C_array(pTHX_ UV* list)
7229 /* Return a pointer to a newly constructed inversion list, initialized to
7230 * point to <list>, which has to be in the exact correct inversion list
7231 * form, including internal fields. Thus this is a dangerous routine that
7232 * should not be used in the wrong hands */
7234 SV* invlist = newSV_type(SVt_PV);
7236 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7238 SvPV_set(invlist, (char *) list);
7239 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7240 shouldn't touch it */
7241 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7243 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7244 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7251 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7253 /* Grow the maximum size of an inversion list */
7255 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7257 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7260 PERL_STATIC_INLINE void
7261 S_invlist_trim(pTHX_ SV* const invlist)
7263 PERL_ARGS_ASSERT_INVLIST_TRIM;
7265 /* Change the length of the inversion list to how many entries it currently
7268 SvPV_shrink_to_cur((SV *) invlist);
7271 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7274 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7276 /* Subject to change or removal. Append the range from 'start' to 'end' at
7277 * the end of the inversion list. The range must be above any existing
7281 UV max = invlist_max(invlist);
7282 UV len = _invlist_len(invlist);
7284 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7286 if (len == 0) { /* Empty lists must be initialized */
7287 array = _invlist_array_init(invlist, start == 0);
7290 /* Here, the existing list is non-empty. The current max entry in the
7291 * list is generally the first value not in the set, except when the
7292 * set extends to the end of permissible values, in which case it is
7293 * the first entry in that final set, and so this call is an attempt to
7294 * append out-of-order */
7296 UV final_element = len - 1;
7297 array = invlist_array(invlist);
7298 if (array[final_element] > start
7299 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7301 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7302 array[final_element], start,
7303 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7306 /* Here, it is a legal append. If the new range begins with the first
7307 * value not in the set, it is extending the set, so the new first
7308 * value not in the set is one greater than the newly extended range.
7310 if (array[final_element] == start) {
7311 if (end != UV_MAX) {
7312 array[final_element] = end + 1;
7315 /* But if the end is the maximum representable on the machine,
7316 * just let the range that this would extend to have no end */
7317 invlist_set_len(invlist, len - 1);
7323 /* Here the new range doesn't extend any existing set. Add it */
7325 len += 2; /* Includes an element each for the start and end of range */
7327 /* If overflows the existing space, extend, which may cause the array to be
7330 invlist_extend(invlist, len);
7331 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7332 failure in invlist_array() */
7333 array = invlist_array(invlist);
7336 invlist_set_len(invlist, len);
7339 /* The next item on the list starts the range, the one after that is
7340 * one past the new range. */
7341 array[len - 2] = start;
7342 if (end != UV_MAX) {
7343 array[len - 1] = end + 1;
7346 /* But if the end is the maximum representable on the machine, just let
7347 * the range have no end */
7348 invlist_set_len(invlist, len - 1);
7352 #ifndef PERL_IN_XSUB_RE
7355 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7357 /* Searches the inversion list for the entry that contains the input code
7358 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7359 * return value is the index into the list's array of the range that
7364 IV high = _invlist_len(invlist);
7365 const IV highest_element = high - 1;
7368 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7370 /* If list is empty, return failure. */
7375 /* If the code point is before the first element, return failure. (We
7376 * can't combine this with the test above, because we can't get the array
7377 * unless we know the list is non-empty) */
7378 array = invlist_array(invlist);
7380 mid = invlist_previous_index(invlist);
7381 assert(mid >=0 && mid <= highest_element);
7383 /* <mid> contains the cache of the result of the previous call to this
7384 * function (0 the first time). See if this call is for the same result,
7385 * or if it is for mid-1. This is under the theory that calls to this
7386 * function will often be for related code points that are near each other.
7387 * And benchmarks show that caching gives better results. We also test
7388 * here if the code point is within the bounds of the list. These tests
7389 * replace others that would have had to be made anyway to make sure that
7390 * the array bounds were not exceeded, and these give us extra information
7391 * at the same time */
7392 if (cp >= array[mid]) {
7393 if (cp >= array[highest_element]) {
7394 return highest_element;
7397 /* Here, array[mid] <= cp < array[highest_element]. This means that
7398 * the final element is not the answer, so can exclude it; it also
7399 * means that <mid> is not the final element, so can refer to 'mid + 1'
7401 if (cp < array[mid + 1]) {
7407 else { /* cp < aray[mid] */
7408 if (cp < array[0]) { /* Fail if outside the array */
7412 if (cp >= array[mid - 1]) {
7417 /* Binary search. What we are looking for is <i> such that
7418 * array[i] <= cp < array[i+1]
7419 * The loop below converges on the i+1. Note that there may not be an
7420 * (i+1)th element in the array, and things work nonetheless */
7421 while (low < high) {
7422 mid = (low + high) / 2;
7423 assert(mid <= highest_element);
7424 if (array[mid] <= cp) { /* cp >= array[mid] */
7427 /* We could do this extra test to exit the loop early.
7428 if (cp < array[low]) {
7433 else { /* cp < array[mid] */
7440 invlist_set_previous_index(invlist, high);
7445 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7447 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7448 * but is used when the swash has an inversion list. This makes this much
7449 * faster, as it uses a binary search instead of a linear one. This is
7450 * intimately tied to that function, and perhaps should be in utf8.c,
7451 * except it is intimately tied to inversion lists as well. It assumes
7452 * that <swatch> is all 0's on input */
7455 const IV len = _invlist_len(invlist);
7459 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7461 if (len == 0) { /* Empty inversion list */
7465 array = invlist_array(invlist);
7467 /* Find which element it is */
7468 i = _invlist_search(invlist, start);
7470 /* We populate from <start> to <end> */
7471 while (current < end) {
7474 /* The inversion list gives the results for every possible code point
7475 * after the first one in the list. Only those ranges whose index is
7476 * even are ones that the inversion list matches. For the odd ones,
7477 * and if the initial code point is not in the list, we have to skip
7478 * forward to the next element */
7479 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7481 if (i >= len) { /* Finished if beyond the end of the array */
7485 if (current >= end) { /* Finished if beyond the end of what we
7487 if (LIKELY(end < UV_MAX)) {
7491 /* We get here when the upper bound is the maximum
7492 * representable on the machine, and we are looking for just
7493 * that code point. Have to special case it */
7495 goto join_end_of_list;
7498 assert(current >= start);
7500 /* The current range ends one below the next one, except don't go past
7503 upper = (i < len && array[i] < end) ? array[i] : end;
7505 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7506 * for each code point in it */
7507 for (; current < upper; current++) {
7508 const STRLEN offset = (STRLEN)(current - start);
7509 swatch[offset >> 3] |= 1 << (offset & 7);
7514 /* Quit if at the end of the list */
7517 /* But first, have to deal with the highest possible code point on
7518 * the platform. The previous code assumes that <end> is one
7519 * beyond where we want to populate, but that is impossible at the
7520 * platform's infinity, so have to handle it specially */
7521 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7523 const STRLEN offset = (STRLEN)(end - start);
7524 swatch[offset >> 3] |= 1 << (offset & 7);
7529 /* Advance to the next range, which will be for code points not in the
7538 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7540 /* Take the union of two inversion lists and point <output> to it. *output
7541 * should be defined upon input, and if it points to one of the two lists,
7542 * the reference count to that list will be decremented. The first list,
7543 * <a>, may be NULL, in which case a copy of the second list is returned.
7544 * If <complement_b> is TRUE, the union is taken of the complement
7545 * (inversion) of <b> instead of b itself.
7547 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7548 * Richard Gillam, published by Addison-Wesley, and explained at some
7549 * length there. The preface says to incorporate its examples into your
7550 * code at your own risk.
7552 * The algorithm is like a merge sort.
7554 * XXX A potential performance improvement is to keep track as we go along
7555 * if only one of the inputs contributes to the result, meaning the other
7556 * is a subset of that one. In that case, we can skip the final copy and
7557 * return the larger of the input lists, but then outside code might need
7558 * to keep track of whether to free the input list or not */
7560 UV* array_a; /* a's array */
7562 UV len_a; /* length of a's array */
7565 SV* u; /* the resulting union */
7569 UV i_a = 0; /* current index into a's array */
7573 /* running count, as explained in the algorithm source book; items are
7574 * stopped accumulating and are output when the count changes to/from 0.
7575 * The count is incremented when we start a range that's in the set, and
7576 * decremented when we start a range that's not in the set. So its range
7577 * is 0 to 2. Only when the count is zero is something not in the set.
7581 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7584 /* If either one is empty, the union is the other one */
7585 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7592 *output = invlist_clone(b);
7594 _invlist_invert(*output);
7596 } /* else *output already = b; */
7599 else if ((len_b = _invlist_len(b)) == 0) {
7604 /* The complement of an empty list is a list that has everything in it,
7605 * so the union with <a> includes everything too */
7610 *output = _new_invlist(1);
7611 _append_range_to_invlist(*output, 0, UV_MAX);
7613 else if (*output != a) {
7614 *output = invlist_clone(a);
7616 /* else *output already = a; */
7620 /* Here both lists exist and are non-empty */
7621 array_a = invlist_array(a);
7622 array_b = invlist_array(b);
7624 /* If are to take the union of 'a' with the complement of b, set it
7625 * up so are looking at b's complement. */
7628 /* To complement, we invert: if the first element is 0, remove it. To
7629 * do this, we just pretend the array starts one later, and clear the
7630 * flag as we don't have to do anything else later */
7631 if (array_b[0] == 0) {
7634 complement_b = FALSE;
7638 /* But if the first element is not zero, we unshift a 0 before the
7639 * array. The data structure reserves a space for that 0 (which
7640 * should be a '1' right now), so physical shifting is unneeded,
7641 * but temporarily change that element to 0. Before exiting the
7642 * routine, we must restore the element to '1' */
7649 /* Size the union for the worst case: that the sets are completely
7651 u = _new_invlist(len_a + len_b);
7653 /* Will contain U+0000 if either component does */
7654 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7655 || (len_b > 0 && array_b[0] == 0));
7657 /* Go through each list item by item, stopping when exhausted one of
7659 while (i_a < len_a && i_b < len_b) {
7660 UV cp; /* The element to potentially add to the union's array */
7661 bool cp_in_set; /* is it in the the input list's set or not */
7663 /* We need to take one or the other of the two inputs for the union.
7664 * Since we are merging two sorted lists, we take the smaller of the
7665 * next items. In case of a tie, we take the one that is in its set
7666 * first. If we took one not in the set first, it would decrement the
7667 * count, possibly to 0 which would cause it to be output as ending the
7668 * range, and the next time through we would take the same number, and
7669 * output it again as beginning the next range. By doing it the
7670 * opposite way, there is no possibility that the count will be
7671 * momentarily decremented to 0, and thus the two adjoining ranges will
7672 * be seamlessly merged. (In a tie and both are in the set or both not
7673 * in the set, it doesn't matter which we take first.) */
7674 if (array_a[i_a] < array_b[i_b]
7675 || (array_a[i_a] == array_b[i_b]
7676 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7678 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7682 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7686 /* Here, have chosen which of the two inputs to look at. Only output
7687 * if the running count changes to/from 0, which marks the
7688 * beginning/end of a range in that's in the set */
7691 array_u[i_u++] = cp;
7698 array_u[i_u++] = cp;
7703 /* Here, we are finished going through at least one of the lists, which
7704 * means there is something remaining in at most one. We check if the list
7705 * that hasn't been exhausted is positioned such that we are in the middle
7706 * of a range in its set or not. (i_a and i_b point to the element beyond
7707 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7708 * is potentially more to output.
7709 * There are four cases:
7710 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7711 * in the union is entirely from the non-exhausted set.
7712 * 2) Both were in their sets, count is 2. Nothing further should
7713 * be output, as everything that remains will be in the exhausted
7714 * list's set, hence in the union; decrementing to 1 but not 0 insures
7716 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7717 * Nothing further should be output because the union includes
7718 * everything from the exhausted set. Not decrementing ensures that.
7719 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7720 * decrementing to 0 insures that we look at the remainder of the
7721 * non-exhausted set */
7722 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7723 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7728 /* The final length is what we've output so far, plus what else is about to
7729 * be output. (If 'count' is non-zero, then the input list we exhausted
7730 * has everything remaining up to the machine's limit in its set, and hence
7731 * in the union, so there will be no further output. */
7734 /* At most one of the subexpressions will be non-zero */
7735 len_u += (len_a - i_a) + (len_b - i_b);
7738 /* Set result to final length, which can change the pointer to array_u, so
7740 if (len_u != _invlist_len(u)) {
7741 invlist_set_len(u, len_u);
7743 array_u = invlist_array(u);
7746 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7747 * the other) ended with everything above it not in its set. That means
7748 * that the remaining part of the union is precisely the same as the
7749 * non-exhausted list, so can just copy it unchanged. (If both list were
7750 * exhausted at the same time, then the operations below will be both 0.)
7753 IV copy_count; /* At most one will have a non-zero copy count */
7754 if ((copy_count = len_a - i_a) > 0) {
7755 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7757 else if ((copy_count = len_b - i_b) > 0) {
7758 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7762 /* We may be removing a reference to one of the inputs */
7763 if (a == *output || b == *output) {
7764 SvREFCNT_dec(*output);
7767 /* If we've changed b, restore it */
7777 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7779 /* Take the intersection of two inversion lists and point <i> to it. *i
7780 * should be defined upon input, and if it points to one of the two lists,
7781 * the reference count to that list will be decremented.
7782 * If <complement_b> is TRUE, the result will be the intersection of <a>
7783 * and the complement (or inversion) of <b> instead of <b> directly.
7785 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7786 * Richard Gillam, published by Addison-Wesley, and explained at some
7787 * length there. The preface says to incorporate its examples into your
7788 * code at your own risk. In fact, it had bugs
7790 * The algorithm is like a merge sort, and is essentially the same as the
7794 UV* array_a; /* a's array */
7796 UV len_a; /* length of a's array */
7799 SV* r; /* the resulting intersection */
7803 UV i_a = 0; /* current index into a's array */
7807 /* running count, as explained in the algorithm source book; items are
7808 * stopped accumulating and are output when the count changes to/from 2.
7809 * The count is incremented when we start a range that's in the set, and
7810 * decremented when we start a range that's not in the set. So its range
7811 * is 0 to 2. Only when the count is 2 is something in the intersection.
7815 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7818 /* Special case if either one is empty */
7819 len_a = _invlist_len(a);
7820 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7822 if (len_a != 0 && complement_b) {
7824 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7825 * be empty. Here, also we are using 'b's complement, which hence
7826 * must be every possible code point. Thus the intersection is
7829 *i = invlist_clone(a);
7835 /* else *i is already 'a' */
7839 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7840 * intersection must be empty */
7847 *i = _new_invlist(0);
7851 /* Here both lists exist and are non-empty */
7852 array_a = invlist_array(a);
7853 array_b = invlist_array(b);
7855 /* If are to take the intersection of 'a' with the complement of b, set it
7856 * up so are looking at b's complement. */
7859 /* To complement, we invert: if the first element is 0, remove it. To
7860 * do this, we just pretend the array starts one later, and clear the
7861 * flag as we don't have to do anything else later */
7862 if (array_b[0] == 0) {
7865 complement_b = FALSE;
7869 /* But if the first element is not zero, we unshift a 0 before the
7870 * array. The data structure reserves a space for that 0 (which
7871 * should be a '1' right now), so physical shifting is unneeded,
7872 * but temporarily change that element to 0. Before exiting the
7873 * routine, we must restore the element to '1' */
7880 /* Size the intersection for the worst case: that the intersection ends up
7881 * fragmenting everything to be completely disjoint */
7882 r= _new_invlist(len_a + len_b);
7884 /* Will contain U+0000 iff both components do */
7885 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7886 && len_b > 0 && array_b[0] == 0);
7888 /* Go through each list item by item, stopping when exhausted one of
7890 while (i_a < len_a && i_b < len_b) {
7891 UV cp; /* The element to potentially add to the intersection's
7893 bool cp_in_set; /* Is it in the input list's set or not */
7895 /* We need to take one or the other of the two inputs for the
7896 * intersection. Since we are merging two sorted lists, we take the
7897 * smaller of the next items. In case of a tie, we take the one that
7898 * is not in its set first (a difference from the union algorithm). If
7899 * we took one in the set first, it would increment the count, possibly
7900 * to 2 which would cause it to be output as starting a range in the
7901 * intersection, and the next time through we would take that same
7902 * number, and output it again as ending the set. By doing it the
7903 * opposite of this, there is no possibility that the count will be
7904 * momentarily incremented to 2. (In a tie and both are in the set or
7905 * both not in the set, it doesn't matter which we take first.) */
7906 if (array_a[i_a] < array_b[i_b]
7907 || (array_a[i_a] == array_b[i_b]
7908 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7910 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7914 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7918 /* Here, have chosen which of the two inputs to look at. Only output
7919 * if the running count changes to/from 2, which marks the
7920 * beginning/end of a range that's in the intersection */
7924 array_r[i_r++] = cp;
7929 array_r[i_r++] = cp;
7935 /* Here, we are finished going through at least one of the lists, which
7936 * means there is something remaining in at most one. We check if the list
7937 * that has been exhausted is positioned such that we are in the middle
7938 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7939 * the ones we care about.) There are four cases:
7940 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7941 * nothing left in the intersection.
7942 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7943 * above 2. What should be output is exactly that which is in the
7944 * non-exhausted set, as everything it has is also in the intersection
7945 * set, and everything it doesn't have can't be in the intersection
7946 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7947 * gets incremented to 2. Like the previous case, the intersection is
7948 * everything that remains in the non-exhausted set.
7949 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7950 * remains 1. And the intersection has nothing more. */
7951 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7952 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7957 /* The final length is what we've output so far plus what else is in the
7958 * intersection. At most one of the subexpressions below will be non-zero */
7961 len_r += (len_a - i_a) + (len_b - i_b);
7964 /* Set result to final length, which can change the pointer to array_r, so
7966 if (len_r != _invlist_len(r)) {
7967 invlist_set_len(r, len_r);
7969 array_r = invlist_array(r);
7972 /* Finish outputting any remaining */
7973 if (count >= 2) { /* At most one will have a non-zero copy count */
7975 if ((copy_count = len_a - i_a) > 0) {
7976 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7978 else if ((copy_count = len_b - i_b) > 0) {
7979 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7983 /* We may be removing a reference to one of the inputs */
7984 if (a == *i || b == *i) {
7988 /* If we've changed b, restore it */
7998 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8000 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8001 * set. A pointer to the inversion list is returned. This may actually be
8002 * a new list, in which case the passed in one has been destroyed. The
8003 * passed in inversion list can be NULL, in which case a new one is created
8004 * with just the one range in it */
8009 if (invlist == NULL) {
8010 invlist = _new_invlist(2);
8014 len = _invlist_len(invlist);
8017 /* If comes after the final entry, can just append it to the end */
8019 || start >= invlist_array(invlist)
8020 [_invlist_len(invlist) - 1])
8022 _append_range_to_invlist(invlist, start, end);
8026 /* Here, can't just append things, create and return a new inversion list
8027 * which is the union of this range and the existing inversion list */
8028 range_invlist = _new_invlist(2);
8029 _append_range_to_invlist(range_invlist, start, end);
8031 _invlist_union(invlist, range_invlist, &invlist);
8033 /* The temporary can be freed */
8034 SvREFCNT_dec(range_invlist);
8041 PERL_STATIC_INLINE SV*
8042 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8043 return _add_range_to_invlist(invlist, cp, cp);
8046 #ifndef PERL_IN_XSUB_RE
8048 Perl__invlist_invert(pTHX_ SV* const invlist)
8050 /* Complement the input inversion list. This adds a 0 if the list didn't
8051 * have a zero; removes it otherwise. As described above, the data
8052 * structure is set up so that this is very efficient */
8054 UV* len_pos = _get_invlist_len_addr(invlist);
8056 PERL_ARGS_ASSERT__INVLIST_INVERT;
8058 /* The inverse of matching nothing is matching everything */
8059 if (*len_pos == 0) {
8060 _append_range_to_invlist(invlist, 0, UV_MAX);
8064 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
8065 * zero element was a 0, so it is being removed, so the length decrements
8066 * by 1; and vice-versa. SvCUR is unaffected */
8067 if (*get_invlist_zero_addr(invlist) ^= 1) {
8076 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8078 /* Complement the input inversion list (which must be a Unicode property,
8079 * all of which don't match above the Unicode maximum code point.) And
8080 * Perl has chosen to not have the inversion match above that either. This
8081 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8087 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8089 _invlist_invert(invlist);
8091 len = _invlist_len(invlist);
8093 if (len != 0) { /* If empty do nothing */
8094 array = invlist_array(invlist);
8095 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8096 /* Add 0x110000. First, grow if necessary */
8098 if (invlist_max(invlist) < len) {
8099 invlist_extend(invlist, len);
8100 array = invlist_array(invlist);
8102 invlist_set_len(invlist, len);
8103 array[len - 1] = PERL_UNICODE_MAX + 1;
8105 else { /* Remove the 0x110000 */
8106 invlist_set_len(invlist, len - 1);
8114 PERL_STATIC_INLINE SV*
8115 S_invlist_clone(pTHX_ SV* const invlist)
8118 /* Return a new inversion list that is a copy of the input one, which is
8121 /* Need to allocate extra space to accommodate Perl's addition of a
8122 * trailing NUL to SvPV's, since it thinks they are always strings */
8123 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8124 STRLEN length = SvCUR(invlist);
8126 PERL_ARGS_ASSERT_INVLIST_CLONE;
8128 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8129 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8134 PERL_STATIC_INLINE UV*
8135 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8137 /* Return the address of the UV that contains the current iteration
8140 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8142 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8145 PERL_STATIC_INLINE UV*
8146 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8148 /* Return the address of the UV that contains the version id. */
8150 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8152 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8155 PERL_STATIC_INLINE void
8156 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8158 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8160 *get_invlist_iter_addr(invlist) = 0;
8164 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8166 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8167 * This call sets in <*start> and <*end>, the next range in <invlist>.
8168 * Returns <TRUE> if successful and the next call will return the next
8169 * range; <FALSE> if was already at the end of the list. If the latter,
8170 * <*start> and <*end> are unchanged, and the next call to this function
8171 * will start over at the beginning of the list */
8173 UV* pos = get_invlist_iter_addr(invlist);
8174 UV len = _invlist_len(invlist);
8177 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8180 *pos = UV_MAX; /* Force iternit() to be required next time */
8184 array = invlist_array(invlist);
8186 *start = array[(*pos)++];
8192 *end = array[(*pos)++] - 1;
8198 PERL_STATIC_INLINE UV
8199 S_invlist_highest(pTHX_ SV* const invlist)
8201 /* Returns the highest code point that matches an inversion list. This API
8202 * has an ambiguity, as it returns 0 under either the highest is actually
8203 * 0, or if the list is empty. If this distinction matters to you, check
8204 * for emptiness before calling this function */
8206 UV len = _invlist_len(invlist);
8209 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8215 array = invlist_array(invlist);
8217 /* The last element in the array in the inversion list always starts a
8218 * range that goes to infinity. That range may be for code points that are
8219 * matched in the inversion list, or it may be for ones that aren't
8220 * matched. In the latter case, the highest code point in the set is one
8221 * less than the beginning of this range; otherwise it is the final element
8222 * of this range: infinity */
8223 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8225 : array[len - 1] - 1;
8228 #ifndef PERL_IN_XSUB_RE
8230 Perl__invlist_contents(pTHX_ SV* const invlist)
8232 /* Get the contents of an inversion list into a string SV so that they can
8233 * be printed out. It uses the format traditionally done for debug tracing
8237 SV* output = newSVpvs("\n");
8239 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8241 invlist_iterinit(invlist);
8242 while (invlist_iternext(invlist, &start, &end)) {
8243 if (end == UV_MAX) {
8244 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8246 else if (end != start) {
8247 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8251 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8259 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8261 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8263 /* Dumps out the ranges in an inversion list. The string 'header'
8264 * if present is output on a line before the first range */
8268 PERL_ARGS_ASSERT__INVLIST_DUMP;
8270 if (header && strlen(header)) {
8271 PerlIO_printf(Perl_debug_log, "%s\n", header);
8273 invlist_iterinit(invlist);
8274 while (invlist_iternext(invlist, &start, &end)) {
8275 if (end == UV_MAX) {
8276 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8278 else if (end != start) {
8279 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8283 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8291 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8293 /* Return a boolean as to if the two passed in inversion lists are
8294 * identical. The final argument, if TRUE, says to take the complement of
8295 * the second inversion list before doing the comparison */
8297 UV* array_a = invlist_array(a);
8298 UV* array_b = invlist_array(b);
8299 UV len_a = _invlist_len(a);
8300 UV len_b = _invlist_len(b);
8302 UV i = 0; /* current index into the arrays */
8303 bool retval = TRUE; /* Assume are identical until proven otherwise */
8305 PERL_ARGS_ASSERT__INVLISTEQ;
8307 /* If are to compare 'a' with the complement of b, set it
8308 * up so are looking at b's complement. */
8311 /* The complement of nothing is everything, so <a> would have to have
8312 * just one element, starting at zero (ending at infinity) */
8314 return (len_a == 1 && array_a[0] == 0);
8316 else if (array_b[0] == 0) {
8318 /* Otherwise, to complement, we invert. Here, the first element is
8319 * 0, just remove it. To do this, we just pretend the array starts
8320 * one later, and clear the flag as we don't have to do anything
8325 complement_b = FALSE;
8329 /* But if the first element is not zero, we unshift a 0 before the
8330 * array. The data structure reserves a space for that 0 (which
8331 * should be a '1' right now), so physical shifting is unneeded,
8332 * but temporarily change that element to 0. Before exiting the
8333 * routine, we must restore the element to '1' */
8340 /* Make sure that the lengths are the same, as well as the final element
8341 * before looping through the remainder. (Thus we test the length, final,
8342 * and first elements right off the bat) */
8343 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8346 else for (i = 0; i < len_a - 1; i++) {
8347 if (array_a[i] != array_b[i]) {
8360 #undef HEADER_LENGTH
8361 #undef INVLIST_INITIAL_LENGTH
8362 #undef TO_INTERNAL_SIZE
8363 #undef FROM_INTERNAL_SIZE
8364 #undef INVLIST_LEN_OFFSET
8365 #undef INVLIST_ZERO_OFFSET
8366 #undef INVLIST_ITER_OFFSET
8367 #undef INVLIST_VERSION_ID
8369 /* End of inversion list object */
8372 - reg - regular expression, i.e. main body or parenthesized thing
8374 * Caller must absorb opening parenthesis.
8376 * Combining parenthesis handling with the base level of regular expression
8377 * is a trifle forced, but the need to tie the tails of the branches to what
8378 * follows makes it hard to avoid.
8380 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8382 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8384 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8388 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8389 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8392 regnode *ret; /* Will be the head of the group. */
8395 regnode *ender = NULL;
8398 U32 oregflags = RExC_flags;
8399 bool have_branch = 0;
8401 I32 freeze_paren = 0;
8402 I32 after_freeze = 0;
8404 /* for (?g), (?gc), and (?o) warnings; warning
8405 about (?c) will warn about (?g) -- japhy */
8407 #define WASTED_O 0x01
8408 #define WASTED_G 0x02
8409 #define WASTED_C 0x04
8410 #define WASTED_GC (0x02|0x04)
8411 I32 wastedflags = 0x00;
8413 char * parse_start = RExC_parse; /* MJD */
8414 char * const oregcomp_parse = RExC_parse;
8416 GET_RE_DEBUG_FLAGS_DECL;
8418 PERL_ARGS_ASSERT_REG;
8419 DEBUG_PARSE("reg ");
8421 *flagp = 0; /* Tentatively. */
8424 /* Make an OPEN node, if parenthesized. */
8426 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8427 char *start_verb = RExC_parse;
8428 STRLEN verb_len = 0;
8429 char *start_arg = NULL;
8430 unsigned char op = 0;
8432 int internal_argval = 0; /* internal_argval is only useful if !argok */
8433 while ( *RExC_parse && *RExC_parse != ')' ) {
8434 if ( *RExC_parse == ':' ) {
8435 start_arg = RExC_parse + 1;
8441 verb_len = RExC_parse - start_verb;
8444 while ( *RExC_parse && *RExC_parse != ')' )
8446 if ( *RExC_parse != ')' )
8447 vFAIL("Unterminated verb pattern argument");
8448 if ( RExC_parse == start_arg )
8451 if ( *RExC_parse != ')' )
8452 vFAIL("Unterminated verb pattern");
8455 switch ( *start_verb ) {
8456 case 'A': /* (*ACCEPT) */
8457 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8459 internal_argval = RExC_nestroot;
8462 case 'C': /* (*COMMIT) */
8463 if ( memEQs(start_verb,verb_len,"COMMIT") )
8466 case 'F': /* (*FAIL) */
8467 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8472 case ':': /* (*:NAME) */
8473 case 'M': /* (*MARK:NAME) */
8474 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8479 case 'P': /* (*PRUNE) */
8480 if ( memEQs(start_verb,verb_len,"PRUNE") )
8483 case 'S': /* (*SKIP) */
8484 if ( memEQs(start_verb,verb_len,"SKIP") )
8487 case 'T': /* (*THEN) */
8488 /* [19:06] <TimToady> :: is then */
8489 if ( memEQs(start_verb,verb_len,"THEN") ) {
8491 RExC_seen |= REG_SEEN_CUTGROUP;
8497 vFAIL3("Unknown verb pattern '%.*s'",
8498 verb_len, start_verb);
8501 if ( start_arg && internal_argval ) {
8502 vFAIL3("Verb pattern '%.*s' may not have an argument",
8503 verb_len, start_verb);
8504 } else if ( argok < 0 && !start_arg ) {
8505 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8506 verb_len, start_verb);
8508 ret = reganode(pRExC_state, op, internal_argval);
8509 if ( ! internal_argval && ! SIZE_ONLY ) {
8511 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8512 ARG(ret) = add_data( pRExC_state, 1, "S" );
8513 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8520 if (!internal_argval)
8521 RExC_seen |= REG_SEEN_VERBARG;
8522 } else if ( start_arg ) {
8523 vFAIL3("Verb pattern '%.*s' may not have an argument",
8524 verb_len, start_verb);
8526 ret = reg_node(pRExC_state, op);
8528 nextchar(pRExC_state);
8531 if (*RExC_parse == '?') { /* (?...) */
8532 bool is_logical = 0;
8533 const char * const seqstart = RExC_parse;
8534 bool has_use_defaults = FALSE;
8537 paren = *RExC_parse++;
8538 ret = NULL; /* For look-ahead/behind. */
8541 case 'P': /* (?P...) variants for those used to PCRE/Python */
8542 paren = *RExC_parse++;
8543 if ( paren == '<') /* (?P<...>) named capture */
8545 else if (paren == '>') { /* (?P>name) named recursion */
8546 goto named_recursion;
8548 else if (paren == '=') { /* (?P=...) named backref */
8549 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8550 you change this make sure you change that */
8551 char* name_start = RExC_parse;
8553 SV *sv_dat = reg_scan_name(pRExC_state,
8554 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8555 if (RExC_parse == name_start || *RExC_parse != ')')
8556 vFAIL2("Sequence %.3s... not terminated",parse_start);
8559 num = add_data( pRExC_state, 1, "S" );
8560 RExC_rxi->data->data[num]=(void*)sv_dat;
8561 SvREFCNT_inc_simple_void(sv_dat);
8564 ret = reganode(pRExC_state,
8567 : (ASCII_FOLD_RESTRICTED)
8569 : (AT_LEAST_UNI_SEMANTICS)
8577 Set_Node_Offset(ret, parse_start+1);
8578 Set_Node_Cur_Length(ret); /* MJD */
8580 nextchar(pRExC_state);
8584 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8586 case '<': /* (?<...) */
8587 if (*RExC_parse == '!')
8589 else if (*RExC_parse != '=')
8595 case '\'': /* (?'...') */
8596 name_start= RExC_parse;
8597 svname = reg_scan_name(pRExC_state,
8598 SIZE_ONLY ? /* reverse test from the others */
8599 REG_RSN_RETURN_NAME :
8600 REG_RSN_RETURN_NULL);
8601 if (RExC_parse == name_start) {
8603 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8606 if (*RExC_parse != paren)
8607 vFAIL2("Sequence (?%c... not terminated",
8608 paren=='>' ? '<' : paren);
8612 if (!svname) /* shouldn't happen */
8614 "panic: reg_scan_name returned NULL");
8615 if (!RExC_paren_names) {
8616 RExC_paren_names= newHV();
8617 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8619 RExC_paren_name_list= newAV();
8620 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8623 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8625 sv_dat = HeVAL(he_str);
8627 /* croak baby croak */
8629 "panic: paren_name hash element allocation failed");
8630 } else if ( SvPOK(sv_dat) ) {
8631 /* (?|...) can mean we have dupes so scan to check
8632 its already been stored. Maybe a flag indicating
8633 we are inside such a construct would be useful,
8634 but the arrays are likely to be quite small, so
8635 for now we punt -- dmq */
8636 IV count = SvIV(sv_dat);
8637 I32 *pv = (I32*)SvPVX(sv_dat);
8639 for ( i = 0 ; i < count ; i++ ) {
8640 if ( pv[i] == RExC_npar ) {
8646 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8647 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8648 pv[count] = RExC_npar;
8649 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8652 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8653 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8655 SvIV_set(sv_dat, 1);
8658 /* Yes this does cause a memory leak in debugging Perls */
8659 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8660 SvREFCNT_dec(svname);
8663 /*sv_dump(sv_dat);*/
8665 nextchar(pRExC_state);
8667 goto capturing_parens;
8669 RExC_seen |= REG_SEEN_LOOKBEHIND;
8670 RExC_in_lookbehind++;
8672 case '=': /* (?=...) */
8673 RExC_seen_zerolen++;
8675 case '!': /* (?!...) */
8676 RExC_seen_zerolen++;
8677 if (*RExC_parse == ')') {
8678 ret=reg_node(pRExC_state, OPFAIL);
8679 nextchar(pRExC_state);
8683 case '|': /* (?|...) */
8684 /* branch reset, behave like a (?:...) except that
8685 buffers in alternations share the same numbers */
8687 after_freeze = freeze_paren = RExC_npar;
8689 case ':': /* (?:...) */
8690 case '>': /* (?>...) */
8692 case '$': /* (?$...) */
8693 case '@': /* (?@...) */
8694 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8696 case '#': /* (?#...) */
8697 while (*RExC_parse && *RExC_parse != ')')
8699 if (*RExC_parse != ')')
8700 FAIL("Sequence (?#... not terminated");
8701 nextchar(pRExC_state);
8704 case '0' : /* (?0) */
8705 case 'R' : /* (?R) */
8706 if (*RExC_parse != ')')
8707 FAIL("Sequence (?R) not terminated");
8708 ret = reg_node(pRExC_state, GOSTART);
8709 *flagp |= POSTPONED;
8710 nextchar(pRExC_state);
8713 { /* named and numeric backreferences */
8715 case '&': /* (?&NAME) */
8716 parse_start = RExC_parse - 1;
8719 SV *sv_dat = reg_scan_name(pRExC_state,
8720 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8721 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8723 goto gen_recurse_regop;
8724 assert(0); /* NOT REACHED */
8726 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8728 vFAIL("Illegal pattern");
8730 goto parse_recursion;
8732 case '-': /* (?-1) */
8733 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8734 RExC_parse--; /* rewind to let it be handled later */
8738 case '1': case '2': case '3': case '4': /* (?1) */
8739 case '5': case '6': case '7': case '8': case '9':
8742 num = atoi(RExC_parse);
8743 parse_start = RExC_parse - 1; /* MJD */
8744 if (*RExC_parse == '-')
8746 while (isDIGIT(*RExC_parse))
8748 if (*RExC_parse!=')')
8749 vFAIL("Expecting close bracket");
8752 if ( paren == '-' ) {
8754 Diagram of capture buffer numbering.
8755 Top line is the normal capture buffer numbers
8756 Bottom line is the negative indexing as from
8760 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8764 num = RExC_npar + num;
8767 vFAIL("Reference to nonexistent group");
8769 } else if ( paren == '+' ) {
8770 num = RExC_npar + num - 1;
8773 ret = reganode(pRExC_state, GOSUB, num);
8775 if (num > (I32)RExC_rx->nparens) {
8777 vFAIL("Reference to nonexistent group");
8779 ARG2L_SET( ret, RExC_recurse_count++);
8781 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8782 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8786 RExC_seen |= REG_SEEN_RECURSE;
8787 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8788 Set_Node_Offset(ret, parse_start); /* MJD */
8790 *flagp |= POSTPONED;
8791 nextchar(pRExC_state);
8793 } /* named and numeric backreferences */
8794 assert(0); /* NOT REACHED */
8796 case '?': /* (??...) */
8798 if (*RExC_parse != '{') {
8800 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8803 *flagp |= POSTPONED;
8804 paren = *RExC_parse++;
8806 case '{': /* (?{...}) */
8809 struct reg_code_block *cb;
8811 RExC_seen_zerolen++;
8813 if ( !pRExC_state->num_code_blocks
8814 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8815 || pRExC_state->code_blocks[pRExC_state->code_index].start
8816 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8819 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8820 FAIL("panic: Sequence (?{...}): no code block found\n");
8821 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8823 /* this is a pre-compiled code block (?{...}) */
8824 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8825 RExC_parse = RExC_start + cb->end;
8828 if (cb->src_regex) {
8829 n = add_data(pRExC_state, 2, "rl");
8830 RExC_rxi->data->data[n] =
8831 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8832 RExC_rxi->data->data[n+1] = (void*)o;
8835 n = add_data(pRExC_state, 1,
8836 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8837 RExC_rxi->data->data[n] = (void*)o;
8840 pRExC_state->code_index++;
8841 nextchar(pRExC_state);
8845 ret = reg_node(pRExC_state, LOGICAL);
8846 eval = reganode(pRExC_state, EVAL, n);
8849 /* for later propagation into (??{}) return value */
8850 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8852 REGTAIL(pRExC_state, ret, eval);
8853 /* deal with the length of this later - MJD */
8856 ret = reganode(pRExC_state, EVAL, n);
8857 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8858 Set_Node_Offset(ret, parse_start);
8861 case '(': /* (?(?{...})...) and (?(?=...)...) */
8864 if (RExC_parse[0] == '?') { /* (?(?...)) */
8865 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8866 || RExC_parse[1] == '<'
8867 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8870 ret = reg_node(pRExC_state, LOGICAL);
8873 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8877 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8878 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8880 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8881 char *name_start= RExC_parse++;
8883 SV *sv_dat=reg_scan_name(pRExC_state,
8884 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8885 if (RExC_parse == name_start || *RExC_parse != ch)
8886 vFAIL2("Sequence (?(%c... not terminated",
8887 (ch == '>' ? '<' : ch));
8890 num = add_data( pRExC_state, 1, "S" );
8891 RExC_rxi->data->data[num]=(void*)sv_dat;
8892 SvREFCNT_inc_simple_void(sv_dat);
8894 ret = reganode(pRExC_state,NGROUPP,num);
8895 goto insert_if_check_paren;
8897 else if (RExC_parse[0] == 'D' &&
8898 RExC_parse[1] == 'E' &&
8899 RExC_parse[2] == 'F' &&
8900 RExC_parse[3] == 'I' &&
8901 RExC_parse[4] == 'N' &&
8902 RExC_parse[5] == 'E')
8904 ret = reganode(pRExC_state,DEFINEP,0);
8907 goto insert_if_check_paren;
8909 else if (RExC_parse[0] == 'R') {
8912 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8913 parno = atoi(RExC_parse++);
8914 while (isDIGIT(*RExC_parse))
8916 } else if (RExC_parse[0] == '&') {
8919 sv_dat = reg_scan_name(pRExC_state,
8920 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8921 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8923 ret = reganode(pRExC_state,INSUBP,parno);
8924 goto insert_if_check_paren;
8926 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8929 parno = atoi(RExC_parse++);
8931 while (isDIGIT(*RExC_parse))
8933 ret = reganode(pRExC_state, GROUPP, parno);
8935 insert_if_check_paren:
8936 if ((c = *nextchar(pRExC_state)) != ')')
8937 vFAIL("Switch condition not recognized");
8939 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8940 br = regbranch(pRExC_state, &flags, 1,depth+1);
8942 br = reganode(pRExC_state, LONGJMP, 0);
8944 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8945 c = *nextchar(pRExC_state);
8950 vFAIL("(?(DEFINE)....) does not allow branches");
8951 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8952 regbranch(pRExC_state, &flags, 1,depth+1);
8953 REGTAIL(pRExC_state, ret, lastbr);
8956 c = *nextchar(pRExC_state);
8961 vFAIL("Switch (?(condition)... contains too many branches");
8962 ender = reg_node(pRExC_state, TAIL);
8963 REGTAIL(pRExC_state, br, ender);
8965 REGTAIL(pRExC_state, lastbr, ender);
8966 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8969 REGTAIL(pRExC_state, ret, ender);
8970 RExC_size++; /* XXX WHY do we need this?!!
8971 For large programs it seems to be required
8972 but I can't figure out why. -- dmq*/
8976 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8980 RExC_parse--; /* for vFAIL to print correctly */
8981 vFAIL("Sequence (? incomplete");
8983 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8985 has_use_defaults = TRUE;
8986 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8987 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8988 ? REGEX_UNICODE_CHARSET
8989 : REGEX_DEPENDS_CHARSET);
8993 parse_flags: /* (?i) */
8995 U32 posflags = 0, negflags = 0;
8996 U32 *flagsp = &posflags;
8997 char has_charset_modifier = '\0';
8998 regex_charset cs = get_regex_charset(RExC_flags);
8999 if (cs == REGEX_DEPENDS_CHARSET
9000 && (RExC_utf8 || RExC_uni_semantics))
9002 cs = REGEX_UNICODE_CHARSET;
9005 while (*RExC_parse) {
9006 /* && strchr("iogcmsx", *RExC_parse) */
9007 /* (?g), (?gc) and (?o) are useless here
9008 and must be globally applied -- japhy */
9009 switch (*RExC_parse) {
9010 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9011 case LOCALE_PAT_MOD:
9012 if (has_charset_modifier) {
9013 goto excess_modifier;
9015 else if (flagsp == &negflags) {
9018 cs = REGEX_LOCALE_CHARSET;
9019 has_charset_modifier = LOCALE_PAT_MOD;
9020 RExC_contains_locale = 1;
9022 case UNICODE_PAT_MOD:
9023 if (has_charset_modifier) {
9024 goto excess_modifier;
9026 else if (flagsp == &negflags) {
9029 cs = REGEX_UNICODE_CHARSET;
9030 has_charset_modifier = UNICODE_PAT_MOD;
9032 case ASCII_RESTRICT_PAT_MOD:
9033 if (flagsp == &negflags) {
9036 if (has_charset_modifier) {
9037 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9038 goto excess_modifier;
9040 /* Doubled modifier implies more restricted */
9041 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9044 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9046 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9048 case DEPENDS_PAT_MOD:
9049 if (has_use_defaults) {
9050 goto fail_modifiers;
9052 else if (flagsp == &negflags) {
9055 else if (has_charset_modifier) {
9056 goto excess_modifier;
9059 /* The dual charset means unicode semantics if the
9060 * pattern (or target, not known until runtime) are
9061 * utf8, or something in the pattern indicates unicode
9063 cs = (RExC_utf8 || RExC_uni_semantics)
9064 ? REGEX_UNICODE_CHARSET
9065 : REGEX_DEPENDS_CHARSET;
9066 has_charset_modifier = DEPENDS_PAT_MOD;
9070 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9071 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9073 else if (has_charset_modifier == *(RExC_parse - 1)) {
9074 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9077 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9082 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9084 case ONCE_PAT_MOD: /* 'o' */
9085 case GLOBAL_PAT_MOD: /* 'g' */
9086 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9087 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9088 if (! (wastedflags & wflagbit) ) {
9089 wastedflags |= wflagbit;
9092 "Useless (%s%c) - %suse /%c modifier",
9093 flagsp == &negflags ? "?-" : "?",
9095 flagsp == &negflags ? "don't " : "",
9102 case CONTINUE_PAT_MOD: /* 'c' */
9103 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9104 if (! (wastedflags & WASTED_C) ) {
9105 wastedflags |= WASTED_GC;
9108 "Useless (%sc) - %suse /gc modifier",
9109 flagsp == &negflags ? "?-" : "?",
9110 flagsp == &negflags ? "don't " : ""
9115 case KEEPCOPY_PAT_MOD: /* 'p' */
9116 if (flagsp == &negflags) {
9118 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9120 *flagsp |= RXf_PMf_KEEPCOPY;
9124 /* A flag is a default iff it is following a minus, so
9125 * if there is a minus, it means will be trying to
9126 * re-specify a default which is an error */
9127 if (has_use_defaults || flagsp == &negflags) {
9130 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9134 wastedflags = 0; /* reset so (?g-c) warns twice */
9140 RExC_flags |= posflags;
9141 RExC_flags &= ~negflags;
9142 set_regex_charset(&RExC_flags, cs);
9144 oregflags |= posflags;
9145 oregflags &= ~negflags;
9146 set_regex_charset(&oregflags, cs);
9148 nextchar(pRExC_state);
9159 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9164 }} /* one for the default block, one for the switch */
9171 ret = reganode(pRExC_state, OPEN, parno);
9174 RExC_nestroot = parno;
9175 if (RExC_seen & REG_SEEN_RECURSE
9176 && !RExC_open_parens[parno-1])
9178 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9179 "Setting open paren #%"IVdf" to %d\n",
9180 (IV)parno, REG_NODE_NUM(ret)));
9181 RExC_open_parens[parno-1]= ret;
9184 Set_Node_Length(ret, 1); /* MJD */
9185 Set_Node_Offset(ret, RExC_parse); /* MJD */
9193 /* Pick up the branches, linking them together. */
9194 parse_start = RExC_parse; /* MJD */
9195 br = regbranch(pRExC_state, &flags, 1,depth+1);
9197 /* branch_len = (paren != 0); */
9201 if (*RExC_parse == '|') {
9202 if (!SIZE_ONLY && RExC_extralen) {
9203 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9206 reginsert(pRExC_state, BRANCH, br, depth+1);
9207 Set_Node_Length(br, paren != 0);
9208 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9212 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9214 else if (paren == ':') {
9215 *flagp |= flags&SIMPLE;
9217 if (is_open) { /* Starts with OPEN. */
9218 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9220 else if (paren != '?') /* Not Conditional */
9222 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9224 while (*RExC_parse == '|') {
9225 if (!SIZE_ONLY && RExC_extralen) {
9226 ender = reganode(pRExC_state, LONGJMP,0);
9227 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9230 RExC_extralen += 2; /* Account for LONGJMP. */
9231 nextchar(pRExC_state);
9233 if (RExC_npar > after_freeze)
9234 after_freeze = RExC_npar;
9235 RExC_npar = freeze_paren;
9237 br = regbranch(pRExC_state, &flags, 0, depth+1);
9241 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9243 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9246 if (have_branch || paren != ':') {
9247 /* Make a closing node, and hook it on the end. */
9250 ender = reg_node(pRExC_state, TAIL);
9253 ender = reganode(pRExC_state, CLOSE, parno);
9254 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9255 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9256 "Setting close paren #%"IVdf" to %d\n",
9257 (IV)parno, REG_NODE_NUM(ender)));
9258 RExC_close_parens[parno-1]= ender;
9259 if (RExC_nestroot == parno)
9262 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9263 Set_Node_Length(ender,1); /* MJD */
9269 *flagp &= ~HASWIDTH;
9272 ender = reg_node(pRExC_state, SUCCEED);
9275 ender = reg_node(pRExC_state, END);
9277 assert(!RExC_opend); /* there can only be one! */
9282 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9283 SV * const mysv_val1=sv_newmortal();
9284 SV * const mysv_val2=sv_newmortal();
9285 DEBUG_PARSE_MSG("lsbr");
9286 regprop(RExC_rx, mysv_val1, lastbr);
9287 regprop(RExC_rx, mysv_val2, ender);
9288 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9289 SvPV_nolen_const(mysv_val1),
9290 (IV)REG_NODE_NUM(lastbr),
9291 SvPV_nolen_const(mysv_val2),
9292 (IV)REG_NODE_NUM(ender),
9293 (IV)(ender - lastbr)
9296 REGTAIL(pRExC_state, lastbr, ender);
9298 if (have_branch && !SIZE_ONLY) {
9301 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9303 /* Hook the tails of the branches to the closing node. */
9304 for (br = ret; br; br = regnext(br)) {
9305 const U8 op = PL_regkind[OP(br)];
9307 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9308 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9311 else if (op == BRANCHJ) {
9312 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9313 /* for now we always disable this optimisation * /
9314 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9320 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9321 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9322 SV * const mysv_val1=sv_newmortal();
9323 SV * const mysv_val2=sv_newmortal();
9324 DEBUG_PARSE_MSG("NADA");
9325 regprop(RExC_rx, mysv_val1, ret);
9326 regprop(RExC_rx, mysv_val2, ender);
9327 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9328 SvPV_nolen_const(mysv_val1),
9329 (IV)REG_NODE_NUM(ret),
9330 SvPV_nolen_const(mysv_val2),
9331 (IV)REG_NODE_NUM(ender),
9336 if (OP(ender) == TAIL) {
9341 for ( opt= br + 1; opt < ender ; opt++ )
9343 NEXT_OFF(br)= ender - br;
9351 static const char parens[] = "=!<,>";
9353 if (paren && (p = strchr(parens, paren))) {
9354 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9355 int flag = (p - parens) > 1;
9358 node = SUSPEND, flag = 0;
9359 reginsert(pRExC_state, node,ret, depth+1);
9360 Set_Node_Cur_Length(ret);
9361 Set_Node_Offset(ret, parse_start + 1);
9363 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9367 /* Check for proper termination. */
9369 RExC_flags = oregflags;
9370 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9371 RExC_parse = oregcomp_parse;
9372 vFAIL("Unmatched (");
9375 else if (!paren && RExC_parse < RExC_end) {
9376 if (*RExC_parse == ')') {
9378 vFAIL("Unmatched )");
9381 FAIL("Junk on end of regexp"); /* "Can't happen". */
9382 assert(0); /* NOTREACHED */
9385 if (RExC_in_lookbehind) {
9386 RExC_in_lookbehind--;
9388 if (after_freeze > RExC_npar)
9389 RExC_npar = after_freeze;
9394 - regbranch - one alternative of an | operator
9396 * Implements the concatenation operator.
9399 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9403 regnode *chain = NULL;
9405 I32 flags = 0, c = 0;
9406 GET_RE_DEBUG_FLAGS_DECL;
9408 PERL_ARGS_ASSERT_REGBRANCH;
9410 DEBUG_PARSE("brnc");
9415 if (!SIZE_ONLY && RExC_extralen)
9416 ret = reganode(pRExC_state, BRANCHJ,0);
9418 ret = reg_node(pRExC_state, BRANCH);
9419 Set_Node_Length(ret, 1);
9423 if (!first && SIZE_ONLY)
9424 RExC_extralen += 1; /* BRANCHJ */
9426 *flagp = WORST; /* Tentatively. */
9429 nextchar(pRExC_state);
9430 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9432 latest = regpiece(pRExC_state, &flags,depth+1);
9433 if (latest == NULL) {
9434 if (flags & TRYAGAIN)
9438 else if (ret == NULL)
9440 *flagp |= flags&(HASWIDTH|POSTPONED);
9441 if (chain == NULL) /* First piece. */
9442 *flagp |= flags&SPSTART;
9445 REGTAIL(pRExC_state, chain, latest);
9450 if (chain == NULL) { /* Loop ran zero times. */
9451 chain = reg_node(pRExC_state, NOTHING);
9456 *flagp |= flags&SIMPLE;
9463 - regpiece - something followed by possible [*+?]
9465 * Note that the branching code sequences used for ? and the general cases
9466 * of * and + are somewhat optimized: they use the same NOTHING node as
9467 * both the endmarker for their branch list and the body of the last branch.
9468 * It might seem that this node could be dispensed with entirely, but the
9469 * endmarker role is not redundant.
9472 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9479 const char * const origparse = RExC_parse;
9481 I32 max = REG_INFTY;
9482 #ifdef RE_TRACK_PATTERN_OFFSETS
9485 const char *maxpos = NULL;
9487 /* Save the original in case we change the emitted regop to a FAIL. */
9488 regnode * const orig_emit = RExC_emit;
9490 GET_RE_DEBUG_FLAGS_DECL;
9492 PERL_ARGS_ASSERT_REGPIECE;
9494 DEBUG_PARSE("piec");
9496 ret = regatom(pRExC_state, &flags,depth+1);
9498 if (flags & TRYAGAIN)
9505 if (op == '{' && regcurly(RExC_parse)) {
9507 #ifdef RE_TRACK_PATTERN_OFFSETS
9508 parse_start = RExC_parse; /* MJD */
9510 next = RExC_parse + 1;
9511 while (isDIGIT(*next) || *next == ',') {
9520 if (*next == '}') { /* got one */
9524 min = atoi(RExC_parse);
9528 maxpos = RExC_parse;
9530 if (!max && *maxpos != '0')
9531 max = REG_INFTY; /* meaning "infinity" */
9532 else if (max >= REG_INFTY)
9533 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9535 nextchar(pRExC_state);
9536 if (max < min) { /* If can't match, warn and optimize to fail
9539 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9541 /* We can't back off the size because we have to reserve
9542 * enough space for all the things we are about to throw
9543 * away, but we can shrink it by the ammount we are about
9545 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9548 RExC_emit = orig_emit;
9550 ret = reg_node(pRExC_state, OPFAIL);
9555 if ((flags&SIMPLE)) {
9556 RExC_naughty += 2 + RExC_naughty / 2;
9557 reginsert(pRExC_state, CURLY, ret, depth+1);
9558 Set_Node_Offset(ret, parse_start+1); /* MJD */
9559 Set_Node_Cur_Length(ret);
9562 regnode * const w = reg_node(pRExC_state, WHILEM);
9565 REGTAIL(pRExC_state, ret, w);
9566 if (!SIZE_ONLY && RExC_extralen) {
9567 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9568 reginsert(pRExC_state, NOTHING,ret, depth+1);
9569 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9571 reginsert(pRExC_state, CURLYX,ret, depth+1);
9573 Set_Node_Offset(ret, parse_start+1);
9574 Set_Node_Length(ret,
9575 op == '{' ? (RExC_parse - parse_start) : 1);
9577 if (!SIZE_ONLY && RExC_extralen)
9578 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9579 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9581 RExC_whilem_seen++, RExC_extralen += 3;
9582 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9591 ARG1_SET(ret, (U16)min);
9592 ARG2_SET(ret, (U16)max);
9604 #if 0 /* Now runtime fix should be reliable. */
9606 /* if this is reinstated, don't forget to put this back into perldiag:
9608 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9610 (F) The part of the regexp subject to either the * or + quantifier
9611 could match an empty string. The {#} shows in the regular
9612 expression about where the problem was discovered.
9616 if (!(flags&HASWIDTH) && op != '?')
9617 vFAIL("Regexp *+ operand could be empty");
9620 #ifdef RE_TRACK_PATTERN_OFFSETS
9621 parse_start = RExC_parse;
9623 nextchar(pRExC_state);
9625 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9627 if (op == '*' && (flags&SIMPLE)) {
9628 reginsert(pRExC_state, STAR, ret, depth+1);
9632 else if (op == '*') {
9636 else if (op == '+' && (flags&SIMPLE)) {
9637 reginsert(pRExC_state, PLUS, ret, depth+1);
9641 else if (op == '+') {
9645 else if (op == '?') {
9650 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9651 ckWARN3reg(RExC_parse,
9652 "%.*s matches null string many times",
9653 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9657 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9658 nextchar(pRExC_state);
9659 reginsert(pRExC_state, MINMOD, ret, depth+1);
9660 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9662 #ifndef REG_ALLOW_MINMOD_SUSPEND
9665 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9667 nextchar(pRExC_state);
9668 ender = reg_node(pRExC_state, SUCCEED);
9669 REGTAIL(pRExC_state, ret, ender);
9670 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9672 ender = reg_node(pRExC_state, TAIL);
9673 REGTAIL(pRExC_state, ret, ender);
9677 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9679 vFAIL("Nested quantifiers");
9686 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9689 /* This is expected to be called by a parser routine that has recognized '\N'
9690 and needs to handle the rest. RExC_parse is expected to point at the first
9691 char following the N at the time of the call. On successful return,
9692 RExC_parse has been updated to point to just after the sequence identified
9693 by this routine, and <*flagp> has been updated.
9695 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9698 \N may begin either a named sequence, or if outside a character class, mean
9699 to match a non-newline. For non single-quoted regexes, the tokenizer has
9700 attempted to decide which, and in the case of a named sequence, converted it
9701 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9702 where c1... are the characters in the sequence. For single-quoted regexes,
9703 the tokenizer passes the \N sequence through unchanged; this code will not
9704 attempt to determine this nor expand those, instead raising a syntax error.
9705 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9706 or there is no '}', it signals that this \N occurrence means to match a
9709 Only the \N{U+...} form should occur in a character class, for the same
9710 reason that '.' inside a character class means to just match a period: it
9711 just doesn't make sense.
9713 The function raises an error (via vFAIL), and doesn't return for various
9714 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9715 success; it returns FALSE otherwise.
9717 If <valuep> is non-null, it means the caller can accept an input sequence
9718 consisting of a just a single code point; <*valuep> is set to that value
9719 if the input is such.
9721 If <node_p> is non-null it signifies that the caller can accept any other
9722 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9724 1) \N means not-a-NL: points to a newly created REG_ANY node;
9725 2) \N{}: points to a new NOTHING node;
9726 3) otherwise: points to a new EXACT node containing the resolved
9728 Note that FALSE is returned for single code point sequences if <valuep> is
9732 char * endbrace; /* '}' following the name */
9734 char *endchar; /* Points to '.' or '}' ending cur char in the input
9736 bool has_multiple_chars; /* true if the input stream contains a sequence of
9737 more than one character */
9739 GET_RE_DEBUG_FLAGS_DECL;
9741 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9745 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9747 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9748 * modifier. The other meaning does not */
9749 p = (RExC_flags & RXf_PMf_EXTENDED)
9750 ? regwhite( pRExC_state, RExC_parse )
9753 /* Disambiguate between \N meaning a named character versus \N meaning
9754 * [^\n]. The former is assumed when it can't be the latter. */
9755 if (*p != '{' || regcurly(p)) {
9758 /* no bare \N in a charclass */
9759 if (in_char_class) {
9760 vFAIL("\\N in a character class must be a named character: \\N{...}");
9764 nextchar(pRExC_state);
9765 *node_p = reg_node(pRExC_state, REG_ANY);
9766 *flagp |= HASWIDTH|SIMPLE;
9769 Set_Node_Length(*node_p, 1); /* MJD */
9773 /* Here, we have decided it should be a named character or sequence */
9775 /* The test above made sure that the next real character is a '{', but
9776 * under the /x modifier, it could be separated by space (or a comment and
9777 * \n) and this is not allowed (for consistency with \x{...} and the
9778 * tokenizer handling of \N{NAME}). */
9779 if (*RExC_parse != '{') {
9780 vFAIL("Missing braces on \\N{}");
9783 RExC_parse++; /* Skip past the '{' */
9785 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9786 || ! (endbrace == RExC_parse /* nothing between the {} */
9787 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9788 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9790 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9791 vFAIL("\\N{NAME} must be resolved by the lexer");
9794 if (endbrace == RExC_parse) { /* empty: \N{} */
9797 *node_p = reg_node(pRExC_state,NOTHING);
9799 else if (in_char_class) {
9800 if (SIZE_ONLY && in_char_class) {
9801 ckWARNreg(RExC_parse,
9802 "Ignoring zero length \\N{} in character class"
9810 nextchar(pRExC_state);
9814 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9815 RExC_parse += 2; /* Skip past the 'U+' */
9817 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9819 /* Code points are separated by dots. If none, there is only one code
9820 * point, and is terminated by the brace */
9821 has_multiple_chars = (endchar < endbrace);
9823 if (valuep && (! has_multiple_chars || in_char_class)) {
9824 /* We only pay attention to the first char of
9825 multichar strings being returned in char classes. I kinda wonder
9826 if this makes sense as it does change the behaviour
9827 from earlier versions, OTOH that behaviour was broken
9828 as well. XXX Solution is to recharacterize as
9829 [rest-of-class]|multi1|multi2... */
9831 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9832 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9833 | PERL_SCAN_DISALLOW_PREFIX
9834 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9836 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9838 /* The tokenizer should have guaranteed validity, but it's possible to
9839 * bypass it by using single quoting, so check */
9840 if (length_of_hex == 0
9841 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9843 RExC_parse += length_of_hex; /* Includes all the valid */
9844 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9845 ? UTF8SKIP(RExC_parse)
9847 /* Guard against malformed utf8 */
9848 if (RExC_parse >= endchar) {
9849 RExC_parse = endchar;
9851 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9854 if (in_char_class && has_multiple_chars) {
9855 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9858 RExC_parse = endbrace + 1;
9860 else if (! node_p || ! has_multiple_chars) {
9862 /* Here, the input is legal, but not according to the caller's
9863 * options. We fail without advancing the parse, so that the
9864 * caller can try again */
9870 /* What is done here is to convert this to a sub-pattern of the form
9871 * (?:\x{char1}\x{char2}...)
9872 * and then call reg recursively. That way, it retains its atomicness,
9873 * while not having to worry about special handling that some code
9874 * points may have. toke.c has converted the original Unicode values
9875 * to native, so that we can just pass on the hex values unchanged. We
9876 * do have to set a flag to keep recoding from happening in the
9879 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9881 char *orig_end = RExC_end;
9884 while (RExC_parse < endbrace) {
9886 /* Convert to notation the rest of the code understands */
9887 sv_catpv(substitute_parse, "\\x{");
9888 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9889 sv_catpv(substitute_parse, "}");
9891 /* Point to the beginning of the next character in the sequence. */
9892 RExC_parse = endchar + 1;
9893 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9895 sv_catpv(substitute_parse, ")");
9897 RExC_parse = SvPV(substitute_parse, len);
9899 /* Don't allow empty number */
9901 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9903 RExC_end = RExC_parse + len;
9905 /* The values are Unicode, and therefore not subject to recoding */
9906 RExC_override_recoding = 1;
9908 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9909 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9911 RExC_parse = endbrace;
9912 RExC_end = orig_end;
9913 RExC_override_recoding = 0;
9915 nextchar(pRExC_state);
9925 * It returns the code point in utf8 for the value in *encp.
9926 * value: a code value in the source encoding
9927 * encp: a pointer to an Encode object
9929 * If the result from Encode is not a single character,
9930 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9933 S_reg_recode(pTHX_ const char value, SV **encp)
9936 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9937 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9938 const STRLEN newlen = SvCUR(sv);
9939 UV uv = UNICODE_REPLACEMENT;
9941 PERL_ARGS_ASSERT_REG_RECODE;
9945 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9948 if (!newlen || numlen != newlen) {
9949 uv = UNICODE_REPLACEMENT;
9955 PERL_STATIC_INLINE U8
9956 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9960 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9966 op = get_regex_charset(RExC_flags);
9967 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9968 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9969 been, so there is no hole */
9975 PERL_STATIC_INLINE void
9976 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9978 /* This knows the details about sizing an EXACTish node, setting flags for
9979 * it (by setting <*flagp>, and potentially populating it with a single
9982 * If <len> (the length in bytes) is non-zero, this function assumes that
9983 * the node has already been populated, and just does the sizing. In this
9984 * case <code_point> should be the final code point that has already been
9985 * placed into the node. This value will be ignored except that under some
9986 * circumstances <*flagp> is set based on it.
9988 * If <len> is zero, the function assumes that the node is to contain only
9989 * the single character given by <code_point> and calculates what <len>
9990 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9991 * additionally will populate the node's STRING with <code_point>, if <len>
9992 * is 0. In both cases <*flagp> is appropriately set
9994 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9995 * folded (the latter only when the rules indicate it can match 'ss') */
9997 bool len_passed_in = cBOOL(len != 0);
9998 U8 character[UTF8_MAXBYTES_CASE+1];
10000 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10002 if (! len_passed_in) {
10005 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10008 uvchr_to_utf8( character, code_point);
10009 len = UTF8SKIP(character);
10013 || code_point != LATIN_SMALL_LETTER_SHARP_S
10014 || ASCII_FOLD_RESTRICTED
10015 || ! AT_LEAST_UNI_SEMANTICS)
10017 *character = (U8) code_point;
10022 *(character + 1) = 's';
10028 RExC_size += STR_SZ(len);
10031 RExC_emit += STR_SZ(len);
10032 STR_LEN(node) = len;
10033 if (! len_passed_in) {
10034 Copy((char *) character, STRING(node), len, char);
10038 *flagp |= HASWIDTH;
10040 /* A single character node is SIMPLE, except for the special-cased SHARP S
10042 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10043 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10044 || ! FOLD || ! DEPENDS_SEMANTICS))
10051 - regatom - the lowest level
10053 Try to identify anything special at the start of the pattern. If there
10054 is, then handle it as required. This may involve generating a single regop,
10055 such as for an assertion; or it may involve recursing, such as to
10056 handle a () structure.
10058 If the string doesn't start with something special then we gobble up
10059 as much literal text as we can.
10061 Once we have been able to handle whatever type of thing started the
10062 sequence, we return.
10064 Note: we have to be careful with escapes, as they can be both literal
10065 and special, and in the case of \10 and friends, context determines which.
10067 A summary of the code structure is:
10069 switch (first_byte) {
10070 cases for each special:
10071 handle this special;
10074 switch (2nd byte) {
10075 cases for each unambiguous special:
10076 handle this special;
10078 cases for each ambigous special/literal:
10080 if (special) handle here
10082 default: // unambiguously literal:
10085 default: // is a literal char
10088 create EXACTish node for literal;
10089 while (more input and node isn't full) {
10090 switch (input_byte) {
10091 cases for each special;
10092 make sure parse pointer is set so that the next call to
10093 regatom will see this special first
10094 goto loopdone; // EXACTish node terminated by prev. char
10096 append char to EXACTISH node;
10098 get next input byte;
10102 return the generated node;
10104 Specifically there are two separate switches for handling
10105 escape sequences, with the one for handling literal escapes requiring
10106 a dummy entry for all of the special escapes that are actually handled
10111 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10114 regnode *ret = NULL;
10116 char *parse_start = RExC_parse;
10118 GET_RE_DEBUG_FLAGS_DECL;
10119 DEBUG_PARSE("atom");
10120 *flagp = WORST; /* Tentatively. */
10122 PERL_ARGS_ASSERT_REGATOM;
10125 switch ((U8)*RExC_parse) {
10127 RExC_seen_zerolen++;
10128 nextchar(pRExC_state);
10129 if (RExC_flags & RXf_PMf_MULTILINE)
10130 ret = reg_node(pRExC_state, MBOL);
10131 else if (RExC_flags & RXf_PMf_SINGLELINE)
10132 ret = reg_node(pRExC_state, SBOL);
10134 ret = reg_node(pRExC_state, BOL);
10135 Set_Node_Length(ret, 1); /* MJD */
10138 nextchar(pRExC_state);
10140 RExC_seen_zerolen++;
10141 if (RExC_flags & RXf_PMf_MULTILINE)
10142 ret = reg_node(pRExC_state, MEOL);
10143 else if (RExC_flags & RXf_PMf_SINGLELINE)
10144 ret = reg_node(pRExC_state, SEOL);
10146 ret = reg_node(pRExC_state, EOL);
10147 Set_Node_Length(ret, 1); /* MJD */
10150 nextchar(pRExC_state);
10151 if (RExC_flags & RXf_PMf_SINGLELINE)
10152 ret = reg_node(pRExC_state, SANY);
10154 ret = reg_node(pRExC_state, REG_ANY);
10155 *flagp |= HASWIDTH|SIMPLE;
10157 Set_Node_Length(ret, 1); /* MJD */
10161 char * const oregcomp_parse = ++RExC_parse;
10162 ret = regclass(pRExC_state, flagp,depth+1);
10163 if (*RExC_parse != ']') {
10164 RExC_parse = oregcomp_parse;
10165 vFAIL("Unmatched [");
10167 nextchar(pRExC_state);
10168 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10172 nextchar(pRExC_state);
10173 ret = reg(pRExC_state, 1, &flags,depth+1);
10175 if (flags & TRYAGAIN) {
10176 if (RExC_parse == RExC_end) {
10177 /* Make parent create an empty node if needed. */
10178 *flagp |= TRYAGAIN;
10185 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10189 if (flags & TRYAGAIN) {
10190 *flagp |= TRYAGAIN;
10193 vFAIL("Internal urp");
10194 /* Supposed to be caught earlier. */
10200 vFAIL("Quantifier follows nothing");
10205 This switch handles escape sequences that resolve to some kind
10206 of special regop and not to literal text. Escape sequnces that
10207 resolve to literal text are handled below in the switch marked
10210 Every entry in this switch *must* have a corresponding entry
10211 in the literal escape switch. However, the opposite is not
10212 required, as the default for this switch is to jump to the
10213 literal text handling code.
10215 switch ((U8)*++RExC_parse) {
10216 /* Special Escapes */
10218 RExC_seen_zerolen++;
10219 ret = reg_node(pRExC_state, SBOL);
10221 goto finish_meta_pat;
10223 ret = reg_node(pRExC_state, GPOS);
10224 RExC_seen |= REG_SEEN_GPOS;
10226 goto finish_meta_pat;
10228 RExC_seen_zerolen++;
10229 ret = reg_node(pRExC_state, KEEPS);
10231 /* XXX:dmq : disabling in-place substitution seems to
10232 * be necessary here to avoid cases of memory corruption, as
10233 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10235 RExC_seen |= REG_SEEN_LOOKBEHIND;
10236 goto finish_meta_pat;
10238 ret = reg_node(pRExC_state, SEOL);
10240 RExC_seen_zerolen++; /* Do not optimize RE away */
10241 goto finish_meta_pat;
10243 ret = reg_node(pRExC_state, EOS);
10245 RExC_seen_zerolen++; /* Do not optimize RE away */
10246 goto finish_meta_pat;
10248 ret = reg_node(pRExC_state, CANY);
10249 RExC_seen |= REG_SEEN_CANY;
10250 *flagp |= HASWIDTH|SIMPLE;
10251 goto finish_meta_pat;
10253 ret = reg_node(pRExC_state, CLUMP);
10254 *flagp |= HASWIDTH;
10255 goto finish_meta_pat;
10257 op = ALNUM + get_regex_charset(RExC_flags);
10258 if (op > ALNUMA) { /* /aa is same as /a */
10261 ret = reg_node(pRExC_state, op);
10262 *flagp |= HASWIDTH|SIMPLE;
10263 goto finish_meta_pat;
10265 op = NALNUM + get_regex_charset(RExC_flags);
10266 if (op > NALNUMA) { /* /aa is same as /a */
10269 ret = reg_node(pRExC_state, op);
10270 *flagp |= HASWIDTH|SIMPLE;
10271 goto finish_meta_pat;
10273 RExC_seen_zerolen++;
10274 RExC_seen |= REG_SEEN_LOOKBEHIND;
10275 op = BOUND + get_regex_charset(RExC_flags);
10276 if (op > BOUNDA) { /* /aa is same as /a */
10279 ret = reg_node(pRExC_state, op);
10280 FLAGS(ret) = get_regex_charset(RExC_flags);
10282 goto finish_meta_pat;
10284 RExC_seen_zerolen++;
10285 RExC_seen |= REG_SEEN_LOOKBEHIND;
10286 op = NBOUND + get_regex_charset(RExC_flags);
10287 if (op > NBOUNDA) { /* /aa is same as /a */
10290 ret = reg_node(pRExC_state, op);
10291 FLAGS(ret) = get_regex_charset(RExC_flags);
10293 goto finish_meta_pat;
10295 op = SPACE + get_regex_charset(RExC_flags);
10296 if (op > SPACEA) { /* /aa is same as /a */
10299 ret = reg_node(pRExC_state, op);
10300 *flagp |= HASWIDTH|SIMPLE;
10301 goto finish_meta_pat;
10303 op = NSPACE + get_regex_charset(RExC_flags);
10304 if (op > NSPACEA) { /* /aa is same as /a */
10307 ret = reg_node(pRExC_state, op);
10308 *flagp |= HASWIDTH|SIMPLE;
10309 goto finish_meta_pat;
10317 U8 offset = get_regex_charset(RExC_flags);
10318 if (offset == REGEX_UNICODE_CHARSET) {
10319 offset = REGEX_DEPENDS_CHARSET;
10321 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10322 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10326 ret = reg_node(pRExC_state, op);
10327 *flagp |= HASWIDTH|SIMPLE;
10328 goto finish_meta_pat;
10330 ret = reg_node(pRExC_state, LNBREAK);
10331 *flagp |= HASWIDTH|SIMPLE;
10332 goto finish_meta_pat;
10334 ret = reg_node(pRExC_state, HORIZWS);
10335 *flagp |= HASWIDTH|SIMPLE;
10336 goto finish_meta_pat;
10338 ret = reg_node(pRExC_state, NHORIZWS);
10339 *flagp |= HASWIDTH|SIMPLE;
10340 goto finish_meta_pat;
10342 ret = reg_node(pRExC_state, VERTWS);
10343 *flagp |= HASWIDTH|SIMPLE;
10344 goto finish_meta_pat;
10346 ret = reg_node(pRExC_state, NVERTWS);
10347 *flagp |= HASWIDTH|SIMPLE;
10349 nextchar(pRExC_state);
10350 Set_Node_Length(ret, 2); /* MJD */
10355 char* const oldregxend = RExC_end;
10357 char* parse_start = RExC_parse - 2;
10360 if (RExC_parse[1] == '{') {
10361 /* a lovely hack--pretend we saw [\pX] instead */
10362 RExC_end = strchr(RExC_parse, '}');
10364 const U8 c = (U8)*RExC_parse;
10366 RExC_end = oldregxend;
10367 vFAIL2("Missing right brace on \\%c{}", c);
10372 RExC_end = RExC_parse + 2;
10373 if (RExC_end > oldregxend)
10374 RExC_end = oldregxend;
10378 ret = regclass(pRExC_state, flagp,depth+1);
10380 RExC_end = oldregxend;
10383 Set_Node_Offset(ret, parse_start + 2);
10384 Set_Node_Cur_Length(ret);
10385 nextchar(pRExC_state);
10389 /* Handle \N and \N{NAME} with multiple code points here and not
10390 * below because it can be multicharacter. join_exact() will join
10391 * them up later on. Also this makes sure that things like
10392 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10393 * The options to the grok function call causes it to fail if the
10394 * sequence is just a single code point. We then go treat it as
10395 * just another character in the current EXACT node, and hence it
10396 * gets uniform treatment with all the other characters. The
10397 * special treatment for quantifiers is not needed for such single
10398 * character sequences */
10400 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10405 case 'k': /* Handle \k<NAME> and \k'NAME' */
10408 char ch= RExC_parse[1];
10409 if (ch != '<' && ch != '\'' && ch != '{') {
10411 vFAIL2("Sequence %.2s... not terminated",parse_start);
10413 /* this pretty much dupes the code for (?P=...) in reg(), if
10414 you change this make sure you change that */
10415 char* name_start = (RExC_parse += 2);
10417 SV *sv_dat = reg_scan_name(pRExC_state,
10418 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10419 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10420 if (RExC_parse == name_start || *RExC_parse != ch)
10421 vFAIL2("Sequence %.3s... not terminated",parse_start);
10424 num = add_data( pRExC_state, 1, "S" );
10425 RExC_rxi->data->data[num]=(void*)sv_dat;
10426 SvREFCNT_inc_simple_void(sv_dat);
10430 ret = reganode(pRExC_state,
10433 : (ASCII_FOLD_RESTRICTED)
10435 : (AT_LEAST_UNI_SEMANTICS)
10441 *flagp |= HASWIDTH;
10443 /* override incorrect value set in reganode MJD */
10444 Set_Node_Offset(ret, parse_start+1);
10445 Set_Node_Cur_Length(ret); /* MJD */
10446 nextchar(pRExC_state);
10452 case '1': case '2': case '3': case '4':
10453 case '5': case '6': case '7': case '8': case '9':
10456 bool isg = *RExC_parse == 'g';
10461 if (*RExC_parse == '{') {
10465 if (*RExC_parse == '-') {
10469 if (hasbrace && !isDIGIT(*RExC_parse)) {
10470 if (isrel) RExC_parse--;
10472 goto parse_named_seq;
10474 num = atoi(RExC_parse);
10475 if (isg && num == 0)
10476 vFAIL("Reference to invalid group 0");
10478 num = RExC_npar - num;
10480 vFAIL("Reference to nonexistent or unclosed group");
10482 if (!isg && num > 9 && num >= RExC_npar)
10483 /* Probably a character specified in octal, e.g. \35 */
10486 char * const parse_start = RExC_parse - 1; /* MJD */
10487 while (isDIGIT(*RExC_parse))
10489 if (parse_start == RExC_parse - 1)
10490 vFAIL("Unterminated \\g... pattern");
10492 if (*RExC_parse != '}')
10493 vFAIL("Unterminated \\g{...} pattern");
10497 if (num > (I32)RExC_rx->nparens)
10498 vFAIL("Reference to nonexistent group");
10501 ret = reganode(pRExC_state,
10504 : (ASCII_FOLD_RESTRICTED)
10506 : (AT_LEAST_UNI_SEMANTICS)
10512 *flagp |= HASWIDTH;
10514 /* override incorrect value set in reganode MJD */
10515 Set_Node_Offset(ret, parse_start+1);
10516 Set_Node_Cur_Length(ret); /* MJD */
10518 nextchar(pRExC_state);
10523 if (RExC_parse >= RExC_end)
10524 FAIL("Trailing \\");
10527 /* Do not generate "unrecognized" warnings here, we fall
10528 back into the quick-grab loop below */
10535 if (RExC_flags & RXf_PMf_EXTENDED) {
10536 if ( reg_skipcomment( pRExC_state ) )
10543 parse_start = RExC_parse - 1;
10552 #define MAX_NODE_STRING_SIZE 127
10553 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10555 U8 upper_parse = MAX_NODE_STRING_SIZE;
10558 bool next_is_quantifier;
10559 char * oldp = NULL;
10561 /* If a folding node contains only code points that don't
10562 * participate in folds, it can be changed into an EXACT node,
10563 * which allows the optimizer more things to look for */
10567 node_type = compute_EXACTish(pRExC_state);
10568 ret = reg_node(pRExC_state, node_type);
10570 /* In pass1, folded, we use a temporary buffer instead of the
10571 * actual node, as the node doesn't exist yet */
10572 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10578 /* We do the EXACTFish to EXACT node only if folding, and not if in
10579 * locale, as whether a character folds or not isn't known until
10581 maybe_exact = FOLD && ! LOC;
10583 /* XXX The node can hold up to 255 bytes, yet this only goes to
10584 * 127. I (khw) do not know why. Keeping it somewhat less than
10585 * 255 allows us to not have to worry about overflow due to
10586 * converting to utf8 and fold expansion, but that value is
10587 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10588 * split up by this limit into a single one using the real max of
10589 * 255. Even at 127, this breaks under rare circumstances. If
10590 * folding, we do not want to split a node at a character that is a
10591 * non-final in a multi-char fold, as an input string could just
10592 * happen to want to match across the node boundary. The join
10593 * would solve that problem if the join actually happens. But a
10594 * series of more than two nodes in a row each of 127 would cause
10595 * the first join to succeed to get to 254, but then there wouldn't
10596 * be room for the next one, which could at be one of those split
10597 * multi-char folds. I don't know of any fool-proof solution. One
10598 * could back off to end with only a code point that isn't such a
10599 * non-final, but it is possible for there not to be any in the
10601 for (p = RExC_parse - 1;
10602 len < upper_parse && p < RExC_end;
10607 if (RExC_flags & RXf_PMf_EXTENDED)
10608 p = regwhite( pRExC_state, p );
10619 /* Literal Escapes Switch
10621 This switch is meant to handle escape sequences that
10622 resolve to a literal character.
10624 Every escape sequence that represents something
10625 else, like an assertion or a char class, is handled
10626 in the switch marked 'Special Escapes' above in this
10627 routine, but also has an entry here as anything that
10628 isn't explicitly mentioned here will be treated as
10629 an unescaped equivalent literal.
10632 switch ((U8)*++p) {
10633 /* These are all the special escapes. */
10634 case 'A': /* Start assertion */
10635 case 'b': case 'B': /* Word-boundary assertion*/
10636 case 'C': /* Single char !DANGEROUS! */
10637 case 'd': case 'D': /* digit class */
10638 case 'g': case 'G': /* generic-backref, pos assertion */
10639 case 'h': case 'H': /* HORIZWS */
10640 case 'k': case 'K': /* named backref, keep marker */
10641 case 'p': case 'P': /* Unicode property */
10642 case 'R': /* LNBREAK */
10643 case 's': case 'S': /* space class */
10644 case 'v': case 'V': /* VERTWS */
10645 case 'w': case 'W': /* word class */
10646 case 'X': /* eXtended Unicode "combining character sequence" */
10647 case 'z': case 'Z': /* End of line/string assertion */
10651 /* Anything after here is an escape that resolves to a
10652 literal. (Except digits, which may or may not)
10658 case 'N': /* Handle a single-code point named character. */
10659 /* The options cause it to fail if a multiple code
10660 * point sequence. Handle those in the switch() above
10662 RExC_parse = p + 1;
10663 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10664 flagp, depth, FALSE))
10666 RExC_parse = p = oldp;
10670 if (ender > 0xff) {
10687 ender = ASCII_TO_NATIVE('\033');
10691 ender = ASCII_TO_NATIVE('\007');
10696 STRLEN brace_len = len;
10698 const char* error_msg;
10700 bool valid = grok_bslash_o(p,
10707 RExC_parse = p; /* going to die anyway; point
10708 to exact spot of failure */
10715 if (PL_encoding && ender < 0x100) {
10716 goto recode_encoding;
10718 if (ender > 0xff) {
10725 STRLEN brace_len = len;
10727 const char* error_msg;
10729 bool valid = grok_bslash_x(p,
10736 RExC_parse = p; /* going to die anyway; point
10737 to exact spot of failure */
10743 if (PL_encoding && ender < 0x100) {
10744 goto recode_encoding;
10746 if (ender > 0xff) {
10753 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10755 case '0': case '1': case '2': case '3':case '4':
10756 case '5': case '6': case '7':
10758 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10760 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10762 ender = grok_oct(p, &numlen, &flags, NULL);
10763 if (ender > 0xff) {
10772 if (PL_encoding && ender < 0x100)
10773 goto recode_encoding;
10776 if (! RExC_override_recoding) {
10777 SV* enc = PL_encoding;
10778 ender = reg_recode((const char)(U8)ender, &enc);
10779 if (!enc && SIZE_ONLY)
10780 ckWARNreg(p, "Invalid escape in the specified encoding");
10786 FAIL("Trailing \\");
10789 if (!SIZE_ONLY&& isALNUMC(*p)) {
10790 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10792 goto normal_default;
10796 /* Currently we don't warn when the lbrace is at the start
10797 * of a construct. This catches it in the middle of a
10798 * literal string, or when its the first thing after
10799 * something like "\b" */
10801 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10803 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10808 if (UTF8_IS_START(*p) && UTF) {
10810 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10811 &numlen, UTF8_ALLOW_DEFAULT);
10817 } /* End of switch on the literal */
10819 /* Here, have looked at the literal character and <ender>
10820 * contains its ordinal, <p> points to the character after it
10823 if ( RExC_flags & RXf_PMf_EXTENDED)
10824 p = regwhite( pRExC_state, p );
10826 /* If the next thing is a quantifier, it applies to this
10827 * character only, which means that this character has to be in
10828 * its own node and can't just be appended to the string in an
10829 * existing node, so if there are already other characters in
10830 * the node, close the node with just them, and set up to do
10831 * this character again next time through, when it will be the
10832 * only thing in its new node */
10833 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10841 /* See comments for join_exact() as to why we fold
10842 * this non-UTF at compile time */
10843 || (node_type == EXACTFU
10844 && ender == LATIN_SMALL_LETTER_SHARP_S))
10848 /* Prime the casefolded buffer. Locale rules, which
10849 * apply only to code points < 256, aren't known until
10850 * execution, so for them, just output the original
10851 * character using utf8. If we start to fold non-UTF
10852 * patterns, be sure to update join_exact() */
10853 if (LOC && ender < 256) {
10854 if (UNI_IS_INVARIANT(ender)) {
10858 *s = UTF8_TWO_BYTE_HI(ender);
10859 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10864 UV folded = _to_uni_fold_flags(
10869 | ((LOC) ? FOLD_FLAGS_LOCALE
10870 : (ASCII_FOLD_RESTRICTED)
10871 ? FOLD_FLAGS_NOMIX_ASCII
10875 /* If this node only contains non-folding code
10876 * points so far, see if this new one is also
10879 if (folded != ender) {
10880 maybe_exact = FALSE;
10883 /* Here the fold is the original; we have
10884 * to check further to see if anything
10886 if (! PL_utf8_foldable) {
10887 SV* swash = swash_init("utf8",
10889 &PL_sv_undef, 1, 0);
10891 _get_swash_invlist(swash);
10892 SvREFCNT_dec(swash);
10894 if (_invlist_contains_cp(PL_utf8_foldable,
10897 maybe_exact = FALSE;
10905 /* The loop increments <len> each time, as all but this
10906 * path (and the one just below for UTF) through it add
10907 * a single byte to the EXACTish node. But this one
10908 * has changed len to be the correct final value, so
10909 * subtract one to cancel out the increment that
10911 len += foldlen - 1;
10915 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10919 const STRLEN unilen = reguni(pRExC_state, ender, s);
10925 /* See comment just above for - 1 */
10929 REGC((char)ender, s++);
10932 if (next_is_quantifier) {
10934 /* Here, the next input is a quantifier, and to get here,
10935 * the current character is the only one in the node.
10936 * Also, here <len> doesn't include the final byte for this
10942 } /* End of loop through literal characters */
10944 /* Here we have either exhausted the input or ran out of room in
10945 * the node. (If we encountered a character that can't be in the
10946 * node, transfer is made directly to <loopdone>, and so we
10947 * wouldn't have fallen off the end of the loop.) In the latter
10948 * case, we artificially have to split the node into two, because
10949 * we just don't have enough space to hold everything. This
10950 * creates a problem if the final character participates in a
10951 * multi-character fold in the non-final position, as a match that
10952 * should have occurred won't, due to the way nodes are matched,
10953 * and our artificial boundary. So back off until we find a non-
10954 * problematic character -- one that isn't at the beginning or
10955 * middle of such a fold. (Either it doesn't participate in any
10956 * folds, or appears only in the final position of all the folds it
10957 * does participate in.) A better solution with far fewer false
10958 * positives, and that would fill the nodes more completely, would
10959 * be to actually have available all the multi-character folds to
10960 * test against, and to back-off only far enough to be sure that
10961 * this node isn't ending with a partial one. <upper_parse> is set
10962 * further below (if we need to reparse the node) to include just
10963 * up through that final non-problematic character that this code
10964 * identifies, so when it is set to less than the full node, we can
10965 * skip the rest of this */
10966 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10968 const STRLEN full_len = len;
10970 assert(len >= MAX_NODE_STRING_SIZE);
10972 /* Here, <s> points to the final byte of the final character.
10973 * Look backwards through the string until find a non-
10974 * problematic character */
10978 /* These two have no multi-char folds to non-UTF characters
10980 if (ASCII_FOLD_RESTRICTED || LOC) {
10984 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10988 if (! PL_NonL1NonFinalFold) {
10989 PL_NonL1NonFinalFold = _new_invlist_C_array(
10990 NonL1_Perl_Non_Final_Folds_invlist);
10993 /* Point to the first byte of the final character */
10994 s = (char *) utf8_hop((U8 *) s, -1);
10996 while (s >= s0) { /* Search backwards until find
10997 non-problematic char */
10998 if (UTF8_IS_INVARIANT(*s)) {
11000 /* There are no ascii characters that participate
11001 * in multi-char folds under /aa. In EBCDIC, the
11002 * non-ascii invariants are all control characters,
11003 * so don't ever participate in any folds. */
11004 if (ASCII_FOLD_RESTRICTED
11005 || ! IS_NON_FINAL_FOLD(*s))
11010 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11012 /* No Latin1 characters participate in multi-char
11013 * folds under /l */
11015 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11021 else if (! _invlist_contains_cp(
11022 PL_NonL1NonFinalFold,
11023 valid_utf8_to_uvchr((U8 *) s, NULL)))
11028 /* Here, the current character is problematic in that
11029 * it does occur in the non-final position of some
11030 * fold, so try the character before it, but have to
11031 * special case the very first byte in the string, so
11032 * we don't read outside the string */
11033 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11034 } /* End of loop backwards through the string */
11036 /* If there were only problematic characters in the string,
11037 * <s> will point to before s0, in which case the length
11038 * should be 0, otherwise include the length of the
11039 * non-problematic character just found */
11040 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11043 /* Here, have found the final character, if any, that is
11044 * non-problematic as far as ending the node without splitting
11045 * it across a potential multi-char fold. <len> contains the
11046 * number of bytes in the node up-to and including that
11047 * character, or is 0 if there is no such character, meaning
11048 * the whole node contains only problematic characters. In
11049 * this case, give up and just take the node as-is. We can't
11055 /* Here, the node does contain some characters that aren't
11056 * problematic. If one such is the final character in the
11057 * node, we are done */
11058 if (len == full_len) {
11061 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11063 /* If the final character is problematic, but the
11064 * penultimate is not, back-off that last character to
11065 * later start a new node with it */
11070 /* Here, the final non-problematic character is earlier
11071 * in the input than the penultimate character. What we do
11072 * is reparse from the beginning, going up only as far as
11073 * this final ok one, thus guaranteeing that the node ends
11074 * in an acceptable character. The reason we reparse is
11075 * that we know how far in the character is, but we don't
11076 * know how to correlate its position with the input parse.
11077 * An alternate implementation would be to build that
11078 * correlation as we go along during the original parse,
11079 * but that would entail extra work for every node, whereas
11080 * this code gets executed only when the string is too
11081 * large for the node, and the final two characters are
11082 * problematic, an infrequent occurrence. Yet another
11083 * possible strategy would be to save the tail of the
11084 * string, and the next time regatom is called, initialize
11085 * with that. The problem with this is that unless you
11086 * back off one more character, you won't be guaranteed
11087 * regatom will get called again, unless regbranch,
11088 * regpiece ... are also changed. If you do back off that
11089 * extra character, so that there is input guaranteed to
11090 * force calling regatom, you can't handle the case where
11091 * just the first character in the node is acceptable. I
11092 * (khw) decided to try this method which doesn't have that
11093 * pitfall; if performance issues are found, we can do a
11094 * combination of the current approach plus that one */
11100 } /* End of verifying node ends with an appropriate char */
11102 loopdone: /* Jumped to when encounters something that shouldn't be in
11105 /* If 'maybe_exact' is still set here, means there are no
11106 * code points in the node that participate in folds */
11107 if (FOLD && maybe_exact) {
11111 /* I (khw) don't know if you can get here with zero length, but the
11112 * old code handled this situation by creating a zero-length EXACT
11113 * node. Might as well be NOTHING instead */
11118 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11121 RExC_parse = p - 1;
11122 Set_Node_Cur_Length(ret); /* MJD */
11123 nextchar(pRExC_state);
11125 /* len is STRLEN which is unsigned, need to copy to signed */
11128 vFAIL("Internal disaster");
11131 } /* End of label 'defchar:' */
11133 } /* End of giant switch on input character */
11139 S_regwhite( RExC_state_t *pRExC_state, char *p )
11141 const char *e = RExC_end;
11143 PERL_ARGS_ASSERT_REGWHITE;
11148 else if (*p == '#') {
11151 if (*p++ == '\n') {
11157 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11165 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11166 Character classes ([:foo:]) can also be negated ([:^foo:]).
11167 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11168 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11169 but trigger failures because they are currently unimplemented. */
11171 #define POSIXCC_DONE(c) ((c) == ':')
11172 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11173 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11175 PERL_STATIC_INLINE I32
11176 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
11179 I32 namedclass = OOB_NAMEDCLASS;
11181 PERL_ARGS_ASSERT_REGPPOSIXCC;
11183 if (value == '[' && RExC_parse + 1 < RExC_end &&
11184 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11185 POSIXCC(UCHARAT(RExC_parse))) {
11186 const char c = UCHARAT(RExC_parse);
11187 char* const s = RExC_parse++;
11189 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11191 if (RExC_parse == RExC_end)
11192 /* Grandfather lone [:, [=, [. */
11195 const char* const t = RExC_parse++; /* skip over the c */
11198 if (UCHARAT(RExC_parse) == ']') {
11199 const char *posixcc = s + 1;
11200 RExC_parse++; /* skip over the ending ] */
11203 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11204 const I32 skip = t - posixcc;
11206 /* Initially switch on the length of the name. */
11209 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11210 namedclass = ANYOF_WORDCHAR;
11213 /* Names all of length 5. */
11214 /* alnum alpha ascii blank cntrl digit graph lower
11215 print punct space upper */
11216 /* Offset 4 gives the best switch position. */
11217 switch (posixcc[4]) {
11219 if (memEQ(posixcc, "alph", 4)) /* alpha */
11220 namedclass = ANYOF_ALPHA;
11223 if (memEQ(posixcc, "spac", 4)) /* space */
11224 namedclass = ANYOF_PSXSPC;
11227 if (memEQ(posixcc, "grap", 4)) /* graph */
11228 namedclass = ANYOF_GRAPH;
11231 if (memEQ(posixcc, "asci", 4)) /* ascii */
11232 namedclass = ANYOF_ASCII;
11235 if (memEQ(posixcc, "blan", 4)) /* blank */
11236 namedclass = ANYOF_BLANK;
11239 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11240 namedclass = ANYOF_CNTRL;
11243 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11244 namedclass = ANYOF_ALNUMC;
11247 if (memEQ(posixcc, "lowe", 4)) /* lower */
11248 namedclass = ANYOF_LOWER;
11249 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11250 namedclass = ANYOF_UPPER;
11253 if (memEQ(posixcc, "digi", 4)) /* digit */
11254 namedclass = ANYOF_DIGIT;
11255 else if (memEQ(posixcc, "prin", 4)) /* print */
11256 namedclass = ANYOF_PRINT;
11257 else if (memEQ(posixcc, "punc", 4)) /* punct */
11258 namedclass = ANYOF_PUNCT;
11263 if (memEQ(posixcc, "xdigit", 6))
11264 namedclass = ANYOF_XDIGIT;
11268 if (namedclass == OOB_NAMEDCLASS)
11269 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11272 /* The #defines are structured so each complement is +1 to
11273 * the normal one */
11277 assert (posixcc[skip] == ':');
11278 assert (posixcc[skip+1] == ']');
11279 } else if (!SIZE_ONLY) {
11280 /* [[=foo=]] and [[.foo.]] are still future. */
11282 /* adjust RExC_parse so the warning shows after
11283 the class closes */
11284 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11286 SvREFCNT_dec(free_me);
11287 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11290 /* Maternal grandfather:
11291 * "[:" ending in ":" but not in ":]" */
11300 /* Generate the code to add a full posix character <class> to the bracketed
11301 * character class given by <node>. (<node> is needed only under locale rules)
11302 * destlist is the inversion list for non-locale rules that this class is
11304 * sourcelist is the ASCII-range inversion list to add under /a rules
11305 * Xsourcelist is the full Unicode range list to use otherwise. */
11306 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11308 SV* scratch_list = NULL; \
11310 /* Set this class in the node for runtime matching */ \
11311 ANYOF_CLASS_SET(node, class); \
11313 /* For above Latin1 code points, we use the full Unicode range */ \
11314 _invlist_intersection(PL_AboveLatin1, \
11317 /* And set the output to it, adding instead if there already is an \
11318 * output. Checking if <destlist> is NULL first saves an extra \
11319 * clone. Its reference count will be decremented at the next \
11320 * union, etc, or if this is the only instance, at the end of the \
11322 if (! destlist) { \
11323 destlist = scratch_list; \
11326 _invlist_union(destlist, scratch_list, &destlist); \
11327 SvREFCNT_dec(scratch_list); \
11331 /* For non-locale, just add it to any existing list */ \
11332 _invlist_union(destlist, \
11333 (AT_LEAST_ASCII_RESTRICTED) \
11339 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11341 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11343 SV* scratch_list = NULL; \
11344 ANYOF_CLASS_SET(node, class); \
11345 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11346 if (! destlist) { \
11347 destlist = scratch_list; \
11350 _invlist_union(destlist, scratch_list, &destlist); \
11351 SvREFCNT_dec(scratch_list); \
11355 _invlist_union_complement_2nd(destlist, \
11356 (AT_LEAST_ASCII_RESTRICTED) \
11360 /* Under /d, everything in the upper half of the Latin1 range \
11361 * matches this complement */ \
11362 if (DEPENDS_SEMANTICS) { \
11363 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11367 /* Generate the code to add a posix character <class> to the bracketed
11368 * character class given by <node>. (<node> is needed only under locale rules)
11369 * destlist is the inversion list for non-locale rules that this class is
11371 * sourcelist is the ASCII-range inversion list to add under /a rules
11372 * l1_sourcelist is the Latin1 range list to use otherwise.
11373 * Xpropertyname is the name to add to <run_time_list> of the property to
11374 * specify the code points above Latin1 that will have to be
11375 * determined at run-time
11376 * run_time_list is a SV* that contains text names of properties that are to
11377 * be computed at run time. This concatenates <Xpropertyname>
11378 * to it, appropriately
11379 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11381 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11382 l1_sourcelist, Xpropertyname, run_time_list) \
11383 /* First, resolve whether to use the ASCII-only list or the L1 \
11385 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11386 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11387 Xpropertyname, run_time_list)
11389 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11390 Xpropertyname, run_time_list) \
11391 /* If not /a matching, there are going to be code points we will have \
11392 * to defer to runtime to look-up */ \
11393 if (! AT_LEAST_ASCII_RESTRICTED) { \
11394 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11397 ANYOF_CLASS_SET(node, class); \
11400 _invlist_union(destlist, sourcelist, &destlist); \
11403 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11404 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11406 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11407 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11408 if (AT_LEAST_ASCII_RESTRICTED) { \
11409 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11412 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11413 matches_above_unicode = TRUE; \
11415 ANYOF_CLASS_SET(node, namedclass); \
11418 SV* scratch_list = NULL; \
11419 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11420 if (! destlist) { \
11421 destlist = scratch_list; \
11424 _invlist_union(destlist, scratch_list, &destlist); \
11425 SvREFCNT_dec(scratch_list); \
11427 if (DEPENDS_SEMANTICS) { \
11428 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11433 /* The names of properties whose definitions are not known at compile time are
11434 * stored in this SV, after a constant heading. So if the length has been
11435 * changed since initialization, then there is a run-time definition. */
11436 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11438 /* This converts the named class defined in regcomp.h to its equivalent class
11439 * number defined in handy.h. */
11440 #define namedclass_to_classnum(class) ((class) / 2)
11443 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11445 /* parse a bracketed class specification. Most of these will produce an ANYOF node;
11446 * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11447 * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
11448 * multi-character folds: it will be rewritten following the paradigm of
11449 * this example, where the <multi-fold>s are characters which fold to
11450 * multiple character sequences:
11451 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11452 * gets effectively rewritten as:
11453 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11454 * reg() gets called (recursively) on the rewritten version, and this
11455 * function will return what it constructs. (Actually the <multi-fold>s
11456 * aren't physically removed from the [abcdefghi], it's just that they are
11457 * ignored in the recursion by means of a flag:
11458 * <RExC_in_multi_char_class>.)
11460 * ANYOF nodes contain a bit map for the first 256 characters, with the
11461 * corresponding bit set if that character is in the list. For characters
11462 * above 255, a range list or swash is used. There are extra bits for \w,
11463 * etc. in locale ANYOFs, as what these match is not determinable at
11468 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11470 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11473 IV namedclass = OOB_NAMEDCLASS;
11474 char *rangebegin = NULL;
11475 bool need_class = 0;
11477 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11478 than just initialized. */
11479 SV* properties = NULL; /* Code points that match \p{} \P{} */
11480 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11481 extended beyond the Latin1 range */
11482 UV element_count = 0; /* Number of distinct elements in the class.
11483 Optimizations may be possible if this is tiny */
11484 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11485 character; used under /i */
11488 /* Unicode properties are stored in a swash; this holds the current one
11489 * being parsed. If this swash is the only above-latin1 component of the
11490 * character class, an optimization is to pass it directly on to the
11491 * execution engine. Otherwise, it is set to NULL to indicate that there
11492 * are other things in the class that have to be dealt with at execution
11494 SV* swash = NULL; /* Code points that match \p{} \P{} */
11496 /* Set if a component of this character class is user-defined; just passed
11497 * on to the engine */
11498 bool has_user_defined_property = FALSE;
11500 /* inversion list of code points this node matches only when the target
11501 * string is in UTF-8. (Because is under /d) */
11502 SV* depends_list = NULL;
11504 /* inversion list of code points this node matches. For much of the
11505 * function, it includes only those that match regardless of the utf8ness
11506 * of the target string */
11507 SV* cp_list = NULL;
11510 /* In a range, counts how many 0-2 of the ends of it came from literals,
11511 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11512 UV literal_endpoint = 0;
11514 bool invert = FALSE; /* Is this class to be complemented */
11516 /* Is there any thing like \W or [:^digit:] that matches above the legal
11517 * Unicode range? */
11518 bool runtime_posix_matches_above_Unicode = FALSE;
11520 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11521 case we need to change the emitted regop to an EXACT. */
11522 const char * orig_parse = RExC_parse;
11523 const I32 orig_size = RExC_size;
11524 GET_RE_DEBUG_FLAGS_DECL;
11526 PERL_ARGS_ASSERT_REGCLASS;
11528 PERL_UNUSED_ARG(depth);
11531 DEBUG_PARSE("clas");
11533 /* Assume we are going to generate an ANYOF node. */
11534 ret = reganode(pRExC_state, ANYOF, 0);
11537 ANYOF_FLAGS(ret) = 0;
11540 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11547 RExC_size += ANYOF_SKIP;
11548 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11551 RExC_emit += ANYOF_SKIP;
11553 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11555 listsv = newSVpvs("# comment\n");
11556 initial_listsv_len = SvCUR(listsv);
11559 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11561 if (!SIZE_ONLY && POSIXCC(nextvalue))
11563 const char *s = RExC_parse;
11564 const char c = *s++;
11566 while (isALNUM(*s))
11568 if (*s && c == *s && s[1] == ']') {
11570 "POSIX syntax [%c %c] belongs inside character classes",
11573 /* [[=foo=]] and [[.foo.]] are still future. */
11574 if (POSIXCC_NOTYET(c)) {
11575 /* adjust RExC_parse so the error shows after
11576 the class closes */
11577 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11579 SvREFCNT_dec(listsv);
11580 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11585 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11586 if (UCHARAT(RExC_parse) == ']')
11587 goto charclassloop;
11590 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11594 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11595 save_value = value;
11596 save_prevvalue = prevvalue;
11599 rangebegin = RExC_parse;
11603 value = utf8n_to_uvchr((U8*)RExC_parse,
11604 RExC_end - RExC_parse,
11605 &numlen, UTF8_ALLOW_DEFAULT);
11606 RExC_parse += numlen;
11609 value = UCHARAT(RExC_parse++);
11611 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11612 if (value == '[' && POSIXCC(nextvalue))
11613 namedclass = regpposixcc(pRExC_state, value, listsv);
11614 else if (value == '\\') {
11616 value = utf8n_to_uvchr((U8*)RExC_parse,
11617 RExC_end - RExC_parse,
11618 &numlen, UTF8_ALLOW_DEFAULT);
11619 RExC_parse += numlen;
11622 value = UCHARAT(RExC_parse++);
11623 /* Some compilers cannot handle switching on 64-bit integer
11624 * values, therefore value cannot be an UV. Yes, this will
11625 * be a problem later if we want switch on Unicode.
11626 * A similar issue a little bit later when switching on
11627 * namedclass. --jhi */
11628 switch ((I32)value) {
11629 case 'w': namedclass = ANYOF_WORDCHAR; break;
11630 case 'W': namedclass = ANYOF_NWORDCHAR; break;
11631 case 's': namedclass = ANYOF_SPACE; break;
11632 case 'S': namedclass = ANYOF_NSPACE; break;
11633 case 'd': namedclass = ANYOF_DIGIT; break;
11634 case 'D': namedclass = ANYOF_NDIGIT; break;
11635 case 'v': namedclass = ANYOF_VERTWS; break;
11636 case 'V': namedclass = ANYOF_NVERTWS; break;
11637 case 'h': namedclass = ANYOF_HORIZWS; break;
11638 case 'H': namedclass = ANYOF_NHORIZWS; break;
11639 case 'N': /* Handle \N{NAME} in class */
11641 /* We only pay attention to the first char of
11642 multichar strings being returned. I kinda wonder
11643 if this makes sense as it does change the behaviour
11644 from earlier versions, OTOH that behaviour was broken
11646 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11647 TRUE /* => charclass */))
11658 /* This routine will handle any undefined properties */
11659 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11661 if (RExC_parse >= RExC_end)
11662 vFAIL2("Empty \\%c{}", (U8)value);
11663 if (*RExC_parse == '{') {
11664 const U8 c = (U8)value;
11665 e = strchr(RExC_parse++, '}');
11667 vFAIL2("Missing right brace on \\%c{}", c);
11668 while (isSPACE(UCHARAT(RExC_parse)))
11670 if (e == RExC_parse)
11671 vFAIL2("Empty \\%c{}", c);
11672 n = e - RExC_parse;
11673 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11684 if (UCHARAT(RExC_parse) == '^') {
11687 value = value == 'p' ? 'P' : 'p'; /* toggle */
11688 while (isSPACE(UCHARAT(RExC_parse))) {
11693 /* Try to get the definition of the property into
11694 * <invlist>. If /i is in effect, the effective property
11695 * will have its name be <__NAME_i>. The design is
11696 * discussed in commit
11697 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11698 Newx(name, n + sizeof("_i__\n"), char);
11700 sprintf(name, "%s%.*s%s\n",
11701 (FOLD) ? "__" : "",
11707 /* Look up the property name, and get its swash and
11708 * inversion list, if the property is found */
11710 SvREFCNT_dec(swash);
11712 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11715 NULL, /* No inversion list */
11718 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11720 SvREFCNT_dec(swash);
11724 /* Here didn't find it. It could be a user-defined
11725 * property that will be available at run-time. Add it
11726 * to the list to look up then */
11727 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11728 (value == 'p' ? '+' : '!'),
11730 has_user_defined_property = TRUE;
11732 /* We don't know yet, so have to assume that the
11733 * property could match something in the Latin1 range,
11734 * hence something that isn't utf8. Note that this
11735 * would cause things in <depends_list> to match
11736 * inappropriately, except that any \p{}, including
11737 * this one forces Unicode semantics, which means there
11738 * is <no depends_list> */
11739 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11743 /* Here, did get the swash and its inversion list. If
11744 * the swash is from a user-defined property, then this
11745 * whole character class should be regarded as such */
11746 has_user_defined_property =
11748 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11750 /* Invert if asking for the complement */
11751 if (value == 'P') {
11752 _invlist_union_complement_2nd(properties,
11756 /* The swash can't be used as-is, because we've
11757 * inverted things; delay removing it to here after
11758 * have copied its invlist above */
11759 SvREFCNT_dec(swash);
11763 _invlist_union(properties, invlist, &properties);
11768 RExC_parse = e + 1;
11769 namedclass = ANYOF_UNIPROP; /* no official name, but it's named */
11771 /* \p means they want Unicode semantics */
11772 RExC_uni_semantics = 1;
11775 case 'n': value = '\n'; break;
11776 case 'r': value = '\r'; break;
11777 case 't': value = '\t'; break;
11778 case 'f': value = '\f'; break;
11779 case 'b': value = '\b'; break;
11780 case 'e': value = ASCII_TO_NATIVE('\033');break;
11781 case 'a': value = ASCII_TO_NATIVE('\007');break;
11783 RExC_parse--; /* function expects to be pointed at the 'o' */
11785 const char* error_msg;
11786 bool valid = grok_bslash_o(RExC_parse,
11791 RExC_parse += numlen;
11796 if (PL_encoding && value < 0x100) {
11797 goto recode_encoding;
11801 RExC_parse--; /* function expects to be pointed at the 'x' */
11803 const char* error_msg;
11804 bool valid = grok_bslash_x(RExC_parse,
11809 RExC_parse += numlen;
11814 if (PL_encoding && value < 0x100)
11815 goto recode_encoding;
11818 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11820 case '0': case '1': case '2': case '3': case '4':
11821 case '5': case '6': case '7':
11823 /* Take 1-3 octal digits */
11824 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11826 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11827 RExC_parse += numlen;
11828 if (PL_encoding && value < 0x100)
11829 goto recode_encoding;
11833 if (! RExC_override_recoding) {
11834 SV* enc = PL_encoding;
11835 value = reg_recode((const char)(U8)value, &enc);
11836 if (!enc && SIZE_ONLY)
11837 ckWARNreg(RExC_parse,
11838 "Invalid escape in the specified encoding");
11842 /* Allow \_ to not give an error */
11843 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11844 ckWARN2reg(RExC_parse,
11845 "Unrecognized escape \\%c in character class passed through",
11850 } /* end of \blah */
11853 literal_endpoint++;
11856 /* What matches in a locale is not known until runtime. This
11857 * includes what the Posix classes (like \w, [:space:]) match.
11858 * Room must be reserved (one time per class) to store such
11859 * classes, either if Perl is compiled so that locale nodes always
11860 * should have this space, or if there is such class info to be
11861 * stored. The space will contain a bit for each named class that
11862 * is to be matched against. This isn't needed for \p{} and
11863 * pseudo-classes, as they are not affected by locale, and hence
11864 * are dealt with separately */
11867 && (ANYOF_LOCALE == ANYOF_CLASS
11868 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11872 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11875 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11876 ANYOF_CLASS_ZERO(ret);
11878 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11881 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11883 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11884 * literal, as is the character that began the false range, i.e.
11885 * the 'a' in the examples */
11889 RExC_parse >= rangebegin ?
11890 RExC_parse - rangebegin : 0;
11891 ckWARN4reg(RExC_parse,
11892 "False [] range \"%*.*s\"",
11894 cp_list = add_cp_to_invlist(cp_list, '-');
11895 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11898 range = 0; /* this was not a true range */
11899 element_count += 2; /* So counts for three values */
11903 switch ((I32)namedclass) {
11905 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11906 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11907 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11909 case ANYOF_NALNUMC:
11910 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11911 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11912 runtime_posix_matches_above_Unicode);
11915 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11916 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11919 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11920 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11921 runtime_posix_matches_above_Unicode);
11926 ANYOF_CLASS_SET(ret, namedclass);
11929 #endif /* Not isascii(); just use the hard-coded definition for it */
11930 _invlist_union(posixes, PL_ASCII, &posixes);
11935 ANYOF_CLASS_SET(ret, namedclass);
11939 _invlist_union_complement_2nd(posixes,
11940 PL_ASCII, &posixes);
11941 if (DEPENDS_SEMANTICS) {
11942 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11949 if (hasISBLANK || ! LOC) {
11950 DO_POSIX(ret, namedclass, posixes,
11951 PL_PosixBlank, PL_XPosixBlank);
11953 else { /* There is no isblank() and we are in locale: We
11954 use the ASCII range and the above-Latin1 range
11956 SV* scratch_list = NULL;
11958 /* Include all above-Latin1 blanks */
11959 _invlist_intersection(PL_AboveLatin1,
11962 /* Add it to the running total of posix classes */
11964 posixes = scratch_list;
11967 _invlist_union(posixes, scratch_list, &posixes);
11968 SvREFCNT_dec(scratch_list);
11970 /* Add the ASCII-range blanks to the running total. */
11971 _invlist_union(posixes, PL_PosixBlank, &posixes);
11975 if (hasISBLANK || ! LOC) {
11976 DO_N_POSIX(ret, namedclass, posixes,
11977 PL_PosixBlank, PL_XPosixBlank);
11979 else { /* There is no isblank() and we are in locale */
11980 SV* scratch_list = NULL;
11982 /* Include all above-Latin1 non-blanks */
11983 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11986 /* Add them to the running total of posix classes */
11987 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11990 posixes = scratch_list;
11993 _invlist_union(posixes, scratch_list, &posixes);
11994 SvREFCNT_dec(scratch_list);
11997 /* Get the list of all non-ASCII-blanks in Latin 1, and
11998 * add them to the running total */
11999 _invlist_subtract(PL_Latin1, PL_PosixBlank,
12001 _invlist_union(posixes, scratch_list, &posixes);
12002 SvREFCNT_dec(scratch_list);
12006 DO_POSIX(ret, namedclass, posixes,
12007 PL_PosixCntrl, PL_XPosixCntrl);
12010 DO_N_POSIX(ret, namedclass, posixes,
12011 PL_PosixCntrl, PL_XPosixCntrl);
12014 /* There are no digits in the Latin1 range outside of
12015 * ASCII, so call the macro that doesn't have to resolve
12017 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
12018 PL_PosixDigit, "XPosixDigit", listsv);
12021 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12022 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
12023 runtime_posix_matches_above_Unicode);
12026 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12027 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
12030 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12031 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
12032 runtime_posix_matches_above_Unicode);
12034 case ANYOF_HORIZWS:
12035 /* For these, we use the cp_list, as /d doesn't make a
12036 * difference in what these match. There would be problems
12037 * if these characters had folds other than themselves, as
12038 * cp_list is subject to folding. It turns out that \h
12039 * is just a synonym for XPosixBlank */
12040 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12042 case ANYOF_NHORIZWS:
12043 _invlist_union_complement_2nd(cp_list,
12044 PL_XPosixBlank, &cp_list);
12048 { /* These require special handling, as they differ under
12049 folding, matching Cased there (which in the ASCII range
12050 is the same as Alpha */
12056 if (FOLD && ! LOC) {
12057 ascii_source = PL_PosixAlpha;
12058 l1_source = PL_L1Cased;
12062 ascii_source = PL_PosixLower;
12063 l1_source = PL_L1PosixLower;
12064 Xname = "XPosixLower";
12066 if (namedclass == ANYOF_LOWER) {
12067 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12068 ascii_source, l1_source, Xname, listsv);
12071 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12072 posixes, ascii_source, l1_source, Xname, listsv,
12073 runtime_posix_matches_above_Unicode);
12078 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12079 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12082 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12083 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12084 runtime_posix_matches_above_Unicode);
12087 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12088 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12091 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12092 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12093 runtime_posix_matches_above_Unicode);
12096 DO_POSIX(ret, namedclass, posixes,
12097 PL_PosixSpace, PL_XPosixSpace);
12099 case ANYOF_NPSXSPC:
12100 DO_N_POSIX(ret, namedclass, posixes,
12101 PL_PosixSpace, PL_XPosixSpace);
12104 DO_POSIX(ret, namedclass, posixes,
12105 PL_PerlSpace, PL_XPerlSpace);
12108 DO_N_POSIX(ret, namedclass, posixes,
12109 PL_PerlSpace, PL_XPerlSpace);
12111 case ANYOF_UPPER: /* Same as LOWER, above */
12118 if (FOLD && ! LOC) {
12119 ascii_source = PL_PosixAlpha;
12120 l1_source = PL_L1Cased;
12124 ascii_source = PL_PosixUpper;
12125 l1_source = PL_L1PosixUpper;
12126 Xname = "XPosixUpper";
12128 if (namedclass == ANYOF_UPPER) {
12129 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12130 ascii_source, l1_source, Xname, listsv);
12133 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12134 posixes, ascii_source, l1_source, Xname, listsv,
12135 runtime_posix_matches_above_Unicode);
12139 case ANYOF_WORDCHAR:
12140 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12141 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12143 case ANYOF_NWORDCHAR:
12144 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12145 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12146 runtime_posix_matches_above_Unicode);
12149 /* For these, we use the cp_list, as /d doesn't make a
12150 * difference in what these match. There would be problems
12151 * if these characters had folds other than themselves, as
12152 * cp_list is subject to folding */
12153 _invlist_union(cp_list, PL_VertSpace, &cp_list);
12155 case ANYOF_NVERTWS:
12156 _invlist_union_complement_2nd(cp_list,
12157 PL_VertSpace, &cp_list);
12160 DO_POSIX(ret, namedclass, posixes,
12161 PL_PosixXDigit, PL_XPosixXDigit);
12163 case ANYOF_NXDIGIT:
12164 DO_N_POSIX(ret, namedclass, posixes,
12165 PL_PosixXDigit, PL_XPosixXDigit);
12167 case ANYOF_UNIPROP: /* this is to handle \p and \P */
12170 vFAIL("Invalid [::] class");
12174 continue; /* Go get next character */
12176 } /* end of namedclass \blah */
12179 if (prevvalue > value) /* b-a */ {
12180 const int w = RExC_parse - rangebegin;
12181 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12182 range = 0; /* not a valid range */
12186 prevvalue = value; /* save the beginning of the potential range */
12187 if (RExC_parse+1 < RExC_end
12188 && *RExC_parse == '-'
12189 && RExC_parse[1] != ']')
12193 /* a bad range like \w-, [:word:]- ? */
12194 if (namedclass > OOB_NAMEDCLASS) {
12195 if (ckWARN(WARN_REGEXP)) {
12197 RExC_parse >= rangebegin ?
12198 RExC_parse - rangebegin : 0;
12200 "False [] range \"%*.*s\"",
12204 cp_list = add_cp_to_invlist(cp_list, '-');
12208 range = 1; /* yeah, it's a range! */
12209 continue; /* but do it the next time */
12213 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12216 /* non-Latin1 code point implies unicode semantics. Must be set in
12217 * pass1 so is there for the whole of pass 2 */
12219 RExC_uni_semantics = 1;
12222 /* Ready to process either the single value, or the completed range.
12223 * For single-valued non-inverted ranges, we consider the possibility
12224 * of multi-char folds. (We made a conscious decision to not do this
12225 * for the other cases because it can often lead to non-intuitive
12226 * results. For example, you have the peculiar case that:
12227 * "s s" =~ /^[^\xDF]+$/i => Y
12228 * "ss" =~ /^[^\xDF]+$/i => N
12230 * See [perl #89750] */
12231 if (FOLD && ! invert && value == prevvalue) {
12232 if (value == LATIN_SMALL_LETTER_SHARP_S
12233 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12236 /* Here <value> is indeed a multi-char fold. Get what it is */
12238 U8 foldbuf[UTF8_MAXBYTES_CASE];
12241 UV folded = _to_uni_fold_flags(
12246 | ((LOC) ? FOLD_FLAGS_LOCALE
12247 : (ASCII_FOLD_RESTRICTED)
12248 ? FOLD_FLAGS_NOMIX_ASCII
12252 /* Here, <folded> should be the first character of the
12253 * multi-char fold of <value>, with <foldbuf> containing the
12254 * whole thing. But, if this fold is not allowed (because of
12255 * the flags), <fold> will be the same as <value>, and should
12256 * be processed like any other character, so skip the special
12258 if (folded != value) {
12260 /* Skip if we are recursed, currently parsing the class
12261 * again. Otherwise add this character to the list of
12262 * multi-char folds. */
12263 if (! RExC_in_multi_char_class) {
12264 AV** this_array_ptr;
12266 STRLEN cp_count = utf8_length(foldbuf,
12267 foldbuf + foldlen);
12268 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12270 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12273 if (! multi_char_matches) {
12274 multi_char_matches = newAV();
12277 /* <multi_char_matches> is actually an array of arrays.
12278 * There will be one or two top-level elements: [2],
12279 * and/or [3]. The [2] element is an array, each
12280 * element thereof is a character which folds to two
12281 * characters; likewise for [3]. (Unicode guarantees a
12282 * maximum of 3 characters in any fold.) When we
12283 * rewrite the character class below, we will do so
12284 * such that the longest folds are written first, so
12285 * that it prefers the longest matching strings first.
12286 * This is done even if it turns out that any
12287 * quantifier is non-greedy, out of programmer
12288 * laziness. Tom Christiansen has agreed that this is
12289 * ok. This makes the test for the ligature 'ffi' come
12290 * before the test for 'ff' */
12291 if (av_exists(multi_char_matches, cp_count)) {
12292 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12294 this_array = *this_array_ptr;
12297 this_array = newAV();
12298 av_store(multi_char_matches, cp_count,
12301 av_push(this_array, multi_fold);
12304 /* This element should not be processed further in this
12307 value = save_value;
12308 prevvalue = save_prevvalue;
12314 /* Deal with this element of the class */
12317 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12319 UV* this_range = _new_invlist(1);
12320 _append_range_to_invlist(this_range, prevvalue, value);
12322 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12323 * If this range was specified using something like 'i-j', we want
12324 * to include only the 'i' and the 'j', and not anything in
12325 * between, so exclude non-ASCII, non-alphabetics from it.
12326 * However, if the range was specified with something like
12327 * [\x89-\x91] or [\x89-j], all code points within it should be
12328 * included. literal_endpoint==2 means both ends of the range used
12329 * a literal character, not \x{foo} */
12330 if (literal_endpoint == 2
12331 && (prevvalue >= 'a' && value <= 'z')
12332 || (prevvalue >= 'A' && value <= 'Z'))
12334 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12335 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12337 _invlist_union(cp_list, this_range, &cp_list);
12338 literal_endpoint = 0;
12342 range = 0; /* this range (if it was one) is done now */
12343 } /* End of loop through all the text within the brackets */
12345 /* If anything in the class expands to more than one character, we have to
12346 * deal with them by building up a substitute parse string, and recursively
12347 * calling reg() on it, instead of proceeding */
12348 if (multi_char_matches) {
12349 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12352 char *save_end = RExC_end;
12353 char *save_parse = RExC_parse;
12354 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12359 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12360 because too confusing */
12362 sv_catpv(substitute_parse, "(?:");
12366 /* Look at the longest folds first */
12367 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12369 if (av_exists(multi_char_matches, cp_count)) {
12370 AV** this_array_ptr;
12373 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12375 while ((this_sequence = av_pop(*this_array_ptr)) !=
12378 if (! first_time) {
12379 sv_catpv(substitute_parse, "|");
12381 first_time = FALSE;
12383 sv_catpv(substitute_parse, SvPVX(this_sequence));
12388 /* If the character class contains anything else besides these
12389 * multi-character folds, have to include it in recursive parsing */
12390 if (element_count) {
12391 sv_catpv(substitute_parse, "|[");
12392 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12393 sv_catpv(substitute_parse, "]");
12396 sv_catpv(substitute_parse, ")");
12399 /* This is a way to get the parse to skip forward a whole named
12400 * sequence instead of matching the 2nd character when it fails the
12402 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12406 RExC_parse = SvPV(substitute_parse, len);
12407 RExC_end = RExC_parse + len;
12408 RExC_in_multi_char_class = 1;
12409 RExC_emit = (regnode *)orig_emit;
12411 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12413 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12415 RExC_parse = save_parse;
12416 RExC_end = save_end;
12417 RExC_in_multi_char_class = 0;
12418 SvREFCNT_dec(multi_char_matches);
12419 SvREFCNT_dec(listsv);
12423 /* If the character class contains only a single element, it may be
12424 * optimizable into another node type which is smaller and runs faster.
12425 * Check if this is the case for this class */
12426 if (element_count == 1) {
12430 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12431 [:digit:] or \p{foo} */
12433 /* Certain named classes have equivalents that can appear outside a
12434 * character class, e.g. \w, \H. We use these instead of a
12435 * character class. */
12436 switch ((I32)namedclass) {
12439 /* The first group is for node types that depend on the charset
12440 * modifier to the regex. We first calculate the base node
12441 * type, and if it should be inverted */
12443 case ANYOF_NWORDCHAR:
12446 case ANYOF_WORDCHAR:
12448 goto join_charset_classes;
12455 goto join_charset_classes;
12463 join_charset_classes:
12465 /* Now that we have the base node type, we take advantage
12466 * of the enum ordering of the charset modifiers to get the
12467 * exact node type, For example the base SPACE also has
12468 * SPACEL, SPACEU, and SPACEA */
12470 offset = get_regex_charset(RExC_flags);
12472 /* /aa is the same as /a for these */
12473 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12474 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12476 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12477 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12482 /* The number of varieties of each of these is the same,
12483 * hence, so is the delta between the normal and
12484 * complemented nodes */
12486 op += NALNUM - ALNUM;
12488 *flagp |= HASWIDTH|SIMPLE;
12491 /* The second group doesn't depend of the charset modifiers.
12492 * We just have normal and complemented */
12493 case ANYOF_NHORIZWS:
12496 case ANYOF_HORIZWS:
12498 op = (invert) ? NHORIZWS : HORIZWS;
12499 *flagp |= HASWIDTH|SIMPLE;
12502 case ANYOF_NVERTWS:
12506 op = (invert) ? NVERTWS : VERTWS;
12507 *flagp |= HASWIDTH|SIMPLE;
12510 case ANYOF_UNIPROP:
12517 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12522 /* A generic posix class. All the /a ones can be handled
12523 * by the POSIXA opcode. And all are closed under folding
12524 * in the ASCII range, so FOLD doesn't matter */
12525 if (AT_LEAST_ASCII_RESTRICTED
12526 || (! LOC && namedclass == ANYOF_ASCII))
12528 /* The odd numbered ones are the complements of the
12529 * next-lower even number one */
12530 if (namedclass % 2 == 1) {
12534 arg = namedclass_to_classnum(namedclass);
12535 op = (invert) ? NPOSIXA : POSIXA;
12540 else if (value == prevvalue) {
12542 /* Here, the class consists of just a single code point */
12545 if (! LOC && value == '\n') {
12546 op = REG_ANY; /* Optimize [^\n] */
12547 *flagp |= HASWIDTH|SIMPLE;
12551 else if (value < 256 || UTF) {
12553 /* Optimize a single value into an EXACTish node, but not if it
12554 * would require converting the pattern to UTF-8. */
12555 op = compute_EXACTish(pRExC_state);
12557 } /* Otherwise is a range */
12558 else if (! LOC) { /* locale could vary these */
12559 if (prevvalue == '0') {
12560 if (value == '9') {
12561 op = (invert) ? NDIGITA : DIGITA;
12562 *flagp |= HASWIDTH|SIMPLE;
12567 /* Here, we have changed <op> away from its initial value iff we found
12568 * an optimization */
12571 /* Throw away this ANYOF regnode, and emit the calculated one,
12572 * which should correspond to the beginning, not current, state of
12574 const char * cur_parse = RExC_parse;
12575 RExC_parse = (char *)orig_parse;
12579 /* To get locale nodes to not use the full ANYOF size would
12580 * require moving the code above that writes the portions
12581 * of it that aren't in other nodes to after this point.
12582 * e.g. ANYOF_CLASS_SET */
12583 RExC_size = orig_size;
12587 RExC_emit = (regnode *)orig_emit;
12590 ret = reg_node(pRExC_state, op);
12592 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
12596 *flagp |= HASWIDTH|SIMPLE;
12598 else if (PL_regkind[op] == EXACT) {
12599 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12602 RExC_parse = (char *) cur_parse;
12604 SvREFCNT_dec(posixes);
12605 SvREFCNT_dec(listsv);
12606 SvREFCNT_dec(cp_list);
12613 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12615 /* If folding, we calculate all characters that could fold to or from the
12616 * ones already on the list */
12617 if (FOLD && cp_list) {
12618 UV start, end; /* End points of code point ranges */
12620 SV* fold_intersection = NULL;
12622 /* If the highest code point is within Latin1, we can use the
12623 * compiled-in Alphas list, and not have to go out to disk. This
12624 * yields two false positives, the masculine and feminine oridinal
12625 * indicators, which are weeded out below using the
12626 * IS_IN_SOME_FOLD_L1() macro */
12627 if (invlist_highest(cp_list) < 256) {
12628 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12632 /* Here, there are non-Latin1 code points, so we will have to go
12633 * fetch the list of all the characters that participate in folds
12635 if (! PL_utf8_foldable) {
12636 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12637 &PL_sv_undef, 1, 0);
12638 PL_utf8_foldable = _get_swash_invlist(swash);
12639 SvREFCNT_dec(swash);
12642 /* This is a hash that for a particular fold gives all characters
12643 * that are involved in it */
12644 if (! PL_utf8_foldclosures) {
12646 /* If we were unable to find any folds, then we likely won't be
12647 * able to find the closures. So just create an empty list.
12648 * Folding will effectively be restricted to the non-Unicode
12649 * rules hard-coded into Perl. (This case happens legitimately
12650 * during compilation of Perl itself before the Unicode tables
12651 * are generated) */
12652 if (_invlist_len(PL_utf8_foldable) == 0) {
12653 PL_utf8_foldclosures = newHV();
12656 /* If the folds haven't been read in, call a fold function
12658 if (! PL_utf8_tofold) {
12659 U8 dummy[UTF8_MAXBYTES+1];
12661 /* This string is just a short named one above \xff */
12662 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12663 assert(PL_utf8_tofold); /* Verify that worked */
12665 PL_utf8_foldclosures =
12666 _swash_inversion_hash(PL_utf8_tofold);
12670 /* Only the characters in this class that participate in folds need
12671 * be checked. Get the intersection of this class and all the
12672 * possible characters that are foldable. This can quickly narrow
12673 * down a large class */
12674 _invlist_intersection(PL_utf8_foldable, cp_list,
12675 &fold_intersection);
12678 /* Now look at the foldable characters in this class individually */
12679 invlist_iterinit(fold_intersection);
12680 while (invlist_iternext(fold_intersection, &start, &end)) {
12683 /* Locale folding for Latin1 characters is deferred until runtime */
12684 if (LOC && start < 256) {
12688 /* Look at every character in the range */
12689 for (j = start; j <= end; j++) {
12691 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12697 /* We have the latin1 folding rules hard-coded here so that
12698 * an innocent-looking character class, like /[ks]/i won't
12699 * have to go out to disk to find the possible matches.
12700 * XXX It would be better to generate these via regen, in
12701 * case a new version of the Unicode standard adds new
12702 * mappings, though that is not really likely, and may be
12703 * caught by the default: case of the switch below. */
12705 if (IS_IN_SOME_FOLD_L1(j)) {
12707 /* ASCII is always matched; non-ASCII is matched only
12708 * under Unicode rules */
12709 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12711 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12715 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12719 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12720 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12722 /* Certain Latin1 characters have matches outside
12723 * Latin1. To get here, <j> is one of those
12724 * characters. None of these matches is valid for
12725 * ASCII characters under /aa, which is why the 'if'
12726 * just above excludes those. These matches only
12727 * happen when the target string is utf8. The code
12728 * below adds the single fold closures for <j> to the
12729 * inversion list. */
12734 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12738 cp_list = add_cp_to_invlist(cp_list,
12739 LATIN_SMALL_LETTER_LONG_S);
12742 cp_list = add_cp_to_invlist(cp_list,
12743 GREEK_CAPITAL_LETTER_MU);
12744 cp_list = add_cp_to_invlist(cp_list,
12745 GREEK_SMALL_LETTER_MU);
12747 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12748 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12750 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12752 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12753 cp_list = add_cp_to_invlist(cp_list,
12754 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12756 case LATIN_SMALL_LETTER_SHARP_S:
12757 cp_list = add_cp_to_invlist(cp_list,
12758 LATIN_CAPITAL_LETTER_SHARP_S);
12760 case 'F': case 'f':
12761 case 'I': case 'i':
12762 case 'L': case 'l':
12763 case 'T': case 't':
12764 case 'A': case 'a':
12765 case 'H': case 'h':
12766 case 'J': case 'j':
12767 case 'N': case 'n':
12768 case 'W': case 'w':
12769 case 'Y': case 'y':
12770 /* These all are targets of multi-character
12771 * folds from code points that require UTF8 to
12772 * express, so they can't match unless the
12773 * target string is in UTF-8, so no action here
12774 * is necessary, as regexec.c properly handles
12775 * the general case for UTF-8 matching and
12776 * multi-char folds */
12779 /* Use deprecated warning to increase the
12780 * chances of this being output */
12781 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12788 /* Here is an above Latin1 character. We don't have the rules
12789 * hard-coded for it. First, get its fold. This is the simple
12790 * fold, as the multi-character folds have been handled earlier
12791 * and separated out */
12792 _to_uni_fold_flags(j, foldbuf, &foldlen,
12794 ? FOLD_FLAGS_LOCALE
12795 : (ASCII_FOLD_RESTRICTED)
12796 ? FOLD_FLAGS_NOMIX_ASCII
12799 /* Single character fold of above Latin1. Add everything in
12800 * its fold closure to the list that this node should match.
12801 * The fold closures data structure is a hash with the keys
12802 * being the UTF-8 of every character that is folded to, like
12803 * 'k', and the values each an array of all code points that
12804 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
12805 * Multi-character folds are not included */
12806 if ((listp = hv_fetch(PL_utf8_foldclosures,
12807 (char *) foldbuf, foldlen, FALSE)))
12809 AV* list = (AV*) *listp;
12811 for (k = 0; k <= av_len(list); k++) {
12812 SV** c_p = av_fetch(list, k, FALSE);
12815 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12819 /* /aa doesn't allow folds between ASCII and non-; /l
12820 * doesn't allow them between above and below 256 */
12821 if ((ASCII_FOLD_RESTRICTED
12822 && (isASCII(c) != isASCII(j)))
12823 || (LOC && ((c < 256) != (j < 256))))
12828 /* Folds involving non-ascii Latin1 characters
12829 * under /d are added to a separate list */
12830 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12832 cp_list = add_cp_to_invlist(cp_list, c);
12835 depends_list = add_cp_to_invlist(depends_list, c);
12841 SvREFCNT_dec(fold_intersection);
12844 /* And combine the result (if any) with any inversion list from posix
12845 * classes. The lists are kept separate up to now because we don't want to
12846 * fold the classes (folding of those is automatically handled by the swash
12847 * fetching code) */
12849 if (! DEPENDS_SEMANTICS) {
12851 _invlist_union(cp_list, posixes, &cp_list);
12852 SvREFCNT_dec(posixes);
12859 /* Under /d, we put into a separate list the Latin1 things that
12860 * match only when the target string is utf8 */
12861 SV* nonascii_but_latin1_properties = NULL;
12862 _invlist_intersection(posixes, PL_Latin1,
12863 &nonascii_but_latin1_properties);
12864 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12865 &nonascii_but_latin1_properties);
12866 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12869 _invlist_union(cp_list, posixes, &cp_list);
12870 SvREFCNT_dec(posixes);
12876 if (depends_list) {
12877 _invlist_union(depends_list, nonascii_but_latin1_properties,
12879 SvREFCNT_dec(nonascii_but_latin1_properties);
12882 depends_list = nonascii_but_latin1_properties;
12887 /* And combine the result (if any) with any inversion list from properties.
12888 * The lists are kept separate up to now so that we can distinguish the two
12889 * in regards to matching above-Unicode. A run-time warning is generated
12890 * if a Unicode property is matched against a non-Unicode code point. But,
12891 * we allow user-defined properties to match anything, without any warning,
12892 * and we also suppress the warning if there is a portion of the character
12893 * class that isn't a Unicode property, and which matches above Unicode, \W
12894 * or [\x{110000}] for example.
12895 * (Note that in this case, unlike the Posix one above, there is no
12896 * <depends_list>, because having a Unicode property forces Unicode
12899 bool warn_super = ! has_user_defined_property;
12902 /* If it matters to the final outcome, see if a non-property
12903 * component of the class matches above Unicode. If so, the
12904 * warning gets suppressed. This is true even if just a single
12905 * such code point is specified, as though not strictly correct if
12906 * another such code point is matched against, the fact that they
12907 * are using above-Unicode code points indicates they should know
12908 * the issues involved */
12910 bool non_prop_matches_above_Unicode =
12911 runtime_posix_matches_above_Unicode
12912 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12914 non_prop_matches_above_Unicode =
12915 ! non_prop_matches_above_Unicode;
12917 warn_super = ! non_prop_matches_above_Unicode;
12920 _invlist_union(properties, cp_list, &cp_list);
12921 SvREFCNT_dec(properties);
12924 cp_list = properties;
12928 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12932 /* Here, we have calculated what code points should be in the character
12935 * Now we can see about various optimizations. Fold calculation (which we
12936 * did above) needs to take place before inversion. Otherwise /[^k]/i
12937 * would invert to include K, which under /i would match k, which it
12938 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12939 * folded until runtime */
12941 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12942 * at compile time. Besides not inverting folded locale now, we can't
12943 * invert if there are things such as \w, which aren't known until runtime
12946 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12948 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12950 _invlist_invert(cp_list);
12952 /* Any swash can't be used as-is, because we've inverted things */
12954 SvREFCNT_dec(swash);
12958 /* Clear the invert flag since have just done it here */
12962 /* If we didn't do folding, it's because some information isn't available
12963 * until runtime; set the run-time fold flag for these. (We don't have to
12964 * worry about properties folding, as that is taken care of by the swash
12968 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12971 /* Some character classes are equivalent to other nodes. Such nodes take
12972 * up less room and generally fewer operations to execute than ANYOF nodes.
12973 * Above, we checked for and optimized into some such equivalents for
12974 * certain common classes that are easy to test. Getting to this point in
12975 * the code means that the class didn't get optimized there. Since this
12976 * code is only executed in Pass 2, it is too late to save space--it has
12977 * been allocated in Pass 1, and currently isn't given back. But turning
12978 * things into an EXACTish node can allow the optimizer to join it to any
12979 * adjacent such nodes. And if the class is equivalent to things like /./,
12980 * expensive run-time swashes can be avoided. Now that we have more
12981 * complete information, we can find things necessarily missed by the
12982 * earlier code. I (khw) am not sure how much to look for here. It would
12983 * be easy, but perhaps too slow, to check any candidates against all the
12984 * node types they could possibly match using _invlistEQ(). */
12989 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12990 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12993 U8 op = END; /* The optimzation node-type */
12994 const char * cur_parse= RExC_parse;
12996 invlist_iterinit(cp_list);
12997 if (! invlist_iternext(cp_list, &start, &end)) {
12999 /* Here, the list is empty. This happens, for example, when a
13000 * Unicode property is the only thing in the character class, and
13001 * it doesn't match anything. (perluniprops.pod notes such
13004 *flagp |= HASWIDTH|SIMPLE;
13006 else if (start == end) { /* The range is a single code point */
13007 if (! invlist_iternext(cp_list, &start, &end)
13009 /* Don't do this optimization if it would require changing
13010 * the pattern to UTF-8 */
13011 && (start < 256 || UTF))
13013 /* Here, the list contains a single code point. Can optimize
13014 * into an EXACT node */
13023 /* A locale node under folding with one code point can be
13024 * an EXACTFL, as its fold won't be calculated until
13030 /* Here, we are generally folding, but there is only one
13031 * code point to match. If we have to, we use an EXACT
13032 * node, but it would be better for joining with adjacent
13033 * nodes in the optimization pass if we used the same
13034 * EXACTFish node that any such are likely to be. We can
13035 * do this iff the code point doesn't participate in any
13036 * folds. For example, an EXACTF of a colon is the same as
13037 * an EXACT one, since nothing folds to or from a colon. */
13039 if (IS_IN_SOME_FOLD_L1(value)) {
13044 if (! PL_utf8_foldable) {
13045 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13046 &PL_sv_undef, 1, 0);
13047 PL_utf8_foldable = _get_swash_invlist(swash);
13048 SvREFCNT_dec(swash);
13050 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13055 /* If we haven't found the node type, above, it means we
13056 * can use the prevailing one */
13058 op = compute_EXACTish(pRExC_state);
13063 else if (start == 0) {
13064 if (end == UV_MAX) {
13066 *flagp |= HASWIDTH|SIMPLE;
13069 else if (end == '\n' - 1
13070 && invlist_iternext(cp_list, &start, &end)
13071 && start == '\n' + 1 && end == UV_MAX)
13074 *flagp |= HASWIDTH|SIMPLE;
13080 RExC_parse = (char *)orig_parse;
13081 RExC_emit = (regnode *)orig_emit;
13083 ret = reg_node(pRExC_state, op);
13085 RExC_parse = (char *)cur_parse;
13087 if (PL_regkind[op] == EXACT) {
13088 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13091 SvREFCNT_dec(cp_list);
13092 SvREFCNT_dec(listsv);
13097 /* Here, <cp_list> contains all the code points we can determine at
13098 * compile time that match under all conditions. Go through it, and
13099 * for things that belong in the bitmap, put them there, and delete from
13100 * <cp_list>. While we are at it, see if everything above 255 is in the
13101 * list, and if so, set a flag to speed up execution */
13102 ANYOF_BITMAP_ZERO(ret);
13105 /* This gets set if we actually need to modify things */
13106 bool change_invlist = FALSE;
13110 /* Start looking through <cp_list> */
13111 invlist_iterinit(cp_list);
13112 while (invlist_iternext(cp_list, &start, &end)) {
13116 if (end == UV_MAX && start <= 256) {
13117 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13120 /* Quit if are above what we should change */
13125 change_invlist = TRUE;
13127 /* Set all the bits in the range, up to the max that we are doing */
13128 high = (end < 255) ? end : 255;
13129 for (i = start; i <= (int) high; i++) {
13130 if (! ANYOF_BITMAP_TEST(ret, i)) {
13131 ANYOF_BITMAP_SET(ret, i);
13138 /* Done with loop; remove any code points that are in the bitmap from
13140 if (change_invlist) {
13141 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13144 /* If have completely emptied it, remove it completely */
13145 if (_invlist_len(cp_list) == 0) {
13146 SvREFCNT_dec(cp_list);
13152 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13155 /* Here, the bitmap has been populated with all the Latin1 code points that
13156 * always match. Can now add to the overall list those that match only
13157 * when the target string is UTF-8 (<depends_list>). */
13158 if (depends_list) {
13160 _invlist_union(cp_list, depends_list, &cp_list);
13161 SvREFCNT_dec(depends_list);
13164 cp_list = depends_list;
13168 /* If there is a swash and more than one element, we can't use the swash in
13169 * the optimization below. */
13170 if (swash && element_count > 1) {
13171 SvREFCNT_dec(swash);
13176 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13178 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13179 SvREFCNT_dec(listsv);
13182 /* av[0] stores the character class description in its textual form:
13183 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13184 * appropriate swash, and is also useful for dumping the regnode.
13185 * av[1] if NULL, is a placeholder to later contain the swash computed
13186 * from av[0]. But if no further computation need be done, the
13187 * swash is stored there now.
13188 * av[2] stores the cp_list inversion list for use in addition or
13189 * instead of av[0]; used only if av[1] is NULL
13190 * av[3] is set if any component of the class is from a user-defined
13191 * property; used only if av[1] is NULL */
13192 AV * const av = newAV();
13195 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13197 : (SvREFCNT_dec(listsv), &PL_sv_undef));
13199 av_store(av, 1, swash);
13200 SvREFCNT_dec(cp_list);
13203 av_store(av, 1, NULL);
13205 av_store(av, 2, cp_list);
13206 av_store(av, 3, newSVuv(has_user_defined_property));
13210 rv = newRV_noinc(MUTABLE_SV(av));
13211 n = add_data(pRExC_state, 1, "s");
13212 RExC_rxi->data->data[n] = (void*)rv;
13216 *flagp |= HASWIDTH|SIMPLE;
13219 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13222 /* reg_skipcomment()
13224 Absorbs an /x style # comments from the input stream.
13225 Returns true if there is more text remaining in the stream.
13226 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13227 terminates the pattern without including a newline.
13229 Note its the callers responsibility to ensure that we are
13230 actually in /x mode
13235 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13239 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13241 while (RExC_parse < RExC_end)
13242 if (*RExC_parse++ == '\n') {
13247 /* we ran off the end of the pattern without ending
13248 the comment, so we have to add an \n when wrapping */
13249 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13257 Advances the parse position, and optionally absorbs
13258 "whitespace" from the inputstream.
13260 Without /x "whitespace" means (?#...) style comments only,
13261 with /x this means (?#...) and # comments and whitespace proper.
13263 Returns the RExC_parse point from BEFORE the scan occurs.
13265 This is the /x friendly way of saying RExC_parse++.
13269 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13271 char* const retval = RExC_parse++;
13273 PERL_ARGS_ASSERT_NEXTCHAR;
13276 if (RExC_end - RExC_parse >= 3
13277 && *RExC_parse == '('
13278 && RExC_parse[1] == '?'
13279 && RExC_parse[2] == '#')
13281 while (*RExC_parse != ')') {
13282 if (RExC_parse == RExC_end)
13283 FAIL("Sequence (?#... not terminated");
13289 if (RExC_flags & RXf_PMf_EXTENDED) {
13290 if (isSPACE(*RExC_parse)) {
13294 else if (*RExC_parse == '#') {
13295 if ( reg_skipcomment( pRExC_state ) )
13304 - reg_node - emit a node
13306 STATIC regnode * /* Location. */
13307 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13311 regnode * const ret = RExC_emit;
13312 GET_RE_DEBUG_FLAGS_DECL;
13314 PERL_ARGS_ASSERT_REG_NODE;
13317 SIZE_ALIGN(RExC_size);
13321 if (RExC_emit >= RExC_emit_bound)
13322 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13323 op, RExC_emit, RExC_emit_bound);
13325 NODE_ALIGN_FILL(ret);
13327 FILL_ADVANCE_NODE(ptr, op);
13328 #ifdef RE_TRACK_PATTERN_OFFSETS
13329 if (RExC_offsets) { /* MJD */
13330 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13331 "reg_node", __LINE__,
13333 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13334 ? "Overwriting end of array!\n" : "OK",
13335 (UV)(RExC_emit - RExC_emit_start),
13336 (UV)(RExC_parse - RExC_start),
13337 (UV)RExC_offsets[0]));
13338 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13346 - reganode - emit a node with an argument
13348 STATIC regnode * /* Location. */
13349 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13353 regnode * const ret = RExC_emit;
13354 GET_RE_DEBUG_FLAGS_DECL;
13356 PERL_ARGS_ASSERT_REGANODE;
13359 SIZE_ALIGN(RExC_size);
13364 assert(2==regarglen[op]+1);
13366 Anything larger than this has to allocate the extra amount.
13367 If we changed this to be:
13369 RExC_size += (1 + regarglen[op]);
13371 then it wouldn't matter. Its not clear what side effect
13372 might come from that so its not done so far.
13377 if (RExC_emit >= RExC_emit_bound)
13378 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13379 op, RExC_emit, RExC_emit_bound);
13381 NODE_ALIGN_FILL(ret);
13383 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13384 #ifdef RE_TRACK_PATTERN_OFFSETS
13385 if (RExC_offsets) { /* MJD */
13386 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13390 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13391 "Overwriting end of array!\n" : "OK",
13392 (UV)(RExC_emit - RExC_emit_start),
13393 (UV)(RExC_parse - RExC_start),
13394 (UV)RExC_offsets[0]));
13395 Set_Cur_Node_Offset;
13403 - reguni - emit (if appropriate) a Unicode character
13406 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13410 PERL_ARGS_ASSERT_REGUNI;
13412 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13416 - reginsert - insert an operator in front of already-emitted operand
13418 * Means relocating the operand.
13421 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13427 const int offset = regarglen[(U8)op];
13428 const int size = NODE_STEP_REGNODE + offset;
13429 GET_RE_DEBUG_FLAGS_DECL;
13431 PERL_ARGS_ASSERT_REGINSERT;
13432 PERL_UNUSED_ARG(depth);
13433 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13434 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13443 if (RExC_open_parens) {
13445 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13446 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13447 if ( RExC_open_parens[paren] >= opnd ) {
13448 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13449 RExC_open_parens[paren] += size;
13451 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13453 if ( RExC_close_parens[paren] >= opnd ) {
13454 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13455 RExC_close_parens[paren] += size;
13457 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13462 while (src > opnd) {
13463 StructCopy(--src, --dst, regnode);
13464 #ifdef RE_TRACK_PATTERN_OFFSETS
13465 if (RExC_offsets) { /* MJD 20010112 */
13466 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13470 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13471 ? "Overwriting end of array!\n" : "OK",
13472 (UV)(src - RExC_emit_start),
13473 (UV)(dst - RExC_emit_start),
13474 (UV)RExC_offsets[0]));
13475 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13476 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13482 place = opnd; /* Op node, where operand used to be. */
13483 #ifdef RE_TRACK_PATTERN_OFFSETS
13484 if (RExC_offsets) { /* MJD */
13485 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13489 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13490 ? "Overwriting end of array!\n" : "OK",
13491 (UV)(place - RExC_emit_start),
13492 (UV)(RExC_parse - RExC_start),
13493 (UV)RExC_offsets[0]));
13494 Set_Node_Offset(place, RExC_parse);
13495 Set_Node_Length(place, 1);
13498 src = NEXTOPER(place);
13499 FILL_ADVANCE_NODE(place, op);
13500 Zero(src, offset, regnode);
13504 - regtail - set the next-pointer at the end of a node chain of p to val.
13505 - SEE ALSO: regtail_study
13507 /* TODO: All three parms should be const */
13509 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13513 GET_RE_DEBUG_FLAGS_DECL;
13515 PERL_ARGS_ASSERT_REGTAIL;
13517 PERL_UNUSED_ARG(depth);
13523 /* Find last node. */
13526 regnode * const temp = regnext(scan);
13528 SV * const mysv=sv_newmortal();
13529 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13530 regprop(RExC_rx, mysv, scan);
13531 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13532 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13533 (temp == NULL ? "->" : ""),
13534 (temp == NULL ? PL_reg_name[OP(val)] : "")
13542 if (reg_off_by_arg[OP(scan)]) {
13543 ARG_SET(scan, val - scan);
13546 NEXT_OFF(scan) = val - scan;
13552 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13553 - Look for optimizable sequences at the same time.
13554 - currently only looks for EXACT chains.
13556 This is experimental code. The idea is to use this routine to perform
13557 in place optimizations on branches and groups as they are constructed,
13558 with the long term intention of removing optimization from study_chunk so
13559 that it is purely analytical.
13561 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13562 to control which is which.
13565 /* TODO: All four parms should be const */
13568 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13573 #ifdef EXPERIMENTAL_INPLACESCAN
13576 GET_RE_DEBUG_FLAGS_DECL;
13578 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13584 /* Find last node. */
13588 regnode * const temp = regnext(scan);
13589 #ifdef EXPERIMENTAL_INPLACESCAN
13590 if (PL_regkind[OP(scan)] == EXACT) {
13591 bool has_exactf_sharp_s; /* Unexamined in this routine */
13592 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13597 switch (OP(scan)) {
13603 case EXACTFU_TRICKYFOLD:
13605 if( exact == PSEUDO )
13607 else if ( exact != OP(scan) )
13616 SV * const mysv=sv_newmortal();
13617 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13618 regprop(RExC_rx, mysv, scan);
13619 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13620 SvPV_nolen_const(mysv),
13621 REG_NODE_NUM(scan),
13622 PL_reg_name[exact]);
13629 SV * const mysv_val=sv_newmortal();
13630 DEBUG_PARSE_MSG("");
13631 regprop(RExC_rx, mysv_val, val);
13632 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13633 SvPV_nolen_const(mysv_val),
13634 (IV)REG_NODE_NUM(val),
13638 if (reg_off_by_arg[OP(scan)]) {
13639 ARG_SET(scan, val - scan);
13642 NEXT_OFF(scan) = val - scan;
13650 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13654 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13660 for (bit=0; bit<32; bit++) {
13661 if (flags & (1<<bit)) {
13662 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13665 if (!set++ && lead)
13666 PerlIO_printf(Perl_debug_log, "%s",lead);
13667 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13670 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13671 if (!set++ && lead) {
13672 PerlIO_printf(Perl_debug_log, "%s",lead);
13675 case REGEX_UNICODE_CHARSET:
13676 PerlIO_printf(Perl_debug_log, "UNICODE");
13678 case REGEX_LOCALE_CHARSET:
13679 PerlIO_printf(Perl_debug_log, "LOCALE");
13681 case REGEX_ASCII_RESTRICTED_CHARSET:
13682 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13684 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13685 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13688 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13694 PerlIO_printf(Perl_debug_log, "\n");
13696 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13702 Perl_regdump(pTHX_ const regexp *r)
13706 SV * const sv = sv_newmortal();
13707 SV *dsv= sv_newmortal();
13708 RXi_GET_DECL(r,ri);
13709 GET_RE_DEBUG_FLAGS_DECL;
13711 PERL_ARGS_ASSERT_REGDUMP;
13713 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13715 /* Header fields of interest. */
13716 if (r->anchored_substr) {
13717 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13718 RE_SV_DUMPLEN(r->anchored_substr), 30);
13719 PerlIO_printf(Perl_debug_log,
13720 "anchored %s%s at %"IVdf" ",
13721 s, RE_SV_TAIL(r->anchored_substr),
13722 (IV)r->anchored_offset);
13723 } else if (r->anchored_utf8) {
13724 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13725 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13726 PerlIO_printf(Perl_debug_log,
13727 "anchored utf8 %s%s at %"IVdf" ",
13728 s, RE_SV_TAIL(r->anchored_utf8),
13729 (IV)r->anchored_offset);
13731 if (r->float_substr) {
13732 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13733 RE_SV_DUMPLEN(r->float_substr), 30);
13734 PerlIO_printf(Perl_debug_log,
13735 "floating %s%s at %"IVdf"..%"UVuf" ",
13736 s, RE_SV_TAIL(r->float_substr),
13737 (IV)r->float_min_offset, (UV)r->float_max_offset);
13738 } else if (r->float_utf8) {
13739 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13740 RE_SV_DUMPLEN(r->float_utf8), 30);
13741 PerlIO_printf(Perl_debug_log,
13742 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13743 s, RE_SV_TAIL(r->float_utf8),
13744 (IV)r->float_min_offset, (UV)r->float_max_offset);
13746 if (r->check_substr || r->check_utf8)
13747 PerlIO_printf(Perl_debug_log,
13749 (r->check_substr == r->float_substr
13750 && r->check_utf8 == r->float_utf8
13751 ? "(checking floating" : "(checking anchored"));
13752 if (r->extflags & RXf_NOSCAN)
13753 PerlIO_printf(Perl_debug_log, " noscan");
13754 if (r->extflags & RXf_CHECK_ALL)
13755 PerlIO_printf(Perl_debug_log, " isall");
13756 if (r->check_substr || r->check_utf8)
13757 PerlIO_printf(Perl_debug_log, ") ");
13759 if (ri->regstclass) {
13760 regprop(r, sv, ri->regstclass);
13761 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13763 if (r->extflags & RXf_ANCH) {
13764 PerlIO_printf(Perl_debug_log, "anchored");
13765 if (r->extflags & RXf_ANCH_BOL)
13766 PerlIO_printf(Perl_debug_log, "(BOL)");
13767 if (r->extflags & RXf_ANCH_MBOL)
13768 PerlIO_printf(Perl_debug_log, "(MBOL)");
13769 if (r->extflags & RXf_ANCH_SBOL)
13770 PerlIO_printf(Perl_debug_log, "(SBOL)");
13771 if (r->extflags & RXf_ANCH_GPOS)
13772 PerlIO_printf(Perl_debug_log, "(GPOS)");
13773 PerlIO_putc(Perl_debug_log, ' ');
13775 if (r->extflags & RXf_GPOS_SEEN)
13776 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13777 if (r->intflags & PREGf_SKIP)
13778 PerlIO_printf(Perl_debug_log, "plus ");
13779 if (r->intflags & PREGf_IMPLICIT)
13780 PerlIO_printf(Perl_debug_log, "implicit ");
13781 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13782 if (r->extflags & RXf_EVAL_SEEN)
13783 PerlIO_printf(Perl_debug_log, "with eval ");
13784 PerlIO_printf(Perl_debug_log, "\n");
13785 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13787 PERL_ARGS_ASSERT_REGDUMP;
13788 PERL_UNUSED_CONTEXT;
13789 PERL_UNUSED_ARG(r);
13790 #endif /* DEBUGGING */
13794 - regprop - printable representation of opcode
13796 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13799 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13800 if (flags & ANYOF_INVERT) \
13801 /*make sure the invert info is in each */ \
13802 sv_catpvs(sv, "^"); \
13808 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13814 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13815 static const char * const anyofs[] = {
13847 RXi_GET_DECL(prog,progi);
13848 GET_RE_DEBUG_FLAGS_DECL;
13850 PERL_ARGS_ASSERT_REGPROP;
13854 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13855 /* It would be nice to FAIL() here, but this may be called from
13856 regexec.c, and it would be hard to supply pRExC_state. */
13857 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13858 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13860 k = PL_regkind[OP(o)];
13863 sv_catpvs(sv, " ");
13864 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13865 * is a crude hack but it may be the best for now since
13866 * we have no flag "this EXACTish node was UTF-8"
13868 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13869 PERL_PV_ESCAPE_UNI_DETECT |
13870 PERL_PV_ESCAPE_NONASCII |
13871 PERL_PV_PRETTY_ELLIPSES |
13872 PERL_PV_PRETTY_LTGT |
13873 PERL_PV_PRETTY_NOCLEAR
13875 } else if (k == TRIE) {
13876 /* print the details of the trie in dumpuntil instead, as
13877 * progi->data isn't available here */
13878 const char op = OP(o);
13879 const U32 n = ARG(o);
13880 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13881 (reg_ac_data *)progi->data->data[n] :
13883 const reg_trie_data * const trie
13884 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13886 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13887 DEBUG_TRIE_COMPILE_r(
13888 Perl_sv_catpvf(aTHX_ sv,
13889 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13890 (UV)trie->startstate,
13891 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13892 (UV)trie->wordcount,
13895 (UV)TRIE_CHARCOUNT(trie),
13896 (UV)trie->uniquecharcount
13899 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13901 int rangestart = -1;
13902 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13903 sv_catpvs(sv, "[");
13904 for (i = 0; i <= 256; i++) {
13905 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13906 if (rangestart == -1)
13908 } else if (rangestart != -1) {
13909 if (i <= rangestart + 3)
13910 for (; rangestart < i; rangestart++)
13911 put_byte(sv, rangestart);
13913 put_byte(sv, rangestart);
13914 sv_catpvs(sv, "-");
13915 put_byte(sv, i - 1);
13920 sv_catpvs(sv, "]");
13923 } else if (k == CURLY) {
13924 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13925 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13926 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13928 else if (k == WHILEM && o->flags) /* Ordinal/of */
13929 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13930 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13931 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13932 if ( RXp_PAREN_NAMES(prog) ) {
13933 if ( k != REF || (OP(o) < NREF)) {
13934 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13935 SV **name= av_fetch(list, ARG(o), 0 );
13937 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13940 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13941 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13942 I32 *nums=(I32*)SvPVX(sv_dat);
13943 SV **name= av_fetch(list, nums[0], 0 );
13946 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13947 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13948 (n ? "," : ""), (IV)nums[n]);
13950 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13954 } else if (k == GOSUB)
13955 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13956 else if (k == VERB) {
13958 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13959 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13960 } else if (k == LOGICAL)
13961 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13962 else if (k == ANYOF) {
13963 int i, rangestart = -1;
13964 const U8 flags = ANYOF_FLAGS(o);
13968 if (flags & ANYOF_LOCALE)
13969 sv_catpvs(sv, "{loc}");
13970 if (flags & ANYOF_LOC_FOLD)
13971 sv_catpvs(sv, "{i}");
13972 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13973 if (flags & ANYOF_INVERT)
13974 sv_catpvs(sv, "^");
13976 /* output what the standard cp 0-255 bitmap matches */
13977 for (i = 0; i <= 256; i++) {
13978 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13979 if (rangestart == -1)
13981 } else if (rangestart != -1) {
13982 if (i <= rangestart + 3)
13983 for (; rangestart < i; rangestart++)
13984 put_byte(sv, rangestart);
13986 put_byte(sv, rangestart);
13987 sv_catpvs(sv, "-");
13988 put_byte(sv, i - 1);
13995 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13996 /* output any special charclass tests (used entirely under use locale) */
13997 if (ANYOF_CLASS_TEST_ANY_SET(o))
13998 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13999 if (ANYOF_CLASS_TEST(o,i)) {
14000 sv_catpv(sv, anyofs[i]);
14004 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14006 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14007 sv_catpvs(sv, "{non-utf8-latin1-all}");
14010 /* output information about the unicode matching */
14011 if (flags & ANYOF_UNICODE_ALL)
14012 sv_catpvs(sv, "{unicode_all}");
14013 else if (ANYOF_NONBITMAP(o))
14014 sv_catpvs(sv, "{unicode}");
14015 if (flags & ANYOF_NONBITMAP_NON_UTF8)
14016 sv_catpvs(sv, "{outside bitmap}");
14018 if (ANYOF_NONBITMAP(o)) {
14019 SV *lv; /* Set if there is something outside the bit map */
14020 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14021 bool byte_output = FALSE; /* If something in the bitmap has been
14024 if (lv && lv != &PL_sv_undef) {
14026 U8 s[UTF8_MAXBYTES_CASE+1];
14028 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14029 uvchr_to_utf8(s, i);
14032 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14036 && swash_fetch(sw, s, TRUE))
14038 if (rangestart == -1)
14040 } else if (rangestart != -1) {
14041 byte_output = TRUE;
14042 if (i <= rangestart + 3)
14043 for (; rangestart < i; rangestart++) {
14044 put_byte(sv, rangestart);
14047 put_byte(sv, rangestart);
14048 sv_catpvs(sv, "-");
14057 char *s = savesvpv(lv);
14058 char * const origs = s;
14060 while (*s && *s != '\n')
14064 const char * const t = ++s;
14067 sv_catpvs(sv, " ");
14073 /* Truncate very long output */
14074 if (s - origs > 256) {
14075 Perl_sv_catpvf(aTHX_ sv,
14077 (int) (s - origs - 1),
14083 else if (*s == '\t') {
14102 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14104 else if (k == POSIXD || k == NPOSIXD) {
14105 U8 index = FLAGS(o) * 2;
14106 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14107 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14110 sv_catpv(sv, anyofs[index]);
14113 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14114 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14116 PERL_UNUSED_CONTEXT;
14117 PERL_UNUSED_ARG(sv);
14118 PERL_UNUSED_ARG(o);
14119 PERL_UNUSED_ARG(prog);
14120 #endif /* DEBUGGING */
14124 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14125 { /* Assume that RE_INTUIT is set */
14127 struct regexp *const prog = ReANY(r);
14128 GET_RE_DEBUG_FLAGS_DECL;
14130 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14131 PERL_UNUSED_CONTEXT;
14135 const char * const s = SvPV_nolen_const(prog->check_substr
14136 ? prog->check_substr : prog->check_utf8);
14138 if (!PL_colorset) reginitcolors();
14139 PerlIO_printf(Perl_debug_log,
14140 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14142 prog->check_substr ? "" : "utf8 ",
14143 PL_colors[5],PL_colors[0],
14146 (strlen(s) > 60 ? "..." : ""));
14149 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14155 handles refcounting and freeing the perl core regexp structure. When
14156 it is necessary to actually free the structure the first thing it
14157 does is call the 'free' method of the regexp_engine associated to
14158 the regexp, allowing the handling of the void *pprivate; member
14159 first. (This routine is not overridable by extensions, which is why
14160 the extensions free is called first.)
14162 See regdupe and regdupe_internal if you change anything here.
14164 #ifndef PERL_IN_XSUB_RE
14166 Perl_pregfree(pTHX_ REGEXP *r)
14172 Perl_pregfree2(pTHX_ REGEXP *rx)
14175 struct regexp *const r = ReANY(rx);
14176 GET_RE_DEBUG_FLAGS_DECL;
14178 PERL_ARGS_ASSERT_PREGFREE2;
14180 if (r->mother_re) {
14181 ReREFCNT_dec(r->mother_re);
14183 CALLREGFREE_PVT(rx); /* free the private data */
14184 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14185 Safefree(r->xpv_len_u.xpvlenu_pv);
14188 SvREFCNT_dec(r->anchored_substr);
14189 SvREFCNT_dec(r->anchored_utf8);
14190 SvREFCNT_dec(r->float_substr);
14191 SvREFCNT_dec(r->float_utf8);
14192 Safefree(r->substrs);
14194 RX_MATCH_COPY_FREE(rx);
14195 #ifdef PERL_OLD_COPY_ON_WRITE
14196 SvREFCNT_dec(r->saved_copy);
14199 SvREFCNT_dec(r->qr_anoncv);
14200 rx->sv_u.svu_rx = 0;
14205 This is a hacky workaround to the structural issue of match results
14206 being stored in the regexp structure which is in turn stored in
14207 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14208 could be PL_curpm in multiple contexts, and could require multiple
14209 result sets being associated with the pattern simultaneously, such
14210 as when doing a recursive match with (??{$qr})
14212 The solution is to make a lightweight copy of the regexp structure
14213 when a qr// is returned from the code executed by (??{$qr}) this
14214 lightweight copy doesn't actually own any of its data except for
14215 the starp/end and the actual regexp structure itself.
14221 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14223 struct regexp *ret;
14224 struct regexp *const r = ReANY(rx);
14225 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14227 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14230 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14232 SvOK_off((SV *)ret_x);
14234 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14235 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14236 made both spots point to the same regexp body.) */
14237 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14238 assert(!SvPVX(ret_x));
14239 ret_x->sv_u.svu_rx = temp->sv_any;
14240 temp->sv_any = NULL;
14241 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14242 SvREFCNT_dec(temp);
14243 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14244 ing below will not set it. */
14245 SvCUR_set(ret_x, SvCUR(rx));
14248 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14249 sv_force_normal(sv) is called. */
14251 ret = ReANY(ret_x);
14253 SvFLAGS(ret_x) |= SvUTF8(rx);
14254 /* We share the same string buffer as the original regexp, on which we
14255 hold a reference count, incremented when mother_re is set below.
14256 The string pointer is copied here, being part of the regexp struct.
14258 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14259 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14261 const I32 npar = r->nparens+1;
14262 Newx(ret->offs, npar, regexp_paren_pair);
14263 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14266 Newx(ret->substrs, 1, struct reg_substr_data);
14267 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14269 SvREFCNT_inc_void(ret->anchored_substr);
14270 SvREFCNT_inc_void(ret->anchored_utf8);
14271 SvREFCNT_inc_void(ret->float_substr);
14272 SvREFCNT_inc_void(ret->float_utf8);
14274 /* check_substr and check_utf8, if non-NULL, point to either their
14275 anchored or float namesakes, and don't hold a second reference. */
14277 RX_MATCH_COPIED_off(ret_x);
14278 #ifdef PERL_OLD_COPY_ON_WRITE
14279 ret->saved_copy = NULL;
14281 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14282 SvREFCNT_inc_void(ret->qr_anoncv);
14288 /* regfree_internal()
14290 Free the private data in a regexp. This is overloadable by
14291 extensions. Perl takes care of the regexp structure in pregfree(),
14292 this covers the *pprivate pointer which technically perl doesn't
14293 know about, however of course we have to handle the
14294 regexp_internal structure when no extension is in use.
14296 Note this is called before freeing anything in the regexp
14301 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14304 struct regexp *const r = ReANY(rx);
14305 RXi_GET_DECL(r,ri);
14306 GET_RE_DEBUG_FLAGS_DECL;
14308 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14314 SV *dsv= sv_newmortal();
14315 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14316 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14317 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14318 PL_colors[4],PL_colors[5],s);
14321 #ifdef RE_TRACK_PATTERN_OFFSETS
14323 Safefree(ri->u.offsets); /* 20010421 MJD */
14325 if (ri->code_blocks) {
14327 for (n = 0; n < ri->num_code_blocks; n++)
14328 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14329 Safefree(ri->code_blocks);
14333 int n = ri->data->count;
14336 /* If you add a ->what type here, update the comment in regcomp.h */
14337 switch (ri->data->what[n]) {
14343 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14346 Safefree(ri->data->data[n]);
14352 { /* Aho Corasick add-on structure for a trie node.
14353 Used in stclass optimization only */
14355 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14357 refcount = --aho->refcount;
14360 PerlMemShared_free(aho->states);
14361 PerlMemShared_free(aho->fail);
14362 /* do this last!!!! */
14363 PerlMemShared_free(ri->data->data[n]);
14364 PerlMemShared_free(ri->regstclass);
14370 /* trie structure. */
14372 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14374 refcount = --trie->refcount;
14377 PerlMemShared_free(trie->charmap);
14378 PerlMemShared_free(trie->states);
14379 PerlMemShared_free(trie->trans);
14381 PerlMemShared_free(trie->bitmap);
14383 PerlMemShared_free(trie->jump);
14384 PerlMemShared_free(trie->wordinfo);
14385 /* do this last!!!! */
14386 PerlMemShared_free(ri->data->data[n]);
14391 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14394 Safefree(ri->data->what);
14395 Safefree(ri->data);
14401 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14402 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14403 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14406 re_dup - duplicate a regexp.
14408 This routine is expected to clone a given regexp structure. It is only
14409 compiled under USE_ITHREADS.
14411 After all of the core data stored in struct regexp is duplicated
14412 the regexp_engine.dupe method is used to copy any private data
14413 stored in the *pprivate pointer. This allows extensions to handle
14414 any duplication it needs to do.
14416 See pregfree() and regfree_internal() if you change anything here.
14418 #if defined(USE_ITHREADS)
14419 #ifndef PERL_IN_XSUB_RE
14421 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14425 const struct regexp *r = ReANY(sstr);
14426 struct regexp *ret = ReANY(dstr);
14428 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14430 npar = r->nparens+1;
14431 Newx(ret->offs, npar, regexp_paren_pair);
14432 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14434 /* no need to copy these */
14435 Newx(ret->swap, npar, regexp_paren_pair);
14438 if (ret->substrs) {
14439 /* Do it this way to avoid reading from *r after the StructCopy().
14440 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14441 cache, it doesn't matter. */
14442 const bool anchored = r->check_substr
14443 ? r->check_substr == r->anchored_substr
14444 : r->check_utf8 == r->anchored_utf8;
14445 Newx(ret->substrs, 1, struct reg_substr_data);
14446 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14448 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14449 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14450 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14451 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14453 /* check_substr and check_utf8, if non-NULL, point to either their
14454 anchored or float namesakes, and don't hold a second reference. */
14456 if (ret->check_substr) {
14458 assert(r->check_utf8 == r->anchored_utf8);
14459 ret->check_substr = ret->anchored_substr;
14460 ret->check_utf8 = ret->anchored_utf8;
14462 assert(r->check_substr == r->float_substr);
14463 assert(r->check_utf8 == r->float_utf8);
14464 ret->check_substr = ret->float_substr;
14465 ret->check_utf8 = ret->float_utf8;
14467 } else if (ret->check_utf8) {
14469 ret->check_utf8 = ret->anchored_utf8;
14471 ret->check_utf8 = ret->float_utf8;
14476 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14477 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14480 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14482 if (RX_MATCH_COPIED(dstr))
14483 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14485 ret->subbeg = NULL;
14486 #ifdef PERL_OLD_COPY_ON_WRITE
14487 ret->saved_copy = NULL;
14490 /* Whether mother_re be set or no, we need to copy the string. We
14491 cannot refrain from copying it when the storage points directly to
14492 our mother regexp, because that's
14493 1: a buffer in a different thread
14494 2: something we no longer hold a reference on
14495 so we need to copy it locally. */
14496 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14497 ret->mother_re = NULL;
14500 #endif /* PERL_IN_XSUB_RE */
14505 This is the internal complement to regdupe() which is used to copy
14506 the structure pointed to by the *pprivate pointer in the regexp.
14507 This is the core version of the extension overridable cloning hook.
14508 The regexp structure being duplicated will be copied by perl prior
14509 to this and will be provided as the regexp *r argument, however
14510 with the /old/ structures pprivate pointer value. Thus this routine
14511 may override any copying normally done by perl.
14513 It returns a pointer to the new regexp_internal structure.
14517 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14520 struct regexp *const r = ReANY(rx);
14521 regexp_internal *reti;
14523 RXi_GET_DECL(r,ri);
14525 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14529 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14530 Copy(ri->program, reti->program, len+1, regnode);
14532 reti->num_code_blocks = ri->num_code_blocks;
14533 if (ri->code_blocks) {
14535 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14536 struct reg_code_block);
14537 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14538 struct reg_code_block);
14539 for (n = 0; n < ri->num_code_blocks; n++)
14540 reti->code_blocks[n].src_regex = (REGEXP*)
14541 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14544 reti->code_blocks = NULL;
14546 reti->regstclass = NULL;
14549 struct reg_data *d;
14550 const int count = ri->data->count;
14553 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14554 char, struct reg_data);
14555 Newx(d->what, count, U8);
14558 for (i = 0; i < count; i++) {
14559 d->what[i] = ri->data->what[i];
14560 switch (d->what[i]) {
14561 /* see also regcomp.h and regfree_internal() */
14562 case 'a': /* actually an AV, but the dup function is identical. */
14566 case 'u': /* actually an HV, but the dup function is identical. */
14567 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14570 /* This is cheating. */
14571 Newx(d->data[i], 1, struct regnode_charclass_class);
14572 StructCopy(ri->data->data[i], d->data[i],
14573 struct regnode_charclass_class);
14574 reti->regstclass = (regnode*)d->data[i];
14577 /* Trie stclasses are readonly and can thus be shared
14578 * without duplication. We free the stclass in pregfree
14579 * when the corresponding reg_ac_data struct is freed.
14581 reti->regstclass= ri->regstclass;
14585 ((reg_trie_data*)ri->data->data[i])->refcount++;
14590 d->data[i] = ri->data->data[i];
14593 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14602 reti->name_list_idx = ri->name_list_idx;
14604 #ifdef RE_TRACK_PATTERN_OFFSETS
14605 if (ri->u.offsets) {
14606 Newx(reti->u.offsets, 2*len+1, U32);
14607 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14610 SetProgLen(reti,len);
14613 return (void*)reti;
14616 #endif /* USE_ITHREADS */
14618 #ifndef PERL_IN_XSUB_RE
14621 - regnext - dig the "next" pointer out of a node
14624 Perl_regnext(pTHX_ regnode *p)
14632 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14633 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14636 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14645 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14648 STRLEN l1 = strlen(pat1);
14649 STRLEN l2 = strlen(pat2);
14652 const char *message;
14654 PERL_ARGS_ASSERT_RE_CROAK2;
14660 Copy(pat1, buf, l1 , char);
14661 Copy(pat2, buf + l1, l2 , char);
14662 buf[l1 + l2] = '\n';
14663 buf[l1 + l2 + 1] = '\0';
14665 /* ANSI variant takes additional second argument */
14666 va_start(args, pat2);
14670 msv = vmess(buf, &args);
14672 message = SvPV_const(msv,l1);
14675 Copy(message, buf, l1 , char);
14676 buf[l1-1] = '\0'; /* Overwrite \n */
14677 Perl_croak(aTHX_ "%s", buf);
14680 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14682 #ifndef PERL_IN_XSUB_RE
14684 Perl_save_re_context(pTHX)
14688 struct re_save_state *state;
14690 SAVEVPTR(PL_curcop);
14691 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14693 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14694 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14695 SSPUSHUV(SAVEt_RE_STATE);
14697 Copy(&PL_reg_state, state, 1, struct re_save_state);
14699 PL_reg_oldsaved = NULL;
14700 PL_reg_oldsavedlen = 0;
14701 PL_reg_oldsavedoffset = 0;
14702 PL_reg_oldsavedcoffset = 0;
14703 PL_reg_maxiter = 0;
14704 PL_reg_leftiter = 0;
14705 PL_reg_poscache = NULL;
14706 PL_reg_poscache_size = 0;
14707 #ifdef PERL_OLD_COPY_ON_WRITE
14711 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14713 const REGEXP * const rx = PM_GETRE(PL_curpm);
14716 for (i = 1; i <= RX_NPARENS(rx); i++) {
14717 char digits[TYPE_CHARS(long)];
14718 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14719 GV *const *const gvp
14720 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14723 GV * const gv = *gvp;
14724 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14734 clear_re(pTHX_ void *r)
14737 ReREFCNT_dec((REGEXP *)r);
14743 S_put_byte(pTHX_ SV *sv, int c)
14745 PERL_ARGS_ASSERT_PUT_BYTE;
14747 /* Our definition of isPRINT() ignores locales, so only bytes that are
14748 not part of UTF-8 are considered printable. I assume that the same
14749 holds for UTF-EBCDIC.
14750 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14751 which Wikipedia says:
14753 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14754 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14755 identical, to the ASCII delete (DEL) or rubout control character.
14756 ) So the old condition can be simplified to !isPRINT(c) */
14759 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14762 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14766 const char string = c;
14767 if (c == '-' || c == ']' || c == '\\' || c == '^')
14768 sv_catpvs(sv, "\\");
14769 sv_catpvn(sv, &string, 1);
14774 #define CLEAR_OPTSTART \
14775 if (optstart) STMT_START { \
14776 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14780 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14782 STATIC const regnode *
14783 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14784 const regnode *last, const regnode *plast,
14785 SV* sv, I32 indent, U32 depth)
14788 U8 op = PSEUDO; /* Arbitrary non-END op. */
14789 const regnode *next;
14790 const regnode *optstart= NULL;
14792 RXi_GET_DECL(r,ri);
14793 GET_RE_DEBUG_FLAGS_DECL;
14795 PERL_ARGS_ASSERT_DUMPUNTIL;
14797 #ifdef DEBUG_DUMPUNTIL
14798 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14799 last ? last-start : 0,plast ? plast-start : 0);
14802 if (plast && plast < last)
14805 while (PL_regkind[op] != END && (!last || node < last)) {
14806 /* While that wasn't END last time... */
14809 if (op == CLOSE || op == WHILEM)
14811 next = regnext((regnode *)node);
14814 if (OP(node) == OPTIMIZED) {
14815 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14822 regprop(r, sv, node);
14823 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14824 (int)(2*indent + 1), "", SvPVX_const(sv));
14826 if (OP(node) != OPTIMIZED) {
14827 if (next == NULL) /* Next ptr. */
14828 PerlIO_printf(Perl_debug_log, " (0)");
14829 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14830 PerlIO_printf(Perl_debug_log, " (FAIL)");
14832 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14833 (void)PerlIO_putc(Perl_debug_log, '\n');
14837 if (PL_regkind[(U8)op] == BRANCHJ) {
14840 const regnode *nnode = (OP(next) == LONGJMP
14841 ? regnext((regnode *)next)
14843 if (last && nnode > last)
14845 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14848 else if (PL_regkind[(U8)op] == BRANCH) {
14850 DUMPUNTIL(NEXTOPER(node), next);
14852 else if ( PL_regkind[(U8)op] == TRIE ) {
14853 const regnode *this_trie = node;
14854 const char op = OP(node);
14855 const U32 n = ARG(node);
14856 const reg_ac_data * const ac = op>=AHOCORASICK ?
14857 (reg_ac_data *)ri->data->data[n] :
14859 const reg_trie_data * const trie =
14860 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14862 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14864 const regnode *nextbranch= NULL;
14867 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14868 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14870 PerlIO_printf(Perl_debug_log, "%*s%s ",
14871 (int)(2*(indent+3)), "",
14872 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14873 PL_colors[0], PL_colors[1],
14874 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14875 PERL_PV_PRETTY_ELLIPSES |
14876 PERL_PV_PRETTY_LTGT
14881 U16 dist= trie->jump[word_idx+1];
14882 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14883 (UV)((dist ? this_trie + dist : next) - start));
14886 nextbranch= this_trie + trie->jump[0];
14887 DUMPUNTIL(this_trie + dist, nextbranch);
14889 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14890 nextbranch= regnext((regnode *)nextbranch);
14892 PerlIO_printf(Perl_debug_log, "\n");
14895 if (last && next > last)
14900 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14901 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14902 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14904 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14906 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14908 else if ( op == PLUS || op == STAR) {
14909 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14911 else if (PL_regkind[(U8)op] == ANYOF) {
14912 /* arglen 1 + class block */
14913 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14914 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14915 node = NEXTOPER(node);
14917 else if (PL_regkind[(U8)op] == EXACT) {
14918 /* Literal string, where present. */
14919 node += NODE_SZ_STR(node) - 1;
14920 node = NEXTOPER(node);
14923 node = NEXTOPER(node);
14924 node += regarglen[(U8)op];
14926 if (op == CURLYX || op == OPEN)
14930 #ifdef DEBUG_DUMPUNTIL
14931 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14936 #endif /* DEBUGGING */
14940 * c-indentation-style: bsd
14941 * c-basic-offset: 4
14942 * indent-tabs-mode: nil
14945 * ex: set ts=8 sts=4 sw=4 et: