5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
122 typedef struct RExC_state_t {
123 U32 flags; /* RXf_* are we folding, multilining? */
124 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
125 char *precomp; /* uncompiled string. */
126 REGEXP *rx_sv; /* The SV that is the regexp. */
127 regexp *rx; /* perl core regexp structure */
128 regexp_internal *rxi; /* internal data for regexp object pprivate field */
129 char *start; /* Start of input for compile */
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
132 I32 whilem_seen; /* number of WHILEM in this expr */
133 regnode *emit_start; /* Start of emitted-code area */
134 regnode *emit_bound; /* First regnode outside of the allocated space */
135 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
136 I32 naughty; /* How bad is this pattern? */
137 I32 sawback; /* Did we see \1, ...? */
139 I32 size; /* Code size. */
140 I32 npar; /* Capture buffer count, (OPEN). */
141 I32 cpar; /* Capture buffer count, (CLOSE). */
142 I32 nestroot; /* root parens we are in - used by accept */
145 regnode **open_parens; /* pointers to open parens */
146 regnode **close_parens; /* pointers to close parens */
147 regnode *opend; /* END node in program */
148 I32 utf8; /* whether the pattern is utf8 or not */
149 I32 orig_utf8; /* whether the pattern was originally in utf8 */
150 /* XXX use this for future optimisation of case
151 * where pattern must be upgraded to utf8. */
152 I32 uni_semantics; /* If a d charset modifier should use unicode
153 rules, even if the pattern is not in
155 HV *paren_names; /* Paren names */
157 regnode **recurse; /* Recurse regops */
158 I32 recurse_count; /* Number of recurse regops */
161 I32 override_recoding;
162 I32 in_multi_char_class;
163 struct reg_code_block *code_blocks; /* positions of literal (?{})
165 int num_code_blocks; /* size of code_blocks[] */
166 int code_index; /* next code_blocks[] slot */
168 char *starttry; /* -Dr: where regtry was called. */
169 #define RExC_starttry (pRExC_state->starttry)
171 SV *runtime_code_qr; /* qr with the runtime code blocks */
173 const char *lastparse;
175 AV *paren_name_list; /* idx -> name */
176 #define RExC_lastparse (pRExC_state->lastparse)
177 #define RExC_lastnum (pRExC_state->lastnum)
178 #define RExC_paren_name_list (pRExC_state->paren_name_list)
182 #define RExC_flags (pRExC_state->flags)
183 #define RExC_pm_flags (pRExC_state->pm_flags)
184 #define RExC_precomp (pRExC_state->precomp)
185 #define RExC_rx_sv (pRExC_state->rx_sv)
186 #define RExC_rx (pRExC_state->rx)
187 #define RExC_rxi (pRExC_state->rxi)
188 #define RExC_start (pRExC_state->start)
189 #define RExC_end (pRExC_state->end)
190 #define RExC_parse (pRExC_state->parse)
191 #define RExC_whilem_seen (pRExC_state->whilem_seen)
192 #ifdef RE_TRACK_PATTERN_OFFSETS
193 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
195 #define RExC_emit (pRExC_state->emit)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_npar (pRExC_state->npar)
203 #define RExC_nestroot (pRExC_state->nestroot)
204 #define RExC_extralen (pRExC_state->extralen)
205 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
206 #define RExC_utf8 (pRExC_state->utf8)
207 #define RExC_uni_semantics (pRExC_state->uni_semantics)
208 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
209 #define RExC_open_parens (pRExC_state->open_parens)
210 #define RExC_close_parens (pRExC_state->close_parens)
211 #define RExC_opend (pRExC_state->opend)
212 #define RExC_paren_names (pRExC_state->paren_names)
213 #define RExC_recurse (pRExC_state->recurse)
214 #define RExC_recurse_count (pRExC_state->recurse_count)
215 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
216 #define RExC_contains_locale (pRExC_state->contains_locale)
217 #define RExC_override_recoding (pRExC_state->override_recoding)
218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
221 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
222 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
223 ((*s) == '{' && regcurly(s)))
226 #undef SPSTART /* dratted cpp namespace... */
229 * Flags to be passed up and down.
231 #define WORST 0 /* Worst case. */
232 #define HASWIDTH 0x01 /* Known to match non-null strings. */
234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
235 * character. (There needs to be a case: in the switch statement in regexec.c
236 * for any node marked SIMPLE.) Note that this is not the same thing as
239 #define SPSTART 0x04 /* Starts with * or + */
240 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
241 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
243 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
245 /* whether trie related optimizations are enabled */
246 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
247 #define TRIE_STUDY_OPT
248 #define FULL_TRIE_STUDY
254 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
255 #define PBITVAL(paren) (1 << ((paren) & 7))
256 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
257 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
258 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
260 /* If not already in utf8, do a longjmp back to the beginning */
261 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
262 #define REQUIRE_UTF8 STMT_START { \
263 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
266 /* About scan_data_t.
268 During optimisation we recurse through the regexp program performing
269 various inplace (keyhole style) optimisations. In addition study_chunk
270 and scan_commit populate this data structure with information about
271 what strings MUST appear in the pattern. We look for the longest
272 string that must appear at a fixed location, and we look for the
273 longest string that may appear at a floating location. So for instance
278 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
279 strings (because they follow a .* construct). study_chunk will identify
280 both FOO and BAR as being the longest fixed and floating strings respectively.
282 The strings can be composites, for instance
286 will result in a composite fixed substring 'foo'.
288 For each string some basic information is maintained:
290 - offset or min_offset
291 This is the position the string must appear at, or not before.
292 It also implicitly (when combined with minlenp) tells us how many
293 characters must match before the string we are searching for.
294 Likewise when combined with minlenp and the length of the string it
295 tells us how many characters must appear after the string we have
299 Only used for floating strings. This is the rightmost point that
300 the string can appear at. If set to I32 max it indicates that the
301 string can occur infinitely far to the right.
304 A pointer to the minimum number of characters of the pattern that the
305 string was found inside. This is important as in the case of positive
306 lookahead or positive lookbehind we can have multiple patterns
311 The minimum length of the pattern overall is 3, the minimum length
312 of the lookahead part is 3, but the minimum length of the part that
313 will actually match is 1. So 'FOO's minimum length is 3, but the
314 minimum length for the F is 1. This is important as the minimum length
315 is used to determine offsets in front of and behind the string being
316 looked for. Since strings can be composites this is the length of the
317 pattern at the time it was committed with a scan_commit. Note that
318 the length is calculated by study_chunk, so that the minimum lengths
319 are not known until the full pattern has been compiled, thus the
320 pointer to the value.
324 In the case of lookbehind the string being searched for can be
325 offset past the start point of the final matching string.
326 If this value was just blithely removed from the min_offset it would
327 invalidate some of the calculations for how many chars must match
328 before or after (as they are derived from min_offset and minlen and
329 the length of the string being searched for).
330 When the final pattern is compiled and the data is moved from the
331 scan_data_t structure into the regexp structure the information
332 about lookbehind is factored in, with the information that would
333 have been lost precalculated in the end_shift field for the
336 The fields pos_min and pos_delta are used to store the minimum offset
337 and the delta to the maximum offset at the current point in the pattern.
341 typedef struct scan_data_t {
342 /*I32 len_min; unused */
343 /*I32 len_delta; unused */
347 I32 last_end; /* min value, <0 unless valid. */
350 SV **longest; /* Either &l_fixed, or &l_float. */
351 SV *longest_fixed; /* longest fixed string found in pattern */
352 I32 offset_fixed; /* offset where it starts */
353 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
354 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
355 SV *longest_float; /* longest floating string found in pattern */
356 I32 offset_float_min; /* earliest point in string it can appear */
357 I32 offset_float_max; /* latest point in string it can appear */
358 I32 *minlen_float; /* pointer to the minlen relevant to the string */
359 I32 lookbehind_float; /* is the position of the string modified by LB */
363 struct regnode_charclass_class *start_class;
367 * Forward declarations for pregcomp()'s friends.
370 static const scan_data_t zero_scan_data =
371 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
373 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
374 #define SF_BEFORE_SEOL 0x0001
375 #define SF_BEFORE_MEOL 0x0002
376 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
377 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
380 # define SF_FIX_SHIFT_EOL (0+2)
381 # define SF_FL_SHIFT_EOL (0+4)
383 # define SF_FIX_SHIFT_EOL (+2)
384 # define SF_FL_SHIFT_EOL (+4)
387 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
388 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
390 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
391 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
392 #define SF_IS_INF 0x0040
393 #define SF_HAS_PAR 0x0080
394 #define SF_IN_PAR 0x0100
395 #define SF_HAS_EVAL 0x0200
396 #define SCF_DO_SUBSTR 0x0400
397 #define SCF_DO_STCLASS_AND 0x0800
398 #define SCF_DO_STCLASS_OR 0x1000
399 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
400 #define SCF_WHILEM_VISITED_POS 0x2000
402 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
403 #define SCF_SEEN_ACCEPT 0x8000
405 #define UTF cBOOL(RExC_utf8)
407 /* The enums for all these are ordered so things work out correctly */
408 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
409 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
410 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
411 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
412 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
413 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
414 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
416 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
418 #define OOB_NAMEDCLASS -1
420 /* There is no code point that is out-of-bounds, so this is problematic. But
421 * its only current use is to initialize a variable that is always set before
423 #define OOB_UNICODE 0xDEADBEEF
425 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
426 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
429 /* length of regex to show in messages that don't mark a position within */
430 #define RegexLengthToShowInErrorMessages 127
433 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
434 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
435 * op/pragma/warn/regcomp.
437 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
438 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
440 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
443 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
444 * arg. Show regex, up to a maximum length. If it's too long, chop and add
447 #define _FAIL(code) STMT_START { \
448 const char *ellipses = ""; \
449 IV len = RExC_end - RExC_precomp; \
452 SAVEFREESV(RExC_rx_sv); \
453 if (len > RegexLengthToShowInErrorMessages) { \
454 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
455 len = RegexLengthToShowInErrorMessages - 10; \
461 #define FAIL(msg) _FAIL( \
462 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
463 msg, (int)len, RExC_precomp, ellipses))
465 #define FAIL2(msg,arg) _FAIL( \
466 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
467 arg, (int)len, RExC_precomp, ellipses))
470 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
472 #define Simple_vFAIL(m) STMT_START { \
473 const IV offset = RExC_parse - RExC_precomp; \
474 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
475 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
479 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
481 #define vFAIL(m) STMT_START { \
483 SAVEFREESV(RExC_rx_sv); \
488 * Like Simple_vFAIL(), but accepts two arguments.
490 #define Simple_vFAIL2(m,a1) STMT_START { \
491 const IV offset = RExC_parse - RExC_precomp; \
492 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
493 (int)offset, RExC_precomp, RExC_precomp + offset); \
497 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
499 #define vFAIL2(m,a1) STMT_START { \
501 SAVEFREESV(RExC_rx_sv); \
502 Simple_vFAIL2(m, a1); \
507 * Like Simple_vFAIL(), but accepts three arguments.
509 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
510 const IV offset = RExC_parse - RExC_precomp; \
511 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
512 (int)offset, RExC_precomp, RExC_precomp + offset); \
516 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
518 #define vFAIL3(m,a1,a2) STMT_START { \
520 SAVEFREESV(RExC_rx_sv); \
521 Simple_vFAIL3(m, a1, a2); \
525 * Like Simple_vFAIL(), but accepts four arguments.
527 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
528 const IV offset = RExC_parse - RExC_precomp; \
529 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
530 (int)offset, RExC_precomp, RExC_precomp + offset); \
533 #define ckWARNreg(loc,m) STMT_START { \
534 const IV offset = loc - RExC_precomp; \
535 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
536 (int)offset, RExC_precomp, RExC_precomp + offset); \
539 #define ckWARNregdep(loc,m) STMT_START { \
540 const IV offset = loc - RExC_precomp; \
541 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
543 (int)offset, RExC_precomp, RExC_precomp + offset); \
546 #define ckWARN2regdep(loc,m, a1) STMT_START { \
547 const IV offset = loc - RExC_precomp; \
548 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
550 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
553 #define ckWARN2reg(loc, m, a1) STMT_START { \
554 const IV offset = loc - RExC_precomp; \
555 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
556 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
559 #define vWARN3(loc, m, a1, a2) STMT_START { \
560 const IV offset = loc - RExC_precomp; \
561 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
562 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
565 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
566 const IV offset = loc - RExC_precomp; \
567 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
568 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
571 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
572 const IV offset = loc - RExC_precomp; \
573 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
574 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
577 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
578 const IV offset = loc - RExC_precomp; \
579 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
580 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
583 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
584 const IV offset = loc - RExC_precomp; \
585 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
586 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
590 /* Allow for side effects in s */
591 #define REGC(c,s) STMT_START { \
592 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
595 /* Macros for recording node offsets. 20001227 mjd@plover.com
596 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
597 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
598 * Element 0 holds the number n.
599 * Position is 1 indexed.
601 #ifndef RE_TRACK_PATTERN_OFFSETS
602 #define Set_Node_Offset_To_R(node,byte)
603 #define Set_Node_Offset(node,byte)
604 #define Set_Cur_Node_Offset
605 #define Set_Node_Length_To_R(node,len)
606 #define Set_Node_Length(node,len)
607 #define Set_Node_Cur_Length(node)
608 #define Node_Offset(n)
609 #define Node_Length(n)
610 #define Set_Node_Offset_Length(node,offset,len)
611 #define ProgLen(ri) ri->u.proglen
612 #define SetProgLen(ri,x) ri->u.proglen = x
614 #define ProgLen(ri) ri->u.offsets[0]
615 #define SetProgLen(ri,x) ri->u.offsets[0] = x
616 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
618 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
619 __LINE__, (int)(node), (int)(byte))); \
621 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
623 RExC_offsets[2*(node)-1] = (byte); \
628 #define Set_Node_Offset(node,byte) \
629 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
630 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
632 #define Set_Node_Length_To_R(node,len) STMT_START { \
634 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
635 __LINE__, (int)(node), (int)(len))); \
637 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
639 RExC_offsets[2*(node)] = (len); \
644 #define Set_Node_Length(node,len) \
645 Set_Node_Length_To_R((node)-RExC_emit_start, len)
646 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
647 #define Set_Node_Cur_Length(node) \
648 Set_Node_Length(node, RExC_parse - parse_start)
650 /* Get offsets and lengths */
651 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
652 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
654 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
655 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
656 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
660 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
661 #define EXPERIMENTAL_INPLACESCAN
662 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
664 #define DEBUG_STUDYDATA(str,data,depth) \
665 DEBUG_OPTIMISE_MORE_r(if(data){ \
666 PerlIO_printf(Perl_debug_log, \
667 "%*s" str "Pos:%"IVdf"/%"IVdf \
668 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
669 (int)(depth)*2, "", \
670 (IV)((data)->pos_min), \
671 (IV)((data)->pos_delta), \
672 (UV)((data)->flags), \
673 (IV)((data)->whilem_c), \
674 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
675 is_inf ? "INF " : "" \
677 if ((data)->last_found) \
678 PerlIO_printf(Perl_debug_log, \
679 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
680 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
681 SvPVX_const((data)->last_found), \
682 (IV)((data)->last_end), \
683 (IV)((data)->last_start_min), \
684 (IV)((data)->last_start_max), \
685 ((data)->longest && \
686 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
687 SvPVX_const((data)->longest_fixed), \
688 (IV)((data)->offset_fixed), \
689 ((data)->longest && \
690 (data)->longest==&((data)->longest_float)) ? "*" : "", \
691 SvPVX_const((data)->longest_float), \
692 (IV)((data)->offset_float_min), \
693 (IV)((data)->offset_float_max) \
695 PerlIO_printf(Perl_debug_log,"\n"); \
698 /* Mark that we cannot extend a found fixed substring at this point.
699 Update the longest found anchored substring and the longest found
700 floating substrings if needed. */
703 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
705 const STRLEN l = CHR_SVLEN(data->last_found);
706 const STRLEN old_l = CHR_SVLEN(*data->longest);
707 GET_RE_DEBUG_FLAGS_DECL;
709 PERL_ARGS_ASSERT_SCAN_COMMIT;
711 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
712 SvSetMagicSV(*data->longest, data->last_found);
713 if (*data->longest == data->longest_fixed) {
714 data->offset_fixed = l ? data->last_start_min : data->pos_min;
715 if (data->flags & SF_BEFORE_EOL)
717 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
719 data->flags &= ~SF_FIX_BEFORE_EOL;
720 data->minlen_fixed=minlenp;
721 data->lookbehind_fixed=0;
723 else { /* *data->longest == data->longest_float */
724 data->offset_float_min = l ? data->last_start_min : data->pos_min;
725 data->offset_float_max = (l
726 ? data->last_start_max
727 : data->pos_min + data->pos_delta);
728 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
729 data->offset_float_max = I32_MAX;
730 if (data->flags & SF_BEFORE_EOL)
732 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
734 data->flags &= ~SF_FL_BEFORE_EOL;
735 data->minlen_float=minlenp;
736 data->lookbehind_float=0;
739 SvCUR_set(data->last_found, 0);
741 SV * const sv = data->last_found;
742 if (SvUTF8(sv) && SvMAGICAL(sv)) {
743 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
749 data->flags &= ~SF_BEFORE_EOL;
750 DEBUG_STUDYDATA("commit: ",data,0);
753 /* Can match anything (initialization) */
755 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
757 PERL_ARGS_ASSERT_CL_ANYTHING;
759 ANYOF_BITMAP_SETALL(cl);
760 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
761 |ANYOF_NON_UTF8_LATIN1_ALL;
763 /* If any portion of the regex is to operate under locale rules,
764 * initialization includes it. The reason this isn't done for all regexes
765 * is that the optimizer was written under the assumption that locale was
766 * all-or-nothing. Given the complexity and lack of documentation in the
767 * optimizer, and that there are inadequate test cases for locale, so many
768 * parts of it may not work properly, it is safest to avoid locale unless
770 if (RExC_contains_locale) {
771 ANYOF_CLASS_SETALL(cl); /* /l uses class */
772 cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
775 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
779 /* Can match anything (initialization) */
781 S_cl_is_anything(const struct regnode_charclass_class *cl)
785 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
787 for (value = 0; value <= ANYOF_MAX; value += 2)
788 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
790 if (!(cl->flags & ANYOF_UNICODE_ALL))
792 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
797 /* Can match anything (initialization) */
799 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
801 PERL_ARGS_ASSERT_CL_INIT;
803 Zero(cl, 1, struct regnode_charclass_class);
805 cl_anything(pRExC_state, cl);
806 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
809 /* These two functions currently do the exact same thing */
810 #define cl_init_zero S_cl_init
812 /* 'AND' a given class with another one. Can create false positives. 'cl'
813 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
814 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
816 S_cl_and(struct regnode_charclass_class *cl,
817 const struct regnode_charclass_class *and_with)
819 PERL_ARGS_ASSERT_CL_AND;
821 assert(and_with->type == ANYOF);
823 /* I (khw) am not sure all these restrictions are necessary XXX */
824 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
825 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
826 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
827 && !(and_with->flags & ANYOF_LOC_FOLD)
828 && !(cl->flags & ANYOF_LOC_FOLD)) {
831 if (and_with->flags & ANYOF_INVERT)
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] &= ~and_with->bitmap[i];
835 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
836 cl->bitmap[i] &= and_with->bitmap[i];
837 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
839 if (and_with->flags & ANYOF_INVERT) {
841 /* Here, the and'ed node is inverted. Get the AND of the flags that
842 * aren't affected by the inversion. Those that are affected are
843 * handled individually below */
844 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
845 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
846 cl->flags |= affected_flags;
848 /* We currently don't know how to deal with things that aren't in the
849 * bitmap, but we know that the intersection is no greater than what
850 * is already in cl, so let there be false positives that get sorted
851 * out after the synthetic start class succeeds, and the node is
852 * matched for real. */
854 /* The inversion of these two flags indicate that the resulting
855 * intersection doesn't have them */
856 if (and_with->flags & ANYOF_UNICODE_ALL) {
857 cl->flags &= ~ANYOF_UNICODE_ALL;
859 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
860 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
863 else { /* and'd node is not inverted */
864 U8 outside_bitmap_but_not_utf8; /* Temp variable */
866 if (! ANYOF_NONBITMAP(and_with)) {
868 /* Here 'and_with' doesn't match anything outside the bitmap
869 * (except possibly ANYOF_UNICODE_ALL), which means the
870 * intersection can't either, except for ANYOF_UNICODE_ALL, in
871 * which case we don't know what the intersection is, but it's no
872 * greater than what cl already has, so can just leave it alone,
873 * with possible false positives */
874 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
875 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
876 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
879 else if (! ANYOF_NONBITMAP(cl)) {
881 /* Here, 'and_with' does match something outside the bitmap, and cl
882 * doesn't have a list of things to match outside the bitmap. If
883 * cl can match all code points above 255, the intersection will
884 * be those above-255 code points that 'and_with' matches. If cl
885 * can't match all Unicode code points, it means that it can't
886 * match anything outside the bitmap (since the 'if' that got us
887 * into this block tested for that), so we leave the bitmap empty.
889 if (cl->flags & ANYOF_UNICODE_ALL) {
890 ARG_SET(cl, ARG(and_with));
892 /* and_with's ARG may match things that don't require UTF8.
893 * And now cl's will too, in spite of this being an 'and'. See
894 * the comments below about the kludge */
895 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
899 /* Here, both 'and_with' and cl match something outside the
900 * bitmap. Currently we do not do the intersection, so just match
901 * whatever cl had at the beginning. */
905 /* Take the intersection of the two sets of flags. However, the
906 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
907 * kludge around the fact that this flag is not treated like the others
908 * which are initialized in cl_anything(). The way the optimizer works
909 * is that the synthetic start class (SSC) is initialized to match
910 * anything, and then the first time a real node is encountered, its
911 * values are AND'd with the SSC's with the result being the values of
912 * the real node. However, there are paths through the optimizer where
913 * the AND never gets called, so those initialized bits are set
914 * inappropriately, which is not usually a big deal, as they just cause
915 * false positives in the SSC, which will just mean a probably
916 * imperceptible slow down in execution. However this bit has a
917 * higher false positive consequence in that it can cause utf8.pm,
918 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
919 * bigger slowdown and also causes significant extra memory to be used.
920 * In order to prevent this, the code now takes a different tack. The
921 * bit isn't set unless some part of the regular expression needs it,
922 * but once set it won't get cleared. This means that these extra
923 * modules won't get loaded unless there was some path through the
924 * pattern that would have required them anyway, and so any false
925 * positives that occur by not ANDing them out when they could be
926 * aren't as severe as they would be if we treated this bit like all
928 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
929 & ANYOF_NONBITMAP_NON_UTF8;
930 cl->flags &= and_with->flags;
931 cl->flags |= outside_bitmap_but_not_utf8;
935 /* 'OR' a given class with another one. Can create false positives. 'cl'
936 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
937 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
939 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
941 PERL_ARGS_ASSERT_CL_OR;
943 if (or_with->flags & ANYOF_INVERT) {
945 /* Here, the or'd node is to be inverted. This means we take the
946 * complement of everything not in the bitmap, but currently we don't
947 * know what that is, so give up and match anything */
948 if (ANYOF_NONBITMAP(or_with)) {
949 cl_anything(pRExC_state, cl);
952 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
953 * <= (B1 | !B2) | (CL1 | !CL2)
954 * which is wasteful if CL2 is small, but we ignore CL2:
955 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
956 * XXXX Can we handle case-fold? Unclear:
957 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
958 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
960 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
961 && !(or_with->flags & ANYOF_LOC_FOLD)
962 && !(cl->flags & ANYOF_LOC_FOLD) ) {
965 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
966 cl->bitmap[i] |= ~or_with->bitmap[i];
967 } /* XXXX: logic is complicated otherwise */
969 cl_anything(pRExC_state, cl);
972 /* And, we can just take the union of the flags that aren't affected
973 * by the inversion */
974 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
976 /* For the remaining flags:
977 ANYOF_UNICODE_ALL and inverted means to not match anything above
978 255, which means that the union with cl should just be
979 what cl has in it, so can ignore this flag
980 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
981 is 127-255 to match them, but then invert that, so the
982 union with cl should just be what cl has in it, so can
985 } else { /* 'or_with' is not inverted */
986 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
987 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
988 && (!(or_with->flags & ANYOF_LOC_FOLD)
989 || (cl->flags & ANYOF_LOC_FOLD)) ) {
992 /* OR char bitmap and class bitmap separately */
993 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
994 cl->bitmap[i] |= or_with->bitmap[i];
995 ANYOF_CLASS_OR(or_with, cl);
997 else { /* XXXX: logic is complicated, leave it along for a moment. */
998 cl_anything(pRExC_state, cl);
1001 if (ANYOF_NONBITMAP(or_with)) {
1003 /* Use the added node's outside-the-bit-map match if there isn't a
1004 * conflict. If there is a conflict (both nodes match something
1005 * outside the bitmap, but what they match outside is not the same
1006 * pointer, and hence not easily compared until XXX we extend
1007 * inversion lists this far), give up and allow the start class to
1008 * match everything outside the bitmap. If that stuff is all above
1009 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1010 if (! ANYOF_NONBITMAP(cl)) {
1011 ARG_SET(cl, ARG(or_with));
1013 else if (ARG(cl) != ARG(or_with)) {
1015 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1016 cl_anything(pRExC_state, cl);
1019 cl->flags |= ANYOF_UNICODE_ALL;
1024 /* Take the union */
1025 cl->flags |= or_with->flags;
1029 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1030 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1031 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1032 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1037 dump_trie(trie,widecharmap,revcharmap)
1038 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1039 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1041 These routines dump out a trie in a somewhat readable format.
1042 The _interim_ variants are used for debugging the interim
1043 tables that are used to generate the final compressed
1044 representation which is what dump_trie expects.
1046 Part of the reason for their existence is to provide a form
1047 of documentation as to how the different representations function.
1052 Dumps the final compressed table form of the trie to Perl_debug_log.
1053 Used for debugging make_trie().
1057 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1058 AV *revcharmap, U32 depth)
1061 SV *sv=sv_newmortal();
1062 int colwidth= widecharmap ? 6 : 4;
1064 GET_RE_DEBUG_FLAGS_DECL;
1066 PERL_ARGS_ASSERT_DUMP_TRIE;
1068 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1069 (int)depth * 2 + 2,"",
1070 "Match","Base","Ofs" );
1072 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1073 SV ** const tmp = av_fetch( revcharmap, state, 0);
1075 PerlIO_printf( Perl_debug_log, "%*s",
1077 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1078 PL_colors[0], PL_colors[1],
1079 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1080 PERL_PV_ESCAPE_FIRSTCHAR
1085 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1086 (int)depth * 2 + 2,"");
1088 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1089 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1090 PerlIO_printf( Perl_debug_log, "\n");
1092 for( state = 1 ; state < trie->statecount ; state++ ) {
1093 const U32 base = trie->states[ state ].trans.base;
1095 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1097 if ( trie->states[ state ].wordnum ) {
1098 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1100 PerlIO_printf( Perl_debug_log, "%6s", "" );
1103 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1108 while( ( base + ofs < trie->uniquecharcount ) ||
1109 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1110 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1113 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1115 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1116 if ( ( base + ofs >= trie->uniquecharcount ) &&
1117 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1118 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1120 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1122 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1124 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1128 PerlIO_printf( Perl_debug_log, "]");
1131 PerlIO_printf( Perl_debug_log, "\n" );
1133 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1134 for (word=1; word <= trie->wordcount; word++) {
1135 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1136 (int)word, (int)(trie->wordinfo[word].prev),
1137 (int)(trie->wordinfo[word].len));
1139 PerlIO_printf(Perl_debug_log, "\n" );
1142 Dumps a fully constructed but uncompressed trie in list form.
1143 List tries normally only are used for construction when the number of
1144 possible chars (trie->uniquecharcount) is very high.
1145 Used for debugging make_trie().
1148 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1149 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1153 SV *sv=sv_newmortal();
1154 int colwidth= widecharmap ? 6 : 4;
1155 GET_RE_DEBUG_FLAGS_DECL;
1157 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1159 /* print out the table precompression. */
1160 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1161 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1162 "------:-----+-----------------\n" );
1164 for( state=1 ; state < next_alloc ; state ++ ) {
1167 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1168 (int)depth * 2 + 2,"", (UV)state );
1169 if ( ! trie->states[ state ].wordnum ) {
1170 PerlIO_printf( Perl_debug_log, "%5s| ","");
1172 PerlIO_printf( Perl_debug_log, "W%4x| ",
1173 trie->states[ state ].wordnum
1176 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1177 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1179 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1181 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1182 PL_colors[0], PL_colors[1],
1183 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1184 PERL_PV_ESCAPE_FIRSTCHAR
1186 TRIE_LIST_ITEM(state,charid).forid,
1187 (UV)TRIE_LIST_ITEM(state,charid).newstate
1190 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1191 (int)((depth * 2) + 14), "");
1194 PerlIO_printf( Perl_debug_log, "\n");
1199 Dumps a fully constructed but uncompressed trie in table form.
1200 This is the normal DFA style state transition table, with a few
1201 twists to facilitate compression later.
1202 Used for debugging make_trie().
1205 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1206 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1211 SV *sv=sv_newmortal();
1212 int colwidth= widecharmap ? 6 : 4;
1213 GET_RE_DEBUG_FLAGS_DECL;
1215 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1218 print out the table precompression so that we can do a visual check
1219 that they are identical.
1222 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1224 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1225 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1227 PerlIO_printf( Perl_debug_log, "%*s",
1229 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1230 PL_colors[0], PL_colors[1],
1231 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1232 PERL_PV_ESCAPE_FIRSTCHAR
1238 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1240 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1241 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1244 PerlIO_printf( Perl_debug_log, "\n" );
1246 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1248 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1249 (int)depth * 2 + 2,"",
1250 (UV)TRIE_NODENUM( state ) );
1252 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1253 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1255 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1257 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1259 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1260 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1262 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1263 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1271 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1272 startbranch: the first branch in the whole branch sequence
1273 first : start branch of sequence of branch-exact nodes.
1274 May be the same as startbranch
1275 last : Thing following the last branch.
1276 May be the same as tail.
1277 tail : item following the branch sequence
1278 count : words in the sequence
1279 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1280 depth : indent depth
1282 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1284 A trie is an N'ary tree where the branches are determined by digital
1285 decomposition of the key. IE, at the root node you look up the 1st character and
1286 follow that branch repeat until you find the end of the branches. Nodes can be
1287 marked as "accepting" meaning they represent a complete word. Eg:
1291 would convert into the following structure. Numbers represent states, letters
1292 following numbers represent valid transitions on the letter from that state, if
1293 the number is in square brackets it represents an accepting state, otherwise it
1294 will be in parenthesis.
1296 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1300 (1) +-i->(6)-+-s->[7]
1302 +-s->(3)-+-h->(4)-+-e->[5]
1304 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1306 This shows that when matching against the string 'hers' we will begin at state 1
1307 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1308 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1309 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1310 single traverse. We store a mapping from accepting to state to which word was
1311 matched, and then when we have multiple possibilities we try to complete the
1312 rest of the regex in the order in which they occured in the alternation.
1314 The only prior NFA like behaviour that would be changed by the TRIE support is
1315 the silent ignoring of duplicate alternations which are of the form:
1317 / (DUPE|DUPE) X? (?{ ... }) Y /x
1319 Thus EVAL blocks following a trie may be called a different number of times with
1320 and without the optimisation. With the optimisations dupes will be silently
1321 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1322 the following demonstrates:
1324 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1326 which prints out 'word' three times, but
1328 'words'=~/(word|word|word)(?{ print $1 })S/
1330 which doesnt print it out at all. This is due to other optimisations kicking in.
1332 Example of what happens on a structural level:
1334 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1336 1: CURLYM[1] {1,32767}(18)
1347 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1348 and should turn into:
1350 1: CURLYM[1] {1,32767}(18)
1352 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1360 Cases where tail != last would be like /(?foo|bar)baz/:
1370 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1371 and would end up looking like:
1374 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1381 d = uvuni_to_utf8_flags(d, uv, 0);
1383 is the recommended Unicode-aware way of saying
1388 #define TRIE_STORE_REVCHAR(val) \
1391 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1392 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1393 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1394 SvCUR_set(zlopp, kapow - flrbbbbb); \
1397 av_push(revcharmap, zlopp); \
1399 char ooooff = (char)val; \
1400 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1404 #define TRIE_READ_CHAR STMT_START { \
1407 /* if it is UTF then it is either already folded, or does not need folding */ \
1408 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1410 else if (folder == PL_fold_latin1) { \
1411 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1412 if ( foldlen > 0 ) { \
1413 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1419 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1420 skiplen = UNISKIP(uvc); \
1421 foldlen -= skiplen; \
1422 scan = foldbuf + skiplen; \
1425 /* raw data, will be folded later if needed */ \
1433 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1434 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1435 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1436 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1438 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1439 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1440 TRIE_LIST_CUR( state )++; \
1443 #define TRIE_LIST_NEW(state) STMT_START { \
1444 Newxz( trie->states[ state ].trans.list, \
1445 4, reg_trie_trans_le ); \
1446 TRIE_LIST_CUR( state ) = 1; \
1447 TRIE_LIST_LEN( state ) = 4; \
1450 #define TRIE_HANDLE_WORD(state) STMT_START { \
1451 U16 dupe= trie->states[ state ].wordnum; \
1452 regnode * const noper_next = regnext( noper ); \
1455 /* store the word for dumping */ \
1457 if (OP(noper) != NOTHING) \
1458 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1460 tmp = newSVpvn_utf8( "", 0, UTF ); \
1461 av_push( trie_words, tmp ); \
1465 trie->wordinfo[curword].prev = 0; \
1466 trie->wordinfo[curword].len = wordlen; \
1467 trie->wordinfo[curword].accept = state; \
1469 if ( noper_next < tail ) { \
1471 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1472 trie->jump[curword] = (U16)(noper_next - convert); \
1474 jumper = noper_next; \
1476 nextbranch= regnext(cur); \
1480 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1481 /* chain, so that when the bits of chain are later */\
1482 /* linked together, the dups appear in the chain */\
1483 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1484 trie->wordinfo[dupe].prev = curword; \
1486 /* we haven't inserted this word yet. */ \
1487 trie->states[ state ].wordnum = curword; \
1492 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1493 ( ( base + charid >= ucharcount \
1494 && base + charid < ubound \
1495 && state == trie->trans[ base - ucharcount + charid ].check \
1496 && trie->trans[ base - ucharcount + charid ].next ) \
1497 ? trie->trans[ base - ucharcount + charid ].next \
1498 : ( state==1 ? special : 0 ) \
1502 #define MADE_JUMP_TRIE 2
1503 #define MADE_EXACT_TRIE 4
1506 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1509 /* first pass, loop through and scan words */
1510 reg_trie_data *trie;
1511 HV *widecharmap = NULL;
1512 AV *revcharmap = newAV();
1514 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1519 regnode *jumper = NULL;
1520 regnode *nextbranch = NULL;
1521 regnode *convert = NULL;
1522 U32 *prev_states; /* temp array mapping each state to previous one */
1523 /* we just use folder as a flag in utf8 */
1524 const U8 * folder = NULL;
1527 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1528 AV *trie_words = NULL;
1529 /* along with revcharmap, this only used during construction but both are
1530 * useful during debugging so we store them in the struct when debugging.
1533 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1534 STRLEN trie_charcount=0;
1536 SV *re_trie_maxbuff;
1537 GET_RE_DEBUG_FLAGS_DECL;
1539 PERL_ARGS_ASSERT_MAKE_TRIE;
1541 PERL_UNUSED_ARG(depth);
1548 case EXACTFU_TRICKYFOLD:
1549 case EXACTFU: folder = PL_fold_latin1; break;
1550 case EXACTF: folder = PL_fold; break;
1551 case EXACTFL: folder = PL_fold_locale; break;
1552 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1555 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1557 trie->startstate = 1;
1558 trie->wordcount = word_count;
1559 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1560 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1562 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1563 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1564 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1567 trie_words = newAV();
1570 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1571 if (!SvIOK(re_trie_maxbuff)) {
1572 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1574 DEBUG_TRIE_COMPILE_r({
1575 PerlIO_printf( Perl_debug_log,
1576 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1577 (int)depth * 2 + 2, "",
1578 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1579 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1583 /* Find the node we are going to overwrite */
1584 if ( first == startbranch && OP( last ) != BRANCH ) {
1585 /* whole branch chain */
1588 /* branch sub-chain */
1589 convert = NEXTOPER( first );
1592 /* -- First loop and Setup --
1594 We first traverse the branches and scan each word to determine if it
1595 contains widechars, and how many unique chars there are, this is
1596 important as we have to build a table with at least as many columns as we
1599 We use an array of integers to represent the character codes 0..255
1600 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1601 native representation of the character value as the key and IV's for the
1604 *TODO* If we keep track of how many times each character is used we can
1605 remap the columns so that the table compression later on is more
1606 efficient in terms of memory by ensuring the most common value is in the
1607 middle and the least common are on the outside. IMO this would be better
1608 than a most to least common mapping as theres a decent chance the most
1609 common letter will share a node with the least common, meaning the node
1610 will not be compressible. With a middle is most common approach the worst
1611 case is when we have the least common nodes twice.
1615 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1616 regnode *noper = NEXTOPER( cur );
1617 const U8 *uc = (U8*)STRING( noper );
1618 const U8 *e = uc + STR_LEN( noper );
1620 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1622 const U8 *scan = (U8*)NULL;
1623 U32 wordlen = 0; /* required init */
1625 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1627 if (OP(noper) == NOTHING) {
1628 regnode *noper_next= regnext(noper);
1629 if (noper_next != tail && OP(noper_next) == flags) {
1631 uc= (U8*)STRING(noper);
1632 e= uc + STR_LEN(noper);
1633 trie->minlen= STR_LEN(noper);
1640 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1641 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1642 regardless of encoding */
1643 if (OP( noper ) == EXACTFU_SS) {
1644 /* false positives are ok, so just set this */
1645 TRIE_BITMAP_SET(trie,0xDF);
1648 for ( ; uc < e ; uc += len ) {
1649 TRIE_CHARCOUNT(trie)++;
1654 U8 folded= folder[ (U8) uvc ];
1655 if ( !trie->charmap[ folded ] ) {
1656 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1657 TRIE_STORE_REVCHAR( folded );
1660 if ( !trie->charmap[ uvc ] ) {
1661 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1662 TRIE_STORE_REVCHAR( uvc );
1665 /* store the codepoint in the bitmap, and its folded
1667 TRIE_BITMAP_SET(trie, uvc);
1669 /* store the folded codepoint */
1670 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1673 /* store first byte of utf8 representation of
1674 variant codepoints */
1675 if (! UNI_IS_INVARIANT(uvc)) {
1676 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1679 set_bit = 0; /* We've done our bit :-) */
1684 widecharmap = newHV();
1686 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1689 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1691 if ( !SvTRUE( *svpp ) ) {
1692 sv_setiv( *svpp, ++trie->uniquecharcount );
1693 TRIE_STORE_REVCHAR(uvc);
1697 if( cur == first ) {
1698 trie->minlen = chars;
1699 trie->maxlen = chars;
1700 } else if (chars < trie->minlen) {
1701 trie->minlen = chars;
1702 } else if (chars > trie->maxlen) {
1703 trie->maxlen = chars;
1705 if (OP( noper ) == EXACTFU_SS) {
1706 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1707 if (trie->minlen > 1)
1710 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1711 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1712 * - We assume that any such sequence might match a 2 byte string */
1713 if (trie->minlen > 2 )
1717 } /* end first pass */
1718 DEBUG_TRIE_COMPILE_r(
1719 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1720 (int)depth * 2 + 2,"",
1721 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1722 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1723 (int)trie->minlen, (int)trie->maxlen )
1727 We now know what we are dealing with in terms of unique chars and
1728 string sizes so we can calculate how much memory a naive
1729 representation using a flat table will take. If it's over a reasonable
1730 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1731 conservative but potentially much slower representation using an array
1734 At the end we convert both representations into the same compressed
1735 form that will be used in regexec.c for matching with. The latter
1736 is a form that cannot be used to construct with but has memory
1737 properties similar to the list form and access properties similar
1738 to the table form making it both suitable for fast searches and
1739 small enough that its feasable to store for the duration of a program.
1741 See the comment in the code where the compressed table is produced
1742 inplace from the flat tabe representation for an explanation of how
1743 the compression works.
1748 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1751 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1753 Second Pass -- Array Of Lists Representation
1755 Each state will be represented by a list of charid:state records
1756 (reg_trie_trans_le) the first such element holds the CUR and LEN
1757 points of the allocated array. (See defines above).
1759 We build the initial structure using the lists, and then convert
1760 it into the compressed table form which allows faster lookups
1761 (but cant be modified once converted).
1764 STRLEN transcount = 1;
1766 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1767 "%*sCompiling trie using list compiler\n",
1768 (int)depth * 2 + 2, ""));
1770 trie->states = (reg_trie_state *)
1771 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1772 sizeof(reg_trie_state) );
1776 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1778 regnode *noper = NEXTOPER( cur );
1779 U8 *uc = (U8*)STRING( noper );
1780 const U8 *e = uc + STR_LEN( noper );
1781 U32 state = 1; /* required init */
1782 U16 charid = 0; /* sanity init */
1783 U8 *scan = (U8*)NULL; /* sanity init */
1784 STRLEN foldlen = 0; /* required init */
1785 U32 wordlen = 0; /* required init */
1786 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1789 if (OP(noper) == NOTHING) {
1790 regnode *noper_next= regnext(noper);
1791 if (noper_next != tail && OP(noper_next) == flags) {
1793 uc= (U8*)STRING(noper);
1794 e= uc + STR_LEN(noper);
1798 if (OP(noper) != NOTHING) {
1799 for ( ; uc < e ; uc += len ) {
1804 charid = trie->charmap[ uvc ];
1806 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1810 charid=(U16)SvIV( *svpp );
1813 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1820 if ( !trie->states[ state ].trans.list ) {
1821 TRIE_LIST_NEW( state );
1823 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1824 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1825 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1830 newstate = next_alloc++;
1831 prev_states[newstate] = state;
1832 TRIE_LIST_PUSH( state, charid, newstate );
1837 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1841 TRIE_HANDLE_WORD(state);
1843 } /* end second pass */
1845 /* next alloc is the NEXT state to be allocated */
1846 trie->statecount = next_alloc;
1847 trie->states = (reg_trie_state *)
1848 PerlMemShared_realloc( trie->states,
1850 * sizeof(reg_trie_state) );
1852 /* and now dump it out before we compress it */
1853 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1854 revcharmap, next_alloc,
1858 trie->trans = (reg_trie_trans *)
1859 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1866 for( state=1 ; state < next_alloc ; state ++ ) {
1870 DEBUG_TRIE_COMPILE_MORE_r(
1871 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1875 if (trie->states[state].trans.list) {
1876 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1880 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1881 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1882 if ( forid < minid ) {
1884 } else if ( forid > maxid ) {
1888 if ( transcount < tp + maxid - minid + 1) {
1890 trie->trans = (reg_trie_trans *)
1891 PerlMemShared_realloc( trie->trans,
1893 * sizeof(reg_trie_trans) );
1894 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1896 base = trie->uniquecharcount + tp - minid;
1897 if ( maxid == minid ) {
1899 for ( ; zp < tp ; zp++ ) {
1900 if ( ! trie->trans[ zp ].next ) {
1901 base = trie->uniquecharcount + zp - minid;
1902 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1903 trie->trans[ zp ].check = state;
1909 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1910 trie->trans[ tp ].check = state;
1915 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1916 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1917 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1918 trie->trans[ tid ].check = state;
1920 tp += ( maxid - minid + 1 );
1922 Safefree(trie->states[ state ].trans.list);
1925 DEBUG_TRIE_COMPILE_MORE_r(
1926 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1929 trie->states[ state ].trans.base=base;
1931 trie->lasttrans = tp + 1;
1935 Second Pass -- Flat Table Representation.
1937 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1938 We know that we will need Charcount+1 trans at most to store the data
1939 (one row per char at worst case) So we preallocate both structures
1940 assuming worst case.
1942 We then construct the trie using only the .next slots of the entry
1945 We use the .check field of the first entry of the node temporarily to
1946 make compression both faster and easier by keeping track of how many non
1947 zero fields are in the node.
1949 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1952 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1953 number representing the first entry of the node, and state as a
1954 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1955 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1956 are 2 entrys per node. eg:
1964 The table is internally in the right hand, idx form. However as we also
1965 have to deal with the states array which is indexed by nodenum we have to
1966 use TRIE_NODENUM() to convert.
1969 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1970 "%*sCompiling trie using table compiler\n",
1971 (int)depth * 2 + 2, ""));
1973 trie->trans = (reg_trie_trans *)
1974 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1975 * trie->uniquecharcount + 1,
1976 sizeof(reg_trie_trans) );
1977 trie->states = (reg_trie_state *)
1978 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1979 sizeof(reg_trie_state) );
1980 next_alloc = trie->uniquecharcount + 1;
1983 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1985 regnode *noper = NEXTOPER( cur );
1986 const U8 *uc = (U8*)STRING( noper );
1987 const U8 *e = uc + STR_LEN( noper );
1989 U32 state = 1; /* required init */
1991 U16 charid = 0; /* sanity init */
1992 U32 accept_state = 0; /* sanity init */
1993 U8 *scan = (U8*)NULL; /* sanity init */
1995 STRLEN foldlen = 0; /* required init */
1996 U32 wordlen = 0; /* required init */
1998 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2000 if (OP(noper) == NOTHING) {
2001 regnode *noper_next= regnext(noper);
2002 if (noper_next != tail && OP(noper_next) == flags) {
2004 uc= (U8*)STRING(noper);
2005 e= uc + STR_LEN(noper);
2009 if ( OP(noper) != NOTHING ) {
2010 for ( ; uc < e ; uc += len ) {
2015 charid = trie->charmap[ uvc ];
2017 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2018 charid = svpp ? (U16)SvIV(*svpp) : 0;
2022 if ( !trie->trans[ state + charid ].next ) {
2023 trie->trans[ state + charid ].next = next_alloc;
2024 trie->trans[ state ].check++;
2025 prev_states[TRIE_NODENUM(next_alloc)]
2026 = TRIE_NODENUM(state);
2027 next_alloc += trie->uniquecharcount;
2029 state = trie->trans[ state + charid ].next;
2031 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2033 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2036 accept_state = TRIE_NODENUM( state );
2037 TRIE_HANDLE_WORD(accept_state);
2039 } /* end second pass */
2041 /* and now dump it out before we compress it */
2042 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2044 next_alloc, depth+1));
2048 * Inplace compress the table.*
2050 For sparse data sets the table constructed by the trie algorithm will
2051 be mostly 0/FAIL transitions or to put it another way mostly empty.
2052 (Note that leaf nodes will not contain any transitions.)
2054 This algorithm compresses the tables by eliminating most such
2055 transitions, at the cost of a modest bit of extra work during lookup:
2057 - Each states[] entry contains a .base field which indicates the
2058 index in the state[] array wheres its transition data is stored.
2060 - If .base is 0 there are no valid transitions from that node.
2062 - If .base is nonzero then charid is added to it to find an entry in
2065 -If trans[states[state].base+charid].check!=state then the
2066 transition is taken to be a 0/Fail transition. Thus if there are fail
2067 transitions at the front of the node then the .base offset will point
2068 somewhere inside the previous nodes data (or maybe even into a node
2069 even earlier), but the .check field determines if the transition is
2073 The following process inplace converts the table to the compressed
2074 table: We first do not compress the root node 1,and mark all its
2075 .check pointers as 1 and set its .base pointer as 1 as well. This
2076 allows us to do a DFA construction from the compressed table later,
2077 and ensures that any .base pointers we calculate later are greater
2080 - We set 'pos' to indicate the first entry of the second node.
2082 - We then iterate over the columns of the node, finding the first and
2083 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2084 and set the .check pointers accordingly, and advance pos
2085 appropriately and repreat for the next node. Note that when we copy
2086 the next pointers we have to convert them from the original
2087 NODEIDX form to NODENUM form as the former is not valid post
2090 - If a node has no transitions used we mark its base as 0 and do not
2091 advance the pos pointer.
2093 - If a node only has one transition we use a second pointer into the
2094 structure to fill in allocated fail transitions from other states.
2095 This pointer is independent of the main pointer and scans forward
2096 looking for null transitions that are allocated to a state. When it
2097 finds one it writes the single transition into the "hole". If the
2098 pointer doesnt find one the single transition is appended as normal.
2100 - Once compressed we can Renew/realloc the structures to release the
2103 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2104 specifically Fig 3.47 and the associated pseudocode.
2108 const U32 laststate = TRIE_NODENUM( next_alloc );
2111 trie->statecount = laststate;
2113 for ( state = 1 ; state < laststate ; state++ ) {
2115 const U32 stateidx = TRIE_NODEIDX( state );
2116 const U32 o_used = trie->trans[ stateidx ].check;
2117 U32 used = trie->trans[ stateidx ].check;
2118 trie->trans[ stateidx ].check = 0;
2120 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2121 if ( flag || trie->trans[ stateidx + charid ].next ) {
2122 if ( trie->trans[ stateidx + charid ].next ) {
2124 for ( ; zp < pos ; zp++ ) {
2125 if ( ! trie->trans[ zp ].next ) {
2129 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2130 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2131 trie->trans[ zp ].check = state;
2132 if ( ++zp > pos ) pos = zp;
2139 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2141 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2142 trie->trans[ pos ].check = state;
2147 trie->lasttrans = pos + 1;
2148 trie->states = (reg_trie_state *)
2149 PerlMemShared_realloc( trie->states, laststate
2150 * sizeof(reg_trie_state) );
2151 DEBUG_TRIE_COMPILE_MORE_r(
2152 PerlIO_printf( Perl_debug_log,
2153 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2154 (int)depth * 2 + 2,"",
2155 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2158 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2161 } /* end table compress */
2163 DEBUG_TRIE_COMPILE_MORE_r(
2164 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2165 (int)depth * 2 + 2, "",
2166 (UV)trie->statecount,
2167 (UV)trie->lasttrans)
2169 /* resize the trans array to remove unused space */
2170 trie->trans = (reg_trie_trans *)
2171 PerlMemShared_realloc( trie->trans, trie->lasttrans
2172 * sizeof(reg_trie_trans) );
2174 { /* Modify the program and insert the new TRIE node */
2175 U8 nodetype =(U8)(flags & 0xFF);
2179 regnode *optimize = NULL;
2180 #ifdef RE_TRACK_PATTERN_OFFSETS
2183 U32 mjd_nodelen = 0;
2184 #endif /* RE_TRACK_PATTERN_OFFSETS */
2185 #endif /* DEBUGGING */
2187 This means we convert either the first branch or the first Exact,
2188 depending on whether the thing following (in 'last') is a branch
2189 or not and whther first is the startbranch (ie is it a sub part of
2190 the alternation or is it the whole thing.)
2191 Assuming its a sub part we convert the EXACT otherwise we convert
2192 the whole branch sequence, including the first.
2194 /* Find the node we are going to overwrite */
2195 if ( first != startbranch || OP( last ) == BRANCH ) {
2196 /* branch sub-chain */
2197 NEXT_OFF( first ) = (U16)(last - first);
2198 #ifdef RE_TRACK_PATTERN_OFFSETS
2200 mjd_offset= Node_Offset((convert));
2201 mjd_nodelen= Node_Length((convert));
2204 /* whole branch chain */
2206 #ifdef RE_TRACK_PATTERN_OFFSETS
2209 const regnode *nop = NEXTOPER( convert );
2210 mjd_offset= Node_Offset((nop));
2211 mjd_nodelen= Node_Length((nop));
2215 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2216 (int)depth * 2 + 2, "",
2217 (UV)mjd_offset, (UV)mjd_nodelen)
2220 /* But first we check to see if there is a common prefix we can
2221 split out as an EXACT and put in front of the TRIE node. */
2222 trie->startstate= 1;
2223 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2225 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2229 const U32 base = trie->states[ state ].trans.base;
2231 if ( trie->states[state].wordnum )
2234 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2235 if ( ( base + ofs >= trie->uniquecharcount ) &&
2236 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2237 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2239 if ( ++count > 1 ) {
2240 SV **tmp = av_fetch( revcharmap, ofs, 0);
2241 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2242 if ( state == 1 ) break;
2244 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2246 PerlIO_printf(Perl_debug_log,
2247 "%*sNew Start State=%"UVuf" Class: [",
2248 (int)depth * 2 + 2, "",
2251 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2252 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2254 TRIE_BITMAP_SET(trie,*ch);
2256 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2258 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2262 TRIE_BITMAP_SET(trie,*ch);
2264 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2265 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2271 SV **tmp = av_fetch( revcharmap, idx, 0);
2273 char *ch = SvPV( *tmp, len );
2275 SV *sv=sv_newmortal();
2276 PerlIO_printf( Perl_debug_log,
2277 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2278 (int)depth * 2 + 2, "",
2280 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2281 PL_colors[0], PL_colors[1],
2282 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2283 PERL_PV_ESCAPE_FIRSTCHAR
2288 OP( convert ) = nodetype;
2289 str=STRING(convert);
2292 STR_LEN(convert) += len;
2298 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2303 trie->prefixlen = (state-1);
2305 regnode *n = convert+NODE_SZ_STR(convert);
2306 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2307 trie->startstate = state;
2308 trie->minlen -= (state - 1);
2309 trie->maxlen -= (state - 1);
2311 /* At least the UNICOS C compiler choked on this
2312 * being argument to DEBUG_r(), so let's just have
2315 #ifdef PERL_EXT_RE_BUILD
2321 regnode *fix = convert;
2322 U32 word = trie->wordcount;
2324 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2325 while( ++fix < n ) {
2326 Set_Node_Offset_Length(fix, 0, 0);
2329 SV ** const tmp = av_fetch( trie_words, word, 0 );
2331 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2332 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2334 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2342 NEXT_OFF(convert) = (U16)(tail - convert);
2343 DEBUG_r(optimize= n);
2349 if ( trie->maxlen ) {
2350 NEXT_OFF( convert ) = (U16)(tail - convert);
2351 ARG_SET( convert, data_slot );
2352 /* Store the offset to the first unabsorbed branch in
2353 jump[0], which is otherwise unused by the jump logic.
2354 We use this when dumping a trie and during optimisation. */
2356 trie->jump[0] = (U16)(nextbranch - convert);
2358 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2359 * and there is a bitmap
2360 * and the first "jump target" node we found leaves enough room
2361 * then convert the TRIE node into a TRIEC node, with the bitmap
2362 * embedded inline in the opcode - this is hypothetically faster.
2364 if ( !trie->states[trie->startstate].wordnum
2366 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2368 OP( convert ) = TRIEC;
2369 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2370 PerlMemShared_free(trie->bitmap);
2373 OP( convert ) = TRIE;
2375 /* store the type in the flags */
2376 convert->flags = nodetype;
2380 + regarglen[ OP( convert ) ];
2382 /* XXX We really should free up the resource in trie now,
2383 as we won't use them - (which resources?) dmq */
2385 /* needed for dumping*/
2386 DEBUG_r(if (optimize) {
2387 regnode *opt = convert;
2389 while ( ++opt < optimize) {
2390 Set_Node_Offset_Length(opt,0,0);
2393 Try to clean up some of the debris left after the
2396 while( optimize < jumper ) {
2397 mjd_nodelen += Node_Length((optimize));
2398 OP( optimize ) = OPTIMIZED;
2399 Set_Node_Offset_Length(optimize,0,0);
2402 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2404 } /* end node insert */
2406 /* Finish populating the prev field of the wordinfo array. Walk back
2407 * from each accept state until we find another accept state, and if
2408 * so, point the first word's .prev field at the second word. If the
2409 * second already has a .prev field set, stop now. This will be the
2410 * case either if we've already processed that word's accept state,
2411 * or that state had multiple words, and the overspill words were
2412 * already linked up earlier.
2419 for (word=1; word <= trie->wordcount; word++) {
2421 if (trie->wordinfo[word].prev)
2423 state = trie->wordinfo[word].accept;
2425 state = prev_states[state];
2428 prev = trie->states[state].wordnum;
2432 trie->wordinfo[word].prev = prev;
2434 Safefree(prev_states);
2438 /* and now dump out the compressed format */
2439 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2441 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2443 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2444 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2446 SvREFCNT_dec(revcharmap);
2450 : trie->startstate>1
2456 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2458 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2460 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2461 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2464 We find the fail state for each state in the trie, this state is the longest proper
2465 suffix of the current state's 'word' that is also a proper prefix of another word in our
2466 trie. State 1 represents the word '' and is thus the default fail state. This allows
2467 the DFA not to have to restart after its tried and failed a word at a given point, it
2468 simply continues as though it had been matching the other word in the first place.
2470 'abcdgu'=~/abcdefg|cdgu/
2471 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2472 fail, which would bring us to the state representing 'd' in the second word where we would
2473 try 'g' and succeed, proceeding to match 'cdgu'.
2475 /* add a fail transition */
2476 const U32 trie_offset = ARG(source);
2477 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2479 const U32 ucharcount = trie->uniquecharcount;
2480 const U32 numstates = trie->statecount;
2481 const U32 ubound = trie->lasttrans + ucharcount;
2485 U32 base = trie->states[ 1 ].trans.base;
2488 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2489 GET_RE_DEBUG_FLAGS_DECL;
2491 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2493 PERL_UNUSED_ARG(depth);
2497 ARG_SET( stclass, data_slot );
2498 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2499 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2500 aho->trie=trie_offset;
2501 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2502 Copy( trie->states, aho->states, numstates, reg_trie_state );
2503 Newxz( q, numstates, U32);
2504 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2507 /* initialize fail[0..1] to be 1 so that we always have
2508 a valid final fail state */
2509 fail[ 0 ] = fail[ 1 ] = 1;
2511 for ( charid = 0; charid < ucharcount ; charid++ ) {
2512 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2514 q[ q_write ] = newstate;
2515 /* set to point at the root */
2516 fail[ q[ q_write++ ] ]=1;
2519 while ( q_read < q_write) {
2520 const U32 cur = q[ q_read++ % numstates ];
2521 base = trie->states[ cur ].trans.base;
2523 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2524 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2526 U32 fail_state = cur;
2529 fail_state = fail[ fail_state ];
2530 fail_base = aho->states[ fail_state ].trans.base;
2531 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2533 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2534 fail[ ch_state ] = fail_state;
2535 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2537 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2539 q[ q_write++ % numstates] = ch_state;
2543 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2544 when we fail in state 1, this allows us to use the
2545 charclass scan to find a valid start char. This is based on the principle
2546 that theres a good chance the string being searched contains lots of stuff
2547 that cant be a start char.
2549 fail[ 0 ] = fail[ 1 ] = 0;
2550 DEBUG_TRIE_COMPILE_r({
2551 PerlIO_printf(Perl_debug_log,
2552 "%*sStclass Failtable (%"UVuf" states): 0",
2553 (int)(depth * 2), "", (UV)numstates
2555 for( q_read=1; q_read<numstates; q_read++ ) {
2556 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2558 PerlIO_printf(Perl_debug_log, "\n");
2561 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2566 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2567 * These need to be revisited when a newer toolchain becomes available.
2569 #if defined(__sparc64__) && defined(__GNUC__)
2570 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2571 # undef SPARC64_GCC_WORKAROUND
2572 # define SPARC64_GCC_WORKAROUND 1
2576 #define DEBUG_PEEP(str,scan,depth) \
2577 DEBUG_OPTIMISE_r({if (scan){ \
2578 SV * const mysv=sv_newmortal(); \
2579 regnode *Next = regnext(scan); \
2580 regprop(RExC_rx, mysv, scan); \
2581 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2582 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2583 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2587 /* The below joins as many adjacent EXACTish nodes as possible into a single
2588 * one. The regop may be changed if the node(s) contain certain sequences that
2589 * require special handling. The joining is only done if:
2590 * 1) there is room in the current conglomerated node to entirely contain the
2592 * 2) they are the exact same node type
2594 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2595 * these get optimized out
2597 * If a node is to match under /i (folded), the number of characters it matches
2598 * can be different than its character length if it contains a multi-character
2599 * fold. *min_subtract is set to the total delta of the input nodes.
2601 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2602 * and contains LATIN SMALL LETTER SHARP S
2604 * This is as good a place as any to discuss the design of handling these
2605 * multi-character fold sequences. It's been wrong in Perl for a very long
2606 * time. There are three code points in Unicode whose multi-character folds
2607 * were long ago discovered to mess things up. The previous designs for
2608 * dealing with these involved assigning a special node for them. This
2609 * approach doesn't work, as evidenced by this example:
2610 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2611 * Both these fold to "sss", but if the pattern is parsed to create a node that
2612 * would match just the \xDF, it won't be able to handle the case where a
2613 * successful match would have to cross the node's boundary. The new approach
2614 * that hopefully generally solves the problem generates an EXACTFU_SS node
2617 * It turns out that there are problems with all multi-character folds, and not
2618 * just these three. Now the code is general, for all such cases, but the
2619 * three still have some special handling. The approach taken is:
2620 * 1) This routine examines each EXACTFish node that could contain multi-
2621 * character fold sequences. It returns in *min_subtract how much to
2622 * subtract from the the actual length of the string to get a real minimum
2623 * match length; it is 0 if there are no multi-char folds. This delta is
2624 * used by the caller to adjust the min length of the match, and the delta
2625 * between min and max, so that the optimizer doesn't reject these
2626 * possibilities based on size constraints.
2627 * 2) Certain of these sequences require special handling by the trie code,
2628 * so, if found, this code changes the joined node type to special ops:
2629 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2630 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2631 * is used for an EXACTFU node that contains at least one "ss" sequence in
2632 * it. For non-UTF-8 patterns and strings, this is the only case where
2633 * there is a possible fold length change. That means that a regular
2634 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2635 * with length changes, and so can be processed faster. regexec.c takes
2636 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2637 * pre-folded by regcomp.c. This saves effort in regex matching.
2638 * However, the pre-folding isn't done for non-UTF8 patterns because the
2639 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2640 * down by forcing the pattern into UTF8 unless necessary. Also what
2641 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2642 * possibilities for the non-UTF8 patterns are quite simple, except for
2643 * the sharp s. All the ones that don't involve a UTF-8 target string are
2644 * members of a fold-pair, and arrays are set up for all of them so that
2645 * the other member of the pair can be found quickly. Code elsewhere in
2646 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2647 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2648 * described in the next item.
2649 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2650 * 'ss' or not is not knowable at compile time. It will match iff the
2651 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2652 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2653 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2654 * described in item 3). An assumption that the optimizer part of
2655 * regexec.c (probably unwittingly) makes is that a character in the
2656 * pattern corresponds to at most a single character in the target string.
2657 * (And I do mean character, and not byte here, unlike other parts of the
2658 * documentation that have never been updated to account for multibyte
2659 * Unicode.) This assumption is wrong only in this case, as all other
2660 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2661 * virtue of having this file pre-fold UTF-8 patterns. I'm
2662 * reluctant to try to change this assumption, so instead the code punts.
2663 * This routine examines EXACTF nodes for the sharp s, and returns a
2664 * boolean indicating whether or not the node is an EXACTF node that
2665 * contains a sharp s. When it is true, the caller sets a flag that later
2666 * causes the optimizer in this file to not set values for the floating
2667 * and fixed string lengths, and thus avoids the optimizer code in
2668 * regexec.c that makes the invalid assumption. Thus, there is no
2669 * optimization based on string lengths for EXACTF nodes that contain the
2670 * sharp s. This only happens for /id rules (which means the pattern
2674 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2675 if (PL_regkind[OP(scan)] == EXACT) \
2676 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2679 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2680 /* Merge several consecutive EXACTish nodes into one. */
2681 regnode *n = regnext(scan);
2683 regnode *next = scan + NODE_SZ_STR(scan);
2687 regnode *stop = scan;
2688 GET_RE_DEBUG_FLAGS_DECL;
2690 PERL_UNUSED_ARG(depth);
2693 PERL_ARGS_ASSERT_JOIN_EXACT;
2694 #ifndef EXPERIMENTAL_INPLACESCAN
2695 PERL_UNUSED_ARG(flags);
2696 PERL_UNUSED_ARG(val);
2698 DEBUG_PEEP("join",scan,depth);
2700 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2701 * EXACT ones that are mergeable to the current one. */
2703 && (PL_regkind[OP(n)] == NOTHING
2704 || (stringok && OP(n) == OP(scan)))
2706 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2709 if (OP(n) == TAIL || n > next)
2711 if (PL_regkind[OP(n)] == NOTHING) {
2712 DEBUG_PEEP("skip:",n,depth);
2713 NEXT_OFF(scan) += NEXT_OFF(n);
2714 next = n + NODE_STEP_REGNODE;
2721 else if (stringok) {
2722 const unsigned int oldl = STR_LEN(scan);
2723 regnode * const nnext = regnext(n);
2725 /* XXX I (khw) kind of doubt that this works on platforms where
2726 * U8_MAX is above 255 because of lots of other assumptions */
2727 if (oldl + STR_LEN(n) > U8_MAX)
2730 DEBUG_PEEP("merg",n,depth);
2733 NEXT_OFF(scan) += NEXT_OFF(n);
2734 STR_LEN(scan) += STR_LEN(n);
2735 next = n + NODE_SZ_STR(n);
2736 /* Now we can overwrite *n : */
2737 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2745 #ifdef EXPERIMENTAL_INPLACESCAN
2746 if (flags && !NEXT_OFF(n)) {
2747 DEBUG_PEEP("atch", val, depth);
2748 if (reg_off_by_arg[OP(n)]) {
2749 ARG_SET(n, val - n);
2752 NEXT_OFF(n) = val - n;
2760 *has_exactf_sharp_s = FALSE;
2762 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2763 * can now analyze for sequences of problematic code points. (Prior to
2764 * this final joining, sequences could have been split over boundaries, and
2765 * hence missed). The sequences only happen in folding, hence for any
2766 * non-EXACT EXACTish node */
2767 if (OP(scan) != EXACT) {
2768 const U8 * const s0 = (U8*) STRING(scan);
2770 const U8 * const s_end = s0 + STR_LEN(scan);
2772 /* One pass is made over the node's string looking for all the
2773 * possibilities. to avoid some tests in the loop, there are two main
2774 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2778 /* Examine the string for a multi-character fold sequence. UTF-8
2779 * patterns have all characters pre-folded by the time this code is
2781 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2782 length sequence we are looking for is 2 */
2785 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2786 if (! len) { /* Not a multi-char fold: get next char */
2791 /* Nodes with 'ss' require special handling, except for EXACTFL
2792 * and EXACTFA for which there is no multi-char fold to this */
2793 if (len == 2 && *s == 's' && *(s+1) == 's'
2794 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2797 OP(scan) = EXACTFU_SS;
2800 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2801 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2802 COMBINING_DIAERESIS_UTF8
2803 COMBINING_ACUTE_ACCENT_UTF8,
2805 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2806 COMBINING_DIAERESIS_UTF8
2807 COMBINING_ACUTE_ACCENT_UTF8,
2812 /* These two folds require special handling by trie's, so
2813 * change the node type to indicate this. If EXACTFA and
2814 * EXACTFL were ever to be handled by trie's, this would
2815 * have to be changed. If this node has already been
2816 * changed to EXACTFU_SS in this loop, leave it as is. (I
2817 * (khw) think it doesn't matter in regexec.c for UTF
2818 * patterns, but no need to change it */
2819 if (OP(scan) == EXACTFU) {
2820 OP(scan) = EXACTFU_TRICKYFOLD;
2824 else { /* Here is a generic multi-char fold. */
2825 const U8* multi_end = s + len;
2827 /* Count how many characters in it. In the case of /l and
2828 * /aa, no folds which contain ASCII code points are
2829 * allowed, so check for those, and skip if found. (In
2830 * EXACTFL, no folds are allowed to any Latin1 code point,
2831 * not just ASCII. But there aren't any of these
2832 * currently, nor ever likely, so don't take the time to
2833 * test for them. The code that generates the
2834 * is_MULTI_foo() macros croaks should one actually get put
2835 * into Unicode .) */
2836 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2837 count = utf8_length(s, multi_end);
2841 while (s < multi_end) {
2844 goto next_iteration;
2854 /* The delta is how long the sequence is minus 1 (1 is how long
2855 * the character that folds to the sequence is) */
2856 *min_subtract += count - 1;
2860 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2862 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2863 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2864 * nodes can't have multi-char folds to this range (and there are
2865 * no existing ones in the upper latin1 range). In the EXACTF
2866 * case we look also for the sharp s, which can be in the final
2867 * position. Otherwise we can stop looking 1 byte earlier because
2868 * have to find at least two characters for a multi-fold */
2869 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2871 /* The below is perhaps overboard, but this allows us to save a
2872 * test each time through the loop at the expense of a mask. This
2873 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2874 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2875 * are 64. This uses an exclusive 'or' to find that bit and then
2876 * inverts it to form a mask, with just a single 0, in the bit
2877 * position where 'S' and 's' differ. */
2878 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2879 const U8 s_masked = 's' & S_or_s_mask;
2882 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2883 if (! len) { /* Not a multi-char fold. */
2884 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2886 *has_exactf_sharp_s = TRUE;
2893 && ((*s & S_or_s_mask) == s_masked)
2894 && ((*(s+1) & S_or_s_mask) == s_masked))
2897 /* EXACTF nodes need to know that the minimum length
2898 * changed so that a sharp s in the string can match this
2899 * ss in the pattern, but they remain EXACTF nodes, as they
2900 * won't match this unless the target string is is UTF-8,
2901 * which we don't know until runtime */
2902 if (OP(scan) != EXACTF) {
2903 OP(scan) = EXACTFU_SS;
2907 *min_subtract += len - 1;
2914 /* Allow dumping but overwriting the collection of skipped
2915 * ops and/or strings with fake optimized ops */
2916 n = scan + NODE_SZ_STR(scan);
2924 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2928 /* REx optimizer. Converts nodes into quicker variants "in place".
2929 Finds fixed substrings. */
2931 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2932 to the position after last scanned or to NULL. */
2934 #define INIT_AND_WITHP \
2935 assert(!and_withp); \
2936 Newx(and_withp,1,struct regnode_charclass_class); \
2937 SAVEFREEPV(and_withp)
2939 /* this is a chain of data about sub patterns we are processing that
2940 need to be handled separately/specially in study_chunk. Its so
2941 we can simulate recursion without losing state. */
2943 typedef struct scan_frame {
2944 regnode *last; /* last node to process in this frame */
2945 regnode *next; /* next node to process when last is reached */
2946 struct scan_frame *prev; /*previous frame*/
2947 I32 stop; /* what stopparen do we use */
2951 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2953 #define CASE_SYNST_FNC(nAmE) \
2955 if (flags & SCF_DO_STCLASS_AND) { \
2956 for (value = 0; value < 256; value++) \
2957 if (!is_ ## nAmE ## _cp(value)) \
2958 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2961 for (value = 0; value < 256; value++) \
2962 if (is_ ## nAmE ## _cp(value)) \
2963 ANYOF_BITMAP_SET(data->start_class, value); \
2967 if (flags & SCF_DO_STCLASS_AND) { \
2968 for (value = 0; value < 256; value++) \
2969 if (is_ ## nAmE ## _cp(value)) \
2970 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2973 for (value = 0; value < 256; value++) \
2974 if (!is_ ## nAmE ## _cp(value)) \
2975 ANYOF_BITMAP_SET(data->start_class, value); \
2982 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2983 I32 *minlenp, I32 *deltap,
2988 struct regnode_charclass_class *and_withp,
2989 U32 flags, U32 depth)
2990 /* scanp: Start here (read-write). */
2991 /* deltap: Write maxlen-minlen here. */
2992 /* last: Stop before this one. */
2993 /* data: string data about the pattern */
2994 /* stopparen: treat close N as END */
2995 /* recursed: which subroutines have we recursed into */
2996 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2999 I32 min = 0; /* There must be at least this number of characters to match */
3001 regnode *scan = *scanp, *next;
3003 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3004 int is_inf_internal = 0; /* The studied chunk is infinite */
3005 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3006 scan_data_t data_fake;
3007 SV *re_trie_maxbuff = NULL;
3008 regnode *first_non_open = scan;
3009 I32 stopmin = I32_MAX;
3010 scan_frame *frame = NULL;
3011 GET_RE_DEBUG_FLAGS_DECL;
3013 PERL_ARGS_ASSERT_STUDY_CHUNK;
3016 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3020 while (first_non_open && OP(first_non_open) == OPEN)
3021 first_non_open=regnext(first_non_open);
3026 while ( scan && OP(scan) != END && scan < last ){
3027 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3028 node length to get a real minimum (because
3029 the folded version may be shorter) */
3030 bool has_exactf_sharp_s = FALSE;
3031 /* Peephole optimizer: */
3032 DEBUG_STUDYDATA("Peep:", data,depth);
3033 DEBUG_PEEP("Peep",scan,depth);
3035 /* Its not clear to khw or hv why this is done here, and not in the
3036 * clauses that deal with EXACT nodes. khw's guess is that it's
3037 * because of a previous design */
3038 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3040 /* Follow the next-chain of the current node and optimize
3041 away all the NOTHINGs from it. */
3042 if (OP(scan) != CURLYX) {
3043 const int max = (reg_off_by_arg[OP(scan)]
3045 /* I32 may be smaller than U16 on CRAYs! */
3046 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3047 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3051 /* Skip NOTHING and LONGJMP. */
3052 while ((n = regnext(n))
3053 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3054 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3055 && off + noff < max)
3057 if (reg_off_by_arg[OP(scan)])
3060 NEXT_OFF(scan) = off;
3065 /* The principal pseudo-switch. Cannot be a switch, since we
3066 look into several different things. */
3067 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3068 || OP(scan) == IFTHEN) {
3069 next = regnext(scan);
3071 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3073 if (OP(next) == code || code == IFTHEN) {
3074 /* NOTE - There is similar code to this block below for handling
3075 TRIE nodes on a re-study. If you change stuff here check there
3077 I32 max1 = 0, min1 = I32_MAX, num = 0;
3078 struct regnode_charclass_class accum;
3079 regnode * const startbranch=scan;
3081 if (flags & SCF_DO_SUBSTR)
3082 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3083 if (flags & SCF_DO_STCLASS)
3084 cl_init_zero(pRExC_state, &accum);
3086 while (OP(scan) == code) {
3087 I32 deltanext, minnext, f = 0, fake;
3088 struct regnode_charclass_class this_class;
3091 data_fake.flags = 0;
3093 data_fake.whilem_c = data->whilem_c;
3094 data_fake.last_closep = data->last_closep;
3097 data_fake.last_closep = &fake;
3099 data_fake.pos_delta = delta;
3100 next = regnext(scan);
3101 scan = NEXTOPER(scan);
3103 scan = NEXTOPER(scan);
3104 if (flags & SCF_DO_STCLASS) {
3105 cl_init(pRExC_state, &this_class);
3106 data_fake.start_class = &this_class;
3107 f = SCF_DO_STCLASS_AND;
3109 if (flags & SCF_WHILEM_VISITED_POS)
3110 f |= SCF_WHILEM_VISITED_POS;
3112 /* we suppose the run is continuous, last=next...*/
3113 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3115 stopparen, recursed, NULL, f,depth+1);
3118 if (max1 < minnext + deltanext)
3119 max1 = minnext + deltanext;
3120 if (deltanext == I32_MAX)
3121 is_inf = is_inf_internal = 1;
3123 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3125 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3126 if ( stopmin > minnext)
3127 stopmin = min + min1;
3128 flags &= ~SCF_DO_SUBSTR;
3130 data->flags |= SCF_SEEN_ACCEPT;
3133 if (data_fake.flags & SF_HAS_EVAL)
3134 data->flags |= SF_HAS_EVAL;
3135 data->whilem_c = data_fake.whilem_c;
3137 if (flags & SCF_DO_STCLASS)
3138 cl_or(pRExC_state, &accum, &this_class);
3140 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3142 if (flags & SCF_DO_SUBSTR) {
3143 data->pos_min += min1;
3144 data->pos_delta += max1 - min1;
3145 if (max1 != min1 || is_inf)
3146 data->longest = &(data->longest_float);
3149 delta += max1 - min1;
3150 if (flags & SCF_DO_STCLASS_OR) {
3151 cl_or(pRExC_state, data->start_class, &accum);
3153 cl_and(data->start_class, and_withp);
3154 flags &= ~SCF_DO_STCLASS;
3157 else if (flags & SCF_DO_STCLASS_AND) {
3159 cl_and(data->start_class, &accum);
3160 flags &= ~SCF_DO_STCLASS;
3163 /* Switch to OR mode: cache the old value of
3164 * data->start_class */
3166 StructCopy(data->start_class, and_withp,
3167 struct regnode_charclass_class);
3168 flags &= ~SCF_DO_STCLASS_AND;
3169 StructCopy(&accum, data->start_class,
3170 struct regnode_charclass_class);
3171 flags |= SCF_DO_STCLASS_OR;
3172 data->start_class->flags |= ANYOF_EOS;
3176 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3179 Assuming this was/is a branch we are dealing with: 'scan' now
3180 points at the item that follows the branch sequence, whatever
3181 it is. We now start at the beginning of the sequence and look
3188 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3190 If we can find such a subsequence we need to turn the first
3191 element into a trie and then add the subsequent branch exact
3192 strings to the trie.
3196 1. patterns where the whole set of branches can be converted.
3198 2. patterns where only a subset can be converted.
3200 In case 1 we can replace the whole set with a single regop
3201 for the trie. In case 2 we need to keep the start and end
3204 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3205 becomes BRANCH TRIE; BRANCH X;
3207 There is an additional case, that being where there is a
3208 common prefix, which gets split out into an EXACT like node
3209 preceding the TRIE node.
3211 If x(1..n)==tail then we can do a simple trie, if not we make
3212 a "jump" trie, such that when we match the appropriate word
3213 we "jump" to the appropriate tail node. Essentially we turn
3214 a nested if into a case structure of sorts.
3219 if (!re_trie_maxbuff) {
3220 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3221 if (!SvIOK(re_trie_maxbuff))
3222 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3224 if ( SvIV(re_trie_maxbuff)>=0 ) {
3226 regnode *first = (regnode *)NULL;
3227 regnode *last = (regnode *)NULL;
3228 regnode *tail = scan;
3233 SV * const mysv = sv_newmortal(); /* for dumping */
3235 /* var tail is used because there may be a TAIL
3236 regop in the way. Ie, the exacts will point to the
3237 thing following the TAIL, but the last branch will
3238 point at the TAIL. So we advance tail. If we
3239 have nested (?:) we may have to move through several
3243 while ( OP( tail ) == TAIL ) {
3244 /* this is the TAIL generated by (?:) */
3245 tail = regnext( tail );
3249 DEBUG_TRIE_COMPILE_r({
3250 regprop(RExC_rx, mysv, tail );
3251 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3252 (int)depth * 2 + 2, "",
3253 "Looking for TRIE'able sequences. Tail node is: ",
3254 SvPV_nolen_const( mysv )
3260 Step through the branches
3261 cur represents each branch,
3262 noper is the first thing to be matched as part of that branch
3263 noper_next is the regnext() of that node.
3265 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3266 via a "jump trie" but we also support building with NOJUMPTRIE,
3267 which restricts the trie logic to structures like /FOO|BAR/.
3269 If noper is a trieable nodetype then the branch is a possible optimization
3270 target. If we are building under NOJUMPTRIE then we require that noper_next
3271 is the same as scan (our current position in the regex program).
3273 Once we have two or more consecutive such branches we can create a
3274 trie of the EXACT's contents and stitch it in place into the program.
3276 If the sequence represents all of the branches in the alternation we
3277 replace the entire thing with a single TRIE node.
3279 Otherwise when it is a subsequence we need to stitch it in place and
3280 replace only the relevant branches. This means the first branch has
3281 to remain as it is used by the alternation logic, and its next pointer,
3282 and needs to be repointed at the item on the branch chain following
3283 the last branch we have optimized away.
3285 This could be either a BRANCH, in which case the subsequence is internal,
3286 or it could be the item following the branch sequence in which case the
3287 subsequence is at the end (which does not necessarily mean the first node
3288 is the start of the alternation).
3290 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3293 ----------------+-----------
3297 EXACTFU_SS | EXACTFU
3298 EXACTFU_TRICKYFOLD | EXACTFU
3303 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3304 ( EXACT == (X) ) ? EXACT : \
3305 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3308 /* dont use tail as the end marker for this traverse */
3309 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3310 regnode * const noper = NEXTOPER( cur );
3311 U8 noper_type = OP( noper );
3312 U8 noper_trietype = TRIE_TYPE( noper_type );
3313 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3314 regnode * const noper_next = regnext( noper );
3315 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3316 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3319 DEBUG_TRIE_COMPILE_r({
3320 regprop(RExC_rx, mysv, cur);
3321 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3322 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3324 regprop(RExC_rx, mysv, noper);
3325 PerlIO_printf( Perl_debug_log, " -> %s",
3326 SvPV_nolen_const(mysv));
3329 regprop(RExC_rx, mysv, noper_next );
3330 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3331 SvPV_nolen_const(mysv));
3333 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3334 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3335 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3339 /* Is noper a trieable nodetype that can be merged with the
3340 * current trie (if there is one)? */
3344 ( noper_trietype == NOTHING)
3345 || ( trietype == NOTHING )
3346 || ( trietype == noper_trietype )
3349 && noper_next == tail
3353 /* Handle mergable triable node
3354 * Either we are the first node in a new trieable sequence,
3355 * in which case we do some bookkeeping, otherwise we update
3356 * the end pointer. */
3359 if ( noper_trietype == NOTHING ) {
3360 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3361 regnode * const noper_next = regnext( noper );
3362 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3363 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3366 if ( noper_next_trietype ) {
3367 trietype = noper_next_trietype;
3368 } else if (noper_next_type) {
3369 /* a NOTHING regop is 1 regop wide. We need at least two
3370 * for a trie so we can't merge this in */
3374 trietype = noper_trietype;
3377 if ( trietype == NOTHING )
3378 trietype = noper_trietype;
3383 } /* end handle mergable triable node */
3385 /* handle unmergable node -
3386 * noper may either be a triable node which can not be tried
3387 * together with the current trie, or a non triable node */
3389 /* If last is set and trietype is not NOTHING then we have found
3390 * at least two triable branch sequences in a row of a similar
3391 * trietype so we can turn them into a trie. If/when we
3392 * allow NOTHING to start a trie sequence this condition will be
3393 * required, and it isn't expensive so we leave it in for now. */
3394 if ( trietype && trietype != NOTHING )
3395 make_trie( pRExC_state,
3396 startbranch, first, cur, tail, count,
3397 trietype, depth+1 );
3398 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3402 && noper_next == tail
3405 /* noper is triable, so we can start a new trie sequence */
3408 trietype = noper_trietype;
3410 /* if we already saw a first but the current node is not triable then we have
3411 * to reset the first information. */
3416 } /* end handle unmergable node */
3417 } /* loop over branches */
3418 DEBUG_TRIE_COMPILE_r({
3419 regprop(RExC_rx, mysv, cur);
3420 PerlIO_printf( Perl_debug_log,
3421 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3422 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3425 if ( last && trietype ) {
3426 if ( trietype != NOTHING ) {
3427 /* the last branch of the sequence was part of a trie,
3428 * so we have to construct it here outside of the loop
3430 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3431 #ifdef TRIE_STUDY_OPT
3432 if ( ((made == MADE_EXACT_TRIE &&
3433 startbranch == first)
3434 || ( first_non_open == first )) &&
3436 flags |= SCF_TRIE_RESTUDY;
3437 if ( startbranch == first
3440 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3445 /* at this point we know whatever we have is a NOTHING sequence/branch
3446 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3448 if ( startbranch == first ) {
3450 /* the entire thing is a NOTHING sequence, something like this:
3451 * (?:|) So we can turn it into a plain NOTHING op. */
3452 DEBUG_TRIE_COMPILE_r({
3453 regprop(RExC_rx, mysv, cur);
3454 PerlIO_printf( Perl_debug_log,
3455 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3456 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3459 OP(startbranch)= NOTHING;
3460 NEXT_OFF(startbranch)= tail - startbranch;
3461 for ( opt= startbranch + 1; opt < tail ; opt++ )
3465 } /* end if ( last) */
3466 } /* TRIE_MAXBUF is non zero */
3471 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3472 scan = NEXTOPER(NEXTOPER(scan));
3473 } else /* single branch is optimized. */
3474 scan = NEXTOPER(scan);
3476 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3477 scan_frame *newframe = NULL;
3482 if (OP(scan) != SUSPEND) {
3483 /* set the pointer */
3484 if (OP(scan) == GOSUB) {
3486 RExC_recurse[ARG2L(scan)] = scan;
3487 start = RExC_open_parens[paren-1];
3488 end = RExC_close_parens[paren-1];
3491 start = RExC_rxi->program + 1;
3495 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3496 SAVEFREEPV(recursed);
3498 if (!PAREN_TEST(recursed,paren+1)) {
3499 PAREN_SET(recursed,paren+1);
3500 Newx(newframe,1,scan_frame);
3502 if (flags & SCF_DO_SUBSTR) {
3503 SCAN_COMMIT(pRExC_state,data,minlenp);
3504 data->longest = &(data->longest_float);
3506 is_inf = is_inf_internal = 1;
3507 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3508 cl_anything(pRExC_state, data->start_class);
3509 flags &= ~SCF_DO_STCLASS;
3512 Newx(newframe,1,scan_frame);
3515 end = regnext(scan);
3520 SAVEFREEPV(newframe);
3521 newframe->next = regnext(scan);
3522 newframe->last = last;
3523 newframe->stop = stopparen;
3524 newframe->prev = frame;
3534 else if (OP(scan) == EXACT) {
3535 I32 l = STR_LEN(scan);
3538 const U8 * const s = (U8*)STRING(scan);
3539 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3540 l = utf8_length(s, s + l);
3542 uc = *((U8*)STRING(scan));
3545 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3546 /* The code below prefers earlier match for fixed
3547 offset, later match for variable offset. */
3548 if (data->last_end == -1) { /* Update the start info. */
3549 data->last_start_min = data->pos_min;
3550 data->last_start_max = is_inf
3551 ? I32_MAX : data->pos_min + data->pos_delta;
3553 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3555 SvUTF8_on(data->last_found);
3557 SV * const sv = data->last_found;
3558 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3559 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3560 if (mg && mg->mg_len >= 0)
3561 mg->mg_len += utf8_length((U8*)STRING(scan),
3562 (U8*)STRING(scan)+STR_LEN(scan));
3564 data->last_end = data->pos_min + l;
3565 data->pos_min += l; /* As in the first entry. */
3566 data->flags &= ~SF_BEFORE_EOL;
3568 if (flags & SCF_DO_STCLASS_AND) {
3569 /* Check whether it is compatible with what we know already! */
3573 /* If compatible, we or it in below. It is compatible if is
3574 * in the bitmp and either 1) its bit or its fold is set, or 2)
3575 * it's for a locale. Even if there isn't unicode semantics
3576 * here, at runtime there may be because of matching against a
3577 * utf8 string, so accept a possible false positive for
3578 * latin1-range folds */
3580 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3581 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3582 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3583 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3588 ANYOF_CLASS_ZERO(data->start_class);
3589 ANYOF_BITMAP_ZERO(data->start_class);
3591 ANYOF_BITMAP_SET(data->start_class, uc);
3592 else if (uc >= 0x100) {
3595 /* Some Unicode code points fold to the Latin1 range; as
3596 * XXX temporary code, instead of figuring out if this is
3597 * one, just assume it is and set all the start class bits
3598 * that could be some such above 255 code point's fold
3599 * which will generate fals positives. As the code
3600 * elsewhere that does compute the fold settles down, it
3601 * can be extracted out and re-used here */
3602 for (i = 0; i < 256; i++){
3603 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3604 ANYOF_BITMAP_SET(data->start_class, i);
3608 data->start_class->flags &= ~ANYOF_EOS;
3610 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3612 else if (flags & SCF_DO_STCLASS_OR) {
3613 /* false positive possible if the class is case-folded */
3615 ANYOF_BITMAP_SET(data->start_class, uc);
3617 data->start_class->flags |= ANYOF_UNICODE_ALL;
3618 data->start_class->flags &= ~ANYOF_EOS;
3619 cl_and(data->start_class, and_withp);
3621 flags &= ~SCF_DO_STCLASS;
3623 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3624 I32 l = STR_LEN(scan);
3625 UV uc = *((U8*)STRING(scan));
3627 /* Search for fixed substrings supports EXACT only. */
3628 if (flags & SCF_DO_SUBSTR) {
3630 SCAN_COMMIT(pRExC_state, data, minlenp);
3633 const U8 * const s = (U8 *)STRING(scan);
3634 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3635 l = utf8_length(s, s + l);
3637 if (has_exactf_sharp_s) {
3638 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3640 min += l - min_subtract;
3642 delta += min_subtract;
3643 if (flags & SCF_DO_SUBSTR) {
3644 data->pos_min += l - min_subtract;
3645 if (data->pos_min < 0) {
3648 data->pos_delta += min_subtract;
3650 data->longest = &(data->longest_float);
3653 if (flags & SCF_DO_STCLASS_AND) {
3654 /* Check whether it is compatible with what we know already! */
3657 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3658 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3659 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3663 ANYOF_CLASS_ZERO(data->start_class);
3664 ANYOF_BITMAP_ZERO(data->start_class);
3666 ANYOF_BITMAP_SET(data->start_class, uc);
3667 data->start_class->flags &= ~ANYOF_EOS;
3668 if (OP(scan) == EXACTFL) {
3669 /* XXX This set is probably no longer necessary, and
3670 * probably wrong as LOCALE now is on in the initial
3672 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3676 /* Also set the other member of the fold pair. In case
3677 * that unicode semantics is called for at runtime, use
3678 * the full latin1 fold. (Can't do this for locale,
3679 * because not known until runtime) */
3680 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3682 /* All other (EXACTFL handled above) folds except under
3683 * /iaa that include s, S, and sharp_s also may include
3685 if (OP(scan) != EXACTFA) {
3686 if (uc == 's' || uc == 'S') {
3687 ANYOF_BITMAP_SET(data->start_class,
3688 LATIN_SMALL_LETTER_SHARP_S);
3690 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3691 ANYOF_BITMAP_SET(data->start_class, 's');
3692 ANYOF_BITMAP_SET(data->start_class, 'S');
3697 else if (uc >= 0x100) {
3699 for (i = 0; i < 256; i++){
3700 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3701 ANYOF_BITMAP_SET(data->start_class, i);
3706 else if (flags & SCF_DO_STCLASS_OR) {
3707 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3708 /* false positive possible if the class is case-folded.
3709 Assume that the locale settings are the same... */
3711 ANYOF_BITMAP_SET(data->start_class, uc);
3712 if (OP(scan) != EXACTFL) {
3714 /* And set the other member of the fold pair, but
3715 * can't do that in locale because not known until
3717 ANYOF_BITMAP_SET(data->start_class,
3718 PL_fold_latin1[uc]);
3720 /* All folds except under /iaa that include s, S,
3721 * and sharp_s also may include the others */
3722 if (OP(scan) != EXACTFA) {
3723 if (uc == 's' || uc == 'S') {
3724 ANYOF_BITMAP_SET(data->start_class,
3725 LATIN_SMALL_LETTER_SHARP_S);
3727 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3728 ANYOF_BITMAP_SET(data->start_class, 's');
3729 ANYOF_BITMAP_SET(data->start_class, 'S');
3734 data->start_class->flags &= ~ANYOF_EOS;
3736 cl_and(data->start_class, and_withp);
3738 flags &= ~SCF_DO_STCLASS;
3740 else if (REGNODE_VARIES(OP(scan))) {
3741 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3742 I32 f = flags, pos_before = 0;
3743 regnode * const oscan = scan;
3744 struct regnode_charclass_class this_class;
3745 struct regnode_charclass_class *oclass = NULL;
3746 I32 next_is_eval = 0;
3748 switch (PL_regkind[OP(scan)]) {
3749 case WHILEM: /* End of (?:...)* . */
3750 scan = NEXTOPER(scan);
3753 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3754 next = NEXTOPER(scan);
3755 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3757 maxcount = REG_INFTY;
3758 next = regnext(scan);
3759 scan = NEXTOPER(scan);
3763 if (flags & SCF_DO_SUBSTR)
3768 if (flags & SCF_DO_STCLASS) {
3770 maxcount = REG_INFTY;
3771 next = regnext(scan);
3772 scan = NEXTOPER(scan);
3775 is_inf = is_inf_internal = 1;
3776 scan = regnext(scan);
3777 if (flags & SCF_DO_SUBSTR) {
3778 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3779 data->longest = &(data->longest_float);
3781 goto optimize_curly_tail;
3783 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3784 && (scan->flags == stopparen))
3789 mincount = ARG1(scan);
3790 maxcount = ARG2(scan);
3792 next = regnext(scan);
3793 if (OP(scan) == CURLYX) {
3794 I32 lp = (data ? *(data->last_closep) : 0);
3795 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3797 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3798 next_is_eval = (OP(scan) == EVAL);
3800 if (flags & SCF_DO_SUBSTR) {
3801 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3802 pos_before = data->pos_min;
3806 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3808 data->flags |= SF_IS_INF;
3810 if (flags & SCF_DO_STCLASS) {
3811 cl_init(pRExC_state, &this_class);
3812 oclass = data->start_class;
3813 data->start_class = &this_class;
3814 f |= SCF_DO_STCLASS_AND;
3815 f &= ~SCF_DO_STCLASS_OR;
3817 /* Exclude from super-linear cache processing any {n,m}
3818 regops for which the combination of input pos and regex
3819 pos is not enough information to determine if a match
3822 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3823 regex pos at the \s*, the prospects for a match depend not
3824 only on the input position but also on how many (bar\s*)
3825 repeats into the {4,8} we are. */
3826 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3827 f &= ~SCF_WHILEM_VISITED_POS;
3829 /* This will finish on WHILEM, setting scan, or on NULL: */
3830 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3831 last, data, stopparen, recursed, NULL,
3833 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3835 if (flags & SCF_DO_STCLASS)
3836 data->start_class = oclass;
3837 if (mincount == 0 || minnext == 0) {
3838 if (flags & SCF_DO_STCLASS_OR) {
3839 cl_or(pRExC_state, data->start_class, &this_class);
3841 else if (flags & SCF_DO_STCLASS_AND) {
3842 /* Switch to OR mode: cache the old value of
3843 * data->start_class */
3845 StructCopy(data->start_class, and_withp,
3846 struct regnode_charclass_class);
3847 flags &= ~SCF_DO_STCLASS_AND;
3848 StructCopy(&this_class, data->start_class,
3849 struct regnode_charclass_class);
3850 flags |= SCF_DO_STCLASS_OR;
3851 data->start_class->flags |= ANYOF_EOS;
3853 } else { /* Non-zero len */
3854 if (flags & SCF_DO_STCLASS_OR) {
3855 cl_or(pRExC_state, data->start_class, &this_class);
3856 cl_and(data->start_class, and_withp);
3858 else if (flags & SCF_DO_STCLASS_AND)
3859 cl_and(data->start_class, &this_class);
3860 flags &= ~SCF_DO_STCLASS;
3862 if (!scan) /* It was not CURLYX, but CURLY. */
3864 if ( /* ? quantifier ok, except for (?{ ... }) */
3865 (next_is_eval || !(mincount == 0 && maxcount == 1))
3866 && (minnext == 0) && (deltanext == 0)
3867 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3868 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3870 /* Fatal warnings may leak the regexp without this: */
3871 SAVEFREESV(RExC_rx_sv);
3872 ckWARNreg(RExC_parse,
3873 "Quantifier unexpected on zero-length expression");
3874 ReREFCNT_inc(RExC_rx_sv);
3877 min += minnext * mincount;
3878 is_inf_internal |= ((maxcount == REG_INFTY
3879 && (minnext + deltanext) > 0)
3880 || deltanext == I32_MAX);
3881 is_inf |= is_inf_internal;
3882 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3884 /* Try powerful optimization CURLYX => CURLYN. */
3885 if ( OP(oscan) == CURLYX && data
3886 && data->flags & SF_IN_PAR
3887 && !(data->flags & SF_HAS_EVAL)
3888 && !deltanext && minnext == 1 ) {
3889 /* Try to optimize to CURLYN. */
3890 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3891 regnode * const nxt1 = nxt;
3898 if (!REGNODE_SIMPLE(OP(nxt))
3899 && !(PL_regkind[OP(nxt)] == EXACT
3900 && STR_LEN(nxt) == 1))
3906 if (OP(nxt) != CLOSE)
3908 if (RExC_open_parens) {
3909 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3910 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3912 /* Now we know that nxt2 is the only contents: */
3913 oscan->flags = (U8)ARG(nxt);
3915 OP(nxt1) = NOTHING; /* was OPEN. */
3918 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3919 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3920 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3921 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3922 OP(nxt + 1) = OPTIMIZED; /* was count. */
3923 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3928 /* Try optimization CURLYX => CURLYM. */
3929 if ( OP(oscan) == CURLYX && data
3930 && !(data->flags & SF_HAS_PAR)
3931 && !(data->flags & SF_HAS_EVAL)
3932 && !deltanext /* atom is fixed width */
3933 && minnext != 0 /* CURLYM can't handle zero width */
3934 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3936 /* XXXX How to optimize if data == 0? */
3937 /* Optimize to a simpler form. */
3938 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3942 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3943 && (OP(nxt2) != WHILEM))
3945 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3946 /* Need to optimize away parenths. */
3947 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3948 /* Set the parenth number. */
3949 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3951 oscan->flags = (U8)ARG(nxt);
3952 if (RExC_open_parens) {
3953 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3954 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3956 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3957 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3960 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3961 OP(nxt + 1) = OPTIMIZED; /* was count. */
3962 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3963 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3966 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3967 regnode *nnxt = regnext(nxt1);
3969 if (reg_off_by_arg[OP(nxt1)])
3970 ARG_SET(nxt1, nxt2 - nxt1);
3971 else if (nxt2 - nxt1 < U16_MAX)
3972 NEXT_OFF(nxt1) = nxt2 - nxt1;
3974 OP(nxt) = NOTHING; /* Cannot beautify */
3979 /* Optimize again: */
3980 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3981 NULL, stopparen, recursed, NULL, 0,depth+1);
3986 else if ((OP(oscan) == CURLYX)
3987 && (flags & SCF_WHILEM_VISITED_POS)
3988 /* See the comment on a similar expression above.
3989 However, this time it's not a subexpression
3990 we care about, but the expression itself. */
3991 && (maxcount == REG_INFTY)
3992 && data && ++data->whilem_c < 16) {
3993 /* This stays as CURLYX, we can put the count/of pair. */
3994 /* Find WHILEM (as in regexec.c) */
3995 regnode *nxt = oscan + NEXT_OFF(oscan);
3997 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3999 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4000 | (RExC_whilem_seen << 4)); /* On WHILEM */
4002 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4004 if (flags & SCF_DO_SUBSTR) {
4005 SV *last_str = NULL;
4006 int counted = mincount != 0;
4008 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4009 #if defined(SPARC64_GCC_WORKAROUND)
4012 const char *s = NULL;
4015 if (pos_before >= data->last_start_min)
4018 b = data->last_start_min;
4021 s = SvPV_const(data->last_found, l);
4022 old = b - data->last_start_min;
4025 I32 b = pos_before >= data->last_start_min
4026 ? pos_before : data->last_start_min;
4028 const char * const s = SvPV_const(data->last_found, l);
4029 I32 old = b - data->last_start_min;
4033 old = utf8_hop((U8*)s, old) - (U8*)s;
4035 /* Get the added string: */
4036 last_str = newSVpvn_utf8(s + old, l, UTF);
4037 if (deltanext == 0 && pos_before == b) {
4038 /* What was added is a constant string */
4040 SvGROW(last_str, (mincount * l) + 1);
4041 repeatcpy(SvPVX(last_str) + l,
4042 SvPVX_const(last_str), l, mincount - 1);
4043 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4044 /* Add additional parts. */
4045 SvCUR_set(data->last_found,
4046 SvCUR(data->last_found) - l);
4047 sv_catsv(data->last_found, last_str);
4049 SV * sv = data->last_found;
4051 SvUTF8(sv) && SvMAGICAL(sv) ?
4052 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4053 if (mg && mg->mg_len >= 0)
4054 mg->mg_len += CHR_SVLEN(last_str) - l;
4056 data->last_end += l * (mincount - 1);
4059 /* start offset must point into the last copy */
4060 data->last_start_min += minnext * (mincount - 1);
4061 data->last_start_max += is_inf ? I32_MAX
4062 : (maxcount - 1) * (minnext + data->pos_delta);
4065 /* It is counted once already... */
4066 data->pos_min += minnext * (mincount - counted);
4067 data->pos_delta += - counted * deltanext +
4068 (minnext + deltanext) * maxcount - minnext * mincount;
4069 if (mincount != maxcount) {
4070 /* Cannot extend fixed substrings found inside
4072 SCAN_COMMIT(pRExC_state,data,minlenp);
4073 if (mincount && last_str) {
4074 SV * const sv = data->last_found;
4075 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4076 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4080 sv_setsv(sv, last_str);
4081 data->last_end = data->pos_min;
4082 data->last_start_min =
4083 data->pos_min - CHR_SVLEN(last_str);
4084 data->last_start_max = is_inf
4086 : data->pos_min + data->pos_delta
4087 - CHR_SVLEN(last_str);
4089 data->longest = &(data->longest_float);
4091 SvREFCNT_dec(last_str);
4093 if (data && (fl & SF_HAS_EVAL))
4094 data->flags |= SF_HAS_EVAL;
4095 optimize_curly_tail:
4096 if (OP(oscan) != CURLYX) {
4097 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4099 NEXT_OFF(oscan) += NEXT_OFF(next);
4102 default: /* REF, ANYOFV, and CLUMP only? */
4103 if (flags & SCF_DO_SUBSTR) {
4104 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4105 data->longest = &(data->longest_float);
4107 is_inf = is_inf_internal = 1;
4108 if (flags & SCF_DO_STCLASS_OR)
4109 cl_anything(pRExC_state, data->start_class);
4110 flags &= ~SCF_DO_STCLASS;
4114 else if (OP(scan) == LNBREAK) {
4115 if (flags & SCF_DO_STCLASS) {
4117 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4118 if (flags & SCF_DO_STCLASS_AND) {
4119 for (value = 0; value < 256; value++)
4120 if (!is_VERTWS_cp(value))
4121 ANYOF_BITMAP_CLEAR(data->start_class, value);
4124 for (value = 0; value < 256; value++)
4125 if (is_VERTWS_cp(value))
4126 ANYOF_BITMAP_SET(data->start_class, value);
4128 if (flags & SCF_DO_STCLASS_OR)
4129 cl_and(data->start_class, and_withp);
4130 flags &= ~SCF_DO_STCLASS;
4133 delta++; /* Because of the 2 char string cr-lf */
4134 if (flags & SCF_DO_SUBSTR) {
4135 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4137 data->pos_delta += 1;
4138 data->longest = &(data->longest_float);
4141 else if (REGNODE_SIMPLE(OP(scan))) {
4144 if (flags & SCF_DO_SUBSTR) {
4145 SCAN_COMMIT(pRExC_state,data,minlenp);
4149 if (flags & SCF_DO_STCLASS) {
4150 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4152 /* Some of the logic below assumes that switching
4153 locale on will only add false positives. */
4154 switch (PL_regkind[OP(scan)]) {
4158 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4159 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4160 cl_anything(pRExC_state, data->start_class);
4163 if (OP(scan) == SANY)
4165 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4166 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4167 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4168 cl_anything(pRExC_state, data->start_class);
4170 if (flags & SCF_DO_STCLASS_AND || !value)
4171 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4174 if (flags & SCF_DO_STCLASS_AND)
4175 cl_and(data->start_class,
4176 (struct regnode_charclass_class*)scan);
4178 cl_or(pRExC_state, data->start_class,
4179 (struct regnode_charclass_class*)scan);
4182 if (flags & SCF_DO_STCLASS_AND) {
4183 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4184 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4185 if (OP(scan) == ALNUMU) {
4186 for (value = 0; value < 256; value++) {
4187 if (!isWORDCHAR_L1(value)) {
4188 ANYOF_BITMAP_CLEAR(data->start_class, value);
4192 for (value = 0; value < 256; value++) {
4193 if (!isALNUM(value)) {
4194 ANYOF_BITMAP_CLEAR(data->start_class, value);
4201 if (data->start_class->flags & ANYOF_LOCALE)
4202 ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4204 /* Even if under locale, set the bits for non-locale
4205 * in case it isn't a true locale-node. This will
4206 * create false positives if it truly is locale */
4207 if (OP(scan) == ALNUMU) {
4208 for (value = 0; value < 256; value++) {
4209 if (isWORDCHAR_L1(value)) {
4210 ANYOF_BITMAP_SET(data->start_class, value);
4214 for (value = 0; value < 256; value++) {
4215 if (isALNUM(value)) {
4216 ANYOF_BITMAP_SET(data->start_class, value);
4223 if (flags & SCF_DO_STCLASS_AND) {
4224 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4225 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4226 if (OP(scan) == NALNUMU) {
4227 for (value = 0; value < 256; value++) {
4228 if (isWORDCHAR_L1(value)) {
4229 ANYOF_BITMAP_CLEAR(data->start_class, value);
4233 for (value = 0; value < 256; value++) {
4234 if (isALNUM(value)) {
4235 ANYOF_BITMAP_CLEAR(data->start_class, value);
4242 if (data->start_class->flags & ANYOF_LOCALE)
4243 ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4245 /* Even if under locale, set the bits for non-locale in
4246 * case it isn't a true locale-node. This will create
4247 * false positives if it truly is locale */
4248 if (OP(scan) == NALNUMU) {
4249 for (value = 0; value < 256; value++) {
4250 if (! isWORDCHAR_L1(value)) {
4251 ANYOF_BITMAP_SET(data->start_class, value);
4255 for (value = 0; value < 256; value++) {
4256 if (! isALNUM(value)) {
4257 ANYOF_BITMAP_SET(data->start_class, value);
4264 if (flags & SCF_DO_STCLASS_AND) {
4265 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4266 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4267 if (OP(scan) == SPACEU) {
4268 for (value = 0; value < 256; value++) {
4269 if (!isSPACE_L1(value)) {
4270 ANYOF_BITMAP_CLEAR(data->start_class, value);
4274 for (value = 0; value < 256; value++) {
4275 if (!isSPACE(value)) {
4276 ANYOF_BITMAP_CLEAR(data->start_class, value);
4283 if (data->start_class->flags & ANYOF_LOCALE) {
4284 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4286 if (OP(scan) == SPACEU) {
4287 for (value = 0; value < 256; value++) {
4288 if (isSPACE_L1(value)) {
4289 ANYOF_BITMAP_SET(data->start_class, value);
4293 for (value = 0; value < 256; value++) {
4294 if (isSPACE(value)) {
4295 ANYOF_BITMAP_SET(data->start_class, value);
4302 if (flags & SCF_DO_STCLASS_AND) {
4303 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4305 if (OP(scan) == NSPACEU) {
4306 for (value = 0; value < 256; value++) {
4307 if (isSPACE_L1(value)) {
4308 ANYOF_BITMAP_CLEAR(data->start_class, value);
4312 for (value = 0; value < 256; value++) {
4313 if (isSPACE(value)) {
4314 ANYOF_BITMAP_CLEAR(data->start_class, value);
4321 if (data->start_class->flags & ANYOF_LOCALE)
4322 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4323 if (OP(scan) == NSPACEU) {
4324 for (value = 0; value < 256; value++) {
4325 if (!isSPACE_L1(value)) {
4326 ANYOF_BITMAP_SET(data->start_class, value);
4331 for (value = 0; value < 256; value++) {
4332 if (!isSPACE(value)) {
4333 ANYOF_BITMAP_SET(data->start_class, value);
4340 if (flags & SCF_DO_STCLASS_AND) {
4341 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4342 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4343 for (value = 0; value < 256; value++)
4344 if (!isDIGIT(value))
4345 ANYOF_BITMAP_CLEAR(data->start_class, value);
4349 if (data->start_class->flags & ANYOF_LOCALE)
4350 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4351 for (value = 0; value < 256; value++)
4353 ANYOF_BITMAP_SET(data->start_class, value);
4357 if (flags & SCF_DO_STCLASS_AND) {
4358 if (!(data->start_class->flags & ANYOF_LOCALE))
4359 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4360 for (value = 0; value < 256; value++)
4362 ANYOF_BITMAP_CLEAR(data->start_class, value);
4365 if (data->start_class->flags & ANYOF_LOCALE)
4366 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4367 for (value = 0; value < 256; value++)
4368 if (!isDIGIT(value))
4369 ANYOF_BITMAP_SET(data->start_class, value);
4372 CASE_SYNST_FNC(VERTWS);
4373 CASE_SYNST_FNC(HORIZWS);
4376 if (flags & SCF_DO_STCLASS_OR)
4377 cl_and(data->start_class, and_withp);
4378 flags &= ~SCF_DO_STCLASS;
4381 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4382 data->flags |= (OP(scan) == MEOL
4385 SCAN_COMMIT(pRExC_state, data, minlenp);
4388 else if ( PL_regkind[OP(scan)] == BRANCHJ
4389 /* Lookbehind, or need to calculate parens/evals/stclass: */
4390 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4391 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4392 if ( OP(scan) == UNLESSM &&
4394 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4395 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4398 regnode *upto= regnext(scan);
4400 SV * const mysv_val=sv_newmortal();
4401 DEBUG_STUDYDATA("OPFAIL",data,depth);
4403 /*DEBUG_PARSE_MSG("opfail");*/
4404 regprop(RExC_rx, mysv_val, upto);
4405 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4406 SvPV_nolen_const(mysv_val),
4407 (IV)REG_NODE_NUM(upto),
4412 NEXT_OFF(scan) = upto - scan;
4413 for (opt= scan + 1; opt < upto ; opt++)
4414 OP(opt) = OPTIMIZED;
4418 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4419 || OP(scan) == UNLESSM )
4421 /* Negative Lookahead/lookbehind
4422 In this case we can't do fixed string optimisation.
4425 I32 deltanext, minnext, fake = 0;
4427 struct regnode_charclass_class intrnl;
4430 data_fake.flags = 0;
4432 data_fake.whilem_c = data->whilem_c;
4433 data_fake.last_closep = data->last_closep;
4436 data_fake.last_closep = &fake;
4437 data_fake.pos_delta = delta;
4438 if ( flags & SCF_DO_STCLASS && !scan->flags
4439 && OP(scan) == IFMATCH ) { /* Lookahead */
4440 cl_init(pRExC_state, &intrnl);
4441 data_fake.start_class = &intrnl;
4442 f |= SCF_DO_STCLASS_AND;
4444 if (flags & SCF_WHILEM_VISITED_POS)
4445 f |= SCF_WHILEM_VISITED_POS;
4446 next = regnext(scan);
4447 nscan = NEXTOPER(NEXTOPER(scan));
4448 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4449 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4452 FAIL("Variable length lookbehind not implemented");
4454 else if (minnext > (I32)U8_MAX) {
4455 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4457 scan->flags = (U8)minnext;
4460 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4462 if (data_fake.flags & SF_HAS_EVAL)
4463 data->flags |= SF_HAS_EVAL;
4464 data->whilem_c = data_fake.whilem_c;
4466 if (f & SCF_DO_STCLASS_AND) {
4467 if (flags & SCF_DO_STCLASS_OR) {
4468 /* OR before, AND after: ideally we would recurse with
4469 * data_fake to get the AND applied by study of the
4470 * remainder of the pattern, and then derecurse;
4471 * *** HACK *** for now just treat as "no information".
4472 * See [perl #56690].
4474 cl_init(pRExC_state, data->start_class);
4476 /* AND before and after: combine and continue */
4477 const int was = (data->start_class->flags & ANYOF_EOS);
4479 cl_and(data->start_class, &intrnl);
4481 data->start_class->flags |= ANYOF_EOS;
4485 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4487 /* Positive Lookahead/lookbehind
4488 In this case we can do fixed string optimisation,
4489 but we must be careful about it. Note in the case of
4490 lookbehind the positions will be offset by the minimum
4491 length of the pattern, something we won't know about
4492 until after the recurse.
4494 I32 deltanext, fake = 0;
4496 struct regnode_charclass_class intrnl;
4498 /* We use SAVEFREEPV so that when the full compile
4499 is finished perl will clean up the allocated
4500 minlens when it's all done. This way we don't
4501 have to worry about freeing them when we know
4502 they wont be used, which would be a pain.
4505 Newx( minnextp, 1, I32 );
4506 SAVEFREEPV(minnextp);
4509 StructCopy(data, &data_fake, scan_data_t);
4510 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4513 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4514 data_fake.last_found=newSVsv(data->last_found);
4518 data_fake.last_closep = &fake;
4519 data_fake.flags = 0;
4520 data_fake.pos_delta = delta;
4522 data_fake.flags |= SF_IS_INF;
4523 if ( flags & SCF_DO_STCLASS && !scan->flags
4524 && OP(scan) == IFMATCH ) { /* Lookahead */
4525 cl_init(pRExC_state, &intrnl);
4526 data_fake.start_class = &intrnl;
4527 f |= SCF_DO_STCLASS_AND;
4529 if (flags & SCF_WHILEM_VISITED_POS)
4530 f |= SCF_WHILEM_VISITED_POS;
4531 next = regnext(scan);
4532 nscan = NEXTOPER(NEXTOPER(scan));
4534 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4535 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4538 FAIL("Variable length lookbehind not implemented");
4540 else if (*minnextp > (I32)U8_MAX) {
4541 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4543 scan->flags = (U8)*minnextp;
4548 if (f & SCF_DO_STCLASS_AND) {
4549 const int was = (data->start_class->flags & ANYOF_EOS);
4551 cl_and(data->start_class, &intrnl);
4553 data->start_class->flags |= ANYOF_EOS;
4556 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4558 if (data_fake.flags & SF_HAS_EVAL)
4559 data->flags |= SF_HAS_EVAL;
4560 data->whilem_c = data_fake.whilem_c;
4561 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4562 if (RExC_rx->minlen<*minnextp)
4563 RExC_rx->minlen=*minnextp;
4564 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4565 SvREFCNT_dec(data_fake.last_found);
4567 if ( data_fake.minlen_fixed != minlenp )
4569 data->offset_fixed= data_fake.offset_fixed;
4570 data->minlen_fixed= data_fake.minlen_fixed;
4571 data->lookbehind_fixed+= scan->flags;
4573 if ( data_fake.minlen_float != minlenp )
4575 data->minlen_float= data_fake.minlen_float;
4576 data->offset_float_min=data_fake.offset_float_min;
4577 data->offset_float_max=data_fake.offset_float_max;
4578 data->lookbehind_float+= scan->flags;
4585 else if (OP(scan) == OPEN) {
4586 if (stopparen != (I32)ARG(scan))
4589 else if (OP(scan) == CLOSE) {
4590 if (stopparen == (I32)ARG(scan)) {
4593 if ((I32)ARG(scan) == is_par) {
4594 next = regnext(scan);
4596 if ( next && (OP(next) != WHILEM) && next < last)
4597 is_par = 0; /* Disable optimization */
4600 *(data->last_closep) = ARG(scan);
4602 else if (OP(scan) == EVAL) {
4604 data->flags |= SF_HAS_EVAL;
4606 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4607 if (flags & SCF_DO_SUBSTR) {
4608 SCAN_COMMIT(pRExC_state,data,minlenp);
4609 flags &= ~SCF_DO_SUBSTR;
4611 if (data && OP(scan)==ACCEPT) {
4612 data->flags |= SCF_SEEN_ACCEPT;
4617 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4619 if (flags & SCF_DO_SUBSTR) {
4620 SCAN_COMMIT(pRExC_state,data,minlenp);
4621 data->longest = &(data->longest_float);
4623 is_inf = is_inf_internal = 1;
4624 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4625 cl_anything(pRExC_state, data->start_class);
4626 flags &= ~SCF_DO_STCLASS;
4628 else if (OP(scan) == GPOS) {
4629 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4630 !(delta || is_inf || (data && data->pos_delta)))
4632 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4633 RExC_rx->extflags |= RXf_ANCH_GPOS;
4634 if (RExC_rx->gofs < (U32)min)
4635 RExC_rx->gofs = min;
4637 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4641 #ifdef TRIE_STUDY_OPT
4642 #ifdef FULL_TRIE_STUDY
4643 else if (PL_regkind[OP(scan)] == TRIE) {
4644 /* NOTE - There is similar code to this block above for handling
4645 BRANCH nodes on the initial study. If you change stuff here
4647 regnode *trie_node= scan;
4648 regnode *tail= regnext(scan);
4649 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4650 I32 max1 = 0, min1 = I32_MAX;
4651 struct regnode_charclass_class accum;
4653 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4654 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4655 if (flags & SCF_DO_STCLASS)
4656 cl_init_zero(pRExC_state, &accum);
4662 const regnode *nextbranch= NULL;
4665 for ( word=1 ; word <= trie->wordcount ; word++)
4667 I32 deltanext=0, minnext=0, f = 0, fake;
4668 struct regnode_charclass_class this_class;
4670 data_fake.flags = 0;
4672 data_fake.whilem_c = data->whilem_c;
4673 data_fake.last_closep = data->last_closep;
4676 data_fake.last_closep = &fake;
4677 data_fake.pos_delta = delta;
4678 if (flags & SCF_DO_STCLASS) {
4679 cl_init(pRExC_state, &this_class);
4680 data_fake.start_class = &this_class;
4681 f = SCF_DO_STCLASS_AND;
4683 if (flags & SCF_WHILEM_VISITED_POS)
4684 f |= SCF_WHILEM_VISITED_POS;
4686 if (trie->jump[word]) {
4688 nextbranch = trie_node + trie->jump[0];
4689 scan= trie_node + trie->jump[word];
4690 /* We go from the jump point to the branch that follows
4691 it. Note this means we need the vestigal unused branches
4692 even though they arent otherwise used.
4694 minnext = study_chunk(pRExC_state, &scan, minlenp,
4695 &deltanext, (regnode *)nextbranch, &data_fake,
4696 stopparen, recursed, NULL, f,depth+1);
4698 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4699 nextbranch= regnext((regnode*)nextbranch);
4701 if (min1 > (I32)(minnext + trie->minlen))
4702 min1 = minnext + trie->minlen;
4703 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4704 max1 = minnext + deltanext + trie->maxlen;
4705 if (deltanext == I32_MAX)
4706 is_inf = is_inf_internal = 1;
4708 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4710 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4711 if ( stopmin > min + min1)
4712 stopmin = min + min1;
4713 flags &= ~SCF_DO_SUBSTR;
4715 data->flags |= SCF_SEEN_ACCEPT;
4718 if (data_fake.flags & SF_HAS_EVAL)
4719 data->flags |= SF_HAS_EVAL;
4720 data->whilem_c = data_fake.whilem_c;
4722 if (flags & SCF_DO_STCLASS)
4723 cl_or(pRExC_state, &accum, &this_class);
4726 if (flags & SCF_DO_SUBSTR) {
4727 data->pos_min += min1;
4728 data->pos_delta += max1 - min1;
4729 if (max1 != min1 || is_inf)
4730 data->longest = &(data->longest_float);
4733 delta += max1 - min1;
4734 if (flags & SCF_DO_STCLASS_OR) {
4735 cl_or(pRExC_state, data->start_class, &accum);
4737 cl_and(data->start_class, and_withp);
4738 flags &= ~SCF_DO_STCLASS;
4741 else if (flags & SCF_DO_STCLASS_AND) {
4743 cl_and(data->start_class, &accum);
4744 flags &= ~SCF_DO_STCLASS;
4747 /* Switch to OR mode: cache the old value of
4748 * data->start_class */
4750 StructCopy(data->start_class, and_withp,
4751 struct regnode_charclass_class);
4752 flags &= ~SCF_DO_STCLASS_AND;
4753 StructCopy(&accum, data->start_class,
4754 struct regnode_charclass_class);
4755 flags |= SCF_DO_STCLASS_OR;
4756 data->start_class->flags |= ANYOF_EOS;
4763 else if (PL_regkind[OP(scan)] == TRIE) {
4764 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4767 min += trie->minlen;
4768 delta += (trie->maxlen - trie->minlen);
4769 flags &= ~SCF_DO_STCLASS; /* xxx */
4770 if (flags & SCF_DO_SUBSTR) {
4771 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4772 data->pos_min += trie->minlen;
4773 data->pos_delta += (trie->maxlen - trie->minlen);
4774 if (trie->maxlen != trie->minlen)
4775 data->longest = &(data->longest_float);
4777 if (trie->jump) /* no more substrings -- for now /grr*/
4778 flags &= ~SCF_DO_SUBSTR;
4780 #endif /* old or new */
4781 #endif /* TRIE_STUDY_OPT */
4783 /* Else: zero-length, ignore. */
4784 scan = regnext(scan);
4789 stopparen = frame->stop;
4790 frame = frame->prev;
4791 goto fake_study_recurse;
4796 DEBUG_STUDYDATA("pre-fin:",data,depth);
4799 *deltap = is_inf_internal ? I32_MAX : delta;
4800 if (flags & SCF_DO_SUBSTR && is_inf)
4801 data->pos_delta = I32_MAX - data->pos_min;
4802 if (is_par > (I32)U8_MAX)
4804 if (is_par && pars==1 && data) {
4805 data->flags |= SF_IN_PAR;
4806 data->flags &= ~SF_HAS_PAR;
4808 else if (pars && data) {
4809 data->flags |= SF_HAS_PAR;
4810 data->flags &= ~SF_IN_PAR;
4812 if (flags & SCF_DO_STCLASS_OR)
4813 cl_and(data->start_class, and_withp);
4814 if (flags & SCF_TRIE_RESTUDY)
4815 data->flags |= SCF_TRIE_RESTUDY;
4817 DEBUG_STUDYDATA("post-fin:",data,depth);
4819 return min < stopmin ? min : stopmin;
4823 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4825 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4827 PERL_ARGS_ASSERT_ADD_DATA;
4829 Renewc(RExC_rxi->data,
4830 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4831 char, struct reg_data);
4833 Renew(RExC_rxi->data->what, count + n, U8);
4835 Newx(RExC_rxi->data->what, n, U8);
4836 RExC_rxi->data->count = count + n;
4837 Copy(s, RExC_rxi->data->what + count, n, U8);
4841 /*XXX: todo make this not included in a non debugging perl */
4842 #ifndef PERL_IN_XSUB_RE
4844 Perl_reginitcolors(pTHX)
4847 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4849 char *t = savepv(s);
4853 t = strchr(t, '\t');
4859 PL_colors[i] = t = (char *)"";
4864 PL_colors[i++] = (char *)"";
4871 #ifdef TRIE_STUDY_OPT
4872 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4875 (data.flags & SCF_TRIE_RESTUDY) \
4883 #define CHECK_RESTUDY_GOTO_butfirst
4887 * pregcomp - compile a regular expression into internal code
4889 * Decides which engine's compiler to call based on the hint currently in
4893 #ifndef PERL_IN_XSUB_RE
4895 /* return the currently in-scope regex engine (or the default if none) */
4897 regexp_engine const *
4898 Perl_current_re_engine(pTHX)
4902 if (IN_PERL_COMPILETIME) {
4903 HV * const table = GvHV(PL_hintgv);
4907 return &PL_core_reg_engine;
4908 ptr = hv_fetchs(table, "regcomp", FALSE);
4909 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4910 return &PL_core_reg_engine;
4911 return INT2PTR(regexp_engine*,SvIV(*ptr));
4915 if (!PL_curcop->cop_hints_hash)
4916 return &PL_core_reg_engine;
4917 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4918 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4919 return &PL_core_reg_engine;
4920 return INT2PTR(regexp_engine*,SvIV(ptr));
4926 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4929 regexp_engine const *eng = current_re_engine();
4930 GET_RE_DEBUG_FLAGS_DECL;
4932 PERL_ARGS_ASSERT_PREGCOMP;
4934 /* Dispatch a request to compile a regexp to correct regexp engine. */
4936 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4939 return CALLREGCOMP_ENG(eng, pattern, flags);
4943 /* public(ish) entry point for the perl core's own regex compiling code.
4944 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4945 * pattern rather than a list of OPs, and uses the internal engine rather
4946 * than the current one */
4949 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4951 SV *pat = pattern; /* defeat constness! */
4952 PERL_ARGS_ASSERT_RE_COMPILE;
4953 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4954 #ifdef PERL_IN_XSUB_RE
4957 &PL_core_reg_engine,
4959 NULL, NULL, rx_flags, 0);
4962 /* see if there are any run-time code blocks in the pattern.
4963 * False positives are allowed */
4966 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4967 U32 pm_flags, char *pat, STRLEN plen)
4972 /* avoid infinitely recursing when we recompile the pattern parcelled up
4973 * as qr'...'. A single constant qr// string can't have have any
4974 * run-time component in it, and thus, no runtime code. (A non-qr
4975 * string, however, can, e.g. $x =~ '(?{})') */
4976 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4979 for (s = 0; s < plen; s++) {
4980 if (n < pRExC_state->num_code_blocks
4981 && s == pRExC_state->code_blocks[n].start)
4983 s = pRExC_state->code_blocks[n].end;
4987 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4989 if (pat[s] == '(' && pat[s+1] == '?' &&
4990 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4997 /* Handle run-time code blocks. We will already have compiled any direct
4998 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4999 * copy of it, but with any literal code blocks blanked out and
5000 * appropriate chars escaped; then feed it into
5002 * eval "qr'modified_pattern'"
5006 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5010 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5012 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5013 * and merge them with any code blocks of the original regexp.
5015 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5016 * instead, just save the qr and return FALSE; this tells our caller that
5017 * the original pattern needs upgrading to utf8.
5021 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5022 char *pat, STRLEN plen)
5026 GET_RE_DEBUG_FLAGS_DECL;
5028 if (pRExC_state->runtime_code_qr) {
5029 /* this is the second time we've been called; this should
5030 * only happen if the main pattern got upgraded to utf8
5031 * during compilation; re-use the qr we compiled first time
5032 * round (which should be utf8 too)
5034 qr = pRExC_state->runtime_code_qr;
5035 pRExC_state->runtime_code_qr = NULL;
5036 assert(RExC_utf8 && SvUTF8(qr));
5042 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5046 /* determine how many extra chars we need for ' and \ escaping */
5047 for (s = 0; s < plen; s++) {
5048 if (pat[s] == '\'' || pat[s] == '\\')
5052 Newx(newpat, newlen, char);
5054 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5056 for (s = 0; s < plen; s++) {
5057 if (n < pRExC_state->num_code_blocks
5058 && s == pRExC_state->code_blocks[n].start)
5060 /* blank out literal code block */
5061 assert(pat[s] == '(');
5062 while (s <= pRExC_state->code_blocks[n].end) {
5070 if (pat[s] == '\'' || pat[s] == '\\')
5075 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5079 PerlIO_printf(Perl_debug_log,
5080 "%sre-parsing pattern for runtime code:%s %s\n",
5081 PL_colors[4],PL_colors[5],newpat);
5084 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5090 PUSHSTACKi(PERLSI_REQUIRE);
5091 /* this causes the toker to collapse \\ into \ when parsing
5092 * qr''; normally only q'' does this. It also alters hints
5094 PL_reg_state.re_reparsing = TRUE;
5095 eval_sv(sv, G_SCALAR);
5101 SV * const errsv = ERRSV;
5102 if (SvTRUE_NN(errsv))
5104 Safefree(pRExC_state->code_blocks);
5105 /* use croak_sv ? */
5106 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5109 assert(SvROK(qr_ref));
5111 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5112 /* the leaving below frees the tmp qr_ref.
5113 * Give qr a life of its own */
5121 if (!RExC_utf8 && SvUTF8(qr)) {
5122 /* first time through; the pattern got upgraded; save the
5123 * qr for the next time through */
5124 assert(!pRExC_state->runtime_code_qr);
5125 pRExC_state->runtime_code_qr = qr;
5130 /* extract any code blocks within the returned qr// */
5133 /* merge the main (r1) and run-time (r2) code blocks into one */
5135 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5136 struct reg_code_block *new_block, *dst;
5137 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5140 if (!r2->num_code_blocks) /* we guessed wrong */
5147 r1->num_code_blocks + r2->num_code_blocks,
5148 struct reg_code_block);
5151 while ( i1 < r1->num_code_blocks
5152 || i2 < r2->num_code_blocks)
5154 struct reg_code_block *src;
5157 if (i1 == r1->num_code_blocks) {
5158 src = &r2->code_blocks[i2++];
5161 else if (i2 == r2->num_code_blocks)
5162 src = &r1->code_blocks[i1++];
5163 else if ( r1->code_blocks[i1].start
5164 < r2->code_blocks[i2].start)
5166 src = &r1->code_blocks[i1++];
5167 assert(src->end < r2->code_blocks[i2].start);
5170 assert( r1->code_blocks[i1].start
5171 > r2->code_blocks[i2].start);
5172 src = &r2->code_blocks[i2++];
5174 assert(src->end < r1->code_blocks[i1].start);
5177 assert(pat[src->start] == '(');
5178 assert(pat[src->end] == ')');
5179 dst->start = src->start;
5180 dst->end = src->end;
5181 dst->block = src->block;
5182 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5186 r1->num_code_blocks += r2->num_code_blocks;
5187 Safefree(r1->code_blocks);
5188 r1->code_blocks = new_block;
5197 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5199 /* This is the common code for setting up the floating and fixed length
5200 * string data extracted from Perlre_op_compile() below. Returns a boolean
5201 * as to whether succeeded or not */
5205 if (! (longest_length
5206 || (eol /* Can't have SEOL and MULTI */
5207 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5209 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5210 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5215 /* copy the information about the longest from the reg_scan_data
5216 over to the program. */
5217 if (SvUTF8(sv_longest)) {
5218 *rx_utf8 = sv_longest;
5221 *rx_substr = sv_longest;
5224 /* end_shift is how many chars that must be matched that
5225 follow this item. We calculate it ahead of time as once the
5226 lookbehind offset is added in we lose the ability to correctly
5228 ml = minlen ? *(minlen) : (I32)longest_length;
5229 *rx_end_shift = ml - offset
5230 - longest_length + (SvTAIL(sv_longest) != 0)
5233 t = (eol/* Can't have SEOL and MULTI */
5234 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5235 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5241 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5242 * regular expression into internal code.
5243 * The pattern may be passed either as:
5244 * a list of SVs (patternp plus pat_count)
5245 * a list of OPs (expr)
5246 * If both are passed, the SV list is used, but the OP list indicates
5247 * which SVs are actually pre-compiled code blocks
5249 * The SVs in the list have magic and qr overloading applied to them (and
5250 * the list may be modified in-place with replacement SVs in the latter
5253 * If the pattern hasn't changed from old_re, then old_re will be
5256 * eng is the current engine. If that engine has an op_comp method, then
5257 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5258 * do the initial concatenation of arguments and pass on to the external
5261 * If is_bare_re is not null, set it to a boolean indicating whether the
5262 * arg list reduced (after overloading) to a single bare regex which has
5263 * been returned (i.e. /$qr/).
5265 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5267 * pm_flags contains the PMf_* flags, typically based on those from the
5268 * pm_flags field of the related PMOP. Currently we're only interested in
5269 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5271 * We can't allocate space until we know how big the compiled form will be,
5272 * but we can't compile it (and thus know how big it is) until we've got a
5273 * place to put the code. So we cheat: we compile it twice, once with code
5274 * generation turned off and size counting turned on, and once "for real".
5275 * This also means that we don't allocate space until we are sure that the
5276 * thing really will compile successfully, and we never have to move the
5277 * code and thus invalidate pointers into it. (Note that it has to be in
5278 * one piece because free() must be able to free it all.) [NB: not true in perl]
5280 * Beware that the optimization-preparation code in here knows about some
5281 * of the structure of the compiled regexp. [I'll say.]
5285 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5286 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5287 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5292 regexp_internal *ri;
5301 SV * VOL code_blocksv = NULL;
5303 /* these are all flags - maybe they should be turned
5304 * into a single int with different bit masks */
5305 I32 sawlookahead = 0;
5308 bool used_setjump = FALSE;
5309 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5310 bool code_is_utf8 = 0;
5311 bool VOL recompile = 0;
5312 bool runtime_code = 0;
5316 RExC_state_t RExC_state;
5317 RExC_state_t * const pRExC_state = &RExC_state;
5318 #ifdef TRIE_STUDY_OPT
5320 RExC_state_t copyRExC_state;
5322 GET_RE_DEBUG_FLAGS_DECL;
5324 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5326 DEBUG_r(if (!PL_colorset) reginitcolors());
5328 #ifndef PERL_IN_XSUB_RE
5329 /* Initialize these here instead of as-needed, as is quick and avoids
5330 * having to test them each time otherwise */
5331 if (! PL_AboveLatin1) {
5332 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5333 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5334 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5336 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5337 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5339 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5340 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5342 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5343 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5345 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5347 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5348 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5350 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5352 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5353 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5355 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5356 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5358 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5359 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5361 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5362 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5364 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5365 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5367 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5368 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5370 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5371 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5373 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5375 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5376 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5378 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5379 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5381 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5385 pRExC_state->code_blocks = NULL;
5386 pRExC_state->num_code_blocks = 0;
5389 *is_bare_re = FALSE;
5391 if (expr && (expr->op_type == OP_LIST ||
5392 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5394 /* is the source UTF8, and how many code blocks are there? */
5398 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5399 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5401 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5402 /* count of DO blocks */
5406 pRExC_state->num_code_blocks = ncode;
5407 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5412 /* handle a list of SVs */
5416 /* apply magic and RE overloading to each arg */
5417 for (svp = patternp; svp < patternp + pat_count; svp++) {
5420 if (SvROK(rx) && SvAMAGIC(rx)) {
5421 SV *sv = AMG_CALLunary(rx, regexp_amg);
5425 if (SvTYPE(sv) != SVt_REGEXP)
5426 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5432 if (pat_count > 1) {
5433 /* concat multiple args and find any code block indexes */
5438 STRLEN orig_patlen = 0;
5440 if (pRExC_state->num_code_blocks) {
5441 o = cLISTOPx(expr)->op_first;
5442 assert( o->op_type == OP_PUSHMARK
5443 || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
5444 || o->op_type == OP_PADRANGE);
5448 pat = newSVpvn("", 0);
5451 /* determine if the pattern is going to be utf8 (needed
5452 * in advance to align code block indices correctly).
5453 * XXX This could fail to be detected for an arg with
5454 * overloading but not concat overloading; but the main effect
5455 * in this obscure case is to need a 'use re eval' for a
5456 * literal code block */
5457 for (svp = patternp; svp < patternp + pat_count; svp++) {
5464 for (svp = patternp; svp < patternp + pat_count; svp++) {
5465 SV *sv, *msv = *svp;
5468 /* we make the assumption here that each op in the list of
5469 * op_siblings maps to one SV pushed onto the stack,
5470 * except for code blocks, with have both an OP_NULL and
5472 * This allows us to match up the list of SVs against the
5473 * list of OPs to find the next code block.
5475 * Note that PUSHMARK PADSV PADSV ..
5477 * PADRANGE NULL NULL ..
5478 * so the alignment still works. */
5480 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5481 assert(n < pRExC_state->num_code_blocks);
5482 pRExC_state->code_blocks[n].start = SvCUR(pat);
5483 pRExC_state->code_blocks[n].block = o;
5484 pRExC_state->code_blocks[n].src_regex = NULL;
5487 o = o->op_sibling; /* skip CONST */
5493 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5494 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5497 /* overloading involved: all bets are off over literal
5498 * code. Pretend we haven't seen it */
5499 pRExC_state->num_code_blocks -= n;
5505 while (SvAMAGIC(msv)
5506 && (sv = AMG_CALLunary(msv, string_amg))
5510 && SvRV(msv) == SvRV(sv))
5515 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5517 orig_patlen = SvCUR(pat);
5518 sv_catsv_nomg(pat, msv);
5521 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5524 /* extract any code blocks within any embedded qr//'s */
5525 if (rx && SvTYPE(rx) == SVt_REGEXP
5526 && RX_ENGINE((REGEXP*)rx)->op_comp)
5529 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5530 if (ri->num_code_blocks) {
5532 /* the presence of an embedded qr// with code means
5533 * we should always recompile: the text of the
5534 * qr// may not have changed, but it may be a
5535 * different closure than last time */
5537 Renew(pRExC_state->code_blocks,
5538 pRExC_state->num_code_blocks + ri->num_code_blocks,
5539 struct reg_code_block);
5540 pRExC_state->num_code_blocks += ri->num_code_blocks;
5541 for (i=0; i < ri->num_code_blocks; i++) {
5542 struct reg_code_block *src, *dst;
5543 STRLEN offset = orig_patlen
5544 + ReANY((REGEXP *)rx)->pre_prefix;
5545 assert(n < pRExC_state->num_code_blocks);
5546 src = &ri->code_blocks[i];
5547 dst = &pRExC_state->code_blocks[n];
5548 dst->start = src->start + offset;
5549 dst->end = src->end + offset;
5550 dst->block = src->block;
5551 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5565 while (SvAMAGIC(pat)
5566 && (sv = AMG_CALLunary(pat, string_amg))
5574 /* handle bare regex: foo =~ $re */
5579 if (SvTYPE(re) == SVt_REGEXP) {
5583 Safefree(pRExC_state->code_blocks);
5589 /* not a list of SVs, so must be a list of OPs */
5591 if (expr->op_type == OP_LIST) {
5596 pat = newSVpvn("", 0);
5601 /* given a list of CONSTs and DO blocks in expr, append all
5602 * the CONSTs to pat, and record the start and end of each
5603 * code block in code_blocks[] (each DO{} op is followed by an
5604 * OP_CONST containing the corresponding literal '(?{...})
5607 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5608 if (o->op_type == OP_CONST) {
5609 sv_catsv(pat, cSVOPo_sv);
5611 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5615 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5616 assert(i+1 < pRExC_state->num_code_blocks);
5617 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5618 pRExC_state->code_blocks[i].block = o;
5619 pRExC_state->code_blocks[i].src_regex = NULL;
5625 assert(expr->op_type == OP_CONST);
5626 pat = cSVOPx_sv(expr);
5630 exp = SvPV_nomg(pat, plen);
5632 if (!eng->op_comp) {
5633 if ((SvUTF8(pat) && IN_BYTES)
5634 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5636 /* make a temporary copy; either to convert to bytes,
5637 * or to avoid repeating get-magic / overloaded stringify */
5638 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5639 (IN_BYTES ? 0 : SvUTF8(pat)));
5641 Safefree(pRExC_state->code_blocks);
5642 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5645 /* ignore the utf8ness if the pattern is 0 length */
5646 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5647 RExC_uni_semantics = 0;
5648 RExC_contains_locale = 0;
5649 pRExC_state->runtime_code_qr = NULL;
5651 /****************** LONG JUMP TARGET HERE***********************/
5652 /* Longjmp back to here if have to switch in midstream to utf8 */
5653 if (! RExC_orig_utf8) {
5654 JMPENV_PUSH(jump_ret);
5655 used_setjump = TRUE;
5658 if (jump_ret == 0) { /* First time through */
5662 SV *dsv= sv_newmortal();
5663 RE_PV_QUOTED_DECL(s, RExC_utf8,
5664 dsv, exp, plen, 60);
5665 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5666 PL_colors[4],PL_colors[5],s);
5669 else { /* longjumped back */
5672 STRLEN s = 0, d = 0;
5675 /* If the cause for the longjmp was other than changing to utf8, pop
5676 * our own setjmp, and longjmp to the correct handler */
5677 if (jump_ret != UTF8_LONGJMP) {
5679 JMPENV_JUMP(jump_ret);
5684 /* It's possible to write a regexp in ascii that represents Unicode
5685 codepoints outside of the byte range, such as via \x{100}. If we
5686 detect such a sequence we have to convert the entire pattern to utf8
5687 and then recompile, as our sizing calculation will have been based
5688 on 1 byte == 1 character, but we will need to use utf8 to encode
5689 at least some part of the pattern, and therefore must convert the whole
5692 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5693 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5695 /* upgrade pattern to UTF8, and if there are code blocks,
5696 * recalculate the indices.
5697 * This is essentially an unrolled Perl_bytes_to_utf8() */
5699 src = (U8*)SvPV_nomg(pat, plen);
5700 Newx(dst, plen * 2 + 1, U8);
5703 const UV uv = NATIVE_TO_ASCII(src[s]);
5704 if (UNI_IS_INVARIANT(uv))
5705 dst[d] = (U8)UTF_TO_NATIVE(uv);
5707 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5708 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5710 if (n < pRExC_state->num_code_blocks) {
5711 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5712 pRExC_state->code_blocks[n].start = d;
5713 assert(dst[d] == '(');
5716 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5717 pRExC_state->code_blocks[n].end = d;
5718 assert(dst[d] == ')');
5731 RExC_orig_utf8 = RExC_utf8 = 1;
5734 /* return old regex if pattern hasn't changed */
5738 && !!RX_UTF8(old_re) == !!RExC_utf8
5739 && RX_PRECOMP(old_re)
5740 && RX_PRELEN(old_re) == plen
5741 && memEQ(RX_PRECOMP(old_re), exp, plen))
5743 /* with runtime code, always recompile */
5744 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5746 if (!runtime_code) {
5750 Safefree(pRExC_state->code_blocks);
5754 else if ((pm_flags & PMf_USE_RE_EVAL)
5755 /* this second condition covers the non-regex literal case,
5756 * i.e. $foo =~ '(?{})'. */
5757 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5758 && (PL_hints & HINT_RE_EVAL))
5760 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5763 #ifdef TRIE_STUDY_OPT
5767 rx_flags = orig_rx_flags;
5769 if (initial_charset == REGEX_LOCALE_CHARSET) {
5770 RExC_contains_locale = 1;
5772 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5774 /* Set to use unicode semantics if the pattern is in utf8 and has the
5775 * 'depends' charset specified, as it means unicode when utf8 */
5776 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5780 RExC_flags = rx_flags;
5781 RExC_pm_flags = pm_flags;
5784 if (TAINTING_get && TAINT_get)
5785 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5787 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5788 /* whoops, we have a non-utf8 pattern, whilst run-time code
5789 * got compiled as utf8. Try again with a utf8 pattern */
5790 JMPENV_JUMP(UTF8_LONGJMP);
5793 assert(!pRExC_state->runtime_code_qr);
5798 RExC_in_lookbehind = 0;
5799 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5801 RExC_override_recoding = 0;
5802 RExC_in_multi_char_class = 0;
5804 /* First pass: determine size, legality. */
5812 RExC_emit = &PL_regdummy;
5813 RExC_whilem_seen = 0;
5814 RExC_open_parens = NULL;
5815 RExC_close_parens = NULL;
5817 RExC_paren_names = NULL;
5819 RExC_paren_name_list = NULL;
5821 RExC_recurse = NULL;
5822 RExC_recurse_count = 0;
5823 pRExC_state->code_index = 0;
5825 #if 0 /* REGC() is (currently) a NOP at the first pass.
5826 * Clever compilers notice this and complain. --jhi */
5827 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5830 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5832 RExC_lastparse=NULL;
5834 /* reg may croak on us, not giving us a chance to free
5835 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5836 need it to survive as long as the regexp (qr/(?{})/).
5837 We must check that code_blocksv is not already set, because we may
5838 have longjmped back. */
5839 if (pRExC_state->code_blocks && !code_blocksv) {
5840 code_blocksv = newSV_type(SVt_PV);
5841 SAVEFREESV(code_blocksv);
5842 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5843 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5845 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5846 RExC_precomp = NULL;
5850 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5852 /* Here, finished first pass. Get rid of any added setjmp */
5858 PerlIO_printf(Perl_debug_log,
5859 "Required size %"IVdf" nodes\n"
5860 "Starting second pass (creation)\n",
5863 RExC_lastparse=NULL;
5866 /* The first pass could have found things that force Unicode semantics */
5867 if ((RExC_utf8 || RExC_uni_semantics)
5868 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5870 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5873 /* Small enough for pointer-storage convention?
5874 If extralen==0, this means that we will not need long jumps. */
5875 if (RExC_size >= 0x10000L && RExC_extralen)
5876 RExC_size += RExC_extralen;
5879 if (RExC_whilem_seen > 15)
5880 RExC_whilem_seen = 15;
5882 /* Allocate space and zero-initialize. Note, the two step process
5883 of zeroing when in debug mode, thus anything assigned has to
5884 happen after that */
5885 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5887 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5888 char, regexp_internal);
5889 if ( r == NULL || ri == NULL )
5890 FAIL("Regexp out of space");
5892 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5893 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5895 /* bulk initialize base fields with 0. */
5896 Zero(ri, sizeof(regexp_internal), char);
5899 /* non-zero initialization begins here */
5902 r->extflags = rx_flags;
5903 if (pm_flags & PMf_IS_QR) {
5904 ri->code_blocks = pRExC_state->code_blocks;
5905 ri->num_code_blocks = pRExC_state->num_code_blocks;
5910 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5911 if (pRExC_state->code_blocks[n].src_regex)
5912 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5913 SAVEFREEPV(pRExC_state->code_blocks);
5917 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5918 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5920 /* The caret is output if there are any defaults: if not all the STD
5921 * flags are set, or if no character set specifier is needed */
5923 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5925 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5926 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5927 >> RXf_PMf_STD_PMMOD_SHIFT);
5928 const char *fptr = STD_PAT_MODS; /*"msix"*/
5930 /* Allocate for the worst case, which is all the std flags are turned
5931 * on. If more precision is desired, we could do a population count of
5932 * the flags set. This could be done with a small lookup table, or by
5933 * shifting, masking and adding, or even, when available, assembly
5934 * language for a machine-language population count.
5935 * We never output a minus, as all those are defaults, so are
5936 * covered by the caret */
5937 const STRLEN wraplen = plen + has_p + has_runon
5938 + has_default /* If needs a caret */
5940 /* If needs a character set specifier */
5941 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5942 + (sizeof(STD_PAT_MODS) - 1)
5943 + (sizeof("(?:)") - 1);
5945 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5946 r->xpv_len_u.xpvlenu_pv = p;
5948 SvFLAGS(rx) |= SVf_UTF8;
5951 /* If a default, cover it using the caret */
5953 *p++= DEFAULT_PAT_MOD;
5957 const char* const name = get_regex_charset_name(r->extflags, &len);
5958 Copy(name, p, len, char);
5962 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5965 while((ch = *fptr++)) {
5973 Copy(RExC_precomp, p, plen, char);
5974 assert ((RX_WRAPPED(rx) - p) < 16);
5975 r->pre_prefix = p - RX_WRAPPED(rx);
5981 SvCUR_set(rx, p - RX_WRAPPED(rx));
5985 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5987 if (RExC_seen & REG_SEEN_RECURSE) {
5988 Newxz(RExC_open_parens, RExC_npar,regnode *);
5989 SAVEFREEPV(RExC_open_parens);
5990 Newxz(RExC_close_parens,RExC_npar,regnode *);
5991 SAVEFREEPV(RExC_close_parens);
5994 /* Useful during FAIL. */
5995 #ifdef RE_TRACK_PATTERN_OFFSETS
5996 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5997 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5998 "%s %"UVuf" bytes for offset annotations.\n",
5999 ri->u.offsets ? "Got" : "Couldn't get",
6000 (UV)((2*RExC_size+1) * sizeof(U32))));
6002 SetProgLen(ri,RExC_size);
6007 /* Second pass: emit code. */
6008 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6009 RExC_pm_flags = pm_flags;
6014 RExC_emit_start = ri->program;
6015 RExC_emit = ri->program;
6016 RExC_emit_bound = ri->program + RExC_size + 1;
6017 pRExC_state->code_index = 0;
6019 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6020 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6024 /* XXXX To minimize changes to RE engine we always allocate
6025 3-units-long substrs field. */
6026 Newx(r->substrs, 1, struct reg_substr_data);
6027 if (RExC_recurse_count) {
6028 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6029 SAVEFREEPV(RExC_recurse);
6033 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
6034 Zero(r->substrs, 1, struct reg_substr_data);
6036 #ifdef TRIE_STUDY_OPT
6038 StructCopy(&zero_scan_data, &data, scan_data_t);
6039 copyRExC_state = RExC_state;
6042 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6044 RExC_state = copyRExC_state;
6045 if (seen & REG_TOP_LEVEL_BRANCHES)
6046 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6048 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6049 StructCopy(&zero_scan_data, &data, scan_data_t);
6052 StructCopy(&zero_scan_data, &data, scan_data_t);
6055 /* Dig out information for optimizations. */
6056 r->extflags = RExC_flags; /* was pm_op */
6057 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6060 SvUTF8_on(rx); /* Unicode in it? */
6061 ri->regstclass = NULL;
6062 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6063 r->intflags |= PREGf_NAUGHTY;
6064 scan = ri->program + 1; /* First BRANCH. */
6066 /* testing for BRANCH here tells us whether there is "must appear"
6067 data in the pattern. If there is then we can use it for optimisations */
6068 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6070 STRLEN longest_float_length, longest_fixed_length;
6071 struct regnode_charclass_class ch_class; /* pointed to by data */
6073 I32 last_close = 0; /* pointed to by data */
6074 regnode *first= scan;
6075 regnode *first_next= regnext(first);
6077 * Skip introductions and multiplicators >= 1
6078 * so that we can extract the 'meat' of the pattern that must
6079 * match in the large if() sequence following.
6080 * NOTE that EXACT is NOT covered here, as it is normally
6081 * picked up by the optimiser separately.
6083 * This is unfortunate as the optimiser isnt handling lookahead
6084 * properly currently.
6087 while ((OP(first) == OPEN && (sawopen = 1)) ||
6088 /* An OR of *one* alternative - should not happen now. */
6089 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6090 /* for now we can't handle lookbehind IFMATCH*/
6091 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6092 (OP(first) == PLUS) ||
6093 (OP(first) == MINMOD) ||
6094 /* An {n,m} with n>0 */
6095 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6096 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6099 * the only op that could be a regnode is PLUS, all the rest
6100 * will be regnode_1 or regnode_2.
6103 if (OP(first) == PLUS)
6106 first += regarglen[OP(first)];
6108 first = NEXTOPER(first);
6109 first_next= regnext(first);
6112 /* Starting-point info. */
6114 DEBUG_PEEP("first:",first,0);
6115 /* Ignore EXACT as we deal with it later. */
6116 if (PL_regkind[OP(first)] == EXACT) {
6117 if (OP(first) == EXACT)
6118 NOOP; /* Empty, get anchored substr later. */
6120 ri->regstclass = first;
6123 else if (PL_regkind[OP(first)] == TRIE &&
6124 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6127 /* this can happen only on restudy */
6128 if ( OP(first) == TRIE ) {
6129 struct regnode_1 *trieop = (struct regnode_1 *)
6130 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6131 StructCopy(first,trieop,struct regnode_1);
6132 trie_op=(regnode *)trieop;
6134 struct regnode_charclass *trieop = (struct regnode_charclass *)
6135 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6136 StructCopy(first,trieop,struct regnode_charclass);
6137 trie_op=(regnode *)trieop;
6140 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6141 ri->regstclass = trie_op;
6144 else if (REGNODE_SIMPLE(OP(first)))
6145 ri->regstclass = first;
6146 else if (PL_regkind[OP(first)] == BOUND ||
6147 PL_regkind[OP(first)] == NBOUND)
6148 ri->regstclass = first;
6149 else if (PL_regkind[OP(first)] == BOL) {
6150 r->extflags |= (OP(first) == MBOL
6152 : (OP(first) == SBOL
6155 first = NEXTOPER(first);
6158 else if (OP(first) == GPOS) {
6159 r->extflags |= RXf_ANCH_GPOS;
6160 first = NEXTOPER(first);
6163 else if ((!sawopen || !RExC_sawback) &&
6164 (OP(first) == STAR &&
6165 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6166 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6168 /* turn .* into ^.* with an implied $*=1 */
6170 (OP(NEXTOPER(first)) == REG_ANY)
6173 r->extflags |= type;
6174 r->intflags |= PREGf_IMPLICIT;
6175 first = NEXTOPER(first);
6178 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6179 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6180 /* x+ must match at the 1st pos of run of x's */
6181 r->intflags |= PREGf_SKIP;
6183 /* Scan is after the zeroth branch, first is atomic matcher. */
6184 #ifdef TRIE_STUDY_OPT
6187 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6188 (IV)(first - scan + 1))
6192 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6193 (IV)(first - scan + 1))
6199 * If there's something expensive in the r.e., find the
6200 * longest literal string that must appear and make it the
6201 * regmust. Resolve ties in favor of later strings, since
6202 * the regstart check works with the beginning of the r.e.
6203 * and avoiding duplication strengthens checking. Not a
6204 * strong reason, but sufficient in the absence of others.
6205 * [Now we resolve ties in favor of the earlier string if
6206 * it happens that c_offset_min has been invalidated, since the
6207 * earlier string may buy us something the later one won't.]
6210 data.longest_fixed = newSVpvs("");
6211 data.longest_float = newSVpvs("");
6212 data.last_found = newSVpvs("");
6213 data.longest = &(data.longest_fixed);
6214 ENTER_with_name("study_chunk");
6215 SAVEFREESV(data.longest_fixed);
6216 SAVEFREESV(data.longest_float);
6217 SAVEFREESV(data.last_found);
6219 if (!ri->regstclass) {
6220 cl_init(pRExC_state, &ch_class);
6221 data.start_class = &ch_class;
6222 stclass_flag = SCF_DO_STCLASS_AND;
6223 } else /* XXXX Check for BOUND? */
6225 data.last_closep = &last_close;
6227 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6228 &data, -1, NULL, NULL,
6229 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6232 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6235 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6236 && data.last_start_min == 0 && data.last_end > 0
6237 && !RExC_seen_zerolen
6238 && !(RExC_seen & REG_SEEN_VERBARG)
6239 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6240 r->extflags |= RXf_CHECK_ALL;
6241 scan_commit(pRExC_state, &data,&minlen,0);
6243 longest_float_length = CHR_SVLEN(data.longest_float);
6245 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6246 && data.offset_fixed == data.offset_float_min
6247 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6248 && S_setup_longest (aTHX_ pRExC_state,
6252 &(r->float_end_shift),
6253 data.lookbehind_float,
6254 data.offset_float_min,
6256 longest_float_length,
6257 data.flags & SF_FL_BEFORE_EOL,
6258 data.flags & SF_FL_BEFORE_MEOL))
6260 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6261 r->float_max_offset = data.offset_float_max;
6262 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6263 r->float_max_offset -= data.lookbehind_float;
6264 SvREFCNT_inc_simple_void_NN(data.longest_float);
6267 r->float_substr = r->float_utf8 = NULL;
6268 longest_float_length = 0;
6271 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6273 if (S_setup_longest (aTHX_ pRExC_state,
6275 &(r->anchored_utf8),
6276 &(r->anchored_substr),
6277 &(r->anchored_end_shift),
6278 data.lookbehind_fixed,
6281 longest_fixed_length,
6282 data.flags & SF_FIX_BEFORE_EOL,
6283 data.flags & SF_FIX_BEFORE_MEOL))
6285 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6286 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6289 r->anchored_substr = r->anchored_utf8 = NULL;
6290 longest_fixed_length = 0;
6292 LEAVE_with_name("study_chunk");
6295 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6296 ri->regstclass = NULL;
6298 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6300 && !(data.start_class->flags & ANYOF_EOS)
6301 && !cl_is_anything(data.start_class))
6303 const U32 n = add_data(pRExC_state, 1, "f");
6304 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6306 Newx(RExC_rxi->data->data[n], 1,
6307 struct regnode_charclass_class);
6308 StructCopy(data.start_class,
6309 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6310 struct regnode_charclass_class);
6311 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6312 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6313 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6314 regprop(r, sv, (regnode*)data.start_class);
6315 PerlIO_printf(Perl_debug_log,
6316 "synthetic stclass \"%s\".\n",
6317 SvPVX_const(sv));});
6320 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6321 if (longest_fixed_length > longest_float_length) {
6322 r->check_end_shift = r->anchored_end_shift;
6323 r->check_substr = r->anchored_substr;
6324 r->check_utf8 = r->anchored_utf8;
6325 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6326 if (r->extflags & RXf_ANCH_SINGLE)
6327 r->extflags |= RXf_NOSCAN;
6330 r->check_end_shift = r->float_end_shift;
6331 r->check_substr = r->float_substr;
6332 r->check_utf8 = r->float_utf8;
6333 r->check_offset_min = r->float_min_offset;
6334 r->check_offset_max = r->float_max_offset;
6336 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6337 This should be changed ASAP! */
6338 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6339 r->extflags |= RXf_USE_INTUIT;
6340 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6341 r->extflags |= RXf_INTUIT_TAIL;
6343 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6344 if ( (STRLEN)minlen < longest_float_length )
6345 minlen= longest_float_length;
6346 if ( (STRLEN)minlen < longest_fixed_length )
6347 minlen= longest_fixed_length;
6351 /* Several toplevels. Best we can is to set minlen. */
6353 struct regnode_charclass_class ch_class;
6356 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6358 scan = ri->program + 1;
6359 cl_init(pRExC_state, &ch_class);
6360 data.start_class = &ch_class;
6361 data.last_closep = &last_close;
6364 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6365 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6367 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6369 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6370 = r->float_substr = r->float_utf8 = NULL;
6372 if (!(data.start_class->flags & ANYOF_EOS)
6373 && !cl_is_anything(data.start_class))
6375 const U32 n = add_data(pRExC_state, 1, "f");
6376 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6378 Newx(RExC_rxi->data->data[n], 1,
6379 struct regnode_charclass_class);
6380 StructCopy(data.start_class,
6381 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6382 struct regnode_charclass_class);
6383 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6384 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6385 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6386 regprop(r, sv, (regnode*)data.start_class);
6387 PerlIO_printf(Perl_debug_log,
6388 "synthetic stclass \"%s\".\n",
6389 SvPVX_const(sv));});
6393 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6394 the "real" pattern. */
6396 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6397 (IV)minlen, (IV)r->minlen);
6399 r->minlenret = minlen;
6400 if (r->minlen < minlen)
6403 if (RExC_seen & REG_SEEN_GPOS)
6404 r->extflags |= RXf_GPOS_SEEN;
6405 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6406 r->extflags |= RXf_LOOKBEHIND_SEEN;
6407 if (pRExC_state->num_code_blocks)
6408 r->extflags |= RXf_EVAL_SEEN;
6409 if (RExC_seen & REG_SEEN_CANY)
6410 r->extflags |= RXf_CANY_SEEN;
6411 if (RExC_seen & REG_SEEN_VERBARG)
6413 r->intflags |= PREGf_VERBARG_SEEN;
6414 r->extflags |= RXf_MODIFIES_VARS;
6416 if (RExC_seen & REG_SEEN_CUTGROUP)
6417 r->intflags |= PREGf_CUTGROUP_SEEN;
6418 if (pm_flags & PMf_USE_RE_EVAL)
6419 r->intflags |= PREGf_USE_RE_EVAL;
6420 if (RExC_paren_names)
6421 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6423 RXp_PAREN_NAMES(r) = NULL;
6425 #ifdef STUPID_PATTERN_CHECKS
6426 if (RX_PRELEN(rx) == 0)
6427 r->extflags |= RXf_NULL;
6428 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6429 r->extflags |= RXf_WHITE;
6430 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6431 r->extflags |= RXf_START_ONLY;
6434 regnode *first = ri->program + 1;
6437 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6438 r->extflags |= RXf_NULL;
6439 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6440 r->extflags |= RXf_START_ONLY;
6441 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6442 && OP(regnext(first)) == END)
6443 r->extflags |= RXf_WHITE;
6447 if (RExC_paren_names) {
6448 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6449 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6452 ri->name_list_idx = 0;
6454 if (RExC_recurse_count) {
6455 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6456 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6457 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6460 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6461 /* assume we don't need to swap parens around before we match */
6464 PerlIO_printf(Perl_debug_log,"Final program:\n");
6467 #ifdef RE_TRACK_PATTERN_OFFSETS
6468 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6469 const U32 len = ri->u.offsets[0];
6471 GET_RE_DEBUG_FLAGS_DECL;
6472 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6473 for (i = 1; i <= len; i++) {
6474 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6475 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6476 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6478 PerlIO_printf(Perl_debug_log, "\n");
6486 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6489 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6491 PERL_UNUSED_ARG(value);
6493 if (flags & RXapif_FETCH) {
6494 return reg_named_buff_fetch(rx, key, flags);
6495 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6496 Perl_croak_no_modify();
6498 } else if (flags & RXapif_EXISTS) {
6499 return reg_named_buff_exists(rx, key, flags)
6502 } else if (flags & RXapif_REGNAMES) {
6503 return reg_named_buff_all(rx, flags);
6504 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6505 return reg_named_buff_scalar(rx, flags);
6507 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6513 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6516 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6517 PERL_UNUSED_ARG(lastkey);
6519 if (flags & RXapif_FIRSTKEY)
6520 return reg_named_buff_firstkey(rx, flags);
6521 else if (flags & RXapif_NEXTKEY)
6522 return reg_named_buff_nextkey(rx, flags);
6524 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6530 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6533 AV *retarray = NULL;
6535 struct regexp *const rx = ReANY(r);
6537 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6539 if (flags & RXapif_ALL)
6542 if (rx && RXp_PAREN_NAMES(rx)) {
6543 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6546 SV* sv_dat=HeVAL(he_str);
6547 I32 *nums=(I32*)SvPVX(sv_dat);
6548 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6549 if ((I32)(rx->nparens) >= nums[i]
6550 && rx->offs[nums[i]].start != -1
6551 && rx->offs[nums[i]].end != -1)
6554 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6559 ret = newSVsv(&PL_sv_undef);
6562 av_push(retarray, ret);
6565 return newRV_noinc(MUTABLE_SV(retarray));
6572 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6575 struct regexp *const rx = ReANY(r);
6577 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6579 if (rx && RXp_PAREN_NAMES(rx)) {
6580 if (flags & RXapif_ALL) {
6581 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6583 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6597 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6599 struct regexp *const rx = ReANY(r);
6601 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6603 if ( rx && RXp_PAREN_NAMES(rx) ) {
6604 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6606 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6613 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6615 struct regexp *const rx = ReANY(r);
6616 GET_RE_DEBUG_FLAGS_DECL;
6618 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6620 if (rx && RXp_PAREN_NAMES(rx)) {
6621 HV *hv = RXp_PAREN_NAMES(rx);
6623 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6626 SV* sv_dat = HeVAL(temphe);
6627 I32 *nums = (I32*)SvPVX(sv_dat);
6628 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6629 if ((I32)(rx->lastparen) >= nums[i] &&
6630 rx->offs[nums[i]].start != -1 &&
6631 rx->offs[nums[i]].end != -1)
6637 if (parno || flags & RXapif_ALL) {
6638 return newSVhek(HeKEY_hek(temphe));
6646 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6651 struct regexp *const rx = ReANY(r);
6653 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6655 if (rx && RXp_PAREN_NAMES(rx)) {
6656 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6657 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6658 } else if (flags & RXapif_ONE) {
6659 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6660 av = MUTABLE_AV(SvRV(ret));
6661 length = av_len(av);
6663 return newSViv(length + 1);
6665 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6669 return &PL_sv_undef;
6673 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6675 struct regexp *const rx = ReANY(r);
6678 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6680 if (rx && RXp_PAREN_NAMES(rx)) {
6681 HV *hv= RXp_PAREN_NAMES(rx);
6683 (void)hv_iterinit(hv);
6684 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6687 SV* sv_dat = HeVAL(temphe);
6688 I32 *nums = (I32*)SvPVX(sv_dat);
6689 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6690 if ((I32)(rx->lastparen) >= nums[i] &&
6691 rx->offs[nums[i]].start != -1 &&
6692 rx->offs[nums[i]].end != -1)
6698 if (parno || flags & RXapif_ALL) {
6699 av_push(av, newSVhek(HeKEY_hek(temphe)));
6704 return newRV_noinc(MUTABLE_SV(av));
6708 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6711 struct regexp *const rx = ReANY(r);
6717 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6719 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6720 || n == RX_BUFF_IDX_CARET_FULLMATCH
6721 || n == RX_BUFF_IDX_CARET_POSTMATCH
6723 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6730 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6731 /* no need to distinguish between them any more */
6732 n = RX_BUFF_IDX_FULLMATCH;
6734 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6735 && rx->offs[0].start != -1)
6737 /* $`, ${^PREMATCH} */
6738 i = rx->offs[0].start;
6742 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6743 && rx->offs[0].end != -1)
6745 /* $', ${^POSTMATCH} */
6746 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6747 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6750 if ( 0 <= n && n <= (I32)rx->nparens &&
6751 (s1 = rx->offs[n].start) != -1 &&
6752 (t1 = rx->offs[n].end) != -1)
6754 /* $&, ${^MATCH}, $1 ... */
6756 s = rx->subbeg + s1 - rx->suboffset;
6761 assert(s >= rx->subbeg);
6762 assert(rx->sublen >= (s - rx->subbeg) + i );
6764 #if NO_TAINT_SUPPORT
6765 sv_setpvn(sv, s, i);
6767 const int oldtainted = TAINT_get;
6769 sv_setpvn(sv, s, i);
6770 TAINT_set(oldtainted);
6772 if ( (rx->extflags & RXf_CANY_SEEN)
6773 ? (RXp_MATCH_UTF8(rx)
6774 && (!i || is_utf8_string((U8*)s, i)))
6775 : (RXp_MATCH_UTF8(rx)) )
6782 if (RXp_MATCH_TAINTED(rx)) {
6783 if (SvTYPE(sv) >= SVt_PVMG) {
6784 MAGIC* const mg = SvMAGIC(sv);
6787 SvMAGIC_set(sv, mg->mg_moremagic);
6789 if ((mgt = SvMAGIC(sv))) {
6790 mg->mg_moremagic = mgt;
6791 SvMAGIC_set(sv, mg);
6802 sv_setsv(sv,&PL_sv_undef);
6808 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6809 SV const * const value)
6811 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6813 PERL_UNUSED_ARG(rx);
6814 PERL_UNUSED_ARG(paren);
6815 PERL_UNUSED_ARG(value);
6818 Perl_croak_no_modify();
6822 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6825 struct regexp *const rx = ReANY(r);
6829 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6831 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6833 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6834 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6838 case RX_BUFF_IDX_PREMATCH: /* $` */
6839 if (rx->offs[0].start != -1) {
6840 i = rx->offs[0].start;
6849 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6850 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6852 case RX_BUFF_IDX_POSTMATCH: /* $' */
6853 if (rx->offs[0].end != -1) {
6854 i = rx->sublen - rx->offs[0].end;
6856 s1 = rx->offs[0].end;
6863 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6864 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6868 /* $& / ${^MATCH}, $1, $2, ... */
6870 if (paren <= (I32)rx->nparens &&
6871 (s1 = rx->offs[paren].start) != -1 &&
6872 (t1 = rx->offs[paren].end) != -1)
6878 if (ckWARN(WARN_UNINITIALIZED))
6879 report_uninit((const SV *)sv);
6884 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6885 const char * const s = rx->subbeg - rx->suboffset + s1;
6890 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6897 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6899 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6900 PERL_UNUSED_ARG(rx);
6904 return newSVpvs("Regexp");
6907 /* Scans the name of a named buffer from the pattern.
6908 * If flags is REG_RSN_RETURN_NULL returns null.
6909 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6910 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6911 * to the parsed name as looked up in the RExC_paren_names hash.
6912 * If there is an error throws a vFAIL().. type exception.
6915 #define REG_RSN_RETURN_NULL 0
6916 #define REG_RSN_RETURN_NAME 1
6917 #define REG_RSN_RETURN_DATA 2
6920 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6922 char *name_start = RExC_parse;
6924 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6926 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6927 /* skip IDFIRST by using do...while */
6930 RExC_parse += UTF8SKIP(RExC_parse);
6931 } while (isALNUM_utf8((U8*)RExC_parse));
6935 } while (isALNUM(*RExC_parse));
6937 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6938 vFAIL("Group name must start with a non-digit word character");
6942 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6943 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6944 if ( flags == REG_RSN_RETURN_NAME)
6946 else if (flags==REG_RSN_RETURN_DATA) {
6949 if ( ! sv_name ) /* should not happen*/
6950 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6951 if (RExC_paren_names)
6952 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6954 sv_dat = HeVAL(he_str);
6956 vFAIL("Reference to nonexistent named group");
6960 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6961 (unsigned long) flags);
6963 assert(0); /* NOT REACHED */
6968 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6969 int rem=(int)(RExC_end - RExC_parse); \
6978 if (RExC_lastparse!=RExC_parse) \
6979 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6982 iscut ? "..." : "<" \
6985 PerlIO_printf(Perl_debug_log,"%16s",""); \
6988 num = RExC_size + 1; \
6990 num=REG_NODE_NUM(RExC_emit); \
6991 if (RExC_lastnum!=num) \
6992 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6994 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6995 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6996 (int)((depth*2)), "", \
7000 RExC_lastparse=RExC_parse; \
7005 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7006 DEBUG_PARSE_MSG((funcname)); \
7007 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7009 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7010 DEBUG_PARSE_MSG((funcname)); \
7011 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7014 /* This section of code defines the inversion list object and its methods. The
7015 * interfaces are highly subject to change, so as much as possible is static to
7016 * this file. An inversion list is here implemented as a malloc'd C UV array
7017 * with some added info that is placed as UVs at the beginning in a header
7018 * portion. An inversion list for Unicode is an array of code points, sorted
7019 * by ordinal number. The zeroth element is the first code point in the list.
7020 * The 1th element is the first element beyond that not in the list. In other
7021 * words, the first range is
7022 * invlist[0]..(invlist[1]-1)
7023 * The other ranges follow. Thus every element whose index is divisible by two
7024 * marks the beginning of a range that is in the list, and every element not
7025 * divisible by two marks the beginning of a range not in the list. A single
7026 * element inversion list that contains the single code point N generally
7027 * consists of two elements
7030 * (The exception is when N is the highest representable value on the
7031 * machine, in which case the list containing just it would be a single
7032 * element, itself. By extension, if the last range in the list extends to
7033 * infinity, then the first element of that range will be in the inversion list
7034 * at a position that is divisible by two, and is the final element in the
7036 * Taking the complement (inverting) an inversion list is quite simple, if the
7037 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7038 * This implementation reserves an element at the beginning of each inversion
7039 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
7040 * actual beginning of the list is either that element if 0, or the next one if
7043 * More about inversion lists can be found in "Unicode Demystified"
7044 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7045 * More will be coming when functionality is added later.
7047 * The inversion list data structure is currently implemented as an SV pointing
7048 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7049 * array of UV whose memory management is automatically handled by the existing
7050 * facilities for SV's.
7052 * Some of the methods should always be private to the implementation, and some
7053 * should eventually be made public */
7055 /* The header definitions are in F<inline_invlist.c> */
7057 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7058 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7060 #define INVLIST_INITIAL_LEN 10
7062 PERL_STATIC_INLINE UV*
7063 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7065 /* Returns a pointer to the first element in the inversion list's array.
7066 * This is called upon initialization of an inversion list. Where the
7067 * array begins depends on whether the list has the code point U+0000
7068 * in it or not. The other parameter tells it whether the code that
7069 * follows this call is about to put a 0 in the inversion list or not.
7070 * The first element is either the element with 0, if 0, or the next one,
7073 UV* zero = get_invlist_zero_addr(invlist);
7075 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7078 assert(! *_get_invlist_len_addr(invlist));
7080 /* 1^1 = 0; 1^0 = 1 */
7081 *zero = 1 ^ will_have_0;
7082 return zero + *zero;
7085 PERL_STATIC_INLINE UV*
7086 S_invlist_array(pTHX_ SV* const invlist)
7088 /* Returns the pointer to the inversion list's array. Every time the
7089 * length changes, this needs to be called in case malloc or realloc moved
7092 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7094 /* Must not be empty. If these fail, you probably didn't check for <len>
7095 * being non-zero before trying to get the array */
7096 assert(*_get_invlist_len_addr(invlist));
7097 assert(*get_invlist_zero_addr(invlist) == 0
7098 || *get_invlist_zero_addr(invlist) == 1);
7100 /* The array begins either at the element reserved for zero if the
7101 * list contains 0 (that element will be set to 0), or otherwise the next
7102 * element (in which case the reserved element will be set to 1). */
7103 return (UV *) (get_invlist_zero_addr(invlist)
7104 + *get_invlist_zero_addr(invlist));
7107 PERL_STATIC_INLINE void
7108 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7110 /* Sets the current number of elements stored in the inversion list */
7112 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7114 *_get_invlist_len_addr(invlist) = len;
7116 assert(len <= SvLEN(invlist));
7118 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7119 /* If the list contains U+0000, that element is part of the header,
7120 * and should not be counted as part of the array. It will contain
7121 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7123 * SvCUR_set(invlist,
7124 * TO_INTERNAL_SIZE(len
7125 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7126 * But, this is only valid if len is not 0. The consequences of not doing
7127 * this is that the memory allocation code may think that 1 more UV is
7128 * being used than actually is, and so might do an unnecessary grow. That
7129 * seems worth not bothering to make this the precise amount.
7131 * Note that when inverting, SvCUR shouldn't change */
7134 PERL_STATIC_INLINE IV*
7135 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7137 /* Return the address of the UV that is reserved to hold the cached index
7140 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7142 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7145 PERL_STATIC_INLINE IV
7146 S_invlist_previous_index(pTHX_ SV* const invlist)
7148 /* Returns cached index of previous search */
7150 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7152 return *get_invlist_previous_index_addr(invlist);
7155 PERL_STATIC_INLINE void
7156 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7158 /* Caches <index> for later retrieval */
7160 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7162 assert(index == 0 || index < (int) _invlist_len(invlist));
7164 *get_invlist_previous_index_addr(invlist) = index;
7167 PERL_STATIC_INLINE UV
7168 S_invlist_max(pTHX_ SV* const invlist)
7170 /* Returns the maximum number of elements storable in the inversion list's
7171 * array, without having to realloc() */
7173 PERL_ARGS_ASSERT_INVLIST_MAX;
7175 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7178 PERL_STATIC_INLINE UV*
7179 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7181 /* Return the address of the UV that is reserved to hold 0 if the inversion
7182 * list contains 0. This has to be the last element of the heading, as the
7183 * list proper starts with either it if 0, or the next element if not.
7184 * (But we force it to contain either 0 or 1) */
7186 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7188 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7191 #ifndef PERL_IN_XSUB_RE
7193 Perl__new_invlist(pTHX_ IV initial_size)
7196 /* Return a pointer to a newly constructed inversion list, with enough
7197 * space to store 'initial_size' elements. If that number is negative, a
7198 * system default is used instead */
7202 if (initial_size < 0) {
7203 initial_size = INVLIST_INITIAL_LEN;
7206 /* Allocate the initial space */
7207 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7208 invlist_set_len(new_list, 0);
7210 /* Force iterinit() to be used to get iteration to work */
7211 *get_invlist_iter_addr(new_list) = UV_MAX;
7213 /* This should force a segfault if a method doesn't initialize this
7215 *get_invlist_zero_addr(new_list) = UV_MAX;
7217 *get_invlist_previous_index_addr(new_list) = 0;
7218 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7219 #if HEADER_LENGTH != 5
7220 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7228 S__new_invlist_C_array(pTHX_ UV* list)
7230 /* Return a pointer to a newly constructed inversion list, initialized to
7231 * point to <list>, which has to be in the exact correct inversion list
7232 * form, including internal fields. Thus this is a dangerous routine that
7233 * should not be used in the wrong hands */
7235 SV* invlist = newSV_type(SVt_PV);
7237 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7239 SvPV_set(invlist, (char *) list);
7240 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7241 shouldn't touch it */
7242 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7244 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7245 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7252 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7254 /* Grow the maximum size of an inversion list */
7256 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7258 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7261 PERL_STATIC_INLINE void
7262 S_invlist_trim(pTHX_ SV* const invlist)
7264 PERL_ARGS_ASSERT_INVLIST_TRIM;
7266 /* Change the length of the inversion list to how many entries it currently
7269 SvPV_shrink_to_cur((SV *) invlist);
7272 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7275 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7277 /* Subject to change or removal. Append the range from 'start' to 'end' at
7278 * the end of the inversion list. The range must be above any existing
7282 UV max = invlist_max(invlist);
7283 UV len = _invlist_len(invlist);
7285 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7287 if (len == 0) { /* Empty lists must be initialized */
7288 array = _invlist_array_init(invlist, start == 0);
7291 /* Here, the existing list is non-empty. The current max entry in the
7292 * list is generally the first value not in the set, except when the
7293 * set extends to the end of permissible values, in which case it is
7294 * the first entry in that final set, and so this call is an attempt to
7295 * append out-of-order */
7297 UV final_element = len - 1;
7298 array = invlist_array(invlist);
7299 if (array[final_element] > start
7300 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7302 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7303 array[final_element], start,
7304 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7307 /* Here, it is a legal append. If the new range begins with the first
7308 * value not in the set, it is extending the set, so the new first
7309 * value not in the set is one greater than the newly extended range.
7311 if (array[final_element] == start) {
7312 if (end != UV_MAX) {
7313 array[final_element] = end + 1;
7316 /* But if the end is the maximum representable on the machine,
7317 * just let the range that this would extend to have no end */
7318 invlist_set_len(invlist, len - 1);
7324 /* Here the new range doesn't extend any existing set. Add it */
7326 len += 2; /* Includes an element each for the start and end of range */
7328 /* If overflows the existing space, extend, which may cause the array to be
7331 invlist_extend(invlist, len);
7332 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7333 failure in invlist_array() */
7334 array = invlist_array(invlist);
7337 invlist_set_len(invlist, len);
7340 /* The next item on the list starts the range, the one after that is
7341 * one past the new range. */
7342 array[len - 2] = start;
7343 if (end != UV_MAX) {
7344 array[len - 1] = end + 1;
7347 /* But if the end is the maximum representable on the machine, just let
7348 * the range have no end */
7349 invlist_set_len(invlist, len - 1);
7353 #ifndef PERL_IN_XSUB_RE
7356 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7358 /* Searches the inversion list for the entry that contains the input code
7359 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7360 * return value is the index into the list's array of the range that
7365 IV high = _invlist_len(invlist);
7366 const IV highest_element = high - 1;
7369 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7371 /* If list is empty, return failure. */
7376 /* If the code point is before the first element, return failure. (We
7377 * can't combine this with the test above, because we can't get the array
7378 * unless we know the list is non-empty) */
7379 array = invlist_array(invlist);
7381 mid = invlist_previous_index(invlist);
7382 assert(mid >=0 && mid <= highest_element);
7384 /* <mid> contains the cache of the result of the previous call to this
7385 * function (0 the first time). See if this call is for the same result,
7386 * or if it is for mid-1. This is under the theory that calls to this
7387 * function will often be for related code points that are near each other.
7388 * And benchmarks show that caching gives better results. We also test
7389 * here if the code point is within the bounds of the list. These tests
7390 * replace others that would have had to be made anyway to make sure that
7391 * the array bounds were not exceeded, and these give us extra information
7392 * at the same time */
7393 if (cp >= array[mid]) {
7394 if (cp >= array[highest_element]) {
7395 return highest_element;
7398 /* Here, array[mid] <= cp < array[highest_element]. This means that
7399 * the final element is not the answer, so can exclude it; it also
7400 * means that <mid> is not the final element, so can refer to 'mid + 1'
7402 if (cp < array[mid + 1]) {
7408 else { /* cp < aray[mid] */
7409 if (cp < array[0]) { /* Fail if outside the array */
7413 if (cp >= array[mid - 1]) {
7418 /* Binary search. What we are looking for is <i> such that
7419 * array[i] <= cp < array[i+1]
7420 * The loop below converges on the i+1. Note that there may not be an
7421 * (i+1)th element in the array, and things work nonetheless */
7422 while (low < high) {
7423 mid = (low + high) / 2;
7424 assert(mid <= highest_element);
7425 if (array[mid] <= cp) { /* cp >= array[mid] */
7428 /* We could do this extra test to exit the loop early.
7429 if (cp < array[low]) {
7434 else { /* cp < array[mid] */
7441 invlist_set_previous_index(invlist, high);
7446 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7448 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7449 * but is used when the swash has an inversion list. This makes this much
7450 * faster, as it uses a binary search instead of a linear one. This is
7451 * intimately tied to that function, and perhaps should be in utf8.c,
7452 * except it is intimately tied to inversion lists as well. It assumes
7453 * that <swatch> is all 0's on input */
7456 const IV len = _invlist_len(invlist);
7460 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7462 if (len == 0) { /* Empty inversion list */
7466 array = invlist_array(invlist);
7468 /* Find which element it is */
7469 i = _invlist_search(invlist, start);
7471 /* We populate from <start> to <end> */
7472 while (current < end) {
7475 /* The inversion list gives the results for every possible code point
7476 * after the first one in the list. Only those ranges whose index is
7477 * even are ones that the inversion list matches. For the odd ones,
7478 * and if the initial code point is not in the list, we have to skip
7479 * forward to the next element */
7480 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7482 if (i >= len) { /* Finished if beyond the end of the array */
7486 if (current >= end) { /* Finished if beyond the end of what we
7488 if (LIKELY(end < UV_MAX)) {
7492 /* We get here when the upper bound is the maximum
7493 * representable on the machine, and we are looking for just
7494 * that code point. Have to special case it */
7496 goto join_end_of_list;
7499 assert(current >= start);
7501 /* The current range ends one below the next one, except don't go past
7504 upper = (i < len && array[i] < end) ? array[i] : end;
7506 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7507 * for each code point in it */
7508 for (; current < upper; current++) {
7509 const STRLEN offset = (STRLEN)(current - start);
7510 swatch[offset >> 3] |= 1 << (offset & 7);
7515 /* Quit if at the end of the list */
7518 /* But first, have to deal with the highest possible code point on
7519 * the platform. The previous code assumes that <end> is one
7520 * beyond where we want to populate, but that is impossible at the
7521 * platform's infinity, so have to handle it specially */
7522 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7524 const STRLEN offset = (STRLEN)(end - start);
7525 swatch[offset >> 3] |= 1 << (offset & 7);
7530 /* Advance to the next range, which will be for code points not in the
7539 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7541 /* Take the union of two inversion lists and point <output> to it. *output
7542 * should be defined upon input, and if it points to one of the two lists,
7543 * the reference count to that list will be decremented. The first list,
7544 * <a>, may be NULL, in which case a copy of the second list is returned.
7545 * If <complement_b> is TRUE, the union is taken of the complement
7546 * (inversion) of <b> instead of b itself.
7548 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7549 * Richard Gillam, published by Addison-Wesley, and explained at some
7550 * length there. The preface says to incorporate its examples into your
7551 * code at your own risk.
7553 * The algorithm is like a merge sort.
7555 * XXX A potential performance improvement is to keep track as we go along
7556 * if only one of the inputs contributes to the result, meaning the other
7557 * is a subset of that one. In that case, we can skip the final copy and
7558 * return the larger of the input lists, but then outside code might need
7559 * to keep track of whether to free the input list or not */
7561 UV* array_a; /* a's array */
7563 UV len_a; /* length of a's array */
7566 SV* u; /* the resulting union */
7570 UV i_a = 0; /* current index into a's array */
7574 /* running count, as explained in the algorithm source book; items are
7575 * stopped accumulating and are output when the count changes to/from 0.
7576 * The count is incremented when we start a range that's in the set, and
7577 * decremented when we start a range that's not in the set. So its range
7578 * is 0 to 2. Only when the count is zero is something not in the set.
7582 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7585 /* If either one is empty, the union is the other one */
7586 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7593 *output = invlist_clone(b);
7595 _invlist_invert(*output);
7597 } /* else *output already = b; */
7600 else if ((len_b = _invlist_len(b)) == 0) {
7605 /* The complement of an empty list is a list that has everything in it,
7606 * so the union with <a> includes everything too */
7611 *output = _new_invlist(1);
7612 _append_range_to_invlist(*output, 0, UV_MAX);
7614 else if (*output != a) {
7615 *output = invlist_clone(a);
7617 /* else *output already = a; */
7621 /* Here both lists exist and are non-empty */
7622 array_a = invlist_array(a);
7623 array_b = invlist_array(b);
7625 /* If are to take the union of 'a' with the complement of b, set it
7626 * up so are looking at b's complement. */
7629 /* To complement, we invert: if the first element is 0, remove it. To
7630 * do this, we just pretend the array starts one later, and clear the
7631 * flag as we don't have to do anything else later */
7632 if (array_b[0] == 0) {
7635 complement_b = FALSE;
7639 /* But if the first element is not zero, we unshift a 0 before the
7640 * array. The data structure reserves a space for that 0 (which
7641 * should be a '1' right now), so physical shifting is unneeded,
7642 * but temporarily change that element to 0. Before exiting the
7643 * routine, we must restore the element to '1' */
7650 /* Size the union for the worst case: that the sets are completely
7652 u = _new_invlist(len_a + len_b);
7654 /* Will contain U+0000 if either component does */
7655 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7656 || (len_b > 0 && array_b[0] == 0));
7658 /* Go through each list item by item, stopping when exhausted one of
7660 while (i_a < len_a && i_b < len_b) {
7661 UV cp; /* The element to potentially add to the union's array */
7662 bool cp_in_set; /* is it in the the input list's set or not */
7664 /* We need to take one or the other of the two inputs for the union.
7665 * Since we are merging two sorted lists, we take the smaller of the
7666 * next items. In case of a tie, we take the one that is in its set
7667 * first. If we took one not in the set first, it would decrement the
7668 * count, possibly to 0 which would cause it to be output as ending the
7669 * range, and the next time through we would take the same number, and
7670 * output it again as beginning the next range. By doing it the
7671 * opposite way, there is no possibility that the count will be
7672 * momentarily decremented to 0, and thus the two adjoining ranges will
7673 * be seamlessly merged. (In a tie and both are in the set or both not
7674 * in the set, it doesn't matter which we take first.) */
7675 if (array_a[i_a] < array_b[i_b]
7676 || (array_a[i_a] == array_b[i_b]
7677 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7679 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7683 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7687 /* Here, have chosen which of the two inputs to look at. Only output
7688 * if the running count changes to/from 0, which marks the
7689 * beginning/end of a range in that's in the set */
7692 array_u[i_u++] = cp;
7699 array_u[i_u++] = cp;
7704 /* Here, we are finished going through at least one of the lists, which
7705 * means there is something remaining in at most one. We check if the list
7706 * that hasn't been exhausted is positioned such that we are in the middle
7707 * of a range in its set or not. (i_a and i_b point to the element beyond
7708 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7709 * is potentially more to output.
7710 * There are four cases:
7711 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7712 * in the union is entirely from the non-exhausted set.
7713 * 2) Both were in their sets, count is 2. Nothing further should
7714 * be output, as everything that remains will be in the exhausted
7715 * list's set, hence in the union; decrementing to 1 but not 0 insures
7717 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7718 * Nothing further should be output because the union includes
7719 * everything from the exhausted set. Not decrementing ensures that.
7720 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7721 * decrementing to 0 insures that we look at the remainder of the
7722 * non-exhausted set */
7723 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7724 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7729 /* The final length is what we've output so far, plus what else is about to
7730 * be output. (If 'count' is non-zero, then the input list we exhausted
7731 * has everything remaining up to the machine's limit in its set, and hence
7732 * in the union, so there will be no further output. */
7735 /* At most one of the subexpressions will be non-zero */
7736 len_u += (len_a - i_a) + (len_b - i_b);
7739 /* Set result to final length, which can change the pointer to array_u, so
7741 if (len_u != _invlist_len(u)) {
7742 invlist_set_len(u, len_u);
7744 array_u = invlist_array(u);
7747 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7748 * the other) ended with everything above it not in its set. That means
7749 * that the remaining part of the union is precisely the same as the
7750 * non-exhausted list, so can just copy it unchanged. (If both list were
7751 * exhausted at the same time, then the operations below will be both 0.)
7754 IV copy_count; /* At most one will have a non-zero copy count */
7755 if ((copy_count = len_a - i_a) > 0) {
7756 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7758 else if ((copy_count = len_b - i_b) > 0) {
7759 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7763 /* We may be removing a reference to one of the inputs */
7764 if (a == *output || b == *output) {
7765 SvREFCNT_dec(*output);
7768 /* If we've changed b, restore it */
7778 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7780 /* Take the intersection of two inversion lists and point <i> to it. *i
7781 * should be defined upon input, and if it points to one of the two lists,
7782 * the reference count to that list will be decremented.
7783 * If <complement_b> is TRUE, the result will be the intersection of <a>
7784 * and the complement (or inversion) of <b> instead of <b> directly.
7786 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7787 * Richard Gillam, published by Addison-Wesley, and explained at some
7788 * length there. The preface says to incorporate its examples into your
7789 * code at your own risk. In fact, it had bugs
7791 * The algorithm is like a merge sort, and is essentially the same as the
7795 UV* array_a; /* a's array */
7797 UV len_a; /* length of a's array */
7800 SV* r; /* the resulting intersection */
7804 UV i_a = 0; /* current index into a's array */
7808 /* running count, as explained in the algorithm source book; items are
7809 * stopped accumulating and are output when the count changes to/from 2.
7810 * The count is incremented when we start a range that's in the set, and
7811 * decremented when we start a range that's not in the set. So its range
7812 * is 0 to 2. Only when the count is 2 is something in the intersection.
7816 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7819 /* Special case if either one is empty */
7820 len_a = _invlist_len(a);
7821 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7823 if (len_a != 0 && complement_b) {
7825 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7826 * be empty. Here, also we are using 'b's complement, which hence
7827 * must be every possible code point. Thus the intersection is
7830 *i = invlist_clone(a);
7836 /* else *i is already 'a' */
7840 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7841 * intersection must be empty */
7848 *i = _new_invlist(0);
7852 /* Here both lists exist and are non-empty */
7853 array_a = invlist_array(a);
7854 array_b = invlist_array(b);
7856 /* If are to take the intersection of 'a' with the complement of b, set it
7857 * up so are looking at b's complement. */
7860 /* To complement, we invert: if the first element is 0, remove it. To
7861 * do this, we just pretend the array starts one later, and clear the
7862 * flag as we don't have to do anything else later */
7863 if (array_b[0] == 0) {
7866 complement_b = FALSE;
7870 /* But if the first element is not zero, we unshift a 0 before the
7871 * array. The data structure reserves a space for that 0 (which
7872 * should be a '1' right now), so physical shifting is unneeded,
7873 * but temporarily change that element to 0. Before exiting the
7874 * routine, we must restore the element to '1' */
7881 /* Size the intersection for the worst case: that the intersection ends up
7882 * fragmenting everything to be completely disjoint */
7883 r= _new_invlist(len_a + len_b);
7885 /* Will contain U+0000 iff both components do */
7886 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7887 && len_b > 0 && array_b[0] == 0);
7889 /* Go through each list item by item, stopping when exhausted one of
7891 while (i_a < len_a && i_b < len_b) {
7892 UV cp; /* The element to potentially add to the intersection's
7894 bool cp_in_set; /* Is it in the input list's set or not */
7896 /* We need to take one or the other of the two inputs for the
7897 * intersection. Since we are merging two sorted lists, we take the
7898 * smaller of the next items. In case of a tie, we take the one that
7899 * is not in its set first (a difference from the union algorithm). If
7900 * we took one in the set first, it would increment the count, possibly
7901 * to 2 which would cause it to be output as starting a range in the
7902 * intersection, and the next time through we would take that same
7903 * number, and output it again as ending the set. By doing it the
7904 * opposite of this, there is no possibility that the count will be
7905 * momentarily incremented to 2. (In a tie and both are in the set or
7906 * both not in the set, it doesn't matter which we take first.) */
7907 if (array_a[i_a] < array_b[i_b]
7908 || (array_a[i_a] == array_b[i_b]
7909 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7911 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7915 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7919 /* Here, have chosen which of the two inputs to look at. Only output
7920 * if the running count changes to/from 2, which marks the
7921 * beginning/end of a range that's in the intersection */
7925 array_r[i_r++] = cp;
7930 array_r[i_r++] = cp;
7936 /* Here, we are finished going through at least one of the lists, which
7937 * means there is something remaining in at most one. We check if the list
7938 * that has been exhausted is positioned such that we are in the middle
7939 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7940 * the ones we care about.) There are four cases:
7941 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7942 * nothing left in the intersection.
7943 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7944 * above 2. What should be output is exactly that which is in the
7945 * non-exhausted set, as everything it has is also in the intersection
7946 * set, and everything it doesn't have can't be in the intersection
7947 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7948 * gets incremented to 2. Like the previous case, the intersection is
7949 * everything that remains in the non-exhausted set.
7950 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7951 * remains 1. And the intersection has nothing more. */
7952 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7953 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7958 /* The final length is what we've output so far plus what else is in the
7959 * intersection. At most one of the subexpressions below will be non-zero */
7962 len_r += (len_a - i_a) + (len_b - i_b);
7965 /* Set result to final length, which can change the pointer to array_r, so
7967 if (len_r != _invlist_len(r)) {
7968 invlist_set_len(r, len_r);
7970 array_r = invlist_array(r);
7973 /* Finish outputting any remaining */
7974 if (count >= 2) { /* At most one will have a non-zero copy count */
7976 if ((copy_count = len_a - i_a) > 0) {
7977 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7979 else if ((copy_count = len_b - i_b) > 0) {
7980 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7984 /* We may be removing a reference to one of the inputs */
7985 if (a == *i || b == *i) {
7989 /* If we've changed b, restore it */
7999 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8001 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8002 * set. A pointer to the inversion list is returned. This may actually be
8003 * a new list, in which case the passed in one has been destroyed. The
8004 * passed in inversion list can be NULL, in which case a new one is created
8005 * with just the one range in it */
8010 if (invlist == NULL) {
8011 invlist = _new_invlist(2);
8015 len = _invlist_len(invlist);
8018 /* If comes after the final entry, can just append it to the end */
8020 || start >= invlist_array(invlist)
8021 [_invlist_len(invlist) - 1])
8023 _append_range_to_invlist(invlist, start, end);
8027 /* Here, can't just append things, create and return a new inversion list
8028 * which is the union of this range and the existing inversion list */
8029 range_invlist = _new_invlist(2);
8030 _append_range_to_invlist(range_invlist, start, end);
8032 _invlist_union(invlist, range_invlist, &invlist);
8034 /* The temporary can be freed */
8035 SvREFCNT_dec(range_invlist);
8042 PERL_STATIC_INLINE SV*
8043 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8044 return _add_range_to_invlist(invlist, cp, cp);
8047 #ifndef PERL_IN_XSUB_RE
8049 Perl__invlist_invert(pTHX_ SV* const invlist)
8051 /* Complement the input inversion list. This adds a 0 if the list didn't
8052 * have a zero; removes it otherwise. As described above, the data
8053 * structure is set up so that this is very efficient */
8055 UV* len_pos = _get_invlist_len_addr(invlist);
8057 PERL_ARGS_ASSERT__INVLIST_INVERT;
8059 /* The inverse of matching nothing is matching everything */
8060 if (*len_pos == 0) {
8061 _append_range_to_invlist(invlist, 0, UV_MAX);
8065 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
8066 * zero element was a 0, so it is being removed, so the length decrements
8067 * by 1; and vice-versa. SvCUR is unaffected */
8068 if (*get_invlist_zero_addr(invlist) ^= 1) {
8077 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8079 /* Complement the input inversion list (which must be a Unicode property,
8080 * all of which don't match above the Unicode maximum code point.) And
8081 * Perl has chosen to not have the inversion match above that either. This
8082 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8088 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8090 _invlist_invert(invlist);
8092 len = _invlist_len(invlist);
8094 if (len != 0) { /* If empty do nothing */
8095 array = invlist_array(invlist);
8096 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8097 /* Add 0x110000. First, grow if necessary */
8099 if (invlist_max(invlist) < len) {
8100 invlist_extend(invlist, len);
8101 array = invlist_array(invlist);
8103 invlist_set_len(invlist, len);
8104 array[len - 1] = PERL_UNICODE_MAX + 1;
8106 else { /* Remove the 0x110000 */
8107 invlist_set_len(invlist, len - 1);
8115 PERL_STATIC_INLINE SV*
8116 S_invlist_clone(pTHX_ SV* const invlist)
8119 /* Return a new inversion list that is a copy of the input one, which is
8122 /* Need to allocate extra space to accommodate Perl's addition of a
8123 * trailing NUL to SvPV's, since it thinks they are always strings */
8124 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8125 STRLEN length = SvCUR(invlist);
8127 PERL_ARGS_ASSERT_INVLIST_CLONE;
8129 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8130 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8135 PERL_STATIC_INLINE UV*
8136 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8138 /* Return the address of the UV that contains the current iteration
8141 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8143 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8146 PERL_STATIC_INLINE UV*
8147 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8149 /* Return the address of the UV that contains the version id. */
8151 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8153 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8156 PERL_STATIC_INLINE void
8157 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8159 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8161 *get_invlist_iter_addr(invlist) = 0;
8165 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8167 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8168 * This call sets in <*start> and <*end>, the next range in <invlist>.
8169 * Returns <TRUE> if successful and the next call will return the next
8170 * range; <FALSE> if was already at the end of the list. If the latter,
8171 * <*start> and <*end> are unchanged, and the next call to this function
8172 * will start over at the beginning of the list */
8174 UV* pos = get_invlist_iter_addr(invlist);
8175 UV len = _invlist_len(invlist);
8178 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8181 *pos = UV_MAX; /* Force iternit() to be required next time */
8185 array = invlist_array(invlist);
8187 *start = array[(*pos)++];
8193 *end = array[(*pos)++] - 1;
8199 PERL_STATIC_INLINE UV
8200 S_invlist_highest(pTHX_ SV* const invlist)
8202 /* Returns the highest code point that matches an inversion list. This API
8203 * has an ambiguity, as it returns 0 under either the highest is actually
8204 * 0, or if the list is empty. If this distinction matters to you, check
8205 * for emptiness before calling this function */
8207 UV len = _invlist_len(invlist);
8210 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8216 array = invlist_array(invlist);
8218 /* The last element in the array in the inversion list always starts a
8219 * range that goes to infinity. That range may be for code points that are
8220 * matched in the inversion list, or it may be for ones that aren't
8221 * matched. In the latter case, the highest code point in the set is one
8222 * less than the beginning of this range; otherwise it is the final element
8223 * of this range: infinity */
8224 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8226 : array[len - 1] - 1;
8229 #ifndef PERL_IN_XSUB_RE
8231 Perl__invlist_contents(pTHX_ SV* const invlist)
8233 /* Get the contents of an inversion list into a string SV so that they can
8234 * be printed out. It uses the format traditionally done for debug tracing
8238 SV* output = newSVpvs("\n");
8240 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8242 invlist_iterinit(invlist);
8243 while (invlist_iternext(invlist, &start, &end)) {
8244 if (end == UV_MAX) {
8245 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8247 else if (end != start) {
8248 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8252 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8260 #ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
8262 Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
8264 /* Dumps out the ranges in an inversion list. The string 'header'
8265 * if present is output on a line before the first range */
8269 PERL_ARGS_ASSERT__INVLIST_DUMP;
8271 if (header && strlen(header)) {
8272 PerlIO_printf(Perl_debug_log, "%s\n", header);
8274 invlist_iterinit(invlist);
8275 while (invlist_iternext(invlist, &start, &end)) {
8276 if (end == UV_MAX) {
8277 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8279 else if (end != start) {
8280 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
8284 PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
8292 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8294 /* Return a boolean as to if the two passed in inversion lists are
8295 * identical. The final argument, if TRUE, says to take the complement of
8296 * the second inversion list before doing the comparison */
8298 UV* array_a = invlist_array(a);
8299 UV* array_b = invlist_array(b);
8300 UV len_a = _invlist_len(a);
8301 UV len_b = _invlist_len(b);
8303 UV i = 0; /* current index into the arrays */
8304 bool retval = TRUE; /* Assume are identical until proven otherwise */
8306 PERL_ARGS_ASSERT__INVLISTEQ;
8308 /* If are to compare 'a' with the complement of b, set it
8309 * up so are looking at b's complement. */
8312 /* The complement of nothing is everything, so <a> would have to have
8313 * just one element, starting at zero (ending at infinity) */
8315 return (len_a == 1 && array_a[0] == 0);
8317 else if (array_b[0] == 0) {
8319 /* Otherwise, to complement, we invert. Here, the first element is
8320 * 0, just remove it. To do this, we just pretend the array starts
8321 * one later, and clear the flag as we don't have to do anything
8326 complement_b = FALSE;
8330 /* But if the first element is not zero, we unshift a 0 before the
8331 * array. The data structure reserves a space for that 0 (which
8332 * should be a '1' right now), so physical shifting is unneeded,
8333 * but temporarily change that element to 0. Before exiting the
8334 * routine, we must restore the element to '1' */
8341 /* Make sure that the lengths are the same, as well as the final element
8342 * before looping through the remainder. (Thus we test the length, final,
8343 * and first elements right off the bat) */
8344 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8347 else for (i = 0; i < len_a - 1; i++) {
8348 if (array_a[i] != array_b[i]) {
8361 #undef HEADER_LENGTH
8362 #undef INVLIST_INITIAL_LENGTH
8363 #undef TO_INTERNAL_SIZE
8364 #undef FROM_INTERNAL_SIZE
8365 #undef INVLIST_LEN_OFFSET
8366 #undef INVLIST_ZERO_OFFSET
8367 #undef INVLIST_ITER_OFFSET
8368 #undef INVLIST_VERSION_ID
8370 /* End of inversion list object */
8373 - reg - regular expression, i.e. main body or parenthesized thing
8375 * Caller must absorb opening parenthesis.
8377 * Combining parenthesis handling with the base level of regular expression
8378 * is a trifle forced, but the need to tie the tails of the branches to what
8379 * follows makes it hard to avoid.
8381 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8383 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8385 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8389 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8390 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8393 regnode *ret; /* Will be the head of the group. */
8396 regnode *ender = NULL;
8399 U32 oregflags = RExC_flags;
8400 bool have_branch = 0;
8402 I32 freeze_paren = 0;
8403 I32 after_freeze = 0;
8405 /* for (?g), (?gc), and (?o) warnings; warning
8406 about (?c) will warn about (?g) -- japhy */
8408 #define WASTED_O 0x01
8409 #define WASTED_G 0x02
8410 #define WASTED_C 0x04
8411 #define WASTED_GC (0x02|0x04)
8412 I32 wastedflags = 0x00;
8414 char * parse_start = RExC_parse; /* MJD */
8415 char * const oregcomp_parse = RExC_parse;
8417 GET_RE_DEBUG_FLAGS_DECL;
8419 PERL_ARGS_ASSERT_REG;
8420 DEBUG_PARSE("reg ");
8422 *flagp = 0; /* Tentatively. */
8425 /* Make an OPEN node, if parenthesized. */
8427 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8428 char *start_verb = RExC_parse;
8429 STRLEN verb_len = 0;
8430 char *start_arg = NULL;
8431 unsigned char op = 0;
8433 int internal_argval = 0; /* internal_argval is only useful if !argok */
8434 while ( *RExC_parse && *RExC_parse != ')' ) {
8435 if ( *RExC_parse == ':' ) {
8436 start_arg = RExC_parse + 1;
8442 verb_len = RExC_parse - start_verb;
8445 while ( *RExC_parse && *RExC_parse != ')' )
8447 if ( *RExC_parse != ')' )
8448 vFAIL("Unterminated verb pattern argument");
8449 if ( RExC_parse == start_arg )
8452 if ( *RExC_parse != ')' )
8453 vFAIL("Unterminated verb pattern");
8456 switch ( *start_verb ) {
8457 case 'A': /* (*ACCEPT) */
8458 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8460 internal_argval = RExC_nestroot;
8463 case 'C': /* (*COMMIT) */
8464 if ( memEQs(start_verb,verb_len,"COMMIT") )
8467 case 'F': /* (*FAIL) */
8468 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8473 case ':': /* (*:NAME) */
8474 case 'M': /* (*MARK:NAME) */
8475 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8480 case 'P': /* (*PRUNE) */
8481 if ( memEQs(start_verb,verb_len,"PRUNE") )
8484 case 'S': /* (*SKIP) */
8485 if ( memEQs(start_verb,verb_len,"SKIP") )
8488 case 'T': /* (*THEN) */
8489 /* [19:06] <TimToady> :: is then */
8490 if ( memEQs(start_verb,verb_len,"THEN") ) {
8492 RExC_seen |= REG_SEEN_CUTGROUP;
8498 vFAIL3("Unknown verb pattern '%.*s'",
8499 verb_len, start_verb);
8502 if ( start_arg && internal_argval ) {
8503 vFAIL3("Verb pattern '%.*s' may not have an argument",
8504 verb_len, start_verb);
8505 } else if ( argok < 0 && !start_arg ) {
8506 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8507 verb_len, start_verb);
8509 ret = reganode(pRExC_state, op, internal_argval);
8510 if ( ! internal_argval && ! SIZE_ONLY ) {
8512 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8513 ARG(ret) = add_data( pRExC_state, 1, "S" );
8514 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8521 if (!internal_argval)
8522 RExC_seen |= REG_SEEN_VERBARG;
8523 } else if ( start_arg ) {
8524 vFAIL3("Verb pattern '%.*s' may not have an argument",
8525 verb_len, start_verb);
8527 ret = reg_node(pRExC_state, op);
8529 nextchar(pRExC_state);
8532 if (*RExC_parse == '?') { /* (?...) */
8533 bool is_logical = 0;
8534 const char * const seqstart = RExC_parse;
8535 bool has_use_defaults = FALSE;
8538 paren = *RExC_parse++;
8539 ret = NULL; /* For look-ahead/behind. */
8542 case 'P': /* (?P...) variants for those used to PCRE/Python */
8543 paren = *RExC_parse++;
8544 if ( paren == '<') /* (?P<...>) named capture */
8546 else if (paren == '>') { /* (?P>name) named recursion */
8547 goto named_recursion;
8549 else if (paren == '=') { /* (?P=...) named backref */
8550 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8551 you change this make sure you change that */
8552 char* name_start = RExC_parse;
8554 SV *sv_dat = reg_scan_name(pRExC_state,
8555 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8556 if (RExC_parse == name_start || *RExC_parse != ')')
8557 vFAIL2("Sequence %.3s... not terminated",parse_start);
8560 num = add_data( pRExC_state, 1, "S" );
8561 RExC_rxi->data->data[num]=(void*)sv_dat;
8562 SvREFCNT_inc_simple_void(sv_dat);
8565 ret = reganode(pRExC_state,
8568 : (ASCII_FOLD_RESTRICTED)
8570 : (AT_LEAST_UNI_SEMANTICS)
8578 Set_Node_Offset(ret, parse_start+1);
8579 Set_Node_Cur_Length(ret); /* MJD */
8581 nextchar(pRExC_state);
8585 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8587 case '<': /* (?<...) */
8588 if (*RExC_parse == '!')
8590 else if (*RExC_parse != '=')
8596 case '\'': /* (?'...') */
8597 name_start= RExC_parse;
8598 svname = reg_scan_name(pRExC_state,
8599 SIZE_ONLY ? /* reverse test from the others */
8600 REG_RSN_RETURN_NAME :
8601 REG_RSN_RETURN_NULL);
8602 if (RExC_parse == name_start) {
8604 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8607 if (*RExC_parse != paren)
8608 vFAIL2("Sequence (?%c... not terminated",
8609 paren=='>' ? '<' : paren);
8613 if (!svname) /* shouldn't happen */
8615 "panic: reg_scan_name returned NULL");
8616 if (!RExC_paren_names) {
8617 RExC_paren_names= newHV();
8618 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8620 RExC_paren_name_list= newAV();
8621 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8624 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8626 sv_dat = HeVAL(he_str);
8628 /* croak baby croak */
8630 "panic: paren_name hash element allocation failed");
8631 } else if ( SvPOK(sv_dat) ) {
8632 /* (?|...) can mean we have dupes so scan to check
8633 its already been stored. Maybe a flag indicating
8634 we are inside such a construct would be useful,
8635 but the arrays are likely to be quite small, so
8636 for now we punt -- dmq */
8637 IV count = SvIV(sv_dat);
8638 I32 *pv = (I32*)SvPVX(sv_dat);
8640 for ( i = 0 ; i < count ; i++ ) {
8641 if ( pv[i] == RExC_npar ) {
8647 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8648 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8649 pv[count] = RExC_npar;
8650 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8653 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8654 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8656 SvIV_set(sv_dat, 1);
8659 /* Yes this does cause a memory leak in debugging Perls */
8660 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8661 SvREFCNT_dec(svname);
8664 /*sv_dump(sv_dat);*/
8666 nextchar(pRExC_state);
8668 goto capturing_parens;
8670 RExC_seen |= REG_SEEN_LOOKBEHIND;
8671 RExC_in_lookbehind++;
8673 case '=': /* (?=...) */
8674 RExC_seen_zerolen++;
8676 case '!': /* (?!...) */
8677 RExC_seen_zerolen++;
8678 if (*RExC_parse == ')') {
8679 ret=reg_node(pRExC_state, OPFAIL);
8680 nextchar(pRExC_state);
8684 case '|': /* (?|...) */
8685 /* branch reset, behave like a (?:...) except that
8686 buffers in alternations share the same numbers */
8688 after_freeze = freeze_paren = RExC_npar;
8690 case ':': /* (?:...) */
8691 case '>': /* (?>...) */
8693 case '$': /* (?$...) */
8694 case '@': /* (?@...) */
8695 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8697 case '#': /* (?#...) */
8698 while (*RExC_parse && *RExC_parse != ')')
8700 if (*RExC_parse != ')')
8701 FAIL("Sequence (?#... not terminated");
8702 nextchar(pRExC_state);
8705 case '0' : /* (?0) */
8706 case 'R' : /* (?R) */
8707 if (*RExC_parse != ')')
8708 FAIL("Sequence (?R) not terminated");
8709 ret = reg_node(pRExC_state, GOSTART);
8710 *flagp |= POSTPONED;
8711 nextchar(pRExC_state);
8714 { /* named and numeric backreferences */
8716 case '&': /* (?&NAME) */
8717 parse_start = RExC_parse - 1;
8720 SV *sv_dat = reg_scan_name(pRExC_state,
8721 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8722 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8724 goto gen_recurse_regop;
8725 assert(0); /* NOT REACHED */
8727 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8729 vFAIL("Illegal pattern");
8731 goto parse_recursion;
8733 case '-': /* (?-1) */
8734 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8735 RExC_parse--; /* rewind to let it be handled later */
8739 case '1': case '2': case '3': case '4': /* (?1) */
8740 case '5': case '6': case '7': case '8': case '9':
8743 num = atoi(RExC_parse);
8744 parse_start = RExC_parse - 1; /* MJD */
8745 if (*RExC_parse == '-')
8747 while (isDIGIT(*RExC_parse))
8749 if (*RExC_parse!=')')
8750 vFAIL("Expecting close bracket");
8753 if ( paren == '-' ) {
8755 Diagram of capture buffer numbering.
8756 Top line is the normal capture buffer numbers
8757 Bottom line is the negative indexing as from
8761 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8765 num = RExC_npar + num;
8768 vFAIL("Reference to nonexistent group");
8770 } else if ( paren == '+' ) {
8771 num = RExC_npar + num - 1;
8774 ret = reganode(pRExC_state, GOSUB, num);
8776 if (num > (I32)RExC_rx->nparens) {
8778 vFAIL("Reference to nonexistent group");
8780 ARG2L_SET( ret, RExC_recurse_count++);
8782 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8783 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8787 RExC_seen |= REG_SEEN_RECURSE;
8788 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8789 Set_Node_Offset(ret, parse_start); /* MJD */
8791 *flagp |= POSTPONED;
8792 nextchar(pRExC_state);
8794 } /* named and numeric backreferences */
8795 assert(0); /* NOT REACHED */
8797 case '?': /* (??...) */
8799 if (*RExC_parse != '{') {
8801 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8804 *flagp |= POSTPONED;
8805 paren = *RExC_parse++;
8807 case '{': /* (?{...}) */
8810 struct reg_code_block *cb;
8812 RExC_seen_zerolen++;
8814 if ( !pRExC_state->num_code_blocks
8815 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8816 || pRExC_state->code_blocks[pRExC_state->code_index].start
8817 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8820 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8821 FAIL("panic: Sequence (?{...}): no code block found\n");
8822 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8824 /* this is a pre-compiled code block (?{...}) */
8825 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8826 RExC_parse = RExC_start + cb->end;
8829 if (cb->src_regex) {
8830 n = add_data(pRExC_state, 2, "rl");
8831 RExC_rxi->data->data[n] =
8832 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8833 RExC_rxi->data->data[n+1] = (void*)o;
8836 n = add_data(pRExC_state, 1,
8837 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8838 RExC_rxi->data->data[n] = (void*)o;
8841 pRExC_state->code_index++;
8842 nextchar(pRExC_state);
8846 ret = reg_node(pRExC_state, LOGICAL);
8847 eval = reganode(pRExC_state, EVAL, n);
8850 /* for later propagation into (??{}) return value */
8851 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8853 REGTAIL(pRExC_state, ret, eval);
8854 /* deal with the length of this later - MJD */
8857 ret = reganode(pRExC_state, EVAL, n);
8858 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8859 Set_Node_Offset(ret, parse_start);
8862 case '(': /* (?(?{...})...) and (?(?=...)...) */
8865 if (RExC_parse[0] == '?') { /* (?(?...)) */
8866 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8867 || RExC_parse[1] == '<'
8868 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8871 ret = reg_node(pRExC_state, LOGICAL);
8874 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8878 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8879 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8881 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8882 char *name_start= RExC_parse++;
8884 SV *sv_dat=reg_scan_name(pRExC_state,
8885 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8886 if (RExC_parse == name_start || *RExC_parse != ch)
8887 vFAIL2("Sequence (?(%c... not terminated",
8888 (ch == '>' ? '<' : ch));
8891 num = add_data( pRExC_state, 1, "S" );
8892 RExC_rxi->data->data[num]=(void*)sv_dat;
8893 SvREFCNT_inc_simple_void(sv_dat);
8895 ret = reganode(pRExC_state,NGROUPP,num);
8896 goto insert_if_check_paren;
8898 else if (RExC_parse[0] == 'D' &&
8899 RExC_parse[1] == 'E' &&
8900 RExC_parse[2] == 'F' &&
8901 RExC_parse[3] == 'I' &&
8902 RExC_parse[4] == 'N' &&
8903 RExC_parse[5] == 'E')
8905 ret = reganode(pRExC_state,DEFINEP,0);
8908 goto insert_if_check_paren;
8910 else if (RExC_parse[0] == 'R') {
8913 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8914 parno = atoi(RExC_parse++);
8915 while (isDIGIT(*RExC_parse))
8917 } else if (RExC_parse[0] == '&') {
8920 sv_dat = reg_scan_name(pRExC_state,
8921 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8922 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8924 ret = reganode(pRExC_state,INSUBP,parno);
8925 goto insert_if_check_paren;
8927 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8930 parno = atoi(RExC_parse++);
8932 while (isDIGIT(*RExC_parse))
8934 ret = reganode(pRExC_state, GROUPP, parno);
8936 insert_if_check_paren:
8937 if ((c = *nextchar(pRExC_state)) != ')')
8938 vFAIL("Switch condition not recognized");
8940 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8941 br = regbranch(pRExC_state, &flags, 1,depth+1);
8943 br = reganode(pRExC_state, LONGJMP, 0);
8945 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8946 c = *nextchar(pRExC_state);
8951 vFAIL("(?(DEFINE)....) does not allow branches");
8952 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8953 regbranch(pRExC_state, &flags, 1,depth+1);
8954 REGTAIL(pRExC_state, ret, lastbr);
8957 c = *nextchar(pRExC_state);
8962 vFAIL("Switch (?(condition)... contains too many branches");
8963 ender = reg_node(pRExC_state, TAIL);
8964 REGTAIL(pRExC_state, br, ender);
8966 REGTAIL(pRExC_state, lastbr, ender);
8967 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8970 REGTAIL(pRExC_state, ret, ender);
8971 RExC_size++; /* XXX WHY do we need this?!!
8972 For large programs it seems to be required
8973 but I can't figure out why. -- dmq*/
8977 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8981 RExC_parse--; /* for vFAIL to print correctly */
8982 vFAIL("Sequence (? incomplete");
8984 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8986 has_use_defaults = TRUE;
8987 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8988 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8989 ? REGEX_UNICODE_CHARSET
8990 : REGEX_DEPENDS_CHARSET);
8994 parse_flags: /* (?i) */
8996 U32 posflags = 0, negflags = 0;
8997 U32 *flagsp = &posflags;
8998 char has_charset_modifier = '\0';
8999 regex_charset cs = get_regex_charset(RExC_flags);
9000 if (cs == REGEX_DEPENDS_CHARSET
9001 && (RExC_utf8 || RExC_uni_semantics))
9003 cs = REGEX_UNICODE_CHARSET;
9006 while (*RExC_parse) {
9007 /* && strchr("iogcmsx", *RExC_parse) */
9008 /* (?g), (?gc) and (?o) are useless here
9009 and must be globally applied -- japhy */
9010 switch (*RExC_parse) {
9011 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9012 case LOCALE_PAT_MOD:
9013 if (has_charset_modifier) {
9014 goto excess_modifier;
9016 else if (flagsp == &negflags) {
9019 cs = REGEX_LOCALE_CHARSET;
9020 has_charset_modifier = LOCALE_PAT_MOD;
9021 RExC_contains_locale = 1;
9023 case UNICODE_PAT_MOD:
9024 if (has_charset_modifier) {
9025 goto excess_modifier;
9027 else if (flagsp == &negflags) {
9030 cs = REGEX_UNICODE_CHARSET;
9031 has_charset_modifier = UNICODE_PAT_MOD;
9033 case ASCII_RESTRICT_PAT_MOD:
9034 if (flagsp == &negflags) {
9037 if (has_charset_modifier) {
9038 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9039 goto excess_modifier;
9041 /* Doubled modifier implies more restricted */
9042 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9045 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9047 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9049 case DEPENDS_PAT_MOD:
9050 if (has_use_defaults) {
9051 goto fail_modifiers;
9053 else if (flagsp == &negflags) {
9056 else if (has_charset_modifier) {
9057 goto excess_modifier;
9060 /* The dual charset means unicode semantics if the
9061 * pattern (or target, not known until runtime) are
9062 * utf8, or something in the pattern indicates unicode
9064 cs = (RExC_utf8 || RExC_uni_semantics)
9065 ? REGEX_UNICODE_CHARSET
9066 : REGEX_DEPENDS_CHARSET;
9067 has_charset_modifier = DEPENDS_PAT_MOD;
9071 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9072 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9074 else if (has_charset_modifier == *(RExC_parse - 1)) {
9075 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9078 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9083 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9085 case ONCE_PAT_MOD: /* 'o' */
9086 case GLOBAL_PAT_MOD: /* 'g' */
9087 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9088 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9089 if (! (wastedflags & wflagbit) ) {
9090 wastedflags |= wflagbit;
9093 "Useless (%s%c) - %suse /%c modifier",
9094 flagsp == &negflags ? "?-" : "?",
9096 flagsp == &negflags ? "don't " : "",
9103 case CONTINUE_PAT_MOD: /* 'c' */
9104 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9105 if (! (wastedflags & WASTED_C) ) {
9106 wastedflags |= WASTED_GC;
9109 "Useless (%sc) - %suse /gc modifier",
9110 flagsp == &negflags ? "?-" : "?",
9111 flagsp == &negflags ? "don't " : ""
9116 case KEEPCOPY_PAT_MOD: /* 'p' */
9117 if (flagsp == &negflags) {
9119 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9121 *flagsp |= RXf_PMf_KEEPCOPY;
9125 /* A flag is a default iff it is following a minus, so
9126 * if there is a minus, it means will be trying to
9127 * re-specify a default which is an error */
9128 if (has_use_defaults || flagsp == &negflags) {
9131 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9135 wastedflags = 0; /* reset so (?g-c) warns twice */
9141 RExC_flags |= posflags;
9142 RExC_flags &= ~negflags;
9143 set_regex_charset(&RExC_flags, cs);
9145 oregflags |= posflags;
9146 oregflags &= ~negflags;
9147 set_regex_charset(&oregflags, cs);
9149 nextchar(pRExC_state);
9160 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9165 }} /* one for the default block, one for the switch */
9172 ret = reganode(pRExC_state, OPEN, parno);
9175 RExC_nestroot = parno;
9176 if (RExC_seen & REG_SEEN_RECURSE
9177 && !RExC_open_parens[parno-1])
9179 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9180 "Setting open paren #%"IVdf" to %d\n",
9181 (IV)parno, REG_NODE_NUM(ret)));
9182 RExC_open_parens[parno-1]= ret;
9185 Set_Node_Length(ret, 1); /* MJD */
9186 Set_Node_Offset(ret, RExC_parse); /* MJD */
9194 /* Pick up the branches, linking them together. */
9195 parse_start = RExC_parse; /* MJD */
9196 br = regbranch(pRExC_state, &flags, 1,depth+1);
9198 /* branch_len = (paren != 0); */
9202 if (*RExC_parse == '|') {
9203 if (!SIZE_ONLY && RExC_extralen) {
9204 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9207 reginsert(pRExC_state, BRANCH, br, depth+1);
9208 Set_Node_Length(br, paren != 0);
9209 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9213 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9215 else if (paren == ':') {
9216 *flagp |= flags&SIMPLE;
9218 if (is_open) { /* Starts with OPEN. */
9219 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9221 else if (paren != '?') /* Not Conditional */
9223 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9225 while (*RExC_parse == '|') {
9226 if (!SIZE_ONLY && RExC_extralen) {
9227 ender = reganode(pRExC_state, LONGJMP,0);
9228 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9231 RExC_extralen += 2; /* Account for LONGJMP. */
9232 nextchar(pRExC_state);
9234 if (RExC_npar > after_freeze)
9235 after_freeze = RExC_npar;
9236 RExC_npar = freeze_paren;
9238 br = regbranch(pRExC_state, &flags, 0, depth+1);
9242 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9244 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9247 if (have_branch || paren != ':') {
9248 /* Make a closing node, and hook it on the end. */
9251 ender = reg_node(pRExC_state, TAIL);
9254 ender = reganode(pRExC_state, CLOSE, parno);
9255 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9256 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9257 "Setting close paren #%"IVdf" to %d\n",
9258 (IV)parno, REG_NODE_NUM(ender)));
9259 RExC_close_parens[parno-1]= ender;
9260 if (RExC_nestroot == parno)
9263 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9264 Set_Node_Length(ender,1); /* MJD */
9270 *flagp &= ~HASWIDTH;
9273 ender = reg_node(pRExC_state, SUCCEED);
9276 ender = reg_node(pRExC_state, END);
9278 assert(!RExC_opend); /* there can only be one! */
9283 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9284 SV * const mysv_val1=sv_newmortal();
9285 SV * const mysv_val2=sv_newmortal();
9286 DEBUG_PARSE_MSG("lsbr");
9287 regprop(RExC_rx, mysv_val1, lastbr);
9288 regprop(RExC_rx, mysv_val2, ender);
9289 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9290 SvPV_nolen_const(mysv_val1),
9291 (IV)REG_NODE_NUM(lastbr),
9292 SvPV_nolen_const(mysv_val2),
9293 (IV)REG_NODE_NUM(ender),
9294 (IV)(ender - lastbr)
9297 REGTAIL(pRExC_state, lastbr, ender);
9299 if (have_branch && !SIZE_ONLY) {
9302 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9304 /* Hook the tails of the branches to the closing node. */
9305 for (br = ret; br; br = regnext(br)) {
9306 const U8 op = PL_regkind[OP(br)];
9308 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9309 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9312 else if (op == BRANCHJ) {
9313 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9314 /* for now we always disable this optimisation * /
9315 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9321 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9322 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9323 SV * const mysv_val1=sv_newmortal();
9324 SV * const mysv_val2=sv_newmortal();
9325 DEBUG_PARSE_MSG("NADA");
9326 regprop(RExC_rx, mysv_val1, ret);
9327 regprop(RExC_rx, mysv_val2, ender);
9328 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9329 SvPV_nolen_const(mysv_val1),
9330 (IV)REG_NODE_NUM(ret),
9331 SvPV_nolen_const(mysv_val2),
9332 (IV)REG_NODE_NUM(ender),
9337 if (OP(ender) == TAIL) {
9342 for ( opt= br + 1; opt < ender ; opt++ )
9344 NEXT_OFF(br)= ender - br;
9352 static const char parens[] = "=!<,>";
9354 if (paren && (p = strchr(parens, paren))) {
9355 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9356 int flag = (p - parens) > 1;
9359 node = SUSPEND, flag = 0;
9360 reginsert(pRExC_state, node,ret, depth+1);
9361 Set_Node_Cur_Length(ret);
9362 Set_Node_Offset(ret, parse_start + 1);
9364 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9368 /* Check for proper termination. */
9370 RExC_flags = oregflags;
9371 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9372 RExC_parse = oregcomp_parse;
9373 vFAIL("Unmatched (");
9376 else if (!paren && RExC_parse < RExC_end) {
9377 if (*RExC_parse == ')') {
9379 vFAIL("Unmatched )");
9382 FAIL("Junk on end of regexp"); /* "Can't happen". */
9383 assert(0); /* NOTREACHED */
9386 if (RExC_in_lookbehind) {
9387 RExC_in_lookbehind--;
9389 if (after_freeze > RExC_npar)
9390 RExC_npar = after_freeze;
9395 - regbranch - one alternative of an | operator
9397 * Implements the concatenation operator.
9400 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9404 regnode *chain = NULL;
9406 I32 flags = 0, c = 0;
9407 GET_RE_DEBUG_FLAGS_DECL;
9409 PERL_ARGS_ASSERT_REGBRANCH;
9411 DEBUG_PARSE("brnc");
9416 if (!SIZE_ONLY && RExC_extralen)
9417 ret = reganode(pRExC_state, BRANCHJ,0);
9419 ret = reg_node(pRExC_state, BRANCH);
9420 Set_Node_Length(ret, 1);
9424 if (!first && SIZE_ONLY)
9425 RExC_extralen += 1; /* BRANCHJ */
9427 *flagp = WORST; /* Tentatively. */
9430 nextchar(pRExC_state);
9431 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9433 latest = regpiece(pRExC_state, &flags,depth+1);
9434 if (latest == NULL) {
9435 if (flags & TRYAGAIN)
9439 else if (ret == NULL)
9441 *flagp |= flags&(HASWIDTH|POSTPONED);
9442 if (chain == NULL) /* First piece. */
9443 *flagp |= flags&SPSTART;
9446 REGTAIL(pRExC_state, chain, latest);
9451 if (chain == NULL) { /* Loop ran zero times. */
9452 chain = reg_node(pRExC_state, NOTHING);
9457 *flagp |= flags&SIMPLE;
9464 - regpiece - something followed by possible [*+?]
9466 * Note that the branching code sequences used for ? and the general cases
9467 * of * and + are somewhat optimized: they use the same NOTHING node as
9468 * both the endmarker for their branch list and the body of the last branch.
9469 * It might seem that this node could be dispensed with entirely, but the
9470 * endmarker role is not redundant.
9473 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9480 const char * const origparse = RExC_parse;
9482 I32 max = REG_INFTY;
9483 #ifdef RE_TRACK_PATTERN_OFFSETS
9486 const char *maxpos = NULL;
9488 /* Save the original in case we change the emitted regop to a FAIL. */
9489 regnode * const orig_emit = RExC_emit;
9491 GET_RE_DEBUG_FLAGS_DECL;
9493 PERL_ARGS_ASSERT_REGPIECE;
9495 DEBUG_PARSE("piec");
9497 ret = regatom(pRExC_state, &flags,depth+1);
9499 if (flags & TRYAGAIN)
9506 if (op == '{' && regcurly(RExC_parse)) {
9508 #ifdef RE_TRACK_PATTERN_OFFSETS
9509 parse_start = RExC_parse; /* MJD */
9511 next = RExC_parse + 1;
9512 while (isDIGIT(*next) || *next == ',') {
9521 if (*next == '}') { /* got one */
9525 min = atoi(RExC_parse);
9529 maxpos = RExC_parse;
9531 if (!max && *maxpos != '0')
9532 max = REG_INFTY; /* meaning "infinity" */
9533 else if (max >= REG_INFTY)
9534 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9536 nextchar(pRExC_state);
9537 if (max < min) { /* If can't match, warn and optimize to fail
9540 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9542 /* We can't back off the size because we have to reserve
9543 * enough space for all the things we are about to throw
9544 * away, but we can shrink it by the ammount we are about
9546 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9549 RExC_emit = orig_emit;
9551 ret = reg_node(pRExC_state, OPFAIL);
9556 if ((flags&SIMPLE)) {
9557 RExC_naughty += 2 + RExC_naughty / 2;
9558 reginsert(pRExC_state, CURLY, ret, depth+1);
9559 Set_Node_Offset(ret, parse_start+1); /* MJD */
9560 Set_Node_Cur_Length(ret);
9563 regnode * const w = reg_node(pRExC_state, WHILEM);
9566 REGTAIL(pRExC_state, ret, w);
9567 if (!SIZE_ONLY && RExC_extralen) {
9568 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9569 reginsert(pRExC_state, NOTHING,ret, depth+1);
9570 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9572 reginsert(pRExC_state, CURLYX,ret, depth+1);
9574 Set_Node_Offset(ret, parse_start+1);
9575 Set_Node_Length(ret,
9576 op == '{' ? (RExC_parse - parse_start) : 1);
9578 if (!SIZE_ONLY && RExC_extralen)
9579 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9580 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9582 RExC_whilem_seen++, RExC_extralen += 3;
9583 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9592 ARG1_SET(ret, (U16)min);
9593 ARG2_SET(ret, (U16)max);
9605 #if 0 /* Now runtime fix should be reliable. */
9607 /* if this is reinstated, don't forget to put this back into perldiag:
9609 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9611 (F) The part of the regexp subject to either the * or + quantifier
9612 could match an empty string. The {#} shows in the regular
9613 expression about where the problem was discovered.
9617 if (!(flags&HASWIDTH) && op != '?')
9618 vFAIL("Regexp *+ operand could be empty");
9621 #ifdef RE_TRACK_PATTERN_OFFSETS
9622 parse_start = RExC_parse;
9624 nextchar(pRExC_state);
9626 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9628 if (op == '*' && (flags&SIMPLE)) {
9629 reginsert(pRExC_state, STAR, ret, depth+1);
9633 else if (op == '*') {
9637 else if (op == '+' && (flags&SIMPLE)) {
9638 reginsert(pRExC_state, PLUS, ret, depth+1);
9642 else if (op == '+') {
9646 else if (op == '?') {
9651 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9652 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9653 ckWARN3reg(RExC_parse,
9654 "%.*s matches null string many times",
9655 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9657 ReREFCNT_inc(RExC_rx_sv);
9660 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9661 nextchar(pRExC_state);
9662 reginsert(pRExC_state, MINMOD, ret, depth+1);
9663 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9665 #ifndef REG_ALLOW_MINMOD_SUSPEND
9668 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9670 nextchar(pRExC_state);
9671 ender = reg_node(pRExC_state, SUCCEED);
9672 REGTAIL(pRExC_state, ret, ender);
9673 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9675 ender = reg_node(pRExC_state, TAIL);
9676 REGTAIL(pRExC_state, ret, ender);
9680 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9682 vFAIL("Nested quantifiers");
9689 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9692 /* This is expected to be called by a parser routine that has recognized '\N'
9693 and needs to handle the rest. RExC_parse is expected to point at the first
9694 char following the N at the time of the call. On successful return,
9695 RExC_parse has been updated to point to just after the sequence identified
9696 by this routine, and <*flagp> has been updated.
9698 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9701 \N may begin either a named sequence, or if outside a character class, mean
9702 to match a non-newline. For non single-quoted regexes, the tokenizer has
9703 attempted to decide which, and in the case of a named sequence, converted it
9704 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9705 where c1... are the characters in the sequence. For single-quoted regexes,
9706 the tokenizer passes the \N sequence through unchanged; this code will not
9707 attempt to determine this nor expand those, instead raising a syntax error.
9708 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9709 or there is no '}', it signals that this \N occurrence means to match a
9712 Only the \N{U+...} form should occur in a character class, for the same
9713 reason that '.' inside a character class means to just match a period: it
9714 just doesn't make sense.
9716 The function raises an error (via vFAIL), and doesn't return for various
9717 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9718 success; it returns FALSE otherwise.
9720 If <valuep> is non-null, it means the caller can accept an input sequence
9721 consisting of a just a single code point; <*valuep> is set to that value
9722 if the input is such.
9724 If <node_p> is non-null it signifies that the caller can accept any other
9725 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9727 1) \N means not-a-NL: points to a newly created REG_ANY node;
9728 2) \N{}: points to a new NOTHING node;
9729 3) otherwise: points to a new EXACT node containing the resolved
9731 Note that FALSE is returned for single code point sequences if <valuep> is
9735 char * endbrace; /* '}' following the name */
9737 char *endchar; /* Points to '.' or '}' ending cur char in the input
9739 bool has_multiple_chars; /* true if the input stream contains a sequence of
9740 more than one character */
9742 GET_RE_DEBUG_FLAGS_DECL;
9744 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9748 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9750 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9751 * modifier. The other meaning does not */
9752 p = (RExC_flags & RXf_PMf_EXTENDED)
9753 ? regwhite( pRExC_state, RExC_parse )
9756 /* Disambiguate between \N meaning a named character versus \N meaning
9757 * [^\n]. The former is assumed when it can't be the latter. */
9758 if (*p != '{' || regcurly(p)) {
9761 /* no bare \N in a charclass */
9762 if (in_char_class) {
9763 vFAIL("\\N in a character class must be a named character: \\N{...}");
9767 nextchar(pRExC_state);
9768 *node_p = reg_node(pRExC_state, REG_ANY);
9769 *flagp |= HASWIDTH|SIMPLE;
9772 Set_Node_Length(*node_p, 1); /* MJD */
9776 /* Here, we have decided it should be a named character or sequence */
9778 /* The test above made sure that the next real character is a '{', but
9779 * under the /x modifier, it could be separated by space (or a comment and
9780 * \n) and this is not allowed (for consistency with \x{...} and the
9781 * tokenizer handling of \N{NAME}). */
9782 if (*RExC_parse != '{') {
9783 vFAIL("Missing braces on \\N{}");
9786 RExC_parse++; /* Skip past the '{' */
9788 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9789 || ! (endbrace == RExC_parse /* nothing between the {} */
9790 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9791 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9793 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9794 vFAIL("\\N{NAME} must be resolved by the lexer");
9797 if (endbrace == RExC_parse) { /* empty: \N{} */
9800 *node_p = reg_node(pRExC_state,NOTHING);
9802 else if (in_char_class) {
9803 if (SIZE_ONLY && in_char_class) {
9804 ckWARNreg(RExC_parse,
9805 "Ignoring zero length \\N{} in character class"
9813 nextchar(pRExC_state);
9817 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9818 RExC_parse += 2; /* Skip past the 'U+' */
9820 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9822 /* Code points are separated by dots. If none, there is only one code
9823 * point, and is terminated by the brace */
9824 has_multiple_chars = (endchar < endbrace);
9826 if (valuep && (! has_multiple_chars || in_char_class)) {
9827 /* We only pay attention to the first char of
9828 multichar strings being returned in char classes. I kinda wonder
9829 if this makes sense as it does change the behaviour
9830 from earlier versions, OTOH that behaviour was broken
9831 as well. XXX Solution is to recharacterize as
9832 [rest-of-class]|multi1|multi2... */
9834 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9835 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9836 | PERL_SCAN_DISALLOW_PREFIX
9837 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9839 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9841 /* The tokenizer should have guaranteed validity, but it's possible to
9842 * bypass it by using single quoting, so check */
9843 if (length_of_hex == 0
9844 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9846 RExC_parse += length_of_hex; /* Includes all the valid */
9847 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9848 ? UTF8SKIP(RExC_parse)
9850 /* Guard against malformed utf8 */
9851 if (RExC_parse >= endchar) {
9852 RExC_parse = endchar;
9854 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9857 if (in_char_class && has_multiple_chars) {
9858 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9861 RExC_parse = endbrace + 1;
9863 else if (! node_p || ! has_multiple_chars) {
9865 /* Here, the input is legal, but not according to the caller's
9866 * options. We fail without advancing the parse, so that the
9867 * caller can try again */
9873 /* What is done here is to convert this to a sub-pattern of the form
9874 * (?:\x{char1}\x{char2}...)
9875 * and then call reg recursively. That way, it retains its atomicness,
9876 * while not having to worry about special handling that some code
9877 * points may have. toke.c has converted the original Unicode values
9878 * to native, so that we can just pass on the hex values unchanged. We
9879 * do have to set a flag to keep recoding from happening in the
9882 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9884 char *orig_end = RExC_end;
9887 while (RExC_parse < endbrace) {
9889 /* Convert to notation the rest of the code understands */
9890 sv_catpv(substitute_parse, "\\x{");
9891 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9892 sv_catpv(substitute_parse, "}");
9894 /* Point to the beginning of the next character in the sequence. */
9895 RExC_parse = endchar + 1;
9896 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9898 sv_catpv(substitute_parse, ")");
9900 RExC_parse = SvPV(substitute_parse, len);
9902 /* Don't allow empty number */
9904 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9906 RExC_end = RExC_parse + len;
9908 /* The values are Unicode, and therefore not subject to recoding */
9909 RExC_override_recoding = 1;
9911 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9912 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9914 RExC_parse = endbrace;
9915 RExC_end = orig_end;
9916 RExC_override_recoding = 0;
9918 nextchar(pRExC_state);
9928 * It returns the code point in utf8 for the value in *encp.
9929 * value: a code value in the source encoding
9930 * encp: a pointer to an Encode object
9932 * If the result from Encode is not a single character,
9933 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9936 S_reg_recode(pTHX_ const char value, SV **encp)
9939 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9940 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9941 const STRLEN newlen = SvCUR(sv);
9942 UV uv = UNICODE_REPLACEMENT;
9944 PERL_ARGS_ASSERT_REG_RECODE;
9948 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9951 if (!newlen || numlen != newlen) {
9952 uv = UNICODE_REPLACEMENT;
9958 PERL_STATIC_INLINE U8
9959 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9963 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9969 op = get_regex_charset(RExC_flags);
9970 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9971 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9972 been, so there is no hole */
9978 PERL_STATIC_INLINE void
9979 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9981 /* This knows the details about sizing an EXACTish node, setting flags for
9982 * it (by setting <*flagp>, and potentially populating it with a single
9985 * If <len> (the length in bytes) is non-zero, this function assumes that
9986 * the node has already been populated, and just does the sizing. In this
9987 * case <code_point> should be the final code point that has already been
9988 * placed into the node. This value will be ignored except that under some
9989 * circumstances <*flagp> is set based on it.
9991 * If <len> is zero, the function assumes that the node is to contain only
9992 * the single character given by <code_point> and calculates what <len>
9993 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9994 * additionally will populate the node's STRING with <code_point>, if <len>
9995 * is 0. In both cases <*flagp> is appropriately set
9997 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9998 * folded (the latter only when the rules indicate it can match 'ss') */
10000 bool len_passed_in = cBOOL(len != 0);
10001 U8 character[UTF8_MAXBYTES_CASE+1];
10003 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10005 if (! len_passed_in) {
10008 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
10011 uvchr_to_utf8( character, code_point);
10012 len = UTF8SKIP(character);
10016 || code_point != LATIN_SMALL_LETTER_SHARP_S
10017 || ASCII_FOLD_RESTRICTED
10018 || ! AT_LEAST_UNI_SEMANTICS)
10020 *character = (U8) code_point;
10025 *(character + 1) = 's';
10031 RExC_size += STR_SZ(len);
10034 RExC_emit += STR_SZ(len);
10035 STR_LEN(node) = len;
10036 if (! len_passed_in) {
10037 Copy((char *) character, STRING(node), len, char);
10041 *flagp |= HASWIDTH;
10043 /* A single character node is SIMPLE, except for the special-cased SHARP S
10045 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10046 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10047 || ! FOLD || ! DEPENDS_SEMANTICS))
10054 - regatom - the lowest level
10056 Try to identify anything special at the start of the pattern. If there
10057 is, then handle it as required. This may involve generating a single regop,
10058 such as for an assertion; or it may involve recursing, such as to
10059 handle a () structure.
10061 If the string doesn't start with something special then we gobble up
10062 as much literal text as we can.
10064 Once we have been able to handle whatever type of thing started the
10065 sequence, we return.
10067 Note: we have to be careful with escapes, as they can be both literal
10068 and special, and in the case of \10 and friends, context determines which.
10070 A summary of the code structure is:
10072 switch (first_byte) {
10073 cases for each special:
10074 handle this special;
10077 switch (2nd byte) {
10078 cases for each unambiguous special:
10079 handle this special;
10081 cases for each ambigous special/literal:
10083 if (special) handle here
10085 default: // unambiguously literal:
10088 default: // is a literal char
10091 create EXACTish node for literal;
10092 while (more input and node isn't full) {
10093 switch (input_byte) {
10094 cases for each special;
10095 make sure parse pointer is set so that the next call to
10096 regatom will see this special first
10097 goto loopdone; // EXACTish node terminated by prev. char
10099 append char to EXACTISH node;
10101 get next input byte;
10105 return the generated node;
10107 Specifically there are two separate switches for handling
10108 escape sequences, with the one for handling literal escapes requiring
10109 a dummy entry for all of the special escapes that are actually handled
10114 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10117 regnode *ret = NULL;
10119 char *parse_start = RExC_parse;
10121 GET_RE_DEBUG_FLAGS_DECL;
10122 DEBUG_PARSE("atom");
10123 *flagp = WORST; /* Tentatively. */
10125 PERL_ARGS_ASSERT_REGATOM;
10128 switch ((U8)*RExC_parse) {
10130 RExC_seen_zerolen++;
10131 nextchar(pRExC_state);
10132 if (RExC_flags & RXf_PMf_MULTILINE)
10133 ret = reg_node(pRExC_state, MBOL);
10134 else if (RExC_flags & RXf_PMf_SINGLELINE)
10135 ret = reg_node(pRExC_state, SBOL);
10137 ret = reg_node(pRExC_state, BOL);
10138 Set_Node_Length(ret, 1); /* MJD */
10141 nextchar(pRExC_state);
10143 RExC_seen_zerolen++;
10144 if (RExC_flags & RXf_PMf_MULTILINE)
10145 ret = reg_node(pRExC_state, MEOL);
10146 else if (RExC_flags & RXf_PMf_SINGLELINE)
10147 ret = reg_node(pRExC_state, SEOL);
10149 ret = reg_node(pRExC_state, EOL);
10150 Set_Node_Length(ret, 1); /* MJD */
10153 nextchar(pRExC_state);
10154 if (RExC_flags & RXf_PMf_SINGLELINE)
10155 ret = reg_node(pRExC_state, SANY);
10157 ret = reg_node(pRExC_state, REG_ANY);
10158 *flagp |= HASWIDTH|SIMPLE;
10160 Set_Node_Length(ret, 1); /* MJD */
10164 char * const oregcomp_parse = ++RExC_parse;
10165 ret = regclass(pRExC_state, flagp,depth+1);
10166 if (*RExC_parse != ']') {
10167 RExC_parse = oregcomp_parse;
10168 vFAIL("Unmatched [");
10170 nextchar(pRExC_state);
10171 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10175 nextchar(pRExC_state);
10176 ret = reg(pRExC_state, 1, &flags,depth+1);
10178 if (flags & TRYAGAIN) {
10179 if (RExC_parse == RExC_end) {
10180 /* Make parent create an empty node if needed. */
10181 *flagp |= TRYAGAIN;
10188 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10192 if (flags & TRYAGAIN) {
10193 *flagp |= TRYAGAIN;
10196 vFAIL("Internal urp");
10197 /* Supposed to be caught earlier. */
10203 vFAIL("Quantifier follows nothing");
10208 This switch handles escape sequences that resolve to some kind
10209 of special regop and not to literal text. Escape sequnces that
10210 resolve to literal text are handled below in the switch marked
10213 Every entry in this switch *must* have a corresponding entry
10214 in the literal escape switch. However, the opposite is not
10215 required, as the default for this switch is to jump to the
10216 literal text handling code.
10218 switch ((U8)*++RExC_parse) {
10219 /* Special Escapes */
10221 RExC_seen_zerolen++;
10222 ret = reg_node(pRExC_state, SBOL);
10224 goto finish_meta_pat;
10226 ret = reg_node(pRExC_state, GPOS);
10227 RExC_seen |= REG_SEEN_GPOS;
10229 goto finish_meta_pat;
10231 RExC_seen_zerolen++;
10232 ret = reg_node(pRExC_state, KEEPS);
10234 /* XXX:dmq : disabling in-place substitution seems to
10235 * be necessary here to avoid cases of memory corruption, as
10236 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10238 RExC_seen |= REG_SEEN_LOOKBEHIND;
10239 goto finish_meta_pat;
10241 ret = reg_node(pRExC_state, SEOL);
10243 RExC_seen_zerolen++; /* Do not optimize RE away */
10244 goto finish_meta_pat;
10246 ret = reg_node(pRExC_state, EOS);
10248 RExC_seen_zerolen++; /* Do not optimize RE away */
10249 goto finish_meta_pat;
10251 ret = reg_node(pRExC_state, CANY);
10252 RExC_seen |= REG_SEEN_CANY;
10253 *flagp |= HASWIDTH|SIMPLE;
10254 goto finish_meta_pat;
10256 ret = reg_node(pRExC_state, CLUMP);
10257 *flagp |= HASWIDTH;
10258 goto finish_meta_pat;
10260 op = ALNUM + get_regex_charset(RExC_flags);
10261 if (op > ALNUMA) { /* /aa is same as /a */
10264 ret = reg_node(pRExC_state, op);
10265 *flagp |= HASWIDTH|SIMPLE;
10266 goto finish_meta_pat;
10268 op = NALNUM + get_regex_charset(RExC_flags);
10269 if (op > NALNUMA) { /* /aa is same as /a */
10272 ret = reg_node(pRExC_state, op);
10273 *flagp |= HASWIDTH|SIMPLE;
10274 goto finish_meta_pat;
10276 RExC_seen_zerolen++;
10277 RExC_seen |= REG_SEEN_LOOKBEHIND;
10278 op = BOUND + get_regex_charset(RExC_flags);
10279 if (op > BOUNDA) { /* /aa is same as /a */
10282 ret = reg_node(pRExC_state, op);
10283 FLAGS(ret) = get_regex_charset(RExC_flags);
10285 goto finish_meta_pat;
10287 RExC_seen_zerolen++;
10288 RExC_seen |= REG_SEEN_LOOKBEHIND;
10289 op = NBOUND + get_regex_charset(RExC_flags);
10290 if (op > NBOUNDA) { /* /aa is same as /a */
10293 ret = reg_node(pRExC_state, op);
10294 FLAGS(ret) = get_regex_charset(RExC_flags);
10296 goto finish_meta_pat;
10298 op = SPACE + get_regex_charset(RExC_flags);
10299 if (op > SPACEA) { /* /aa is same as /a */
10302 ret = reg_node(pRExC_state, op);
10303 *flagp |= HASWIDTH|SIMPLE;
10304 goto finish_meta_pat;
10306 op = NSPACE + get_regex_charset(RExC_flags);
10307 if (op > NSPACEA) { /* /aa is same as /a */
10310 ret = reg_node(pRExC_state, op);
10311 *flagp |= HASWIDTH|SIMPLE;
10312 goto finish_meta_pat;
10320 U8 offset = get_regex_charset(RExC_flags);
10321 if (offset == REGEX_UNICODE_CHARSET) {
10322 offset = REGEX_DEPENDS_CHARSET;
10324 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10325 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10329 ret = reg_node(pRExC_state, op);
10330 *flagp |= HASWIDTH|SIMPLE;
10331 goto finish_meta_pat;
10333 ret = reg_node(pRExC_state, LNBREAK);
10334 *flagp |= HASWIDTH|SIMPLE;
10335 goto finish_meta_pat;
10337 ret = reg_node(pRExC_state, HORIZWS);
10338 *flagp |= HASWIDTH|SIMPLE;
10339 goto finish_meta_pat;
10341 ret = reg_node(pRExC_state, NHORIZWS);
10342 *flagp |= HASWIDTH|SIMPLE;
10343 goto finish_meta_pat;
10345 ret = reg_node(pRExC_state, VERTWS);
10346 *flagp |= HASWIDTH|SIMPLE;
10347 goto finish_meta_pat;
10349 ret = reg_node(pRExC_state, NVERTWS);
10350 *flagp |= HASWIDTH|SIMPLE;
10352 nextchar(pRExC_state);
10353 Set_Node_Length(ret, 2); /* MJD */
10358 char* const oldregxend = RExC_end;
10360 char* parse_start = RExC_parse - 2;
10363 if (RExC_parse[1] == '{') {
10364 /* a lovely hack--pretend we saw [\pX] instead */
10365 RExC_end = strchr(RExC_parse, '}');
10367 const U8 c = (U8)*RExC_parse;
10369 RExC_end = oldregxend;
10370 vFAIL2("Missing right brace on \\%c{}", c);
10375 RExC_end = RExC_parse + 2;
10376 if (RExC_end > oldregxend)
10377 RExC_end = oldregxend;
10381 ret = regclass(pRExC_state, flagp,depth+1);
10383 RExC_end = oldregxend;
10386 Set_Node_Offset(ret, parse_start + 2);
10387 Set_Node_Cur_Length(ret);
10388 nextchar(pRExC_state);
10392 /* Handle \N and \N{NAME} with multiple code points here and not
10393 * below because it can be multicharacter. join_exact() will join
10394 * them up later on. Also this makes sure that things like
10395 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10396 * The options to the grok function call causes it to fail if the
10397 * sequence is just a single code point. We then go treat it as
10398 * just another character in the current EXACT node, and hence it
10399 * gets uniform treatment with all the other characters. The
10400 * special treatment for quantifiers is not needed for such single
10401 * character sequences */
10403 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10408 case 'k': /* Handle \k<NAME> and \k'NAME' */
10411 char ch= RExC_parse[1];
10412 if (ch != '<' && ch != '\'' && ch != '{') {
10414 vFAIL2("Sequence %.2s... not terminated",parse_start);
10416 /* this pretty much dupes the code for (?P=...) in reg(), if
10417 you change this make sure you change that */
10418 char* name_start = (RExC_parse += 2);
10420 SV *sv_dat = reg_scan_name(pRExC_state,
10421 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10422 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10423 if (RExC_parse == name_start || *RExC_parse != ch)
10424 vFAIL2("Sequence %.3s... not terminated",parse_start);
10427 num = add_data( pRExC_state, 1, "S" );
10428 RExC_rxi->data->data[num]=(void*)sv_dat;
10429 SvREFCNT_inc_simple_void(sv_dat);
10433 ret = reganode(pRExC_state,
10436 : (ASCII_FOLD_RESTRICTED)
10438 : (AT_LEAST_UNI_SEMANTICS)
10444 *flagp |= HASWIDTH;
10446 /* override incorrect value set in reganode MJD */
10447 Set_Node_Offset(ret, parse_start+1);
10448 Set_Node_Cur_Length(ret); /* MJD */
10449 nextchar(pRExC_state);
10455 case '1': case '2': case '3': case '4':
10456 case '5': case '6': case '7': case '8': case '9':
10459 bool isg = *RExC_parse == 'g';
10464 if (*RExC_parse == '{') {
10468 if (*RExC_parse == '-') {
10472 if (hasbrace && !isDIGIT(*RExC_parse)) {
10473 if (isrel) RExC_parse--;
10475 goto parse_named_seq;
10477 num = atoi(RExC_parse);
10478 if (isg && num == 0)
10479 vFAIL("Reference to invalid group 0");
10481 num = RExC_npar - num;
10483 vFAIL("Reference to nonexistent or unclosed group");
10485 if (!isg && num > 9 && num >= RExC_npar)
10486 /* Probably a character specified in octal, e.g. \35 */
10489 char * const parse_start = RExC_parse - 1; /* MJD */
10490 while (isDIGIT(*RExC_parse))
10492 if (parse_start == RExC_parse - 1)
10493 vFAIL("Unterminated \\g... pattern");
10495 if (*RExC_parse != '}')
10496 vFAIL("Unterminated \\g{...} pattern");
10500 if (num > (I32)RExC_rx->nparens)
10501 vFAIL("Reference to nonexistent group");
10504 ret = reganode(pRExC_state,
10507 : (ASCII_FOLD_RESTRICTED)
10509 : (AT_LEAST_UNI_SEMANTICS)
10515 *flagp |= HASWIDTH;
10517 /* override incorrect value set in reganode MJD */
10518 Set_Node_Offset(ret, parse_start+1);
10519 Set_Node_Cur_Length(ret); /* MJD */
10521 nextchar(pRExC_state);
10526 if (RExC_parse >= RExC_end)
10527 FAIL("Trailing \\");
10530 /* Do not generate "unrecognized" warnings here, we fall
10531 back into the quick-grab loop below */
10538 if (RExC_flags & RXf_PMf_EXTENDED) {
10539 if ( reg_skipcomment( pRExC_state ) )
10546 parse_start = RExC_parse - 1;
10555 #define MAX_NODE_STRING_SIZE 127
10556 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10558 U8 upper_parse = MAX_NODE_STRING_SIZE;
10561 bool next_is_quantifier;
10562 char * oldp = NULL;
10564 /* If a folding node contains only code points that don't
10565 * participate in folds, it can be changed into an EXACT node,
10566 * which allows the optimizer more things to look for */
10570 node_type = compute_EXACTish(pRExC_state);
10571 ret = reg_node(pRExC_state, node_type);
10573 /* In pass1, folded, we use a temporary buffer instead of the
10574 * actual node, as the node doesn't exist yet */
10575 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10581 /* We do the EXACTFish to EXACT node only if folding, and not if in
10582 * locale, as whether a character folds or not isn't known until
10584 maybe_exact = FOLD && ! LOC;
10586 /* XXX The node can hold up to 255 bytes, yet this only goes to
10587 * 127. I (khw) do not know why. Keeping it somewhat less than
10588 * 255 allows us to not have to worry about overflow due to
10589 * converting to utf8 and fold expansion, but that value is
10590 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10591 * split up by this limit into a single one using the real max of
10592 * 255. Even at 127, this breaks under rare circumstances. If
10593 * folding, we do not want to split a node at a character that is a
10594 * non-final in a multi-char fold, as an input string could just
10595 * happen to want to match across the node boundary. The join
10596 * would solve that problem if the join actually happens. But a
10597 * series of more than two nodes in a row each of 127 would cause
10598 * the first join to succeed to get to 254, but then there wouldn't
10599 * be room for the next one, which could at be one of those split
10600 * multi-char folds. I don't know of any fool-proof solution. One
10601 * could back off to end with only a code point that isn't such a
10602 * non-final, but it is possible for there not to be any in the
10604 for (p = RExC_parse - 1;
10605 len < upper_parse && p < RExC_end;
10610 if (RExC_flags & RXf_PMf_EXTENDED)
10611 p = regwhite( pRExC_state, p );
10622 /* Literal Escapes Switch
10624 This switch is meant to handle escape sequences that
10625 resolve to a literal character.
10627 Every escape sequence that represents something
10628 else, like an assertion or a char class, is handled
10629 in the switch marked 'Special Escapes' above in this
10630 routine, but also has an entry here as anything that
10631 isn't explicitly mentioned here will be treated as
10632 an unescaped equivalent literal.
10635 switch ((U8)*++p) {
10636 /* These are all the special escapes. */
10637 case 'A': /* Start assertion */
10638 case 'b': case 'B': /* Word-boundary assertion*/
10639 case 'C': /* Single char !DANGEROUS! */
10640 case 'd': case 'D': /* digit class */
10641 case 'g': case 'G': /* generic-backref, pos assertion */
10642 case 'h': case 'H': /* HORIZWS */
10643 case 'k': case 'K': /* named backref, keep marker */
10644 case 'p': case 'P': /* Unicode property */
10645 case 'R': /* LNBREAK */
10646 case 's': case 'S': /* space class */
10647 case 'v': case 'V': /* VERTWS */
10648 case 'w': case 'W': /* word class */
10649 case 'X': /* eXtended Unicode "combining character sequence" */
10650 case 'z': case 'Z': /* End of line/string assertion */
10654 /* Anything after here is an escape that resolves to a
10655 literal. (Except digits, which may or may not)
10661 case 'N': /* Handle a single-code point named character. */
10662 /* The options cause it to fail if a multiple code
10663 * point sequence. Handle those in the switch() above
10665 RExC_parse = p + 1;
10666 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10667 flagp, depth, FALSE))
10669 RExC_parse = p = oldp;
10673 if (ender > 0xff) {
10690 ender = ASCII_TO_NATIVE('\033');
10694 ender = ASCII_TO_NATIVE('\007');
10699 STRLEN brace_len = len;
10701 const char* error_msg;
10703 bool valid = grok_bslash_o(p,
10710 RExC_parse = p; /* going to die anyway; point
10711 to exact spot of failure */
10718 if (PL_encoding && ender < 0x100) {
10719 goto recode_encoding;
10721 if (ender > 0xff) {
10728 STRLEN brace_len = len;
10730 const char* error_msg;
10732 bool valid = grok_bslash_x(p,
10739 RExC_parse = p; /* going to die anyway; point
10740 to exact spot of failure */
10746 if (PL_encoding && ender < 0x100) {
10747 goto recode_encoding;
10749 if (ender > 0xff) {
10756 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10758 case '0': case '1': case '2': case '3':case '4':
10759 case '5': case '6': case '7':
10761 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10763 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10765 ender = grok_oct(p, &numlen, &flags, NULL);
10766 if (ender > 0xff) {
10775 if (PL_encoding && ender < 0x100)
10776 goto recode_encoding;
10779 if (! RExC_override_recoding) {
10780 SV* enc = PL_encoding;
10781 ender = reg_recode((const char)(U8)ender, &enc);
10782 if (!enc && SIZE_ONLY)
10783 ckWARNreg(p, "Invalid escape in the specified encoding");
10789 FAIL("Trailing \\");
10792 if (!SIZE_ONLY&& isALNUMC(*p)) {
10793 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10795 goto normal_default;
10799 /* Currently we don't warn when the lbrace is at the start
10800 * of a construct. This catches it in the middle of a
10801 * literal string, or when its the first thing after
10802 * something like "\b" */
10804 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10806 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10811 if (UTF8_IS_START(*p) && UTF) {
10813 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10814 &numlen, UTF8_ALLOW_DEFAULT);
10820 } /* End of switch on the literal */
10822 /* Here, have looked at the literal character and <ender>
10823 * contains its ordinal, <p> points to the character after it
10826 if ( RExC_flags & RXf_PMf_EXTENDED)
10827 p = regwhite( pRExC_state, p );
10829 /* If the next thing is a quantifier, it applies to this
10830 * character only, which means that this character has to be in
10831 * its own node and can't just be appended to the string in an
10832 * existing node, so if there are already other characters in
10833 * the node, close the node with just them, and set up to do
10834 * this character again next time through, when it will be the
10835 * only thing in its new node */
10836 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10844 /* See comments for join_exact() as to why we fold
10845 * this non-UTF at compile time */
10846 || (node_type == EXACTFU
10847 && ender == LATIN_SMALL_LETTER_SHARP_S))
10851 /* Prime the casefolded buffer. Locale rules, which
10852 * apply only to code points < 256, aren't known until
10853 * execution, so for them, just output the original
10854 * character using utf8. If we start to fold non-UTF
10855 * patterns, be sure to update join_exact() */
10856 if (LOC && ender < 256) {
10857 if (UNI_IS_INVARIANT(ender)) {
10861 *s = UTF8_TWO_BYTE_HI(ender);
10862 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10867 UV folded = _to_uni_fold_flags(
10872 | ((LOC) ? FOLD_FLAGS_LOCALE
10873 : (ASCII_FOLD_RESTRICTED)
10874 ? FOLD_FLAGS_NOMIX_ASCII
10878 /* If this node only contains non-folding code
10879 * points so far, see if this new one is also
10882 if (folded != ender) {
10883 maybe_exact = FALSE;
10886 /* Here the fold is the original; we have
10887 * to check further to see if anything
10889 if (! PL_utf8_foldable) {
10890 SV* swash = swash_init("utf8",
10892 &PL_sv_undef, 1, 0);
10894 _get_swash_invlist(swash);
10895 SvREFCNT_dec(swash);
10897 if (_invlist_contains_cp(PL_utf8_foldable,
10900 maybe_exact = FALSE;
10908 /* The loop increments <len> each time, as all but this
10909 * path (and the one just below for UTF) through it add
10910 * a single byte to the EXACTish node. But this one
10911 * has changed len to be the correct final value, so
10912 * subtract one to cancel out the increment that
10914 len += foldlen - 1;
10918 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10922 const STRLEN unilen = reguni(pRExC_state, ender, s);
10928 /* See comment just above for - 1 */
10932 REGC((char)ender, s++);
10935 if (next_is_quantifier) {
10937 /* Here, the next input is a quantifier, and to get here,
10938 * the current character is the only one in the node.
10939 * Also, here <len> doesn't include the final byte for this
10945 } /* End of loop through literal characters */
10947 /* Here we have either exhausted the input or ran out of room in
10948 * the node. (If we encountered a character that can't be in the
10949 * node, transfer is made directly to <loopdone>, and so we
10950 * wouldn't have fallen off the end of the loop.) In the latter
10951 * case, we artificially have to split the node into two, because
10952 * we just don't have enough space to hold everything. This
10953 * creates a problem if the final character participates in a
10954 * multi-character fold in the non-final position, as a match that
10955 * should have occurred won't, due to the way nodes are matched,
10956 * and our artificial boundary. So back off until we find a non-
10957 * problematic character -- one that isn't at the beginning or
10958 * middle of such a fold. (Either it doesn't participate in any
10959 * folds, or appears only in the final position of all the folds it
10960 * does participate in.) A better solution with far fewer false
10961 * positives, and that would fill the nodes more completely, would
10962 * be to actually have available all the multi-character folds to
10963 * test against, and to back-off only far enough to be sure that
10964 * this node isn't ending with a partial one. <upper_parse> is set
10965 * further below (if we need to reparse the node) to include just
10966 * up through that final non-problematic character that this code
10967 * identifies, so when it is set to less than the full node, we can
10968 * skip the rest of this */
10969 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10971 const STRLEN full_len = len;
10973 assert(len >= MAX_NODE_STRING_SIZE);
10975 /* Here, <s> points to the final byte of the final character.
10976 * Look backwards through the string until find a non-
10977 * problematic character */
10981 /* These two have no multi-char folds to non-UTF characters
10983 if (ASCII_FOLD_RESTRICTED || LOC) {
10987 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10991 if (! PL_NonL1NonFinalFold) {
10992 PL_NonL1NonFinalFold = _new_invlist_C_array(
10993 NonL1_Perl_Non_Final_Folds_invlist);
10996 /* Point to the first byte of the final character */
10997 s = (char *) utf8_hop((U8 *) s, -1);
10999 while (s >= s0) { /* Search backwards until find
11000 non-problematic char */
11001 if (UTF8_IS_INVARIANT(*s)) {
11003 /* There are no ascii characters that participate
11004 * in multi-char folds under /aa. In EBCDIC, the
11005 * non-ascii invariants are all control characters,
11006 * so don't ever participate in any folds. */
11007 if (ASCII_FOLD_RESTRICTED
11008 || ! IS_NON_FINAL_FOLD(*s))
11013 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11015 /* No Latin1 characters participate in multi-char
11016 * folds under /l */
11018 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
11024 else if (! _invlist_contains_cp(
11025 PL_NonL1NonFinalFold,
11026 valid_utf8_to_uvchr((U8 *) s, NULL)))
11031 /* Here, the current character is problematic in that
11032 * it does occur in the non-final position of some
11033 * fold, so try the character before it, but have to
11034 * special case the very first byte in the string, so
11035 * we don't read outside the string */
11036 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11037 } /* End of loop backwards through the string */
11039 /* If there were only problematic characters in the string,
11040 * <s> will point to before s0, in which case the length
11041 * should be 0, otherwise include the length of the
11042 * non-problematic character just found */
11043 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11046 /* Here, have found the final character, if any, that is
11047 * non-problematic as far as ending the node without splitting
11048 * it across a potential multi-char fold. <len> contains the
11049 * number of bytes in the node up-to and including that
11050 * character, or is 0 if there is no such character, meaning
11051 * the whole node contains only problematic characters. In
11052 * this case, give up and just take the node as-is. We can't
11058 /* Here, the node does contain some characters that aren't
11059 * problematic. If one such is the final character in the
11060 * node, we are done */
11061 if (len == full_len) {
11064 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11066 /* If the final character is problematic, but the
11067 * penultimate is not, back-off that last character to
11068 * later start a new node with it */
11073 /* Here, the final non-problematic character is earlier
11074 * in the input than the penultimate character. What we do
11075 * is reparse from the beginning, going up only as far as
11076 * this final ok one, thus guaranteeing that the node ends
11077 * in an acceptable character. The reason we reparse is
11078 * that we know how far in the character is, but we don't
11079 * know how to correlate its position with the input parse.
11080 * An alternate implementation would be to build that
11081 * correlation as we go along during the original parse,
11082 * but that would entail extra work for every node, whereas
11083 * this code gets executed only when the string is too
11084 * large for the node, and the final two characters are
11085 * problematic, an infrequent occurrence. Yet another
11086 * possible strategy would be to save the tail of the
11087 * string, and the next time regatom is called, initialize
11088 * with that. The problem with this is that unless you
11089 * back off one more character, you won't be guaranteed
11090 * regatom will get called again, unless regbranch,
11091 * regpiece ... are also changed. If you do back off that
11092 * extra character, so that there is input guaranteed to
11093 * force calling regatom, you can't handle the case where
11094 * just the first character in the node is acceptable. I
11095 * (khw) decided to try this method which doesn't have that
11096 * pitfall; if performance issues are found, we can do a
11097 * combination of the current approach plus that one */
11103 } /* End of verifying node ends with an appropriate char */
11105 loopdone: /* Jumped to when encounters something that shouldn't be in
11108 /* If 'maybe_exact' is still set here, means there are no
11109 * code points in the node that participate in folds */
11110 if (FOLD && maybe_exact) {
11114 /* I (khw) don't know if you can get here with zero length, but the
11115 * old code handled this situation by creating a zero-length EXACT
11116 * node. Might as well be NOTHING instead */
11121 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11124 RExC_parse = p - 1;
11125 Set_Node_Cur_Length(ret); /* MJD */
11126 nextchar(pRExC_state);
11128 /* len is STRLEN which is unsigned, need to copy to signed */
11131 vFAIL("Internal disaster");
11134 } /* End of label 'defchar:' */
11136 } /* End of giant switch on input character */
11142 S_regwhite( RExC_state_t *pRExC_state, char *p )
11144 const char *e = RExC_end;
11146 PERL_ARGS_ASSERT_REGWHITE;
11151 else if (*p == '#') {
11154 if (*p++ == '\n') {
11160 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11168 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11169 Character classes ([:foo:]) can also be negated ([:^foo:]).
11170 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11171 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11172 but trigger failures because they are currently unimplemented. */
11174 #define POSIXCC_DONE(c) ((c) == ':')
11175 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11176 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11178 PERL_STATIC_INLINE I32
11179 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
11182 I32 namedclass = OOB_NAMEDCLASS;
11184 PERL_ARGS_ASSERT_REGPPOSIXCC;
11186 if (value == '[' && RExC_parse + 1 < RExC_end &&
11187 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11188 POSIXCC(UCHARAT(RExC_parse))) {
11189 const char c = UCHARAT(RExC_parse);
11190 char* const s = RExC_parse++;
11192 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11194 if (RExC_parse == RExC_end)
11195 /* Grandfather lone [:, [=, [. */
11198 const char* const t = RExC_parse++; /* skip over the c */
11201 if (UCHARAT(RExC_parse) == ']') {
11202 const char *posixcc = s + 1;
11203 RExC_parse++; /* skip over the ending ] */
11206 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11207 const I32 skip = t - posixcc;
11209 /* Initially switch on the length of the name. */
11212 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11213 namedclass = ANYOF_WORDCHAR;
11216 /* Names all of length 5. */
11217 /* alnum alpha ascii blank cntrl digit graph lower
11218 print punct space upper */
11219 /* Offset 4 gives the best switch position. */
11220 switch (posixcc[4]) {
11222 if (memEQ(posixcc, "alph", 4)) /* alpha */
11223 namedclass = ANYOF_ALPHA;
11226 if (memEQ(posixcc, "spac", 4)) /* space */
11227 namedclass = ANYOF_PSXSPC;
11230 if (memEQ(posixcc, "grap", 4)) /* graph */
11231 namedclass = ANYOF_GRAPH;
11234 if (memEQ(posixcc, "asci", 4)) /* ascii */
11235 namedclass = ANYOF_ASCII;
11238 if (memEQ(posixcc, "blan", 4)) /* blank */
11239 namedclass = ANYOF_BLANK;
11242 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11243 namedclass = ANYOF_CNTRL;
11246 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11247 namedclass = ANYOF_ALNUMC;
11250 if (memEQ(posixcc, "lowe", 4)) /* lower */
11251 namedclass = ANYOF_LOWER;
11252 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11253 namedclass = ANYOF_UPPER;
11256 if (memEQ(posixcc, "digi", 4)) /* digit */
11257 namedclass = ANYOF_DIGIT;
11258 else if (memEQ(posixcc, "prin", 4)) /* print */
11259 namedclass = ANYOF_PRINT;
11260 else if (memEQ(posixcc, "punc", 4)) /* punct */
11261 namedclass = ANYOF_PUNCT;
11266 if (memEQ(posixcc, "xdigit", 6))
11267 namedclass = ANYOF_XDIGIT;
11271 if (namedclass == OOB_NAMEDCLASS)
11272 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11275 /* The #defines are structured so each complement is +1 to
11276 * the normal one */
11280 assert (posixcc[skip] == ':');
11281 assert (posixcc[skip+1] == ']');
11282 } else if (!SIZE_ONLY) {
11283 /* [[=foo=]] and [[.foo.]] are still future. */
11285 /* adjust RExC_parse so the warning shows after
11286 the class closes */
11287 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11289 SvREFCNT_dec(free_me);
11290 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11293 /* Maternal grandfather:
11294 * "[:" ending in ":" but not in ":]" */
11303 /* Generate the code to add a full posix character <class> to the bracketed
11304 * character class given by <node>. (<node> is needed only under locale rules)
11305 * destlist is the inversion list for non-locale rules that this class is
11307 * sourcelist is the ASCII-range inversion list to add under /a rules
11308 * Xsourcelist is the full Unicode range list to use otherwise. */
11309 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11311 SV* scratch_list = NULL; \
11313 /* Set this class in the node for runtime matching */ \
11314 ANYOF_CLASS_SET(node, class); \
11316 /* For above Latin1 code points, we use the full Unicode range */ \
11317 _invlist_intersection(PL_AboveLatin1, \
11320 /* And set the output to it, adding instead if there already is an \
11321 * output. Checking if <destlist> is NULL first saves an extra \
11322 * clone. Its reference count will be decremented at the next \
11323 * union, etc, or if this is the only instance, at the end of the \
11325 if (! destlist) { \
11326 destlist = scratch_list; \
11329 _invlist_union(destlist, scratch_list, &destlist); \
11330 SvREFCNT_dec(scratch_list); \
11334 /* For non-locale, just add it to any existing list */ \
11335 _invlist_union(destlist, \
11336 (AT_LEAST_ASCII_RESTRICTED) \
11342 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11344 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11346 SV* scratch_list = NULL; \
11347 ANYOF_CLASS_SET(node, class); \
11348 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11349 if (! destlist) { \
11350 destlist = scratch_list; \
11353 _invlist_union(destlist, scratch_list, &destlist); \
11354 SvREFCNT_dec(scratch_list); \
11358 _invlist_union_complement_2nd(destlist, \
11359 (AT_LEAST_ASCII_RESTRICTED) \
11363 /* Under /d, everything in the upper half of the Latin1 range \
11364 * matches this complement */ \
11365 if (DEPENDS_SEMANTICS) { \
11366 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11370 /* Generate the code to add a posix character <class> to the bracketed
11371 * character class given by <node>. (<node> is needed only under locale rules)
11372 * destlist is the inversion list for non-locale rules that this class is
11374 * sourcelist is the ASCII-range inversion list to add under /a rules
11375 * l1_sourcelist is the Latin1 range list to use otherwise.
11376 * Xpropertyname is the name to add to <run_time_list> of the property to
11377 * specify the code points above Latin1 that will have to be
11378 * determined at run-time
11379 * run_time_list is a SV* that contains text names of properties that are to
11380 * be computed at run time. This concatenates <Xpropertyname>
11381 * to it, appropriately
11382 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11384 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11385 l1_sourcelist, Xpropertyname, run_time_list) \
11386 /* First, resolve whether to use the ASCII-only list or the L1 \
11388 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11389 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11390 Xpropertyname, run_time_list)
11392 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11393 Xpropertyname, run_time_list) \
11394 /* If not /a matching, there are going to be code points we will have \
11395 * to defer to runtime to look-up */ \
11396 if (! AT_LEAST_ASCII_RESTRICTED) { \
11397 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11400 ANYOF_CLASS_SET(node, class); \
11403 _invlist_union(destlist, sourcelist, &destlist); \
11406 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11407 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11409 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11410 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11411 if (AT_LEAST_ASCII_RESTRICTED) { \
11412 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11415 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11416 matches_above_unicode = TRUE; \
11418 ANYOF_CLASS_SET(node, namedclass); \
11421 SV* scratch_list = NULL; \
11422 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11423 if (! destlist) { \
11424 destlist = scratch_list; \
11427 _invlist_union(destlist, scratch_list, &destlist); \
11428 SvREFCNT_dec(scratch_list); \
11430 if (DEPENDS_SEMANTICS) { \
11431 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11436 /* The names of properties whose definitions are not known at compile time are
11437 * stored in this SV, after a constant heading. So if the length has been
11438 * changed since initialization, then there is a run-time definition. */
11439 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11441 /* This converts the named class defined in regcomp.h to its equivalent class
11442 * number defined in handy.h. */
11443 #define namedclass_to_classnum(class) ((class) / 2)
11446 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11448 /* parse a bracketed class specification. Most of these will produce an ANYOF node;
11449 * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11450 * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
11451 * multi-character folds: it will be rewritten following the paradigm of
11452 * this example, where the <multi-fold>s are characters which fold to
11453 * multiple character sequences:
11454 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11455 * gets effectively rewritten as:
11456 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11457 * reg() gets called (recursively) on the rewritten version, and this
11458 * function will return what it constructs. (Actually the <multi-fold>s
11459 * aren't physically removed from the [abcdefghi], it's just that they are
11460 * ignored in the recursion by means of a flag:
11461 * <RExC_in_multi_char_class>.)
11463 * ANYOF nodes contain a bit map for the first 256 characters, with the
11464 * corresponding bit set if that character is in the list. For characters
11465 * above 255, a range list or swash is used. There are extra bits for \w,
11466 * etc. in locale ANYOFs, as what these match is not determinable at
11471 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11473 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11476 IV namedclass = OOB_NAMEDCLASS;
11477 char *rangebegin = NULL;
11478 bool need_class = 0;
11480 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11481 than just initialized. */
11482 SV* properties = NULL; /* Code points that match \p{} \P{} */
11483 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11484 extended beyond the Latin1 range */
11485 UV element_count = 0; /* Number of distinct elements in the class.
11486 Optimizations may be possible if this is tiny */
11487 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11488 character; used under /i */
11491 /* Unicode properties are stored in a swash; this holds the current one
11492 * being parsed. If this swash is the only above-latin1 component of the
11493 * character class, an optimization is to pass it directly on to the
11494 * execution engine. Otherwise, it is set to NULL to indicate that there
11495 * are other things in the class that have to be dealt with at execution
11497 SV* swash = NULL; /* Code points that match \p{} \P{} */
11499 /* Set if a component of this character class is user-defined; just passed
11500 * on to the engine */
11501 bool has_user_defined_property = FALSE;
11503 /* inversion list of code points this node matches only when the target
11504 * string is in UTF-8. (Because is under /d) */
11505 SV* depends_list = NULL;
11507 /* inversion list of code points this node matches. For much of the
11508 * function, it includes only those that match regardless of the utf8ness
11509 * of the target string */
11510 SV* cp_list = NULL;
11513 /* In a range, counts how many 0-2 of the ends of it came from literals,
11514 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11515 UV literal_endpoint = 0;
11517 bool invert = FALSE; /* Is this class to be complemented */
11519 /* Is there any thing like \W or [:^digit:] that matches above the legal
11520 * Unicode range? */
11521 bool runtime_posix_matches_above_Unicode = FALSE;
11523 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11524 case we need to change the emitted regop to an EXACT. */
11525 const char * orig_parse = RExC_parse;
11526 const I32 orig_size = RExC_size;
11527 GET_RE_DEBUG_FLAGS_DECL;
11529 PERL_ARGS_ASSERT_REGCLASS;
11531 PERL_UNUSED_ARG(depth);
11534 DEBUG_PARSE("clas");
11536 /* Assume we are going to generate an ANYOF node. */
11537 ret = reganode(pRExC_state, ANYOF, 0);
11540 ANYOF_FLAGS(ret) = 0;
11543 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11550 RExC_size += ANYOF_SKIP;
11551 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11554 RExC_emit += ANYOF_SKIP;
11556 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11558 listsv = newSVpvs("# comment\n");
11559 initial_listsv_len = SvCUR(listsv);
11562 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11564 if (!SIZE_ONLY && POSIXCC(nextvalue))
11566 const char *s = RExC_parse;
11567 const char c = *s++;
11569 while (isALNUM(*s))
11571 if (*s && c == *s && s[1] == ']') {
11572 SAVEFREESV(RExC_rx_sv);
11573 SAVEFREESV(listsv);
11575 "POSIX syntax [%c %c] belongs inside character classes",
11577 ReREFCNT_inc(RExC_rx_sv);
11578 SvREFCNT_inc_simple_void_NN(listsv);
11582 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11583 if (UCHARAT(RExC_parse) == ']')
11584 goto charclassloop;
11587 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11591 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11592 save_value = value;
11593 save_prevvalue = prevvalue;
11596 rangebegin = RExC_parse;
11600 value = utf8n_to_uvchr((U8*)RExC_parse,
11601 RExC_end - RExC_parse,
11602 &numlen, UTF8_ALLOW_DEFAULT);
11603 RExC_parse += numlen;
11606 value = UCHARAT(RExC_parse++);
11608 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11609 if (value == '[' && POSIXCC(nextvalue))
11610 namedclass = regpposixcc(pRExC_state, value, listsv);
11611 else if (value == '\\') {
11613 value = utf8n_to_uvchr((U8*)RExC_parse,
11614 RExC_end - RExC_parse,
11615 &numlen, UTF8_ALLOW_DEFAULT);
11616 RExC_parse += numlen;
11619 value = UCHARAT(RExC_parse++);
11620 /* Some compilers cannot handle switching on 64-bit integer
11621 * values, therefore value cannot be an UV. Yes, this will
11622 * be a problem later if we want switch on Unicode.
11623 * A similar issue a little bit later when switching on
11624 * namedclass. --jhi */
11625 switch ((I32)value) {
11626 case 'w': namedclass = ANYOF_WORDCHAR; break;
11627 case 'W': namedclass = ANYOF_NWORDCHAR; break;
11628 case 's': namedclass = ANYOF_SPACE; break;
11629 case 'S': namedclass = ANYOF_NSPACE; break;
11630 case 'd': namedclass = ANYOF_DIGIT; break;
11631 case 'D': namedclass = ANYOF_NDIGIT; break;
11632 case 'v': namedclass = ANYOF_VERTWS; break;
11633 case 'V': namedclass = ANYOF_NVERTWS; break;
11634 case 'h': namedclass = ANYOF_HORIZWS; break;
11635 case 'H': namedclass = ANYOF_NHORIZWS; break;
11636 case 'N': /* Handle \N{NAME} in class */
11638 /* We only pay attention to the first char of
11639 multichar strings being returned. I kinda wonder
11640 if this makes sense as it does change the behaviour
11641 from earlier versions, OTOH that behaviour was broken
11643 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11644 TRUE /* => charclass */))
11655 /* This routine will handle any undefined properties */
11656 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11658 if (RExC_parse >= RExC_end)
11659 vFAIL2("Empty \\%c{}", (U8)value);
11660 if (*RExC_parse == '{') {
11661 const U8 c = (U8)value;
11662 e = strchr(RExC_parse++, '}');
11664 vFAIL2("Missing right brace on \\%c{}", c);
11665 while (isSPACE(UCHARAT(RExC_parse)))
11667 if (e == RExC_parse)
11668 vFAIL2("Empty \\%c{}", c);
11669 n = e - RExC_parse;
11670 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11681 if (UCHARAT(RExC_parse) == '^') {
11684 value = value == 'p' ? 'P' : 'p'; /* toggle */
11685 while (isSPACE(UCHARAT(RExC_parse))) {
11690 /* Try to get the definition of the property into
11691 * <invlist>. If /i is in effect, the effective property
11692 * will have its name be <__NAME_i>. The design is
11693 * discussed in commit
11694 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11695 Newx(name, n + sizeof("_i__\n"), char);
11697 sprintf(name, "%s%.*s%s\n",
11698 (FOLD) ? "__" : "",
11704 /* Look up the property name, and get its swash and
11705 * inversion list, if the property is found */
11707 SvREFCNT_dec(swash);
11709 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11712 NULL, /* No inversion list */
11715 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11717 SvREFCNT_dec(swash);
11721 /* Here didn't find it. It could be a user-defined
11722 * property that will be available at run-time. Add it
11723 * to the list to look up then */
11724 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11725 (value == 'p' ? '+' : '!'),
11727 has_user_defined_property = TRUE;
11729 /* We don't know yet, so have to assume that the
11730 * property could match something in the Latin1 range,
11731 * hence something that isn't utf8. Note that this
11732 * would cause things in <depends_list> to match
11733 * inappropriately, except that any \p{}, including
11734 * this one forces Unicode semantics, which means there
11735 * is <no depends_list> */
11736 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11740 /* Here, did get the swash and its inversion list. If
11741 * the swash is from a user-defined property, then this
11742 * whole character class should be regarded as such */
11743 has_user_defined_property =
11745 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11747 /* Invert if asking for the complement */
11748 if (value == 'P') {
11749 _invlist_union_complement_2nd(properties,
11753 /* The swash can't be used as-is, because we've
11754 * inverted things; delay removing it to here after
11755 * have copied its invlist above */
11756 SvREFCNT_dec(swash);
11760 _invlist_union(properties, invlist, &properties);
11765 RExC_parse = e + 1;
11766 namedclass = ANYOF_UNIPROP; /* no official name, but it's named */
11768 /* \p means they want Unicode semantics */
11769 RExC_uni_semantics = 1;
11772 case 'n': value = '\n'; break;
11773 case 'r': value = '\r'; break;
11774 case 't': value = '\t'; break;
11775 case 'f': value = '\f'; break;
11776 case 'b': value = '\b'; break;
11777 case 'e': value = ASCII_TO_NATIVE('\033');break;
11778 case 'a': value = ASCII_TO_NATIVE('\007');break;
11780 RExC_parse--; /* function expects to be pointed at the 'o' */
11782 const char* error_msg;
11783 bool valid = grok_bslash_o(RExC_parse,
11788 RExC_parse += numlen;
11793 if (PL_encoding && value < 0x100) {
11794 goto recode_encoding;
11798 RExC_parse--; /* function expects to be pointed at the 'x' */
11800 const char* error_msg;
11801 bool valid = grok_bslash_x(RExC_parse,
11806 RExC_parse += numlen;
11811 if (PL_encoding && value < 0x100)
11812 goto recode_encoding;
11815 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11817 case '0': case '1': case '2': case '3': case '4':
11818 case '5': case '6': case '7':
11820 /* Take 1-3 octal digits */
11821 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11823 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11824 RExC_parse += numlen;
11825 if (PL_encoding && value < 0x100)
11826 goto recode_encoding;
11830 if (! RExC_override_recoding) {
11831 SV* enc = PL_encoding;
11832 value = reg_recode((const char)(U8)value, &enc);
11833 if (!enc && SIZE_ONLY)
11834 ckWARNreg(RExC_parse,
11835 "Invalid escape in the specified encoding");
11839 /* Allow \_ to not give an error */
11840 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11841 SAVEFREESV(RExC_rx_sv);
11842 SAVEFREESV(listsv);
11843 ckWARN2reg(RExC_parse,
11844 "Unrecognized escape \\%c in character class passed through",
11846 ReREFCNT_inc(RExC_rx_sv);
11847 SvREFCNT_inc_simple_void_NN(listsv);
11851 } /* end of \blah */
11854 literal_endpoint++;
11857 /* What matches in a locale is not known until runtime. This
11858 * includes what the Posix classes (like \w, [:space:]) match.
11859 * Room must be reserved (one time per class) to store such
11860 * classes, either if Perl is compiled so that locale nodes always
11861 * should have this space, or if there is such class info to be
11862 * stored. The space will contain a bit for each named class that
11863 * is to be matched against. This isn't needed for \p{} and
11864 * pseudo-classes, as they are not affected by locale, and hence
11865 * are dealt with separately */
11868 && (ANYOF_LOCALE == ANYOF_CLASS
11869 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11873 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11876 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11877 ANYOF_CLASS_ZERO(ret);
11879 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11882 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11884 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11885 * literal, as is the character that began the false range, i.e.
11886 * the 'a' in the examples */
11890 RExC_parse >= rangebegin ?
11891 RExC_parse - rangebegin : 0;
11892 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11893 SAVEFREESV(listsv);
11894 ckWARN4reg(RExC_parse,
11895 "False [] range \"%*.*s\"",
11897 ReREFCNT_inc(RExC_rx_sv);
11898 SvREFCNT_inc_simple_void_NN(listsv);
11899 cp_list = add_cp_to_invlist(cp_list, '-');
11900 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11903 range = 0; /* this was not a true range */
11904 element_count += 2; /* So counts for three values */
11908 switch ((I32)namedclass) {
11910 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11911 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11912 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11914 case ANYOF_NALNUMC:
11915 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11916 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11917 runtime_posix_matches_above_Unicode);
11920 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11921 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11924 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11925 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11926 runtime_posix_matches_above_Unicode);
11931 ANYOF_CLASS_SET(ret, namedclass);
11934 #endif /* Not isascii(); just use the hard-coded definition for it */
11935 _invlist_union(posixes, PL_ASCII, &posixes);
11940 ANYOF_CLASS_SET(ret, namedclass);
11944 _invlist_union_complement_2nd(posixes,
11945 PL_ASCII, &posixes);
11946 if (DEPENDS_SEMANTICS) {
11947 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11954 if (hasISBLANK || ! LOC) {
11955 DO_POSIX(ret, namedclass, posixes,
11956 PL_PosixBlank, PL_XPosixBlank);
11958 else { /* There is no isblank() and we are in locale: We
11959 use the ASCII range and the above-Latin1 range
11961 SV* scratch_list = NULL;
11963 /* Include all above-Latin1 blanks */
11964 _invlist_intersection(PL_AboveLatin1,
11967 /* Add it to the running total of posix classes */
11969 posixes = scratch_list;
11972 _invlist_union(posixes, scratch_list, &posixes);
11973 SvREFCNT_dec(scratch_list);
11975 /* Add the ASCII-range blanks to the running total. */
11976 _invlist_union(posixes, PL_PosixBlank, &posixes);
11980 if (hasISBLANK || ! LOC) {
11981 DO_N_POSIX(ret, namedclass, posixes,
11982 PL_PosixBlank, PL_XPosixBlank);
11984 else { /* There is no isblank() and we are in locale */
11985 SV* scratch_list = NULL;
11987 /* Include all above-Latin1 non-blanks */
11988 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11991 /* Add them to the running total of posix classes */
11992 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11995 posixes = scratch_list;
11998 _invlist_union(posixes, scratch_list, &posixes);
11999 SvREFCNT_dec(scratch_list);
12002 /* Get the list of all non-ASCII-blanks in Latin 1, and
12003 * add them to the running total */
12004 _invlist_subtract(PL_Latin1, PL_PosixBlank,
12006 _invlist_union(posixes, scratch_list, &posixes);
12007 SvREFCNT_dec(scratch_list);
12011 DO_POSIX(ret, namedclass, posixes,
12012 PL_PosixCntrl, PL_XPosixCntrl);
12015 DO_N_POSIX(ret, namedclass, posixes,
12016 PL_PosixCntrl, PL_XPosixCntrl);
12019 /* There are no digits in the Latin1 range outside of
12020 * ASCII, so call the macro that doesn't have to resolve
12022 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
12023 PL_PosixDigit, "XPosixDigit", listsv);
12026 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12027 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
12028 runtime_posix_matches_above_Unicode);
12031 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12032 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
12035 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12036 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
12037 runtime_posix_matches_above_Unicode);
12039 case ANYOF_HORIZWS:
12040 /* For these, we use the cp_list, as /d doesn't make a
12041 * difference in what these match. There would be problems
12042 * if these characters had folds other than themselves, as
12043 * cp_list is subject to folding. It turns out that \h
12044 * is just a synonym for XPosixBlank */
12045 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12047 case ANYOF_NHORIZWS:
12048 _invlist_union_complement_2nd(cp_list,
12049 PL_XPosixBlank, &cp_list);
12053 { /* These require special handling, as they differ under
12054 folding, matching Cased there (which in the ASCII range
12055 is the same as Alpha */
12061 if (FOLD && ! LOC) {
12062 ascii_source = PL_PosixAlpha;
12063 l1_source = PL_L1Cased;
12067 ascii_source = PL_PosixLower;
12068 l1_source = PL_L1PosixLower;
12069 Xname = "XPosixLower";
12071 if (namedclass == ANYOF_LOWER) {
12072 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12073 ascii_source, l1_source, Xname, listsv);
12076 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12077 posixes, ascii_source, l1_source, Xname, listsv,
12078 runtime_posix_matches_above_Unicode);
12083 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12084 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12087 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12088 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12089 runtime_posix_matches_above_Unicode);
12092 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12093 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12096 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12097 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12098 runtime_posix_matches_above_Unicode);
12101 DO_POSIX(ret, namedclass, posixes,
12102 PL_PosixSpace, PL_XPosixSpace);
12104 case ANYOF_NPSXSPC:
12105 DO_N_POSIX(ret, namedclass, posixes,
12106 PL_PosixSpace, PL_XPosixSpace);
12109 DO_POSIX(ret, namedclass, posixes,
12110 PL_PerlSpace, PL_XPerlSpace);
12113 DO_N_POSIX(ret, namedclass, posixes,
12114 PL_PerlSpace, PL_XPerlSpace);
12116 case ANYOF_UPPER: /* Same as LOWER, above */
12123 if (FOLD && ! LOC) {
12124 ascii_source = PL_PosixAlpha;
12125 l1_source = PL_L1Cased;
12129 ascii_source = PL_PosixUpper;
12130 l1_source = PL_L1PosixUpper;
12131 Xname = "XPosixUpper";
12133 if (namedclass == ANYOF_UPPER) {
12134 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12135 ascii_source, l1_source, Xname, listsv);
12138 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12139 posixes, ascii_source, l1_source, Xname, listsv,
12140 runtime_posix_matches_above_Unicode);
12144 case ANYOF_WORDCHAR:
12145 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12146 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12148 case ANYOF_NWORDCHAR:
12149 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12150 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12151 runtime_posix_matches_above_Unicode);
12154 /* For these, we use the cp_list, as /d doesn't make a
12155 * difference in what these match. There would be problems
12156 * if these characters had folds other than themselves, as
12157 * cp_list is subject to folding */
12158 _invlist_union(cp_list, PL_VertSpace, &cp_list);
12160 case ANYOF_NVERTWS:
12161 _invlist_union_complement_2nd(cp_list,
12162 PL_VertSpace, &cp_list);
12165 DO_POSIX(ret, namedclass, posixes,
12166 PL_PosixXDigit, PL_XPosixXDigit);
12168 case ANYOF_NXDIGIT:
12169 DO_N_POSIX(ret, namedclass, posixes,
12170 PL_PosixXDigit, PL_XPosixXDigit);
12172 case ANYOF_UNIPROP: /* this is to handle \p and \P */
12175 vFAIL("Invalid [::] class");
12179 continue; /* Go get next character */
12181 } /* end of namedclass \blah */
12184 if (prevvalue > value) /* b-a */ {
12185 const int w = RExC_parse - rangebegin;
12186 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12187 range = 0; /* not a valid range */
12191 prevvalue = value; /* save the beginning of the potential range */
12192 if (RExC_parse+1 < RExC_end
12193 && *RExC_parse == '-'
12194 && RExC_parse[1] != ']')
12198 /* a bad range like \w-, [:word:]- ? */
12199 if (namedclass > OOB_NAMEDCLASS) {
12200 if (ckWARN(WARN_REGEXP)) {
12202 RExC_parse >= rangebegin ?
12203 RExC_parse - rangebegin : 0;
12205 "False [] range \"%*.*s\"",
12209 cp_list = add_cp_to_invlist(cp_list, '-');
12213 range = 1; /* yeah, it's a range! */
12214 continue; /* but do it the next time */
12218 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12221 /* non-Latin1 code point implies unicode semantics. Must be set in
12222 * pass1 so is there for the whole of pass 2 */
12224 RExC_uni_semantics = 1;
12227 /* Ready to process either the single value, or the completed range.
12228 * For single-valued non-inverted ranges, we consider the possibility
12229 * of multi-char folds. (We made a conscious decision to not do this
12230 * for the other cases because it can often lead to non-intuitive
12231 * results. For example, you have the peculiar case that:
12232 * "s s" =~ /^[^\xDF]+$/i => Y
12233 * "ss" =~ /^[^\xDF]+$/i => N
12235 * See [perl #89750] */
12236 if (FOLD && ! invert && value == prevvalue) {
12237 if (value == LATIN_SMALL_LETTER_SHARP_S
12238 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12241 /* Here <value> is indeed a multi-char fold. Get what it is */
12243 U8 foldbuf[UTF8_MAXBYTES_CASE];
12246 UV folded = _to_uni_fold_flags(
12251 | ((LOC) ? FOLD_FLAGS_LOCALE
12252 : (ASCII_FOLD_RESTRICTED)
12253 ? FOLD_FLAGS_NOMIX_ASCII
12257 /* Here, <folded> should be the first character of the
12258 * multi-char fold of <value>, with <foldbuf> containing the
12259 * whole thing. But, if this fold is not allowed (because of
12260 * the flags), <fold> will be the same as <value>, and should
12261 * be processed like any other character, so skip the special
12263 if (folded != value) {
12265 /* Skip if we are recursed, currently parsing the class
12266 * again. Otherwise add this character to the list of
12267 * multi-char folds. */
12268 if (! RExC_in_multi_char_class) {
12269 AV** this_array_ptr;
12271 STRLEN cp_count = utf8_length(foldbuf,
12272 foldbuf + foldlen);
12273 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12275 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12278 if (! multi_char_matches) {
12279 multi_char_matches = newAV();
12282 /* <multi_char_matches> is actually an array of arrays.
12283 * There will be one or two top-level elements: [2],
12284 * and/or [3]. The [2] element is an array, each
12285 * element thereof is a character which folds to two
12286 * characters; likewise for [3]. (Unicode guarantees a
12287 * maximum of 3 characters in any fold.) When we
12288 * rewrite the character class below, we will do so
12289 * such that the longest folds are written first, so
12290 * that it prefers the longest matching strings first.
12291 * This is done even if it turns out that any
12292 * quantifier is non-greedy, out of programmer
12293 * laziness. Tom Christiansen has agreed that this is
12294 * ok. This makes the test for the ligature 'ffi' come
12295 * before the test for 'ff' */
12296 if (av_exists(multi_char_matches, cp_count)) {
12297 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12299 this_array = *this_array_ptr;
12302 this_array = newAV();
12303 av_store(multi_char_matches, cp_count,
12306 av_push(this_array, multi_fold);
12309 /* This element should not be processed further in this
12312 value = save_value;
12313 prevvalue = save_prevvalue;
12319 /* Deal with this element of the class */
12322 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12324 UV* this_range = _new_invlist(1);
12325 _append_range_to_invlist(this_range, prevvalue, value);
12327 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12328 * If this range was specified using something like 'i-j', we want
12329 * to include only the 'i' and the 'j', and not anything in
12330 * between, so exclude non-ASCII, non-alphabetics from it.
12331 * However, if the range was specified with something like
12332 * [\x89-\x91] or [\x89-j], all code points within it should be
12333 * included. literal_endpoint==2 means both ends of the range used
12334 * a literal character, not \x{foo} */
12335 if (literal_endpoint == 2
12336 && (prevvalue >= 'a' && value <= 'z')
12337 || (prevvalue >= 'A' && value <= 'Z'))
12339 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12340 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12342 _invlist_union(cp_list, this_range, &cp_list);
12343 literal_endpoint = 0;
12347 range = 0; /* this range (if it was one) is done now */
12348 } /* End of loop through all the text within the brackets */
12350 /* If anything in the class expands to more than one character, we have to
12351 * deal with them by building up a substitute parse string, and recursively
12352 * calling reg() on it, instead of proceeding */
12353 if (multi_char_matches) {
12354 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12357 char *save_end = RExC_end;
12358 char *save_parse = RExC_parse;
12359 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12364 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12365 because too confusing */
12367 sv_catpv(substitute_parse, "(?:");
12371 /* Look at the longest folds first */
12372 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12374 if (av_exists(multi_char_matches, cp_count)) {
12375 AV** this_array_ptr;
12378 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12380 while ((this_sequence = av_pop(*this_array_ptr)) !=
12383 if (! first_time) {
12384 sv_catpv(substitute_parse, "|");
12386 first_time = FALSE;
12388 sv_catpv(substitute_parse, SvPVX(this_sequence));
12393 /* If the character class contains anything else besides these
12394 * multi-character folds, have to include it in recursive parsing */
12395 if (element_count) {
12396 sv_catpv(substitute_parse, "|[");
12397 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12398 sv_catpv(substitute_parse, "]");
12401 sv_catpv(substitute_parse, ")");
12404 /* This is a way to get the parse to skip forward a whole named
12405 * sequence instead of matching the 2nd character when it fails the
12407 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12411 RExC_parse = SvPV(substitute_parse, len);
12412 RExC_end = RExC_parse + len;
12413 RExC_in_multi_char_class = 1;
12414 RExC_emit = (regnode *)orig_emit;
12416 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12418 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
12420 RExC_parse = save_parse;
12421 RExC_end = save_end;
12422 RExC_in_multi_char_class = 0;
12423 SvREFCNT_dec(multi_char_matches);
12424 SvREFCNT_dec(listsv);
12428 /* If the character class contains only a single element, it may be
12429 * optimizable into another node type which is smaller and runs faster.
12430 * Check if this is the case for this class */
12431 if (element_count == 1) {
12435 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12436 [:digit:] or \p{foo} */
12438 /* Certain named classes have equivalents that can appear outside a
12439 * character class, e.g. \w, \H. We use these instead of a
12440 * character class. */
12441 switch ((I32)namedclass) {
12444 /* The first group is for node types that depend on the charset
12445 * modifier to the regex. We first calculate the base node
12446 * type, and if it should be inverted */
12448 case ANYOF_NWORDCHAR:
12451 case ANYOF_WORDCHAR:
12453 goto join_charset_classes;
12460 goto join_charset_classes;
12468 join_charset_classes:
12470 /* Now that we have the base node type, we take advantage
12471 * of the enum ordering of the charset modifiers to get the
12472 * exact node type, For example the base SPACE also has
12473 * SPACEL, SPACEU, and SPACEA */
12475 offset = get_regex_charset(RExC_flags);
12477 /* /aa is the same as /a for these */
12478 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12479 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12481 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12482 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12487 /* The number of varieties of each of these is the same,
12488 * hence, so is the delta between the normal and
12489 * complemented nodes */
12491 op += NALNUM - ALNUM;
12493 *flagp |= HASWIDTH|SIMPLE;
12496 /* The second group doesn't depend of the charset modifiers.
12497 * We just have normal and complemented */
12498 case ANYOF_NHORIZWS:
12501 case ANYOF_HORIZWS:
12503 op = (invert) ? NHORIZWS : HORIZWS;
12504 *flagp |= HASWIDTH|SIMPLE;
12507 case ANYOF_NVERTWS:
12511 op = (invert) ? NVERTWS : VERTWS;
12512 *flagp |= HASWIDTH|SIMPLE;
12515 case ANYOF_UNIPROP:
12522 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12527 /* A generic posix class. All the /a ones can be handled
12528 * by the POSIXA opcode. And all are closed under folding
12529 * in the ASCII range, so FOLD doesn't matter */
12530 if (AT_LEAST_ASCII_RESTRICTED
12531 || (! LOC && namedclass == ANYOF_ASCII))
12533 /* The odd numbered ones are the complements of the
12534 * next-lower even number one */
12535 if (namedclass % 2 == 1) {
12539 arg = namedclass_to_classnum(namedclass);
12540 op = (invert) ? NPOSIXA : POSIXA;
12545 else if (value == prevvalue) {
12547 /* Here, the class consists of just a single code point */
12550 if (! LOC && value == '\n') {
12551 op = REG_ANY; /* Optimize [^\n] */
12552 *flagp |= HASWIDTH|SIMPLE;
12556 else if (value < 256 || UTF) {
12558 /* Optimize a single value into an EXACTish node, but not if it
12559 * would require converting the pattern to UTF-8. */
12560 op = compute_EXACTish(pRExC_state);
12562 } /* Otherwise is a range */
12563 else if (! LOC) { /* locale could vary these */
12564 if (prevvalue == '0') {
12565 if (value == '9') {
12566 op = (invert) ? NDIGITA : DIGITA;
12567 *flagp |= HASWIDTH|SIMPLE;
12572 /* Here, we have changed <op> away from its initial value iff we found
12573 * an optimization */
12576 /* Throw away this ANYOF regnode, and emit the calculated one,
12577 * which should correspond to the beginning, not current, state of
12579 const char * cur_parse = RExC_parse;
12580 RExC_parse = (char *)orig_parse;
12584 /* To get locale nodes to not use the full ANYOF size would
12585 * require moving the code above that writes the portions
12586 * of it that aren't in other nodes to after this point.
12587 * e.g. ANYOF_CLASS_SET */
12588 RExC_size = orig_size;
12592 RExC_emit = (regnode *)orig_emit;
12595 ret = reg_node(pRExC_state, op);
12597 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
12601 *flagp |= HASWIDTH|SIMPLE;
12603 else if (PL_regkind[op] == EXACT) {
12604 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12607 RExC_parse = (char *) cur_parse;
12609 SvREFCNT_dec(posixes);
12610 SvREFCNT_dec(listsv);
12611 SvREFCNT_dec(cp_list);
12618 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12620 /* If folding, we calculate all characters that could fold to or from the
12621 * ones already on the list */
12622 if (FOLD && cp_list) {
12623 UV start, end; /* End points of code point ranges */
12625 SV* fold_intersection = NULL;
12627 /* If the highest code point is within Latin1, we can use the
12628 * compiled-in Alphas list, and not have to go out to disk. This
12629 * yields two false positives, the masculine and feminine ordinal
12630 * indicators, which are weeded out below using the
12631 * IS_IN_SOME_FOLD_L1() macro */
12632 if (invlist_highest(cp_list) < 256) {
12633 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12637 /* Here, there are non-Latin1 code points, so we will have to go
12638 * fetch the list of all the characters that participate in folds
12640 if (! PL_utf8_foldable) {
12641 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12642 &PL_sv_undef, 1, 0);
12643 PL_utf8_foldable = _get_swash_invlist(swash);
12644 SvREFCNT_dec(swash);
12647 /* This is a hash that for a particular fold gives all characters
12648 * that are involved in it */
12649 if (! PL_utf8_foldclosures) {
12651 /* If we were unable to find any folds, then we likely won't be
12652 * able to find the closures. So just create an empty list.
12653 * Folding will effectively be restricted to the non-Unicode
12654 * rules hard-coded into Perl. (This case happens legitimately
12655 * during compilation of Perl itself before the Unicode tables
12656 * are generated) */
12657 if (_invlist_len(PL_utf8_foldable) == 0) {
12658 PL_utf8_foldclosures = newHV();
12661 /* If the folds haven't been read in, call a fold function
12663 if (! PL_utf8_tofold) {
12664 U8 dummy[UTF8_MAXBYTES+1];
12666 /* This string is just a short named one above \xff */
12667 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12668 assert(PL_utf8_tofold); /* Verify that worked */
12670 PL_utf8_foldclosures =
12671 _swash_inversion_hash(PL_utf8_tofold);
12675 /* Only the characters in this class that participate in folds need
12676 * be checked. Get the intersection of this class and all the
12677 * possible characters that are foldable. This can quickly narrow
12678 * down a large class */
12679 _invlist_intersection(PL_utf8_foldable, cp_list,
12680 &fold_intersection);
12683 /* Now look at the foldable characters in this class individually */
12684 invlist_iterinit(fold_intersection);
12685 while (invlist_iternext(fold_intersection, &start, &end)) {
12688 /* Locale folding for Latin1 characters is deferred until runtime */
12689 if (LOC && start < 256) {
12693 /* Look at every character in the range */
12694 for (j = start; j <= end; j++) {
12696 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12702 /* We have the latin1 folding rules hard-coded here so that
12703 * an innocent-looking character class, like /[ks]/i won't
12704 * have to go out to disk to find the possible matches.
12705 * XXX It would be better to generate these via regen, in
12706 * case a new version of the Unicode standard adds new
12707 * mappings, though that is not really likely, and may be
12708 * caught by the default: case of the switch below. */
12710 if (IS_IN_SOME_FOLD_L1(j)) {
12712 /* ASCII is always matched; non-ASCII is matched only
12713 * under Unicode rules */
12714 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12716 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12720 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12724 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12725 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12727 /* Certain Latin1 characters have matches outside
12728 * Latin1. To get here, <j> is one of those
12729 * characters. None of these matches is valid for
12730 * ASCII characters under /aa, which is why the 'if'
12731 * just above excludes those. These matches only
12732 * happen when the target string is utf8. The code
12733 * below adds the single fold closures for <j> to the
12734 * inversion list. */
12739 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12743 cp_list = add_cp_to_invlist(cp_list,
12744 LATIN_SMALL_LETTER_LONG_S);
12747 cp_list = add_cp_to_invlist(cp_list,
12748 GREEK_CAPITAL_LETTER_MU);
12749 cp_list = add_cp_to_invlist(cp_list,
12750 GREEK_SMALL_LETTER_MU);
12752 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12753 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12755 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12757 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12758 cp_list = add_cp_to_invlist(cp_list,
12759 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12761 case LATIN_SMALL_LETTER_SHARP_S:
12762 cp_list = add_cp_to_invlist(cp_list,
12763 LATIN_CAPITAL_LETTER_SHARP_S);
12765 case 'F': case 'f':
12766 case 'I': case 'i':
12767 case 'L': case 'l':
12768 case 'T': case 't':
12769 case 'A': case 'a':
12770 case 'H': case 'h':
12771 case 'J': case 'j':
12772 case 'N': case 'n':
12773 case 'W': case 'w':
12774 case 'Y': case 'y':
12775 /* These all are targets of multi-character
12776 * folds from code points that require UTF8 to
12777 * express, so they can't match unless the
12778 * target string is in UTF-8, so no action here
12779 * is necessary, as regexec.c properly handles
12780 * the general case for UTF-8 matching and
12781 * multi-char folds */
12784 /* Use deprecated warning to increase the
12785 * chances of this being output */
12786 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12793 /* Here is an above Latin1 character. We don't have the rules
12794 * hard-coded for it. First, get its fold. This is the simple
12795 * fold, as the multi-character folds have been handled earlier
12796 * and separated out */
12797 _to_uni_fold_flags(j, foldbuf, &foldlen,
12799 ? FOLD_FLAGS_LOCALE
12800 : (ASCII_FOLD_RESTRICTED)
12801 ? FOLD_FLAGS_NOMIX_ASCII
12804 /* Single character fold of above Latin1. Add everything in
12805 * its fold closure to the list that this node should match.
12806 * The fold closures data structure is a hash with the keys
12807 * being the UTF-8 of every character that is folded to, like
12808 * 'k', and the values each an array of all code points that
12809 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
12810 * Multi-character folds are not included */
12811 if ((listp = hv_fetch(PL_utf8_foldclosures,
12812 (char *) foldbuf, foldlen, FALSE)))
12814 AV* list = (AV*) *listp;
12816 for (k = 0; k <= av_len(list); k++) {
12817 SV** c_p = av_fetch(list, k, FALSE);
12820 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12824 /* /aa doesn't allow folds between ASCII and non-; /l
12825 * doesn't allow them between above and below 256 */
12826 if ((ASCII_FOLD_RESTRICTED
12827 && (isASCII(c) != isASCII(j)))
12828 || (LOC && ((c < 256) != (j < 256))))
12833 /* Folds involving non-ascii Latin1 characters
12834 * under /d are added to a separate list */
12835 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12837 cp_list = add_cp_to_invlist(cp_list, c);
12840 depends_list = add_cp_to_invlist(depends_list, c);
12846 SvREFCNT_dec(fold_intersection);
12849 /* And combine the result (if any) with any inversion list from posix
12850 * classes. The lists are kept separate up to now because we don't want to
12851 * fold the classes (folding of those is automatically handled by the swash
12852 * fetching code) */
12854 if (! DEPENDS_SEMANTICS) {
12856 _invlist_union(cp_list, posixes, &cp_list);
12857 SvREFCNT_dec(posixes);
12864 /* Under /d, we put into a separate list the Latin1 things that
12865 * match only when the target string is utf8 */
12866 SV* nonascii_but_latin1_properties = NULL;
12867 _invlist_intersection(posixes, PL_Latin1,
12868 &nonascii_but_latin1_properties);
12869 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12870 &nonascii_but_latin1_properties);
12871 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12874 _invlist_union(cp_list, posixes, &cp_list);
12875 SvREFCNT_dec(posixes);
12881 if (depends_list) {
12882 _invlist_union(depends_list, nonascii_but_latin1_properties,
12884 SvREFCNT_dec(nonascii_but_latin1_properties);
12887 depends_list = nonascii_but_latin1_properties;
12892 /* And combine the result (if any) with any inversion list from properties.
12893 * The lists are kept separate up to now so that we can distinguish the two
12894 * in regards to matching above-Unicode. A run-time warning is generated
12895 * if a Unicode property is matched against a non-Unicode code point. But,
12896 * we allow user-defined properties to match anything, without any warning,
12897 * and we also suppress the warning if there is a portion of the character
12898 * class that isn't a Unicode property, and which matches above Unicode, \W
12899 * or [\x{110000}] for example.
12900 * (Note that in this case, unlike the Posix one above, there is no
12901 * <depends_list>, because having a Unicode property forces Unicode
12904 bool warn_super = ! has_user_defined_property;
12907 /* If it matters to the final outcome, see if a non-property
12908 * component of the class matches above Unicode. If so, the
12909 * warning gets suppressed. This is true even if just a single
12910 * such code point is specified, as though not strictly correct if
12911 * another such code point is matched against, the fact that they
12912 * are using above-Unicode code points indicates they should know
12913 * the issues involved */
12915 bool non_prop_matches_above_Unicode =
12916 runtime_posix_matches_above_Unicode
12917 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12919 non_prop_matches_above_Unicode =
12920 ! non_prop_matches_above_Unicode;
12922 warn_super = ! non_prop_matches_above_Unicode;
12925 _invlist_union(properties, cp_list, &cp_list);
12926 SvREFCNT_dec(properties);
12929 cp_list = properties;
12933 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12937 /* Here, we have calculated what code points should be in the character
12940 * Now we can see about various optimizations. Fold calculation (which we
12941 * did above) needs to take place before inversion. Otherwise /[^k]/i
12942 * would invert to include K, which under /i would match k, which it
12943 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12944 * folded until runtime */
12946 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12947 * at compile time. Besides not inverting folded locale now, we can't
12948 * invert if there are things such as \w, which aren't known until runtime
12951 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12953 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12955 _invlist_invert(cp_list);
12957 /* Any swash can't be used as-is, because we've inverted things */
12959 SvREFCNT_dec(swash);
12963 /* Clear the invert flag since have just done it here */
12967 /* If we didn't do folding, it's because some information isn't available
12968 * until runtime; set the run-time fold flag for these. (We don't have to
12969 * worry about properties folding, as that is taken care of by the swash
12973 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12976 /* Some character classes are equivalent to other nodes. Such nodes take
12977 * up less room and generally fewer operations to execute than ANYOF nodes.
12978 * Above, we checked for and optimized into some such equivalents for
12979 * certain common classes that are easy to test. Getting to this point in
12980 * the code means that the class didn't get optimized there. Since this
12981 * code is only executed in Pass 2, it is too late to save space--it has
12982 * been allocated in Pass 1, and currently isn't given back. But turning
12983 * things into an EXACTish node can allow the optimizer to join it to any
12984 * adjacent such nodes. And if the class is equivalent to things like /./,
12985 * expensive run-time swashes can be avoided. Now that we have more
12986 * complete information, we can find things necessarily missed by the
12987 * earlier code. I (khw) am not sure how much to look for here. It would
12988 * be easy, but perhaps too slow, to check any candidates against all the
12989 * node types they could possibly match using _invlistEQ(). */
12994 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12995 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12998 U8 op = END; /* The optimzation node-type */
12999 const char * cur_parse= RExC_parse;
13001 invlist_iterinit(cp_list);
13002 if (! invlist_iternext(cp_list, &start, &end)) {
13004 /* Here, the list is empty. This happens, for example, when a
13005 * Unicode property is the only thing in the character class, and
13006 * it doesn't match anything. (perluniprops.pod notes such
13009 *flagp |= HASWIDTH|SIMPLE;
13011 else if (start == end) { /* The range is a single code point */
13012 if (! invlist_iternext(cp_list, &start, &end)
13014 /* Don't do this optimization if it would require changing
13015 * the pattern to UTF-8 */
13016 && (start < 256 || UTF))
13018 /* Here, the list contains a single code point. Can optimize
13019 * into an EXACT node */
13028 /* A locale node under folding with one code point can be
13029 * an EXACTFL, as its fold won't be calculated until
13035 /* Here, we are generally folding, but there is only one
13036 * code point to match. If we have to, we use an EXACT
13037 * node, but it would be better for joining with adjacent
13038 * nodes in the optimization pass if we used the same
13039 * EXACTFish node that any such are likely to be. We can
13040 * do this iff the code point doesn't participate in any
13041 * folds. For example, an EXACTF of a colon is the same as
13042 * an EXACT one, since nothing folds to or from a colon. */
13044 if (IS_IN_SOME_FOLD_L1(value)) {
13049 if (! PL_utf8_foldable) {
13050 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13051 &PL_sv_undef, 1, 0);
13052 PL_utf8_foldable = _get_swash_invlist(swash);
13053 SvREFCNT_dec(swash);
13055 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13060 /* If we haven't found the node type, above, it means we
13061 * can use the prevailing one */
13063 op = compute_EXACTish(pRExC_state);
13068 else if (start == 0) {
13069 if (end == UV_MAX) {
13071 *flagp |= HASWIDTH|SIMPLE;
13074 else if (end == '\n' - 1
13075 && invlist_iternext(cp_list, &start, &end)
13076 && start == '\n' + 1 && end == UV_MAX)
13079 *flagp |= HASWIDTH|SIMPLE;
13085 RExC_parse = (char *)orig_parse;
13086 RExC_emit = (regnode *)orig_emit;
13088 ret = reg_node(pRExC_state, op);
13090 RExC_parse = (char *)cur_parse;
13092 if (PL_regkind[op] == EXACT) {
13093 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13096 SvREFCNT_dec(cp_list);
13097 SvREFCNT_dec(listsv);
13102 /* Here, <cp_list> contains all the code points we can determine at
13103 * compile time that match under all conditions. Go through it, and
13104 * for things that belong in the bitmap, put them there, and delete from
13105 * <cp_list>. While we are at it, see if everything above 255 is in the
13106 * list, and if so, set a flag to speed up execution */
13107 ANYOF_BITMAP_ZERO(ret);
13110 /* This gets set if we actually need to modify things */
13111 bool change_invlist = FALSE;
13115 /* Start looking through <cp_list> */
13116 invlist_iterinit(cp_list);
13117 while (invlist_iternext(cp_list, &start, &end)) {
13121 if (end == UV_MAX && start <= 256) {
13122 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13125 /* Quit if are above what we should change */
13130 change_invlist = TRUE;
13132 /* Set all the bits in the range, up to the max that we are doing */
13133 high = (end < 255) ? end : 255;
13134 for (i = start; i <= (int) high; i++) {
13135 if (! ANYOF_BITMAP_TEST(ret, i)) {
13136 ANYOF_BITMAP_SET(ret, i);
13143 /* Done with loop; remove any code points that are in the bitmap from
13145 if (change_invlist) {
13146 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13149 /* If have completely emptied it, remove it completely */
13150 if (_invlist_len(cp_list) == 0) {
13151 SvREFCNT_dec(cp_list);
13157 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13160 /* Here, the bitmap has been populated with all the Latin1 code points that
13161 * always match. Can now add to the overall list those that match only
13162 * when the target string is UTF-8 (<depends_list>). */
13163 if (depends_list) {
13165 _invlist_union(cp_list, depends_list, &cp_list);
13166 SvREFCNT_dec(depends_list);
13169 cp_list = depends_list;
13173 /* If there is a swash and more than one element, we can't use the swash in
13174 * the optimization below. */
13175 if (swash && element_count > 1) {
13176 SvREFCNT_dec(swash);
13181 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13183 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13184 SvREFCNT_dec(listsv);
13187 /* av[0] stores the character class description in its textual form:
13188 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13189 * appropriate swash, and is also useful for dumping the regnode.
13190 * av[1] if NULL, is a placeholder to later contain the swash computed
13191 * from av[0]. But if no further computation need be done, the
13192 * swash is stored there now.
13193 * av[2] stores the cp_list inversion list for use in addition or
13194 * instead of av[0]; used only if av[1] is NULL
13195 * av[3] is set if any component of the class is from a user-defined
13196 * property; used only if av[1] is NULL */
13197 AV * const av = newAV();
13200 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13202 : (SvREFCNT_dec(listsv), &PL_sv_undef));
13204 av_store(av, 1, swash);
13205 SvREFCNT_dec(cp_list);
13208 av_store(av, 1, NULL);
13210 av_store(av, 2, cp_list);
13211 av_store(av, 3, newSVuv(has_user_defined_property));
13215 rv = newRV_noinc(MUTABLE_SV(av));
13216 n = add_data(pRExC_state, 1, "s");
13217 RExC_rxi->data->data[n] = (void*)rv;
13221 *flagp |= HASWIDTH|SIMPLE;
13224 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13227 /* reg_skipcomment()
13229 Absorbs an /x style # comments from the input stream.
13230 Returns true if there is more text remaining in the stream.
13231 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13232 terminates the pattern without including a newline.
13234 Note its the callers responsibility to ensure that we are
13235 actually in /x mode
13240 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13244 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13246 while (RExC_parse < RExC_end)
13247 if (*RExC_parse++ == '\n') {
13252 /* we ran off the end of the pattern without ending
13253 the comment, so we have to add an \n when wrapping */
13254 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13262 Advances the parse position, and optionally absorbs
13263 "whitespace" from the inputstream.
13265 Without /x "whitespace" means (?#...) style comments only,
13266 with /x this means (?#...) and # comments and whitespace proper.
13268 Returns the RExC_parse point from BEFORE the scan occurs.
13270 This is the /x friendly way of saying RExC_parse++.
13274 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13276 char* const retval = RExC_parse++;
13278 PERL_ARGS_ASSERT_NEXTCHAR;
13281 if (RExC_end - RExC_parse >= 3
13282 && *RExC_parse == '('
13283 && RExC_parse[1] == '?'
13284 && RExC_parse[2] == '#')
13286 while (*RExC_parse != ')') {
13287 if (RExC_parse == RExC_end)
13288 FAIL("Sequence (?#... not terminated");
13294 if (RExC_flags & RXf_PMf_EXTENDED) {
13295 if (isSPACE(*RExC_parse)) {
13299 else if (*RExC_parse == '#') {
13300 if ( reg_skipcomment( pRExC_state ) )
13309 - reg_node - emit a node
13311 STATIC regnode * /* Location. */
13312 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13316 regnode * const ret = RExC_emit;
13317 GET_RE_DEBUG_FLAGS_DECL;
13319 PERL_ARGS_ASSERT_REG_NODE;
13322 SIZE_ALIGN(RExC_size);
13326 if (RExC_emit >= RExC_emit_bound)
13327 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13328 op, RExC_emit, RExC_emit_bound);
13330 NODE_ALIGN_FILL(ret);
13332 FILL_ADVANCE_NODE(ptr, op);
13333 #ifdef RE_TRACK_PATTERN_OFFSETS
13334 if (RExC_offsets) { /* MJD */
13335 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13336 "reg_node", __LINE__,
13338 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13339 ? "Overwriting end of array!\n" : "OK",
13340 (UV)(RExC_emit - RExC_emit_start),
13341 (UV)(RExC_parse - RExC_start),
13342 (UV)RExC_offsets[0]));
13343 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13351 - reganode - emit a node with an argument
13353 STATIC regnode * /* Location. */
13354 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13358 regnode * const ret = RExC_emit;
13359 GET_RE_DEBUG_FLAGS_DECL;
13361 PERL_ARGS_ASSERT_REGANODE;
13364 SIZE_ALIGN(RExC_size);
13369 assert(2==regarglen[op]+1);
13371 Anything larger than this has to allocate the extra amount.
13372 If we changed this to be:
13374 RExC_size += (1 + regarglen[op]);
13376 then it wouldn't matter. Its not clear what side effect
13377 might come from that so its not done so far.
13382 if (RExC_emit >= RExC_emit_bound)
13383 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13384 op, RExC_emit, RExC_emit_bound);
13386 NODE_ALIGN_FILL(ret);
13388 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13389 #ifdef RE_TRACK_PATTERN_OFFSETS
13390 if (RExC_offsets) { /* MJD */
13391 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13395 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13396 "Overwriting end of array!\n" : "OK",
13397 (UV)(RExC_emit - RExC_emit_start),
13398 (UV)(RExC_parse - RExC_start),
13399 (UV)RExC_offsets[0]));
13400 Set_Cur_Node_Offset;
13408 - reguni - emit (if appropriate) a Unicode character
13411 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13415 PERL_ARGS_ASSERT_REGUNI;
13417 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13421 - reginsert - insert an operator in front of already-emitted operand
13423 * Means relocating the operand.
13426 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13432 const int offset = regarglen[(U8)op];
13433 const int size = NODE_STEP_REGNODE + offset;
13434 GET_RE_DEBUG_FLAGS_DECL;
13436 PERL_ARGS_ASSERT_REGINSERT;
13437 PERL_UNUSED_ARG(depth);
13438 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13439 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13448 if (RExC_open_parens) {
13450 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13451 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13452 if ( RExC_open_parens[paren] >= opnd ) {
13453 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13454 RExC_open_parens[paren] += size;
13456 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13458 if ( RExC_close_parens[paren] >= opnd ) {
13459 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13460 RExC_close_parens[paren] += size;
13462 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13467 while (src > opnd) {
13468 StructCopy(--src, --dst, regnode);
13469 #ifdef RE_TRACK_PATTERN_OFFSETS
13470 if (RExC_offsets) { /* MJD 20010112 */
13471 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13475 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13476 ? "Overwriting end of array!\n" : "OK",
13477 (UV)(src - RExC_emit_start),
13478 (UV)(dst - RExC_emit_start),
13479 (UV)RExC_offsets[0]));
13480 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13481 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13487 place = opnd; /* Op node, where operand used to be. */
13488 #ifdef RE_TRACK_PATTERN_OFFSETS
13489 if (RExC_offsets) { /* MJD */
13490 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13494 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13495 ? "Overwriting end of array!\n" : "OK",
13496 (UV)(place - RExC_emit_start),
13497 (UV)(RExC_parse - RExC_start),
13498 (UV)RExC_offsets[0]));
13499 Set_Node_Offset(place, RExC_parse);
13500 Set_Node_Length(place, 1);
13503 src = NEXTOPER(place);
13504 FILL_ADVANCE_NODE(place, op);
13505 Zero(src, offset, regnode);
13509 - regtail - set the next-pointer at the end of a node chain of p to val.
13510 - SEE ALSO: regtail_study
13512 /* TODO: All three parms should be const */
13514 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13518 GET_RE_DEBUG_FLAGS_DECL;
13520 PERL_ARGS_ASSERT_REGTAIL;
13522 PERL_UNUSED_ARG(depth);
13528 /* Find last node. */
13531 regnode * const temp = regnext(scan);
13533 SV * const mysv=sv_newmortal();
13534 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13535 regprop(RExC_rx, mysv, scan);
13536 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13537 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13538 (temp == NULL ? "->" : ""),
13539 (temp == NULL ? PL_reg_name[OP(val)] : "")
13547 if (reg_off_by_arg[OP(scan)]) {
13548 ARG_SET(scan, val - scan);
13551 NEXT_OFF(scan) = val - scan;
13557 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13558 - Look for optimizable sequences at the same time.
13559 - currently only looks for EXACT chains.
13561 This is experimental code. The idea is to use this routine to perform
13562 in place optimizations on branches and groups as they are constructed,
13563 with the long term intention of removing optimization from study_chunk so
13564 that it is purely analytical.
13566 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13567 to control which is which.
13570 /* TODO: All four parms should be const */
13573 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13578 #ifdef EXPERIMENTAL_INPLACESCAN
13581 GET_RE_DEBUG_FLAGS_DECL;
13583 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13589 /* Find last node. */
13593 regnode * const temp = regnext(scan);
13594 #ifdef EXPERIMENTAL_INPLACESCAN
13595 if (PL_regkind[OP(scan)] == EXACT) {
13596 bool has_exactf_sharp_s; /* Unexamined in this routine */
13597 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13602 switch (OP(scan)) {
13608 case EXACTFU_TRICKYFOLD:
13610 if( exact == PSEUDO )
13612 else if ( exact != OP(scan) )
13621 SV * const mysv=sv_newmortal();
13622 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13623 regprop(RExC_rx, mysv, scan);
13624 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13625 SvPV_nolen_const(mysv),
13626 REG_NODE_NUM(scan),
13627 PL_reg_name[exact]);
13634 SV * const mysv_val=sv_newmortal();
13635 DEBUG_PARSE_MSG("");
13636 regprop(RExC_rx, mysv_val, val);
13637 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13638 SvPV_nolen_const(mysv_val),
13639 (IV)REG_NODE_NUM(val),
13643 if (reg_off_by_arg[OP(scan)]) {
13644 ARG_SET(scan, val - scan);
13647 NEXT_OFF(scan) = val - scan;
13655 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13659 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13665 for (bit=0; bit<32; bit++) {
13666 if (flags & (1<<bit)) {
13667 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13670 if (!set++ && lead)
13671 PerlIO_printf(Perl_debug_log, "%s",lead);
13672 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13675 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13676 if (!set++ && lead) {
13677 PerlIO_printf(Perl_debug_log, "%s",lead);
13680 case REGEX_UNICODE_CHARSET:
13681 PerlIO_printf(Perl_debug_log, "UNICODE");
13683 case REGEX_LOCALE_CHARSET:
13684 PerlIO_printf(Perl_debug_log, "LOCALE");
13686 case REGEX_ASCII_RESTRICTED_CHARSET:
13687 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13689 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13690 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13693 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13699 PerlIO_printf(Perl_debug_log, "\n");
13701 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13707 Perl_regdump(pTHX_ const regexp *r)
13711 SV * const sv = sv_newmortal();
13712 SV *dsv= sv_newmortal();
13713 RXi_GET_DECL(r,ri);
13714 GET_RE_DEBUG_FLAGS_DECL;
13716 PERL_ARGS_ASSERT_REGDUMP;
13718 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13720 /* Header fields of interest. */
13721 if (r->anchored_substr) {
13722 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13723 RE_SV_DUMPLEN(r->anchored_substr), 30);
13724 PerlIO_printf(Perl_debug_log,
13725 "anchored %s%s at %"IVdf" ",
13726 s, RE_SV_TAIL(r->anchored_substr),
13727 (IV)r->anchored_offset);
13728 } else if (r->anchored_utf8) {
13729 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13730 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13731 PerlIO_printf(Perl_debug_log,
13732 "anchored utf8 %s%s at %"IVdf" ",
13733 s, RE_SV_TAIL(r->anchored_utf8),
13734 (IV)r->anchored_offset);
13736 if (r->float_substr) {
13737 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13738 RE_SV_DUMPLEN(r->float_substr), 30);
13739 PerlIO_printf(Perl_debug_log,
13740 "floating %s%s at %"IVdf"..%"UVuf" ",
13741 s, RE_SV_TAIL(r->float_substr),
13742 (IV)r->float_min_offset, (UV)r->float_max_offset);
13743 } else if (r->float_utf8) {
13744 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13745 RE_SV_DUMPLEN(r->float_utf8), 30);
13746 PerlIO_printf(Perl_debug_log,
13747 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13748 s, RE_SV_TAIL(r->float_utf8),
13749 (IV)r->float_min_offset, (UV)r->float_max_offset);
13751 if (r->check_substr || r->check_utf8)
13752 PerlIO_printf(Perl_debug_log,
13754 (r->check_substr == r->float_substr
13755 && r->check_utf8 == r->float_utf8
13756 ? "(checking floating" : "(checking anchored"));
13757 if (r->extflags & RXf_NOSCAN)
13758 PerlIO_printf(Perl_debug_log, " noscan");
13759 if (r->extflags & RXf_CHECK_ALL)
13760 PerlIO_printf(Perl_debug_log, " isall");
13761 if (r->check_substr || r->check_utf8)
13762 PerlIO_printf(Perl_debug_log, ") ");
13764 if (ri->regstclass) {
13765 regprop(r, sv, ri->regstclass);
13766 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13768 if (r->extflags & RXf_ANCH) {
13769 PerlIO_printf(Perl_debug_log, "anchored");
13770 if (r->extflags & RXf_ANCH_BOL)
13771 PerlIO_printf(Perl_debug_log, "(BOL)");
13772 if (r->extflags & RXf_ANCH_MBOL)
13773 PerlIO_printf(Perl_debug_log, "(MBOL)");
13774 if (r->extflags & RXf_ANCH_SBOL)
13775 PerlIO_printf(Perl_debug_log, "(SBOL)");
13776 if (r->extflags & RXf_ANCH_GPOS)
13777 PerlIO_printf(Perl_debug_log, "(GPOS)");
13778 PerlIO_putc(Perl_debug_log, ' ');
13780 if (r->extflags & RXf_GPOS_SEEN)
13781 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13782 if (r->intflags & PREGf_SKIP)
13783 PerlIO_printf(Perl_debug_log, "plus ");
13784 if (r->intflags & PREGf_IMPLICIT)
13785 PerlIO_printf(Perl_debug_log, "implicit ");
13786 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13787 if (r->extflags & RXf_EVAL_SEEN)
13788 PerlIO_printf(Perl_debug_log, "with eval ");
13789 PerlIO_printf(Perl_debug_log, "\n");
13790 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13792 PERL_ARGS_ASSERT_REGDUMP;
13793 PERL_UNUSED_CONTEXT;
13794 PERL_UNUSED_ARG(r);
13795 #endif /* DEBUGGING */
13799 - regprop - printable representation of opcode
13801 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13804 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13805 if (flags & ANYOF_INVERT) \
13806 /*make sure the invert info is in each */ \
13807 sv_catpvs(sv, "^"); \
13813 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13819 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13820 static const char * const anyofs[] = {
13821 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
13822 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 || _CC_ALNUMC != 7 \
13823 || _CC_GRAPH != 8 || _CC_SPACE != 9 || _CC_BLANK != 10 \
13824 || _CC_XDIGIT != 11 || _CC_PSXSPC != 12 || _CC_CNTRL != 13 \
13825 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
13826 #error Need to adjust order of anyofs[]
13861 RXi_GET_DECL(prog,progi);
13862 GET_RE_DEBUG_FLAGS_DECL;
13864 PERL_ARGS_ASSERT_REGPROP;
13868 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13869 /* It would be nice to FAIL() here, but this may be called from
13870 regexec.c, and it would be hard to supply pRExC_state. */
13871 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13872 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13874 k = PL_regkind[OP(o)];
13877 sv_catpvs(sv, " ");
13878 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13879 * is a crude hack but it may be the best for now since
13880 * we have no flag "this EXACTish node was UTF-8"
13882 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13883 PERL_PV_ESCAPE_UNI_DETECT |
13884 PERL_PV_ESCAPE_NONASCII |
13885 PERL_PV_PRETTY_ELLIPSES |
13886 PERL_PV_PRETTY_LTGT |
13887 PERL_PV_PRETTY_NOCLEAR
13889 } else if (k == TRIE) {
13890 /* print the details of the trie in dumpuntil instead, as
13891 * progi->data isn't available here */
13892 const char op = OP(o);
13893 const U32 n = ARG(o);
13894 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13895 (reg_ac_data *)progi->data->data[n] :
13897 const reg_trie_data * const trie
13898 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13900 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13901 DEBUG_TRIE_COMPILE_r(
13902 Perl_sv_catpvf(aTHX_ sv,
13903 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13904 (UV)trie->startstate,
13905 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13906 (UV)trie->wordcount,
13909 (UV)TRIE_CHARCOUNT(trie),
13910 (UV)trie->uniquecharcount
13913 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13915 int rangestart = -1;
13916 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13917 sv_catpvs(sv, "[");
13918 for (i = 0; i <= 256; i++) {
13919 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13920 if (rangestart == -1)
13922 } else if (rangestart != -1) {
13923 if (i <= rangestart + 3)
13924 for (; rangestart < i; rangestart++)
13925 put_byte(sv, rangestart);
13927 put_byte(sv, rangestart);
13928 sv_catpvs(sv, "-");
13929 put_byte(sv, i - 1);
13934 sv_catpvs(sv, "]");
13937 } else if (k == CURLY) {
13938 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13939 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13940 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13942 else if (k == WHILEM && o->flags) /* Ordinal/of */
13943 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13944 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13945 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13946 if ( RXp_PAREN_NAMES(prog) ) {
13947 if ( k != REF || (OP(o) < NREF)) {
13948 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13949 SV **name= av_fetch(list, ARG(o), 0 );
13951 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13954 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13955 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13956 I32 *nums=(I32*)SvPVX(sv_dat);
13957 SV **name= av_fetch(list, nums[0], 0 );
13960 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13961 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13962 (n ? "," : ""), (IV)nums[n]);
13964 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13968 } else if (k == GOSUB)
13969 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13970 else if (k == VERB) {
13972 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13973 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13974 } else if (k == LOGICAL)
13975 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13976 else if (k == ANYOF) {
13977 int i, rangestart = -1;
13978 const U8 flags = ANYOF_FLAGS(o);
13982 if (flags & ANYOF_LOCALE)
13983 sv_catpvs(sv, "{loc}");
13984 if (flags & ANYOF_LOC_FOLD)
13985 sv_catpvs(sv, "{i}");
13986 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13987 if (flags & ANYOF_INVERT)
13988 sv_catpvs(sv, "^");
13990 /* output what the standard cp 0-255 bitmap matches */
13991 for (i = 0; i <= 256; i++) {
13992 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13993 if (rangestart == -1)
13995 } else if (rangestart != -1) {
13996 if (i <= rangestart + 3)
13997 for (; rangestart < i; rangestart++)
13998 put_byte(sv, rangestart);
14000 put_byte(sv, rangestart);
14001 sv_catpvs(sv, "-");
14002 put_byte(sv, i - 1);
14009 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14010 /* output any special charclass tests (used entirely under use locale) */
14011 if (ANYOF_CLASS_TEST_ANY_SET(o))
14012 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
14013 if (ANYOF_CLASS_TEST(o,i)) {
14014 sv_catpv(sv, anyofs[i]);
14018 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14020 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14021 sv_catpvs(sv, "{non-utf8-latin1-all}");
14024 /* output information about the unicode matching */
14025 if (flags & ANYOF_UNICODE_ALL)
14026 sv_catpvs(sv, "{unicode_all}");
14027 else if (ANYOF_NONBITMAP(o))
14028 sv_catpvs(sv, "{unicode}");
14029 if (flags & ANYOF_NONBITMAP_NON_UTF8)
14030 sv_catpvs(sv, "{outside bitmap}");
14032 if (ANYOF_NONBITMAP(o)) {
14033 SV *lv; /* Set if there is something outside the bit map */
14034 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
14035 bool byte_output = FALSE; /* If something in the bitmap has been
14038 if (lv && lv != &PL_sv_undef) {
14040 U8 s[UTF8_MAXBYTES_CASE+1];
14042 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
14043 uvchr_to_utf8(s, i);
14046 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
14050 && swash_fetch(sw, s, TRUE))
14052 if (rangestart == -1)
14054 } else if (rangestart != -1) {
14055 byte_output = TRUE;
14056 if (i <= rangestart + 3)
14057 for (; rangestart < i; rangestart++) {
14058 put_byte(sv, rangestart);
14061 put_byte(sv, rangestart);
14062 sv_catpvs(sv, "-");
14071 char *s = savesvpv(lv);
14072 char * const origs = s;
14074 while (*s && *s != '\n')
14078 const char * const t = ++s;
14081 sv_catpvs(sv, " ");
14087 /* Truncate very long output */
14088 if (s - origs > 256) {
14089 Perl_sv_catpvf(aTHX_ sv,
14091 (int) (s - origs - 1),
14097 else if (*s == '\t') {
14116 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14118 else if (k == POSIXD || k == NPOSIXD) {
14119 U8 index = FLAGS(o) * 2;
14120 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14121 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14124 sv_catpv(sv, anyofs[index]);
14127 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14128 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14130 PERL_UNUSED_CONTEXT;
14131 PERL_UNUSED_ARG(sv);
14132 PERL_UNUSED_ARG(o);
14133 PERL_UNUSED_ARG(prog);
14134 #endif /* DEBUGGING */
14138 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14139 { /* Assume that RE_INTUIT is set */
14141 struct regexp *const prog = ReANY(r);
14142 GET_RE_DEBUG_FLAGS_DECL;
14144 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14145 PERL_UNUSED_CONTEXT;
14149 const char * const s = SvPV_nolen_const(prog->check_substr
14150 ? prog->check_substr : prog->check_utf8);
14152 if (!PL_colorset) reginitcolors();
14153 PerlIO_printf(Perl_debug_log,
14154 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14156 prog->check_substr ? "" : "utf8 ",
14157 PL_colors[5],PL_colors[0],
14160 (strlen(s) > 60 ? "..." : ""));
14163 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14169 handles refcounting and freeing the perl core regexp structure. When
14170 it is necessary to actually free the structure the first thing it
14171 does is call the 'free' method of the regexp_engine associated to
14172 the regexp, allowing the handling of the void *pprivate; member
14173 first. (This routine is not overridable by extensions, which is why
14174 the extensions free is called first.)
14176 See regdupe and regdupe_internal if you change anything here.
14178 #ifndef PERL_IN_XSUB_RE
14180 Perl_pregfree(pTHX_ REGEXP *r)
14186 Perl_pregfree2(pTHX_ REGEXP *rx)
14189 struct regexp *const r = ReANY(rx);
14190 GET_RE_DEBUG_FLAGS_DECL;
14192 PERL_ARGS_ASSERT_PREGFREE2;
14194 if (r->mother_re) {
14195 ReREFCNT_dec(r->mother_re);
14197 CALLREGFREE_PVT(rx); /* free the private data */
14198 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14199 Safefree(r->xpv_len_u.xpvlenu_pv);
14202 SvREFCNT_dec(r->anchored_substr);
14203 SvREFCNT_dec(r->anchored_utf8);
14204 SvREFCNT_dec(r->float_substr);
14205 SvREFCNT_dec(r->float_utf8);
14206 Safefree(r->substrs);
14208 RX_MATCH_COPY_FREE(rx);
14209 #ifdef PERL_ANY_COW
14210 SvREFCNT_dec(r->saved_copy);
14213 SvREFCNT_dec(r->qr_anoncv);
14214 rx->sv_u.svu_rx = 0;
14219 This is a hacky workaround to the structural issue of match results
14220 being stored in the regexp structure which is in turn stored in
14221 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14222 could be PL_curpm in multiple contexts, and could require multiple
14223 result sets being associated with the pattern simultaneously, such
14224 as when doing a recursive match with (??{$qr})
14226 The solution is to make a lightweight copy of the regexp structure
14227 when a qr// is returned from the code executed by (??{$qr}) this
14228 lightweight copy doesn't actually own any of its data except for
14229 the starp/end and the actual regexp structure itself.
14235 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14237 struct regexp *ret;
14238 struct regexp *const r = ReANY(rx);
14239 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
14241 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14244 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14246 SvOK_off((SV *)ret_x);
14248 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
14249 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
14250 made both spots point to the same regexp body.) */
14251 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
14252 assert(!SvPVX(ret_x));
14253 ret_x->sv_u.svu_rx = temp->sv_any;
14254 temp->sv_any = NULL;
14255 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
14256 SvREFCNT_dec(temp);
14257 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
14258 ing below will not set it. */
14259 SvCUR_set(ret_x, SvCUR(rx));
14262 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
14263 sv_force_normal(sv) is called. */
14265 ret = ReANY(ret_x);
14267 SvFLAGS(ret_x) |= SvUTF8(rx);
14268 /* We share the same string buffer as the original regexp, on which we
14269 hold a reference count, incremented when mother_re is set below.
14270 The string pointer is copied here, being part of the regexp struct.
14272 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14273 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14275 const I32 npar = r->nparens+1;
14276 Newx(ret->offs, npar, regexp_paren_pair);
14277 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14280 Newx(ret->substrs, 1, struct reg_substr_data);
14281 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14283 SvREFCNT_inc_void(ret->anchored_substr);
14284 SvREFCNT_inc_void(ret->anchored_utf8);
14285 SvREFCNT_inc_void(ret->float_substr);
14286 SvREFCNT_inc_void(ret->float_utf8);
14288 /* check_substr and check_utf8, if non-NULL, point to either their
14289 anchored or float namesakes, and don't hold a second reference. */
14291 RX_MATCH_COPIED_off(ret_x);
14292 #ifdef PERL_ANY_COW
14293 ret->saved_copy = NULL;
14295 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
14296 SvREFCNT_inc_void(ret->qr_anoncv);
14302 /* regfree_internal()
14304 Free the private data in a regexp. This is overloadable by
14305 extensions. Perl takes care of the regexp structure in pregfree(),
14306 this covers the *pprivate pointer which technically perl doesn't
14307 know about, however of course we have to handle the
14308 regexp_internal structure when no extension is in use.
14310 Note this is called before freeing anything in the regexp
14315 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14318 struct regexp *const r = ReANY(rx);
14319 RXi_GET_DECL(r,ri);
14320 GET_RE_DEBUG_FLAGS_DECL;
14322 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14328 SV *dsv= sv_newmortal();
14329 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14330 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14331 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14332 PL_colors[4],PL_colors[5],s);
14335 #ifdef RE_TRACK_PATTERN_OFFSETS
14337 Safefree(ri->u.offsets); /* 20010421 MJD */
14339 if (ri->code_blocks) {
14341 for (n = 0; n < ri->num_code_blocks; n++)
14342 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14343 Safefree(ri->code_blocks);
14347 int n = ri->data->count;
14350 /* If you add a ->what type here, update the comment in regcomp.h */
14351 switch (ri->data->what[n]) {
14357 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14360 Safefree(ri->data->data[n]);
14366 { /* Aho Corasick add-on structure for a trie node.
14367 Used in stclass optimization only */
14369 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14371 refcount = --aho->refcount;
14374 PerlMemShared_free(aho->states);
14375 PerlMemShared_free(aho->fail);
14376 /* do this last!!!! */
14377 PerlMemShared_free(ri->data->data[n]);
14378 PerlMemShared_free(ri->regstclass);
14384 /* trie structure. */
14386 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14388 refcount = --trie->refcount;
14391 PerlMemShared_free(trie->charmap);
14392 PerlMemShared_free(trie->states);
14393 PerlMemShared_free(trie->trans);
14395 PerlMemShared_free(trie->bitmap);
14397 PerlMemShared_free(trie->jump);
14398 PerlMemShared_free(trie->wordinfo);
14399 /* do this last!!!! */
14400 PerlMemShared_free(ri->data->data[n]);
14405 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14408 Safefree(ri->data->what);
14409 Safefree(ri->data);
14415 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14416 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14417 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14420 re_dup - duplicate a regexp.
14422 This routine is expected to clone a given regexp structure. It is only
14423 compiled under USE_ITHREADS.
14425 After all of the core data stored in struct regexp is duplicated
14426 the regexp_engine.dupe method is used to copy any private data
14427 stored in the *pprivate pointer. This allows extensions to handle
14428 any duplication it needs to do.
14430 See pregfree() and regfree_internal() if you change anything here.
14432 #if defined(USE_ITHREADS)
14433 #ifndef PERL_IN_XSUB_RE
14435 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14439 const struct regexp *r = ReANY(sstr);
14440 struct regexp *ret = ReANY(dstr);
14442 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14444 npar = r->nparens+1;
14445 Newx(ret->offs, npar, regexp_paren_pair);
14446 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14448 /* no need to copy these */
14449 Newx(ret->swap, npar, regexp_paren_pair);
14452 if (ret->substrs) {
14453 /* Do it this way to avoid reading from *r after the StructCopy().
14454 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14455 cache, it doesn't matter. */
14456 const bool anchored = r->check_substr
14457 ? r->check_substr == r->anchored_substr
14458 : r->check_utf8 == r->anchored_utf8;
14459 Newx(ret->substrs, 1, struct reg_substr_data);
14460 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14462 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14463 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14464 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14465 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14467 /* check_substr and check_utf8, if non-NULL, point to either their
14468 anchored or float namesakes, and don't hold a second reference. */
14470 if (ret->check_substr) {
14472 assert(r->check_utf8 == r->anchored_utf8);
14473 ret->check_substr = ret->anchored_substr;
14474 ret->check_utf8 = ret->anchored_utf8;
14476 assert(r->check_substr == r->float_substr);
14477 assert(r->check_utf8 == r->float_utf8);
14478 ret->check_substr = ret->float_substr;
14479 ret->check_utf8 = ret->float_utf8;
14481 } else if (ret->check_utf8) {
14483 ret->check_utf8 = ret->anchored_utf8;
14485 ret->check_utf8 = ret->float_utf8;
14490 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14491 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14494 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14496 if (RX_MATCH_COPIED(dstr))
14497 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14499 ret->subbeg = NULL;
14500 #ifdef PERL_ANY_COW
14501 ret->saved_copy = NULL;
14504 /* Whether mother_re be set or no, we need to copy the string. We
14505 cannot refrain from copying it when the storage points directly to
14506 our mother regexp, because that's
14507 1: a buffer in a different thread
14508 2: something we no longer hold a reference on
14509 so we need to copy it locally. */
14510 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
14511 ret->mother_re = NULL;
14514 #endif /* PERL_IN_XSUB_RE */
14519 This is the internal complement to regdupe() which is used to copy
14520 the structure pointed to by the *pprivate pointer in the regexp.
14521 This is the core version of the extension overridable cloning hook.
14522 The regexp structure being duplicated will be copied by perl prior
14523 to this and will be provided as the regexp *r argument, however
14524 with the /old/ structures pprivate pointer value. Thus this routine
14525 may override any copying normally done by perl.
14527 It returns a pointer to the new regexp_internal structure.
14531 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14534 struct regexp *const r = ReANY(rx);
14535 regexp_internal *reti;
14537 RXi_GET_DECL(r,ri);
14539 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14543 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14544 Copy(ri->program, reti->program, len+1, regnode);
14546 reti->num_code_blocks = ri->num_code_blocks;
14547 if (ri->code_blocks) {
14549 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14550 struct reg_code_block);
14551 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14552 struct reg_code_block);
14553 for (n = 0; n < ri->num_code_blocks; n++)
14554 reti->code_blocks[n].src_regex = (REGEXP*)
14555 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14558 reti->code_blocks = NULL;
14560 reti->regstclass = NULL;
14563 struct reg_data *d;
14564 const int count = ri->data->count;
14567 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14568 char, struct reg_data);
14569 Newx(d->what, count, U8);
14572 for (i = 0; i < count; i++) {
14573 d->what[i] = ri->data->what[i];
14574 switch (d->what[i]) {
14575 /* see also regcomp.h and regfree_internal() */
14576 case 'a': /* actually an AV, but the dup function is identical. */
14580 case 'u': /* actually an HV, but the dup function is identical. */
14581 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14584 /* This is cheating. */
14585 Newx(d->data[i], 1, struct regnode_charclass_class);
14586 StructCopy(ri->data->data[i], d->data[i],
14587 struct regnode_charclass_class);
14588 reti->regstclass = (regnode*)d->data[i];
14591 /* Trie stclasses are readonly and can thus be shared
14592 * without duplication. We free the stclass in pregfree
14593 * when the corresponding reg_ac_data struct is freed.
14595 reti->regstclass= ri->regstclass;
14599 ((reg_trie_data*)ri->data->data[i])->refcount++;
14604 d->data[i] = ri->data->data[i];
14607 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14616 reti->name_list_idx = ri->name_list_idx;
14618 #ifdef RE_TRACK_PATTERN_OFFSETS
14619 if (ri->u.offsets) {
14620 Newx(reti->u.offsets, 2*len+1, U32);
14621 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14624 SetProgLen(reti,len);
14627 return (void*)reti;
14630 #endif /* USE_ITHREADS */
14632 #ifndef PERL_IN_XSUB_RE
14635 - regnext - dig the "next" pointer out of a node
14638 Perl_regnext(pTHX_ regnode *p)
14646 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14647 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14650 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14659 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14662 STRLEN l1 = strlen(pat1);
14663 STRLEN l2 = strlen(pat2);
14666 const char *message;
14668 PERL_ARGS_ASSERT_RE_CROAK2;
14674 Copy(pat1, buf, l1 , char);
14675 Copy(pat2, buf + l1, l2 , char);
14676 buf[l1 + l2] = '\n';
14677 buf[l1 + l2 + 1] = '\0';
14679 /* ANSI variant takes additional second argument */
14680 va_start(args, pat2);
14684 msv = vmess(buf, &args);
14686 message = SvPV_const(msv,l1);
14689 Copy(message, buf, l1 , char);
14690 buf[l1-1] = '\0'; /* Overwrite \n */
14691 Perl_croak(aTHX_ "%s", buf);
14694 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14696 #ifndef PERL_IN_XSUB_RE
14698 Perl_save_re_context(pTHX)
14702 struct re_save_state *state;
14704 SAVEVPTR(PL_curcop);
14705 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14707 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14708 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14709 SSPUSHUV(SAVEt_RE_STATE);
14711 Copy(&PL_reg_state, state, 1, struct re_save_state);
14713 PL_reg_oldsaved = NULL;
14714 PL_reg_oldsavedlen = 0;
14715 PL_reg_oldsavedoffset = 0;
14716 PL_reg_oldsavedcoffset = 0;
14717 PL_reg_maxiter = 0;
14718 PL_reg_leftiter = 0;
14719 PL_reg_poscache = NULL;
14720 PL_reg_poscache_size = 0;
14721 #ifdef PERL_ANY_COW
14725 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14727 const REGEXP * const rx = PM_GETRE(PL_curpm);
14730 for (i = 1; i <= RX_NPARENS(rx); i++) {
14731 char digits[TYPE_CHARS(long)];
14732 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14733 GV *const *const gvp
14734 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14737 GV * const gv = *gvp;
14738 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14750 S_put_byte(pTHX_ SV *sv, int c)
14752 PERL_ARGS_ASSERT_PUT_BYTE;
14754 /* Our definition of isPRINT() ignores locales, so only bytes that are
14755 not part of UTF-8 are considered printable. I assume that the same
14756 holds for UTF-EBCDIC.
14757 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14758 which Wikipedia says:
14760 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14761 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14762 identical, to the ASCII delete (DEL) or rubout control character.
14763 ) So the old condition can be simplified to !isPRINT(c) */
14766 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14769 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14773 const char string = c;
14774 if (c == '-' || c == ']' || c == '\\' || c == '^')
14775 sv_catpvs(sv, "\\");
14776 sv_catpvn(sv, &string, 1);
14781 #define CLEAR_OPTSTART \
14782 if (optstart) STMT_START { \
14783 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14787 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14789 STATIC const regnode *
14790 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14791 const regnode *last, const regnode *plast,
14792 SV* sv, I32 indent, U32 depth)
14795 U8 op = PSEUDO; /* Arbitrary non-END op. */
14796 const regnode *next;
14797 const regnode *optstart= NULL;
14799 RXi_GET_DECL(r,ri);
14800 GET_RE_DEBUG_FLAGS_DECL;
14802 PERL_ARGS_ASSERT_DUMPUNTIL;
14804 #ifdef DEBUG_DUMPUNTIL
14805 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14806 last ? last-start : 0,plast ? plast-start : 0);
14809 if (plast && plast < last)
14812 while (PL_regkind[op] != END && (!last || node < last)) {
14813 /* While that wasn't END last time... */
14816 if (op == CLOSE || op == WHILEM)
14818 next = regnext((regnode *)node);
14821 if (OP(node) == OPTIMIZED) {
14822 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14829 regprop(r, sv, node);
14830 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14831 (int)(2*indent + 1), "", SvPVX_const(sv));
14833 if (OP(node) != OPTIMIZED) {
14834 if (next == NULL) /* Next ptr. */
14835 PerlIO_printf(Perl_debug_log, " (0)");
14836 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14837 PerlIO_printf(Perl_debug_log, " (FAIL)");
14839 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14840 (void)PerlIO_putc(Perl_debug_log, '\n');
14844 if (PL_regkind[(U8)op] == BRANCHJ) {
14847 const regnode *nnode = (OP(next) == LONGJMP
14848 ? regnext((regnode *)next)
14850 if (last && nnode > last)
14852 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14855 else if (PL_regkind[(U8)op] == BRANCH) {
14857 DUMPUNTIL(NEXTOPER(node), next);
14859 else if ( PL_regkind[(U8)op] == TRIE ) {
14860 const regnode *this_trie = node;
14861 const char op = OP(node);
14862 const U32 n = ARG(node);
14863 const reg_ac_data * const ac = op>=AHOCORASICK ?
14864 (reg_ac_data *)ri->data->data[n] :
14866 const reg_trie_data * const trie =
14867 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14869 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14871 const regnode *nextbranch= NULL;
14874 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14875 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14877 PerlIO_printf(Perl_debug_log, "%*s%s ",
14878 (int)(2*(indent+3)), "",
14879 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14880 PL_colors[0], PL_colors[1],
14881 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14882 PERL_PV_PRETTY_ELLIPSES |
14883 PERL_PV_PRETTY_LTGT
14888 U16 dist= trie->jump[word_idx+1];
14889 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14890 (UV)((dist ? this_trie + dist : next) - start));
14893 nextbranch= this_trie + trie->jump[0];
14894 DUMPUNTIL(this_trie + dist, nextbranch);
14896 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14897 nextbranch= regnext((regnode *)nextbranch);
14899 PerlIO_printf(Perl_debug_log, "\n");
14902 if (last && next > last)
14907 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14908 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14909 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14911 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14913 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14915 else if ( op == PLUS || op == STAR) {
14916 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14918 else if (PL_regkind[(U8)op] == ANYOF) {
14919 /* arglen 1 + class block */
14920 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14921 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14922 node = NEXTOPER(node);
14924 else if (PL_regkind[(U8)op] == EXACT) {
14925 /* Literal string, where present. */
14926 node += NODE_SZ_STR(node) - 1;
14927 node = NEXTOPER(node);
14930 node = NEXTOPER(node);
14931 node += regarglen[(U8)op];
14933 if (op == CURLYX || op == OPEN)
14937 #ifdef DEBUG_DUMPUNTIL
14938 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14943 #endif /* DEBUGGING */
14947 * c-indentation-style: bsd
14948 * c-basic-offset: 4
14949 * indent-tabs-mode: nil
14952 * ex: set ts=8 sts=4 sw=4 et: