5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
89 #ifndef PERL_IN_XSUB_RE
90 # include "charclass_invlists.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
100 # if defined(BUGGY_MSC6)
101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
102 # pragma optimize("a",off)
103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
104 # pragma optimize("w",on )
105 # endif /* BUGGY_MSC6 */
109 #define STATIC static
113 typedef struct RExC_state_t {
114 U32 flags; /* RXf_* are we folding, multilining? */
115 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
116 char *precomp; /* uncompiled string. */
117 REGEXP *rx_sv; /* The SV that is the regexp. */
118 regexp *rx; /* perl core regexp structure */
119 regexp_internal *rxi; /* internal data for regexp object pprivate field */
120 char *start; /* Start of input for compile */
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
124 regnode *emit_start; /* Start of emitted-code area */
125 regnode *emit_bound; /* First regnode outside of the allocated space */
126 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 I32 size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN). */
132 I32 cpar; /* Capture buffer count, (CLOSE). */
133 I32 nestroot; /* root parens we are in - used by accept */
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
146 HV *paren_names; /* Paren names */
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
152 I32 override_recoding;
153 struct reg_code_block *code_blocks; /* positions of literal (?{})
155 int num_code_blocks; /* size of code_blocks[] */
156 int code_index; /* next code_blocks[] slot */
158 char *starttry; /* -Dr: where regtry was called. */
159 #define RExC_starttry (pRExC_state->starttry)
161 SV *runtime_code_qr; /* qr with the runtime code blocks */
163 const char *lastparse;
165 AV *paren_name_list; /* idx -> name */
166 #define RExC_lastparse (pRExC_state->lastparse)
167 #define RExC_lastnum (pRExC_state->lastnum)
168 #define RExC_paren_name_list (pRExC_state->paren_name_list)
172 #define RExC_flags (pRExC_state->flags)
173 #define RExC_pm_flags (pRExC_state->pm_flags)
174 #define RExC_precomp (pRExC_state->precomp)
175 #define RExC_rx_sv (pRExC_state->rx_sv)
176 #define RExC_rx (pRExC_state->rx)
177 #define RExC_rxi (pRExC_state->rxi)
178 #define RExC_start (pRExC_state->start)
179 #define RExC_end (pRExC_state->end)
180 #define RExC_parse (pRExC_state->parse)
181 #define RExC_whilem_seen (pRExC_state->whilem_seen)
182 #ifdef RE_TRACK_PATTERN_OFFSETS
183 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
185 #define RExC_emit (pRExC_state->emit)
186 #define RExC_emit_start (pRExC_state->emit_start)
187 #define RExC_emit_bound (pRExC_state->emit_bound)
188 #define RExC_naughty (pRExC_state->naughty)
189 #define RExC_sawback (pRExC_state->sawback)
190 #define RExC_seen (pRExC_state->seen)
191 #define RExC_size (pRExC_state->size)
192 #define RExC_npar (pRExC_state->npar)
193 #define RExC_nestroot (pRExC_state->nestroot)
194 #define RExC_extralen (pRExC_state->extralen)
195 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
196 #define RExC_utf8 (pRExC_state->utf8)
197 #define RExC_uni_semantics (pRExC_state->uni_semantics)
198 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
199 #define RExC_open_parens (pRExC_state->open_parens)
200 #define RExC_close_parens (pRExC_state->close_parens)
201 #define RExC_opend (pRExC_state->opend)
202 #define RExC_paren_names (pRExC_state->paren_names)
203 #define RExC_recurse (pRExC_state->recurse)
204 #define RExC_recurse_count (pRExC_state->recurse_count)
205 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
206 #define RExC_contains_locale (pRExC_state->contains_locale)
207 #define RExC_override_recoding (pRExC_state->override_recoding)
210 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
211 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
212 ((*s) == '{' && regcurly(s)))
215 #undef SPSTART /* dratted cpp namespace... */
218 * Flags to be passed up and down.
220 #define WORST 0 /* Worst case. */
221 #define HASWIDTH 0x01 /* Known to match non-null strings. */
223 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
224 * character, and if utf8, must be invariant. Note that this is not the same
225 * thing as REGNODE_SIMPLE */
227 #define SPSTART 0x04 /* Starts with * or +. */
228 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
229 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
231 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
233 /* whether trie related optimizations are enabled */
234 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
235 #define TRIE_STUDY_OPT
236 #define FULL_TRIE_STUDY
242 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
243 #define PBITVAL(paren) (1 << ((paren) & 7))
244 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
245 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
246 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
248 /* If not already in utf8, do a longjmp back to the beginning */
249 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
250 #define REQUIRE_UTF8 STMT_START { \
251 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
254 /* About scan_data_t.
256 During optimisation we recurse through the regexp program performing
257 various inplace (keyhole style) optimisations. In addition study_chunk
258 and scan_commit populate this data structure with information about
259 what strings MUST appear in the pattern. We look for the longest
260 string that must appear at a fixed location, and we look for the
261 longest string that may appear at a floating location. So for instance
266 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
267 strings (because they follow a .* construct). study_chunk will identify
268 both FOO and BAR as being the longest fixed and floating strings respectively.
270 The strings can be composites, for instance
274 will result in a composite fixed substring 'foo'.
276 For each string some basic information is maintained:
278 - offset or min_offset
279 This is the position the string must appear at, or not before.
280 It also implicitly (when combined with minlenp) tells us how many
281 characters must match before the string we are searching for.
282 Likewise when combined with minlenp and the length of the string it
283 tells us how many characters must appear after the string we have
287 Only used for floating strings. This is the rightmost point that
288 the string can appear at. If set to I32 max it indicates that the
289 string can occur infinitely far to the right.
292 A pointer to the minimum length of the pattern that the string
293 was found inside. This is important as in the case of positive
294 lookahead or positive lookbehind we can have multiple patterns
299 The minimum length of the pattern overall is 3, the minimum length
300 of the lookahead part is 3, but the minimum length of the part that
301 will actually match is 1. So 'FOO's minimum length is 3, but the
302 minimum length for the F is 1. This is important as the minimum length
303 is used to determine offsets in front of and behind the string being
304 looked for. Since strings can be composites this is the length of the
305 pattern at the time it was committed with a scan_commit. Note that
306 the length is calculated by study_chunk, so that the minimum lengths
307 are not known until the full pattern has been compiled, thus the
308 pointer to the value.
312 In the case of lookbehind the string being searched for can be
313 offset past the start point of the final matching string.
314 If this value was just blithely removed from the min_offset it would
315 invalidate some of the calculations for how many chars must match
316 before or after (as they are derived from min_offset and minlen and
317 the length of the string being searched for).
318 When the final pattern is compiled and the data is moved from the
319 scan_data_t structure into the regexp structure the information
320 about lookbehind is factored in, with the information that would
321 have been lost precalculated in the end_shift field for the
324 The fields pos_min and pos_delta are used to store the minimum offset
325 and the delta to the maximum offset at the current point in the pattern.
329 typedef struct scan_data_t {
330 /*I32 len_min; unused */
331 /*I32 len_delta; unused */
335 I32 last_end; /* min value, <0 unless valid. */
338 SV **longest; /* Either &l_fixed, or &l_float. */
339 SV *longest_fixed; /* longest fixed string found in pattern */
340 I32 offset_fixed; /* offset where it starts */
341 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
342 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
343 SV *longest_float; /* longest floating string found in pattern */
344 I32 offset_float_min; /* earliest point in string it can appear */
345 I32 offset_float_max; /* latest point in string it can appear */
346 I32 *minlen_float; /* pointer to the minlen relevant to the string */
347 I32 lookbehind_float; /* is the position of the string modified by LB */
351 struct regnode_charclass_class *start_class;
355 * Forward declarations for pregcomp()'s friends.
358 static const scan_data_t zero_scan_data =
359 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
361 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
362 #define SF_BEFORE_SEOL 0x0001
363 #define SF_BEFORE_MEOL 0x0002
364 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
365 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
368 # define SF_FIX_SHIFT_EOL (0+2)
369 # define SF_FL_SHIFT_EOL (0+4)
371 # define SF_FIX_SHIFT_EOL (+2)
372 # define SF_FL_SHIFT_EOL (+4)
375 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
376 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
378 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
379 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
380 #define SF_IS_INF 0x0040
381 #define SF_HAS_PAR 0x0080
382 #define SF_IN_PAR 0x0100
383 #define SF_HAS_EVAL 0x0200
384 #define SCF_DO_SUBSTR 0x0400
385 #define SCF_DO_STCLASS_AND 0x0800
386 #define SCF_DO_STCLASS_OR 0x1000
387 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
388 #define SCF_WHILEM_VISITED_POS 0x2000
390 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
391 #define SCF_SEEN_ACCEPT 0x8000
393 #define UTF cBOOL(RExC_utf8)
395 /* The enums for all these are ordered so things work out correctly */
396 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
397 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
398 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
399 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
400 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
401 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
402 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
404 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
406 #define OOB_UNICODE 12345678
407 #define OOB_NAMEDCLASS -1
409 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
410 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
413 /* length of regex to show in messages that don't mark a position within */
414 #define RegexLengthToShowInErrorMessages 127
417 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
418 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
419 * op/pragma/warn/regcomp.
421 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
422 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
424 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
427 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
428 * arg. Show regex, up to a maximum length. If it's too long, chop and add
431 #define _FAIL(code) STMT_START { \
432 const char *ellipses = ""; \
433 IV len = RExC_end - RExC_precomp; \
436 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
437 if (len > RegexLengthToShowInErrorMessages) { \
438 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
439 len = RegexLengthToShowInErrorMessages - 10; \
445 #define FAIL(msg) _FAIL( \
446 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
447 msg, (int)len, RExC_precomp, ellipses))
449 #define FAIL2(msg,arg) _FAIL( \
450 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
451 arg, (int)len, RExC_precomp, ellipses))
454 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
456 #define Simple_vFAIL(m) STMT_START { \
457 const IV offset = RExC_parse - RExC_precomp; \
458 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
459 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
463 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
465 #define vFAIL(m) STMT_START { \
467 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
472 * Like Simple_vFAIL(), but accepts two arguments.
474 #define Simple_vFAIL2(m,a1) STMT_START { \
475 const IV offset = RExC_parse - RExC_precomp; \
476 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
477 (int)offset, RExC_precomp, RExC_precomp + offset); \
481 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
483 #define vFAIL2(m,a1) STMT_START { \
485 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
486 Simple_vFAIL2(m, a1); \
491 * Like Simple_vFAIL(), but accepts three arguments.
493 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
494 const IV offset = RExC_parse - RExC_precomp; \
495 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
500 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
502 #define vFAIL3(m,a1,a2) STMT_START { \
504 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
505 Simple_vFAIL3(m, a1, a2); \
509 * Like Simple_vFAIL(), but accepts four arguments.
511 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
512 const IV offset = RExC_parse - RExC_precomp; \
513 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
514 (int)offset, RExC_precomp, RExC_precomp + offset); \
517 #define ckWARNreg(loc,m) STMT_START { \
518 const IV offset = loc - RExC_precomp; \
519 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
520 (int)offset, RExC_precomp, RExC_precomp + offset); \
523 #define ckWARNregdep(loc,m) STMT_START { \
524 const IV offset = loc - RExC_precomp; \
525 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
527 (int)offset, RExC_precomp, RExC_precomp + offset); \
530 #define ckWARN2regdep(loc,m, a1) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
534 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 #define ckWARN2reg(loc, m, a1) STMT_START { \
538 const IV offset = loc - RExC_precomp; \
539 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
540 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
543 #define vWARN3(loc, m, a1, a2) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
549 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
550 const IV offset = loc - RExC_precomp; \
551 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
555 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
556 const IV offset = loc - RExC_precomp; \
557 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
558 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
561 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
562 const IV offset = loc - RExC_precomp; \
563 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
564 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
567 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
568 const IV offset = loc - RExC_precomp; \
569 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
570 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
574 /* Allow for side effects in s */
575 #define REGC(c,s) STMT_START { \
576 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
579 /* Macros for recording node offsets. 20001227 mjd@plover.com
580 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
581 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
582 * Element 0 holds the number n.
583 * Position is 1 indexed.
585 #ifndef RE_TRACK_PATTERN_OFFSETS
586 #define Set_Node_Offset_To_R(node,byte)
587 #define Set_Node_Offset(node,byte)
588 #define Set_Cur_Node_Offset
589 #define Set_Node_Length_To_R(node,len)
590 #define Set_Node_Length(node,len)
591 #define Set_Node_Cur_Length(node)
592 #define Node_Offset(n)
593 #define Node_Length(n)
594 #define Set_Node_Offset_Length(node,offset,len)
595 #define ProgLen(ri) ri->u.proglen
596 #define SetProgLen(ri,x) ri->u.proglen = x
598 #define ProgLen(ri) ri->u.offsets[0]
599 #define SetProgLen(ri,x) ri->u.offsets[0] = x
600 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
602 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
603 __LINE__, (int)(node), (int)(byte))); \
605 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
607 RExC_offsets[2*(node)-1] = (byte); \
612 #define Set_Node_Offset(node,byte) \
613 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
614 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
616 #define Set_Node_Length_To_R(node,len) STMT_START { \
618 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
619 __LINE__, (int)(node), (int)(len))); \
621 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
623 RExC_offsets[2*(node)] = (len); \
628 #define Set_Node_Length(node,len) \
629 Set_Node_Length_To_R((node)-RExC_emit_start, len)
630 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
631 #define Set_Node_Cur_Length(node) \
632 Set_Node_Length(node, RExC_parse - parse_start)
634 /* Get offsets and lengths */
635 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
636 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
638 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
639 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
640 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
644 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
645 #define EXPERIMENTAL_INPLACESCAN
646 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
648 #define DEBUG_STUDYDATA(str,data,depth) \
649 DEBUG_OPTIMISE_MORE_r(if(data){ \
650 PerlIO_printf(Perl_debug_log, \
651 "%*s" str "Pos:%"IVdf"/%"IVdf \
652 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
653 (int)(depth)*2, "", \
654 (IV)((data)->pos_min), \
655 (IV)((data)->pos_delta), \
656 (UV)((data)->flags), \
657 (IV)((data)->whilem_c), \
658 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
659 is_inf ? "INF " : "" \
661 if ((data)->last_found) \
662 PerlIO_printf(Perl_debug_log, \
663 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
664 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
665 SvPVX_const((data)->last_found), \
666 (IV)((data)->last_end), \
667 (IV)((data)->last_start_min), \
668 (IV)((data)->last_start_max), \
669 ((data)->longest && \
670 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
671 SvPVX_const((data)->longest_fixed), \
672 (IV)((data)->offset_fixed), \
673 ((data)->longest && \
674 (data)->longest==&((data)->longest_float)) ? "*" : "", \
675 SvPVX_const((data)->longest_float), \
676 (IV)((data)->offset_float_min), \
677 (IV)((data)->offset_float_max) \
679 PerlIO_printf(Perl_debug_log,"\n"); \
682 static void clear_re(pTHX_ void *r);
684 /* Mark that we cannot extend a found fixed substring at this point.
685 Update the longest found anchored substring and the longest found
686 floating substrings if needed. */
689 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
691 const STRLEN l = CHR_SVLEN(data->last_found);
692 const STRLEN old_l = CHR_SVLEN(*data->longest);
693 GET_RE_DEBUG_FLAGS_DECL;
695 PERL_ARGS_ASSERT_SCAN_COMMIT;
697 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
698 SvSetMagicSV(*data->longest, data->last_found);
699 if (*data->longest == data->longest_fixed) {
700 data->offset_fixed = l ? data->last_start_min : data->pos_min;
701 if (data->flags & SF_BEFORE_EOL)
703 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
705 data->flags &= ~SF_FIX_BEFORE_EOL;
706 data->minlen_fixed=minlenp;
707 data->lookbehind_fixed=0;
709 else { /* *data->longest == data->longest_float */
710 data->offset_float_min = l ? data->last_start_min : data->pos_min;
711 data->offset_float_max = (l
712 ? data->last_start_max
713 : data->pos_min + data->pos_delta);
714 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
715 data->offset_float_max = I32_MAX;
716 if (data->flags & SF_BEFORE_EOL)
718 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
720 data->flags &= ~SF_FL_BEFORE_EOL;
721 data->minlen_float=minlenp;
722 data->lookbehind_float=0;
725 SvCUR_set(data->last_found, 0);
727 SV * const sv = data->last_found;
728 if (SvUTF8(sv) && SvMAGICAL(sv)) {
729 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
735 data->flags &= ~SF_BEFORE_EOL;
736 DEBUG_STUDYDATA("commit: ",data,0);
739 /* Can match anything (initialization) */
741 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
743 PERL_ARGS_ASSERT_CL_ANYTHING;
745 ANYOF_BITMAP_SETALL(cl);
746 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
747 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
749 /* If any portion of the regex is to operate under locale rules,
750 * initialization includes it. The reason this isn't done for all regexes
751 * is that the optimizer was written under the assumption that locale was
752 * all-or-nothing. Given the complexity and lack of documentation in the
753 * optimizer, and that there are inadequate test cases for locale, so many
754 * parts of it may not work properly, it is safest to avoid locale unless
756 if (RExC_contains_locale) {
757 ANYOF_CLASS_SETALL(cl); /* /l uses class */
758 cl->flags |= ANYOF_LOCALE;
761 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
765 /* Can match anything (initialization) */
767 S_cl_is_anything(const struct regnode_charclass_class *cl)
771 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
773 for (value = 0; value <= ANYOF_MAX; value += 2)
774 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
776 if (!(cl->flags & ANYOF_UNICODE_ALL))
778 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
783 /* Can match anything (initialization) */
785 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
787 PERL_ARGS_ASSERT_CL_INIT;
789 Zero(cl, 1, struct regnode_charclass_class);
791 cl_anything(pRExC_state, cl);
792 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
795 /* These two functions currently do the exact same thing */
796 #define cl_init_zero S_cl_init
798 /* 'AND' a given class with another one. Can create false positives. 'cl'
799 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
800 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
802 S_cl_and(struct regnode_charclass_class *cl,
803 const struct regnode_charclass_class *and_with)
805 PERL_ARGS_ASSERT_CL_AND;
807 assert(and_with->type == ANYOF);
809 /* I (khw) am not sure all these restrictions are necessary XXX */
810 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
811 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
812 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
813 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
814 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
817 if (and_with->flags & ANYOF_INVERT)
818 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
819 cl->bitmap[i] &= ~and_with->bitmap[i];
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] &= and_with->bitmap[i];
823 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
825 if (and_with->flags & ANYOF_INVERT) {
827 /* Here, the and'ed node is inverted. Get the AND of the flags that
828 * aren't affected by the inversion. Those that are affected are
829 * handled individually below */
830 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
831 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
832 cl->flags |= affected_flags;
834 /* We currently don't know how to deal with things that aren't in the
835 * bitmap, but we know that the intersection is no greater than what
836 * is already in cl, so let there be false positives that get sorted
837 * out after the synthetic start class succeeds, and the node is
838 * matched for real. */
840 /* The inversion of these two flags indicate that the resulting
841 * intersection doesn't have them */
842 if (and_with->flags & ANYOF_UNICODE_ALL) {
843 cl->flags &= ~ANYOF_UNICODE_ALL;
845 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
846 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
849 else { /* and'd node is not inverted */
850 U8 outside_bitmap_but_not_utf8; /* Temp variable */
852 if (! ANYOF_NONBITMAP(and_with)) {
854 /* Here 'and_with' doesn't match anything outside the bitmap
855 * (except possibly ANYOF_UNICODE_ALL), which means the
856 * intersection can't either, except for ANYOF_UNICODE_ALL, in
857 * which case we don't know what the intersection is, but it's no
858 * greater than what cl already has, so can just leave it alone,
859 * with possible false positives */
860 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
861 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
862 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
865 else if (! ANYOF_NONBITMAP(cl)) {
867 /* Here, 'and_with' does match something outside the bitmap, and cl
868 * doesn't have a list of things to match outside the bitmap. If
869 * cl can match all code points above 255, the intersection will
870 * be those above-255 code points that 'and_with' matches. If cl
871 * can't match all Unicode code points, it means that it can't
872 * match anything outside the bitmap (since the 'if' that got us
873 * into this block tested for that), so we leave the bitmap empty.
875 if (cl->flags & ANYOF_UNICODE_ALL) {
876 ARG_SET(cl, ARG(and_with));
878 /* and_with's ARG may match things that don't require UTF8.
879 * And now cl's will too, in spite of this being an 'and'. See
880 * the comments below about the kludge */
881 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
885 /* Here, both 'and_with' and cl match something outside the
886 * bitmap. Currently we do not do the intersection, so just match
887 * whatever cl had at the beginning. */
891 /* Take the intersection of the two sets of flags. However, the
892 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
893 * kludge around the fact that this flag is not treated like the others
894 * which are initialized in cl_anything(). The way the optimizer works
895 * is that the synthetic start class (SSC) is initialized to match
896 * anything, and then the first time a real node is encountered, its
897 * values are AND'd with the SSC's with the result being the values of
898 * the real node. However, there are paths through the optimizer where
899 * the AND never gets called, so those initialized bits are set
900 * inappropriately, which is not usually a big deal, as they just cause
901 * false positives in the SSC, which will just mean a probably
902 * imperceptible slow down in execution. However this bit has a
903 * higher false positive consequence in that it can cause utf8.pm,
904 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
905 * bigger slowdown and also causes significant extra memory to be used.
906 * In order to prevent this, the code now takes a different tack. The
907 * bit isn't set unless some part of the regular expression needs it,
908 * but once set it won't get cleared. This means that these extra
909 * modules won't get loaded unless there was some path through the
910 * pattern that would have required them anyway, and so any false
911 * positives that occur by not ANDing them out when they could be
912 * aren't as severe as they would be if we treated this bit like all
914 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
915 & ANYOF_NONBITMAP_NON_UTF8;
916 cl->flags &= and_with->flags;
917 cl->flags |= outside_bitmap_but_not_utf8;
921 /* 'OR' a given class with another one. Can create false positives. 'cl'
922 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
923 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
925 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
927 PERL_ARGS_ASSERT_CL_OR;
929 if (or_with->flags & ANYOF_INVERT) {
931 /* Here, the or'd node is to be inverted. This means we take the
932 * complement of everything not in the bitmap, but currently we don't
933 * know what that is, so give up and match anything */
934 if (ANYOF_NONBITMAP(or_with)) {
935 cl_anything(pRExC_state, cl);
938 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
939 * <= (B1 | !B2) | (CL1 | !CL2)
940 * which is wasteful if CL2 is small, but we ignore CL2:
941 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
942 * XXXX Can we handle case-fold? Unclear:
943 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
944 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
946 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
947 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
948 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
951 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
952 cl->bitmap[i] |= ~or_with->bitmap[i];
953 } /* XXXX: logic is complicated otherwise */
955 cl_anything(pRExC_state, cl);
958 /* And, we can just take the union of the flags that aren't affected
959 * by the inversion */
960 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
962 /* For the remaining flags:
963 ANYOF_UNICODE_ALL and inverted means to not match anything above
964 255, which means that the union with cl should just be
965 what cl has in it, so can ignore this flag
966 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
967 is 127-255 to match them, but then invert that, so the
968 union with cl should just be what cl has in it, so can
971 } else { /* 'or_with' is not inverted */
972 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
973 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
974 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
975 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
978 /* OR char bitmap and class bitmap separately */
979 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
980 cl->bitmap[i] |= or_with->bitmap[i];
981 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
982 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
983 cl->classflags[i] |= or_with->classflags[i];
984 cl->flags |= ANYOF_CLASS;
987 else { /* XXXX: logic is complicated, leave it along for a moment. */
988 cl_anything(pRExC_state, cl);
991 if (ANYOF_NONBITMAP(or_with)) {
993 /* Use the added node's outside-the-bit-map match if there isn't a
994 * conflict. If there is a conflict (both nodes match something
995 * outside the bitmap, but what they match outside is not the same
996 * pointer, and hence not easily compared until XXX we extend
997 * inversion lists this far), give up and allow the start class to
998 * match everything outside the bitmap. If that stuff is all above
999 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1000 if (! ANYOF_NONBITMAP(cl)) {
1001 ARG_SET(cl, ARG(or_with));
1003 else if (ARG(cl) != ARG(or_with)) {
1005 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1006 cl_anything(pRExC_state, cl);
1009 cl->flags |= ANYOF_UNICODE_ALL;
1014 /* Take the union */
1015 cl->flags |= or_with->flags;
1019 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1020 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1021 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1022 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1027 dump_trie(trie,widecharmap,revcharmap)
1028 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1029 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1031 These routines dump out a trie in a somewhat readable format.
1032 The _interim_ variants are used for debugging the interim
1033 tables that are used to generate the final compressed
1034 representation which is what dump_trie expects.
1036 Part of the reason for their existence is to provide a form
1037 of documentation as to how the different representations function.
1042 Dumps the final compressed table form of the trie to Perl_debug_log.
1043 Used for debugging make_trie().
1047 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1048 AV *revcharmap, U32 depth)
1051 SV *sv=sv_newmortal();
1052 int colwidth= widecharmap ? 6 : 4;
1054 GET_RE_DEBUG_FLAGS_DECL;
1056 PERL_ARGS_ASSERT_DUMP_TRIE;
1058 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1059 (int)depth * 2 + 2,"",
1060 "Match","Base","Ofs" );
1062 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1063 SV ** const tmp = av_fetch( revcharmap, state, 0);
1065 PerlIO_printf( Perl_debug_log, "%*s",
1067 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1068 PL_colors[0], PL_colors[1],
1069 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1070 PERL_PV_ESCAPE_FIRSTCHAR
1075 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1076 (int)depth * 2 + 2,"");
1078 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1079 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1080 PerlIO_printf( Perl_debug_log, "\n");
1082 for( state = 1 ; state < trie->statecount ; state++ ) {
1083 const U32 base = trie->states[ state ].trans.base;
1085 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1087 if ( trie->states[ state ].wordnum ) {
1088 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1090 PerlIO_printf( Perl_debug_log, "%6s", "" );
1093 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1098 while( ( base + ofs < trie->uniquecharcount ) ||
1099 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1100 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1103 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1105 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1106 if ( ( base + ofs >= trie->uniquecharcount ) &&
1107 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1108 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1110 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1112 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1114 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1118 PerlIO_printf( Perl_debug_log, "]");
1121 PerlIO_printf( Perl_debug_log, "\n" );
1123 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1124 for (word=1; word <= trie->wordcount; word++) {
1125 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1126 (int)word, (int)(trie->wordinfo[word].prev),
1127 (int)(trie->wordinfo[word].len));
1129 PerlIO_printf(Perl_debug_log, "\n" );
1132 Dumps a fully constructed but uncompressed trie in list form.
1133 List tries normally only are used for construction when the number of
1134 possible chars (trie->uniquecharcount) is very high.
1135 Used for debugging make_trie().
1138 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1139 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1143 SV *sv=sv_newmortal();
1144 int colwidth= widecharmap ? 6 : 4;
1145 GET_RE_DEBUG_FLAGS_DECL;
1147 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1149 /* print out the table precompression. */
1150 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1151 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1152 "------:-----+-----------------\n" );
1154 for( state=1 ; state < next_alloc ; state ++ ) {
1157 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1158 (int)depth * 2 + 2,"", (UV)state );
1159 if ( ! trie->states[ state ].wordnum ) {
1160 PerlIO_printf( Perl_debug_log, "%5s| ","");
1162 PerlIO_printf( Perl_debug_log, "W%4x| ",
1163 trie->states[ state ].wordnum
1166 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1167 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1169 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1171 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1172 PL_colors[0], PL_colors[1],
1173 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1174 PERL_PV_ESCAPE_FIRSTCHAR
1176 TRIE_LIST_ITEM(state,charid).forid,
1177 (UV)TRIE_LIST_ITEM(state,charid).newstate
1180 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1181 (int)((depth * 2) + 14), "");
1184 PerlIO_printf( Perl_debug_log, "\n");
1189 Dumps a fully constructed but uncompressed trie in table form.
1190 This is the normal DFA style state transition table, with a few
1191 twists to facilitate compression later.
1192 Used for debugging make_trie().
1195 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1196 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1201 SV *sv=sv_newmortal();
1202 int colwidth= widecharmap ? 6 : 4;
1203 GET_RE_DEBUG_FLAGS_DECL;
1205 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1208 print out the table precompression so that we can do a visual check
1209 that they are identical.
1212 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1214 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1215 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1217 PerlIO_printf( Perl_debug_log, "%*s",
1219 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1220 PL_colors[0], PL_colors[1],
1221 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1222 PERL_PV_ESCAPE_FIRSTCHAR
1228 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1230 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1231 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1234 PerlIO_printf( Perl_debug_log, "\n" );
1236 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1238 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1239 (int)depth * 2 + 2,"",
1240 (UV)TRIE_NODENUM( state ) );
1242 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1243 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1245 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1247 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1249 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1250 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1252 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1253 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1261 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1262 startbranch: the first branch in the whole branch sequence
1263 first : start branch of sequence of branch-exact nodes.
1264 May be the same as startbranch
1265 last : Thing following the last branch.
1266 May be the same as tail.
1267 tail : item following the branch sequence
1268 count : words in the sequence
1269 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1270 depth : indent depth
1272 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1274 A trie is an N'ary tree where the branches are determined by digital
1275 decomposition of the key. IE, at the root node you look up the 1st character and
1276 follow that branch repeat until you find the end of the branches. Nodes can be
1277 marked as "accepting" meaning they represent a complete word. Eg:
1281 would convert into the following structure. Numbers represent states, letters
1282 following numbers represent valid transitions on the letter from that state, if
1283 the number is in square brackets it represents an accepting state, otherwise it
1284 will be in parenthesis.
1286 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1290 (1) +-i->(6)-+-s->[7]
1292 +-s->(3)-+-h->(4)-+-e->[5]
1294 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1296 This shows that when matching against the string 'hers' we will begin at state 1
1297 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1298 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1299 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1300 single traverse. We store a mapping from accepting to state to which word was
1301 matched, and then when we have multiple possibilities we try to complete the
1302 rest of the regex in the order in which they occured in the alternation.
1304 The only prior NFA like behaviour that would be changed by the TRIE support is
1305 the silent ignoring of duplicate alternations which are of the form:
1307 / (DUPE|DUPE) X? (?{ ... }) Y /x
1309 Thus EVAL blocks following a trie may be called a different number of times with
1310 and without the optimisation. With the optimisations dupes will be silently
1311 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1312 the following demonstrates:
1314 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1316 which prints out 'word' three times, but
1318 'words'=~/(word|word|word)(?{ print $1 })S/
1320 which doesnt print it out at all. This is due to other optimisations kicking in.
1322 Example of what happens on a structural level:
1324 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1326 1: CURLYM[1] {1,32767}(18)
1337 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1338 and should turn into:
1340 1: CURLYM[1] {1,32767}(18)
1342 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1350 Cases where tail != last would be like /(?foo|bar)baz/:
1360 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1361 and would end up looking like:
1364 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1371 d = uvuni_to_utf8_flags(d, uv, 0);
1373 is the recommended Unicode-aware way of saying
1378 #define TRIE_STORE_REVCHAR(val) \
1381 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1382 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1383 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1384 SvCUR_set(zlopp, kapow - flrbbbbb); \
1387 av_push(revcharmap, zlopp); \
1389 char ooooff = (char)val; \
1390 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1394 #define TRIE_READ_CHAR STMT_START { \
1397 /* if it is UTF then it is either already folded, or does not need folding */ \
1398 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1400 else if (folder == PL_fold_latin1) { \
1401 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1402 if ( foldlen > 0 ) { \
1403 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1409 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1410 skiplen = UNISKIP(uvc); \
1411 foldlen -= skiplen; \
1412 scan = foldbuf + skiplen; \
1415 /* raw data, will be folded later if needed */ \
1423 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1424 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1425 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1426 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1428 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1429 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1430 TRIE_LIST_CUR( state )++; \
1433 #define TRIE_LIST_NEW(state) STMT_START { \
1434 Newxz( trie->states[ state ].trans.list, \
1435 4, reg_trie_trans_le ); \
1436 TRIE_LIST_CUR( state ) = 1; \
1437 TRIE_LIST_LEN( state ) = 4; \
1440 #define TRIE_HANDLE_WORD(state) STMT_START { \
1441 U16 dupe= trie->states[ state ].wordnum; \
1442 regnode * const noper_next = regnext( noper ); \
1445 /* store the word for dumping */ \
1447 if (OP(noper) != NOTHING) \
1448 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1450 tmp = newSVpvn_utf8( "", 0, UTF ); \
1451 av_push( trie_words, tmp ); \
1455 trie->wordinfo[curword].prev = 0; \
1456 trie->wordinfo[curword].len = wordlen; \
1457 trie->wordinfo[curword].accept = state; \
1459 if ( noper_next < tail ) { \
1461 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1462 trie->jump[curword] = (U16)(noper_next - convert); \
1464 jumper = noper_next; \
1466 nextbranch= regnext(cur); \
1470 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1471 /* chain, so that when the bits of chain are later */\
1472 /* linked together, the dups appear in the chain */\
1473 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1474 trie->wordinfo[dupe].prev = curword; \
1476 /* we haven't inserted this word yet. */ \
1477 trie->states[ state ].wordnum = curword; \
1482 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1483 ( ( base + charid >= ucharcount \
1484 && base + charid < ubound \
1485 && state == trie->trans[ base - ucharcount + charid ].check \
1486 && trie->trans[ base - ucharcount + charid ].next ) \
1487 ? trie->trans[ base - ucharcount + charid ].next \
1488 : ( state==1 ? special : 0 ) \
1492 #define MADE_JUMP_TRIE 2
1493 #define MADE_EXACT_TRIE 4
1496 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1499 /* first pass, loop through and scan words */
1500 reg_trie_data *trie;
1501 HV *widecharmap = NULL;
1502 AV *revcharmap = newAV();
1504 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1509 regnode *jumper = NULL;
1510 regnode *nextbranch = NULL;
1511 regnode *convert = NULL;
1512 U32 *prev_states; /* temp array mapping each state to previous one */
1513 /* we just use folder as a flag in utf8 */
1514 const U8 * folder = NULL;
1517 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1518 AV *trie_words = NULL;
1519 /* along with revcharmap, this only used during construction but both are
1520 * useful during debugging so we store them in the struct when debugging.
1523 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1524 STRLEN trie_charcount=0;
1526 SV *re_trie_maxbuff;
1527 GET_RE_DEBUG_FLAGS_DECL;
1529 PERL_ARGS_ASSERT_MAKE_TRIE;
1531 PERL_UNUSED_ARG(depth);
1538 case EXACTFU_TRICKYFOLD:
1539 case EXACTFU: folder = PL_fold_latin1; break;
1540 case EXACTF: folder = PL_fold; break;
1541 case EXACTFL: folder = PL_fold_locale; break;
1542 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1545 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1547 trie->startstate = 1;
1548 trie->wordcount = word_count;
1549 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1550 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1552 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1553 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1554 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1557 trie_words = newAV();
1560 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1561 if (!SvIOK(re_trie_maxbuff)) {
1562 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1564 DEBUG_TRIE_COMPILE_r({
1565 PerlIO_printf( Perl_debug_log,
1566 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1567 (int)depth * 2 + 2, "",
1568 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1569 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1573 /* Find the node we are going to overwrite */
1574 if ( first == startbranch && OP( last ) != BRANCH ) {
1575 /* whole branch chain */
1578 /* branch sub-chain */
1579 convert = NEXTOPER( first );
1582 /* -- First loop and Setup --
1584 We first traverse the branches and scan each word to determine if it
1585 contains widechars, and how many unique chars there are, this is
1586 important as we have to build a table with at least as many columns as we
1589 We use an array of integers to represent the character codes 0..255
1590 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1591 native representation of the character value as the key and IV's for the
1594 *TODO* If we keep track of how many times each character is used we can
1595 remap the columns so that the table compression later on is more
1596 efficient in terms of memory by ensuring the most common value is in the
1597 middle and the least common are on the outside. IMO this would be better
1598 than a most to least common mapping as theres a decent chance the most
1599 common letter will share a node with the least common, meaning the node
1600 will not be compressible. With a middle is most common approach the worst
1601 case is when we have the least common nodes twice.
1605 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1606 regnode *noper = NEXTOPER( cur );
1607 const U8 *uc = (U8*)STRING( noper );
1608 const U8 *e = uc + STR_LEN( noper );
1610 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1612 const U8 *scan = (U8*)NULL;
1613 U32 wordlen = 0; /* required init */
1615 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1617 if (OP(noper) == NOTHING) {
1618 regnode *noper_next= regnext(noper);
1619 if (noper_next != tail && OP(noper_next) == flags) {
1621 uc= (U8*)STRING(noper);
1622 e= uc + STR_LEN(noper);
1623 trie->minlen= STR_LEN(noper);
1630 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1631 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1632 regardless of encoding */
1633 if (OP( noper ) == EXACTFU_SS) {
1634 /* false positives are ok, so just set this */
1635 TRIE_BITMAP_SET(trie,0xDF);
1638 for ( ; uc < e ; uc += len ) {
1639 TRIE_CHARCOUNT(trie)++;
1644 U8 folded= folder[ (U8) uvc ];
1645 if ( !trie->charmap[ folded ] ) {
1646 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1647 TRIE_STORE_REVCHAR( folded );
1650 if ( !trie->charmap[ uvc ] ) {
1651 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1652 TRIE_STORE_REVCHAR( uvc );
1655 /* store the codepoint in the bitmap, and its folded
1657 TRIE_BITMAP_SET(trie, uvc);
1659 /* store the folded codepoint */
1660 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1663 /* store first byte of utf8 representation of
1664 variant codepoints */
1665 if (! UNI_IS_INVARIANT(uvc)) {
1666 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1669 set_bit = 0; /* We've done our bit :-) */
1674 widecharmap = newHV();
1676 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1679 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1681 if ( !SvTRUE( *svpp ) ) {
1682 sv_setiv( *svpp, ++trie->uniquecharcount );
1683 TRIE_STORE_REVCHAR(uvc);
1687 if( cur == first ) {
1688 trie->minlen = chars;
1689 trie->maxlen = chars;
1690 } else if (chars < trie->minlen) {
1691 trie->minlen = chars;
1692 } else if (chars > trie->maxlen) {
1693 trie->maxlen = chars;
1695 if (OP( noper ) == EXACTFU_SS) {
1696 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1697 if (trie->minlen > 1)
1700 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1701 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1702 * - We assume that any such sequence might match a 2 byte string */
1703 if (trie->minlen > 2 )
1707 } /* end first pass */
1708 DEBUG_TRIE_COMPILE_r(
1709 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1710 (int)depth * 2 + 2,"",
1711 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1712 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1713 (int)trie->minlen, (int)trie->maxlen )
1717 We now know what we are dealing with in terms of unique chars and
1718 string sizes so we can calculate how much memory a naive
1719 representation using a flat table will take. If it's over a reasonable
1720 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1721 conservative but potentially much slower representation using an array
1724 At the end we convert both representations into the same compressed
1725 form that will be used in regexec.c for matching with. The latter
1726 is a form that cannot be used to construct with but has memory
1727 properties similar to the list form and access properties similar
1728 to the table form making it both suitable for fast searches and
1729 small enough that its feasable to store for the duration of a program.
1731 See the comment in the code where the compressed table is produced
1732 inplace from the flat tabe representation for an explanation of how
1733 the compression works.
1738 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1741 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1743 Second Pass -- Array Of Lists Representation
1745 Each state will be represented by a list of charid:state records
1746 (reg_trie_trans_le) the first such element holds the CUR and LEN
1747 points of the allocated array. (See defines above).
1749 We build the initial structure using the lists, and then convert
1750 it into the compressed table form which allows faster lookups
1751 (but cant be modified once converted).
1754 STRLEN transcount = 1;
1756 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1757 "%*sCompiling trie using list compiler\n",
1758 (int)depth * 2 + 2, ""));
1760 trie->states = (reg_trie_state *)
1761 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1762 sizeof(reg_trie_state) );
1766 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1768 regnode *noper = NEXTOPER( cur );
1769 U8 *uc = (U8*)STRING( noper );
1770 const U8 *e = uc + STR_LEN( noper );
1771 U32 state = 1; /* required init */
1772 U16 charid = 0; /* sanity init */
1773 U8 *scan = (U8*)NULL; /* sanity init */
1774 STRLEN foldlen = 0; /* required init */
1775 U32 wordlen = 0; /* required init */
1776 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1779 if (OP(noper) == NOTHING) {
1780 regnode *noper_next= regnext(noper);
1781 if (noper_next != tail && OP(noper_next) == flags) {
1783 uc= (U8*)STRING(noper);
1784 e= uc + STR_LEN(noper);
1788 if (OP(noper) != NOTHING) {
1789 for ( ; uc < e ; uc += len ) {
1794 charid = trie->charmap[ uvc ];
1796 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1800 charid=(U16)SvIV( *svpp );
1803 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1810 if ( !trie->states[ state ].trans.list ) {
1811 TRIE_LIST_NEW( state );
1813 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1814 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1815 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1820 newstate = next_alloc++;
1821 prev_states[newstate] = state;
1822 TRIE_LIST_PUSH( state, charid, newstate );
1827 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1831 TRIE_HANDLE_WORD(state);
1833 } /* end second pass */
1835 /* next alloc is the NEXT state to be allocated */
1836 trie->statecount = next_alloc;
1837 trie->states = (reg_trie_state *)
1838 PerlMemShared_realloc( trie->states,
1840 * sizeof(reg_trie_state) );
1842 /* and now dump it out before we compress it */
1843 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1844 revcharmap, next_alloc,
1848 trie->trans = (reg_trie_trans *)
1849 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1856 for( state=1 ; state < next_alloc ; state ++ ) {
1860 DEBUG_TRIE_COMPILE_MORE_r(
1861 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1865 if (trie->states[state].trans.list) {
1866 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1870 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1871 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1872 if ( forid < minid ) {
1874 } else if ( forid > maxid ) {
1878 if ( transcount < tp + maxid - minid + 1) {
1880 trie->trans = (reg_trie_trans *)
1881 PerlMemShared_realloc( trie->trans,
1883 * sizeof(reg_trie_trans) );
1884 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1886 base = trie->uniquecharcount + tp - minid;
1887 if ( maxid == minid ) {
1889 for ( ; zp < tp ; zp++ ) {
1890 if ( ! trie->trans[ zp ].next ) {
1891 base = trie->uniquecharcount + zp - minid;
1892 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1893 trie->trans[ zp ].check = state;
1899 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1900 trie->trans[ tp ].check = state;
1905 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1906 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1907 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1908 trie->trans[ tid ].check = state;
1910 tp += ( maxid - minid + 1 );
1912 Safefree(trie->states[ state ].trans.list);
1915 DEBUG_TRIE_COMPILE_MORE_r(
1916 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1919 trie->states[ state ].trans.base=base;
1921 trie->lasttrans = tp + 1;
1925 Second Pass -- Flat Table Representation.
1927 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1928 We know that we will need Charcount+1 trans at most to store the data
1929 (one row per char at worst case) So we preallocate both structures
1930 assuming worst case.
1932 We then construct the trie using only the .next slots of the entry
1935 We use the .check field of the first entry of the node temporarily to
1936 make compression both faster and easier by keeping track of how many non
1937 zero fields are in the node.
1939 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1942 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1943 number representing the first entry of the node, and state as a
1944 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1945 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1946 are 2 entrys per node. eg:
1954 The table is internally in the right hand, idx form. However as we also
1955 have to deal with the states array which is indexed by nodenum we have to
1956 use TRIE_NODENUM() to convert.
1959 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1960 "%*sCompiling trie using table compiler\n",
1961 (int)depth * 2 + 2, ""));
1963 trie->trans = (reg_trie_trans *)
1964 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1965 * trie->uniquecharcount + 1,
1966 sizeof(reg_trie_trans) );
1967 trie->states = (reg_trie_state *)
1968 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1969 sizeof(reg_trie_state) );
1970 next_alloc = trie->uniquecharcount + 1;
1973 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1975 regnode *noper = NEXTOPER( cur );
1976 const U8 *uc = (U8*)STRING( noper );
1977 const U8 *e = uc + STR_LEN( noper );
1979 U32 state = 1; /* required init */
1981 U16 charid = 0; /* sanity init */
1982 U32 accept_state = 0; /* sanity init */
1983 U8 *scan = (U8*)NULL; /* sanity init */
1985 STRLEN foldlen = 0; /* required init */
1986 U32 wordlen = 0; /* required init */
1988 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1990 if (OP(noper) == NOTHING) {
1991 regnode *noper_next= regnext(noper);
1992 if (noper_next != tail && OP(noper_next) == flags) {
1994 uc= (U8*)STRING(noper);
1995 e= uc + STR_LEN(noper);
1999 if ( OP(noper) != NOTHING ) {
2000 for ( ; uc < e ; uc += len ) {
2005 charid = trie->charmap[ uvc ];
2007 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2008 charid = svpp ? (U16)SvIV(*svpp) : 0;
2012 if ( !trie->trans[ state + charid ].next ) {
2013 trie->trans[ state + charid ].next = next_alloc;
2014 trie->trans[ state ].check++;
2015 prev_states[TRIE_NODENUM(next_alloc)]
2016 = TRIE_NODENUM(state);
2017 next_alloc += trie->uniquecharcount;
2019 state = trie->trans[ state + charid ].next;
2021 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2023 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2026 accept_state = TRIE_NODENUM( state );
2027 TRIE_HANDLE_WORD(accept_state);
2029 } /* end second pass */
2031 /* and now dump it out before we compress it */
2032 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2034 next_alloc, depth+1));
2038 * Inplace compress the table.*
2040 For sparse data sets the table constructed by the trie algorithm will
2041 be mostly 0/FAIL transitions or to put it another way mostly empty.
2042 (Note that leaf nodes will not contain any transitions.)
2044 This algorithm compresses the tables by eliminating most such
2045 transitions, at the cost of a modest bit of extra work during lookup:
2047 - Each states[] entry contains a .base field which indicates the
2048 index in the state[] array wheres its transition data is stored.
2050 - If .base is 0 there are no valid transitions from that node.
2052 - If .base is nonzero then charid is added to it to find an entry in
2055 -If trans[states[state].base+charid].check!=state then the
2056 transition is taken to be a 0/Fail transition. Thus if there are fail
2057 transitions at the front of the node then the .base offset will point
2058 somewhere inside the previous nodes data (or maybe even into a node
2059 even earlier), but the .check field determines if the transition is
2063 The following process inplace converts the table to the compressed
2064 table: We first do not compress the root node 1,and mark all its
2065 .check pointers as 1 and set its .base pointer as 1 as well. This
2066 allows us to do a DFA construction from the compressed table later,
2067 and ensures that any .base pointers we calculate later are greater
2070 - We set 'pos' to indicate the first entry of the second node.
2072 - We then iterate over the columns of the node, finding the first and
2073 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2074 and set the .check pointers accordingly, and advance pos
2075 appropriately and repreat for the next node. Note that when we copy
2076 the next pointers we have to convert them from the original
2077 NODEIDX form to NODENUM form as the former is not valid post
2080 - If a node has no transitions used we mark its base as 0 and do not
2081 advance the pos pointer.
2083 - If a node only has one transition we use a second pointer into the
2084 structure to fill in allocated fail transitions from other states.
2085 This pointer is independent of the main pointer and scans forward
2086 looking for null transitions that are allocated to a state. When it
2087 finds one it writes the single transition into the "hole". If the
2088 pointer doesnt find one the single transition is appended as normal.
2090 - Once compressed we can Renew/realloc the structures to release the
2093 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2094 specifically Fig 3.47 and the associated pseudocode.
2098 const U32 laststate = TRIE_NODENUM( next_alloc );
2101 trie->statecount = laststate;
2103 for ( state = 1 ; state < laststate ; state++ ) {
2105 const U32 stateidx = TRIE_NODEIDX( state );
2106 const U32 o_used = trie->trans[ stateidx ].check;
2107 U32 used = trie->trans[ stateidx ].check;
2108 trie->trans[ stateidx ].check = 0;
2110 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2111 if ( flag || trie->trans[ stateidx + charid ].next ) {
2112 if ( trie->trans[ stateidx + charid ].next ) {
2114 for ( ; zp < pos ; zp++ ) {
2115 if ( ! trie->trans[ zp ].next ) {
2119 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2120 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2121 trie->trans[ zp ].check = state;
2122 if ( ++zp > pos ) pos = zp;
2129 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2131 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2132 trie->trans[ pos ].check = state;
2137 trie->lasttrans = pos + 1;
2138 trie->states = (reg_trie_state *)
2139 PerlMemShared_realloc( trie->states, laststate
2140 * sizeof(reg_trie_state) );
2141 DEBUG_TRIE_COMPILE_MORE_r(
2142 PerlIO_printf( Perl_debug_log,
2143 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2144 (int)depth * 2 + 2,"",
2145 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2148 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2151 } /* end table compress */
2153 DEBUG_TRIE_COMPILE_MORE_r(
2154 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2155 (int)depth * 2 + 2, "",
2156 (UV)trie->statecount,
2157 (UV)trie->lasttrans)
2159 /* resize the trans array to remove unused space */
2160 trie->trans = (reg_trie_trans *)
2161 PerlMemShared_realloc( trie->trans, trie->lasttrans
2162 * sizeof(reg_trie_trans) );
2164 { /* Modify the program and insert the new TRIE node */
2165 U8 nodetype =(U8)(flags & 0xFF);
2169 regnode *optimize = NULL;
2170 #ifdef RE_TRACK_PATTERN_OFFSETS
2173 U32 mjd_nodelen = 0;
2174 #endif /* RE_TRACK_PATTERN_OFFSETS */
2175 #endif /* DEBUGGING */
2177 This means we convert either the first branch or the first Exact,
2178 depending on whether the thing following (in 'last') is a branch
2179 or not and whther first is the startbranch (ie is it a sub part of
2180 the alternation or is it the whole thing.)
2181 Assuming its a sub part we convert the EXACT otherwise we convert
2182 the whole branch sequence, including the first.
2184 /* Find the node we are going to overwrite */
2185 if ( first != startbranch || OP( last ) == BRANCH ) {
2186 /* branch sub-chain */
2187 NEXT_OFF( first ) = (U16)(last - first);
2188 #ifdef RE_TRACK_PATTERN_OFFSETS
2190 mjd_offset= Node_Offset((convert));
2191 mjd_nodelen= Node_Length((convert));
2194 /* whole branch chain */
2196 #ifdef RE_TRACK_PATTERN_OFFSETS
2199 const regnode *nop = NEXTOPER( convert );
2200 mjd_offset= Node_Offset((nop));
2201 mjd_nodelen= Node_Length((nop));
2205 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2206 (int)depth * 2 + 2, "",
2207 (UV)mjd_offset, (UV)mjd_nodelen)
2210 /* But first we check to see if there is a common prefix we can
2211 split out as an EXACT and put in front of the TRIE node. */
2212 trie->startstate= 1;
2213 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2215 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2219 const U32 base = trie->states[ state ].trans.base;
2221 if ( trie->states[state].wordnum )
2224 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2225 if ( ( base + ofs >= trie->uniquecharcount ) &&
2226 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2227 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2229 if ( ++count > 1 ) {
2230 SV **tmp = av_fetch( revcharmap, ofs, 0);
2231 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2232 if ( state == 1 ) break;
2234 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2236 PerlIO_printf(Perl_debug_log,
2237 "%*sNew Start State=%"UVuf" Class: [",
2238 (int)depth * 2 + 2, "",
2241 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2242 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2244 TRIE_BITMAP_SET(trie,*ch);
2246 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2248 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2252 TRIE_BITMAP_SET(trie,*ch);
2254 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2255 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2261 SV **tmp = av_fetch( revcharmap, idx, 0);
2263 char *ch = SvPV( *tmp, len );
2265 SV *sv=sv_newmortal();
2266 PerlIO_printf( Perl_debug_log,
2267 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2268 (int)depth * 2 + 2, "",
2270 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2271 PL_colors[0], PL_colors[1],
2272 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2273 PERL_PV_ESCAPE_FIRSTCHAR
2278 OP( convert ) = nodetype;
2279 str=STRING(convert);
2282 STR_LEN(convert) += len;
2288 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2293 trie->prefixlen = (state-1);
2295 regnode *n = convert+NODE_SZ_STR(convert);
2296 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2297 trie->startstate = state;
2298 trie->minlen -= (state - 1);
2299 trie->maxlen -= (state - 1);
2301 /* At least the UNICOS C compiler choked on this
2302 * being argument to DEBUG_r(), so let's just have
2305 #ifdef PERL_EXT_RE_BUILD
2311 regnode *fix = convert;
2312 U32 word = trie->wordcount;
2314 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2315 while( ++fix < n ) {
2316 Set_Node_Offset_Length(fix, 0, 0);
2319 SV ** const tmp = av_fetch( trie_words, word, 0 );
2321 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2322 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2324 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2332 NEXT_OFF(convert) = (U16)(tail - convert);
2333 DEBUG_r(optimize= n);
2339 if ( trie->maxlen ) {
2340 NEXT_OFF( convert ) = (U16)(tail - convert);
2341 ARG_SET( convert, data_slot );
2342 /* Store the offset to the first unabsorbed branch in
2343 jump[0], which is otherwise unused by the jump logic.
2344 We use this when dumping a trie and during optimisation. */
2346 trie->jump[0] = (U16)(nextbranch - convert);
2348 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2349 * and there is a bitmap
2350 * and the first "jump target" node we found leaves enough room
2351 * then convert the TRIE node into a TRIEC node, with the bitmap
2352 * embedded inline in the opcode - this is hypothetically faster.
2354 if ( !trie->states[trie->startstate].wordnum
2356 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2358 OP( convert ) = TRIEC;
2359 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2360 PerlMemShared_free(trie->bitmap);
2363 OP( convert ) = TRIE;
2365 /* store the type in the flags */
2366 convert->flags = nodetype;
2370 + regarglen[ OP( convert ) ];
2372 /* XXX We really should free up the resource in trie now,
2373 as we won't use them - (which resources?) dmq */
2375 /* needed for dumping*/
2376 DEBUG_r(if (optimize) {
2377 regnode *opt = convert;
2379 while ( ++opt < optimize) {
2380 Set_Node_Offset_Length(opt,0,0);
2383 Try to clean up some of the debris left after the
2386 while( optimize < jumper ) {
2387 mjd_nodelen += Node_Length((optimize));
2388 OP( optimize ) = OPTIMIZED;
2389 Set_Node_Offset_Length(optimize,0,0);
2392 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2394 } /* end node insert */
2396 /* Finish populating the prev field of the wordinfo array. Walk back
2397 * from each accept state until we find another accept state, and if
2398 * so, point the first word's .prev field at the second word. If the
2399 * second already has a .prev field set, stop now. This will be the
2400 * case either if we've already processed that word's accept state,
2401 * or that state had multiple words, and the overspill words were
2402 * already linked up earlier.
2409 for (word=1; word <= trie->wordcount; word++) {
2411 if (trie->wordinfo[word].prev)
2413 state = trie->wordinfo[word].accept;
2415 state = prev_states[state];
2418 prev = trie->states[state].wordnum;
2422 trie->wordinfo[word].prev = prev;
2424 Safefree(prev_states);
2428 /* and now dump out the compressed format */
2429 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2431 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2433 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2434 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2436 SvREFCNT_dec(revcharmap);
2440 : trie->startstate>1
2446 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2448 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2450 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2451 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2454 We find the fail state for each state in the trie, this state is the longest proper
2455 suffix of the current state's 'word' that is also a proper prefix of another word in our
2456 trie. State 1 represents the word '' and is thus the default fail state. This allows
2457 the DFA not to have to restart after its tried and failed a word at a given point, it
2458 simply continues as though it had been matching the other word in the first place.
2460 'abcdgu'=~/abcdefg|cdgu/
2461 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2462 fail, which would bring us to the state representing 'd' in the second word where we would
2463 try 'g' and succeed, proceeding to match 'cdgu'.
2465 /* add a fail transition */
2466 const U32 trie_offset = ARG(source);
2467 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2469 const U32 ucharcount = trie->uniquecharcount;
2470 const U32 numstates = trie->statecount;
2471 const U32 ubound = trie->lasttrans + ucharcount;
2475 U32 base = trie->states[ 1 ].trans.base;
2478 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2479 GET_RE_DEBUG_FLAGS_DECL;
2481 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2483 PERL_UNUSED_ARG(depth);
2487 ARG_SET( stclass, data_slot );
2488 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2489 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2490 aho->trie=trie_offset;
2491 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2492 Copy( trie->states, aho->states, numstates, reg_trie_state );
2493 Newxz( q, numstates, U32);
2494 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2497 /* initialize fail[0..1] to be 1 so that we always have
2498 a valid final fail state */
2499 fail[ 0 ] = fail[ 1 ] = 1;
2501 for ( charid = 0; charid < ucharcount ; charid++ ) {
2502 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2504 q[ q_write ] = newstate;
2505 /* set to point at the root */
2506 fail[ q[ q_write++ ] ]=1;
2509 while ( q_read < q_write) {
2510 const U32 cur = q[ q_read++ % numstates ];
2511 base = trie->states[ cur ].trans.base;
2513 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2514 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2516 U32 fail_state = cur;
2519 fail_state = fail[ fail_state ];
2520 fail_base = aho->states[ fail_state ].trans.base;
2521 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2523 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2524 fail[ ch_state ] = fail_state;
2525 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2527 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2529 q[ q_write++ % numstates] = ch_state;
2533 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2534 when we fail in state 1, this allows us to use the
2535 charclass scan to find a valid start char. This is based on the principle
2536 that theres a good chance the string being searched contains lots of stuff
2537 that cant be a start char.
2539 fail[ 0 ] = fail[ 1 ] = 0;
2540 DEBUG_TRIE_COMPILE_r({
2541 PerlIO_printf(Perl_debug_log,
2542 "%*sStclass Failtable (%"UVuf" states): 0",
2543 (int)(depth * 2), "", (UV)numstates
2545 for( q_read=1; q_read<numstates; q_read++ ) {
2546 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2548 PerlIO_printf(Perl_debug_log, "\n");
2551 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2556 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2557 * These need to be revisited when a newer toolchain becomes available.
2559 #if defined(__sparc64__) && defined(__GNUC__)
2560 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2561 # undef SPARC64_GCC_WORKAROUND
2562 # define SPARC64_GCC_WORKAROUND 1
2566 #define DEBUG_PEEP(str,scan,depth) \
2567 DEBUG_OPTIMISE_r({if (scan){ \
2568 SV * const mysv=sv_newmortal(); \
2569 regnode *Next = regnext(scan); \
2570 regprop(RExC_rx, mysv, scan); \
2571 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2572 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2573 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2577 /* The below joins as many adjacent EXACTish nodes as possible into a single
2578 * one, and looks for problematic sequences of characters whose folds vs.
2579 * non-folds have sufficiently different lengths, that the optimizer would be
2580 * fooled into rejecting legitimate matches of them, and the trie construction
2581 * code can't cope with them. The joining is only done if:
2582 * 1) there is room in the current conglomerated node to entirely contain the
2584 * 2) they are the exact same node type
2586 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2587 * these get optimized out
2589 * If there are problematic code sequences, *min_subtract is set to the delta
2590 * that the minimum size of the node can be less than its actual size. And,
2591 * the node type of the result is changed to reflect that it contains these
2594 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2595 * and contains LATIN SMALL LETTER SHARP S
2597 * This is as good a place as any to discuss the design of handling these
2598 * problematic sequences. It's been wrong in Perl for a very long time. There
2599 * are three code points in Unicode whose folded lengths differ so much from
2600 * the un-folded lengths that it causes problems for the optimizer and trie
2601 * construction. Why only these are problematic, and not others where lengths
2602 * also differ is something I (khw) do not understand. New versions of Unicode
2603 * might add more such code points. Hopefully the logic in fold_grind.t that
2604 * figures out what to test (in part by verifying that each size-combination
2605 * gets tested) will catch any that do come along, so they can be added to the
2606 * special handling below. The chances of new ones are actually rather small,
2607 * as most, if not all, of the world's scripts that have casefolding have
2608 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2609 * made to allow compatibility with pre-existing standards, and almost all of
2610 * those have already been dealt with. These would otherwise be the most
2611 * likely candidates for generating further tricky sequences. In other words,
2612 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2613 * with pre-existing standards, and there aren't many of those left.
2615 * The previous designs for dealing with these involved assigning a special
2616 * node for them. This approach doesn't work, as evidenced by this example:
2617 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2618 * Both these fold to "sss", but if the pattern is parsed to create a node of
2619 * that would match just the \xDF, it won't be able to handle the case where a
2620 * successful match would have to cross the node's boundary. The new approach
2621 * that hopefully generally solves the problem generates an EXACTFU_SS node
2624 * There are a number of components to the approach (a lot of work for just
2625 * three code points!):
2626 * 1) This routine examines each EXACTFish node that could contain the
2627 * problematic sequences. It returns in *min_subtract how much to
2628 * subtract from the the actual length of the string to get a real minimum
2629 * for one that could match it. This number is usually 0 except for the
2630 * problematic sequences. This delta is used by the caller to adjust the
2631 * min length of the match, and the delta between min and max, so that the
2632 * optimizer doesn't reject these possibilities based on size constraints.
2633 * 2) These sequences are not currently correctly handled by the trie code
2634 * either, so it changes the joined node type to ops that are not handled
2635 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2636 * 3) This is sufficient for the two Greek sequences (described below), but
2637 * the one involving the Sharp s (\xDF) needs more. The node type
2638 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2639 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2640 * case where there is a possible fold length change. That means that a
2641 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2642 * itself with length changes, and so can be processed faster. regexec.c
2643 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2644 * is pre-folded by regcomp.c. This saves effort in regex matching.
2645 * However, probably mostly for historical reasons, the pre-folding isn't
2646 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2647 * nodes, as what they fold to isn't known until runtime.) The fold
2648 * possibilities for the non-UTF8 patterns are quite simple, except for
2649 * the sharp s. All the ones that don't involve a UTF-8 target string
2650 * are members of a fold-pair, and arrays are set up for all of them
2651 * that quickly find the other member of the pair. It might actually
2652 * be faster to pre-fold these, but it isn't currently done, except for
2653 * the sharp s. Code elsewhere in this file makes sure that it gets
2654 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2655 * issues described in the next item.
2656 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2657 * 'ss' or not is not knowable at compile time. It will match iff the
2658 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2659 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2660 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2661 * described in item 3). An assumption that the optimizer part of
2662 * regexec.c (probably unwittingly) makes is that a character in the
2663 * pattern corresponds to at most a single character in the target string.
2664 * (And I do mean character, and not byte here, unlike other parts of the
2665 * documentation that have never been updated to account for multibyte
2666 * Unicode.) This assumption is wrong only in this case, as all other
2667 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2668 * virtue of having this file pre-fold UTF-8 patterns. I'm
2669 * reluctant to try to change this assumption, so instead the code punts.
2670 * This routine examines EXACTF nodes for the sharp s, and returns a
2671 * boolean indicating whether or not the node is an EXACTF node that
2672 * contains a sharp s. When it is true, the caller sets a flag that later
2673 * causes the optimizer in this file to not set values for the floating
2674 * and fixed string lengths, and thus avoids the optimizer code in
2675 * regexec.c that makes the invalid assumption. Thus, there is no
2676 * optimization based on string lengths for EXACTF nodes that contain the
2677 * sharp s. This only happens for /id rules (which means the pattern
2681 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2682 if (PL_regkind[OP(scan)] == EXACT) \
2683 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2686 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2687 /* Merge several consecutive EXACTish nodes into one. */
2688 regnode *n = regnext(scan);
2690 regnode *next = scan + NODE_SZ_STR(scan);
2694 regnode *stop = scan;
2695 GET_RE_DEBUG_FLAGS_DECL;
2697 PERL_UNUSED_ARG(depth);
2700 PERL_ARGS_ASSERT_JOIN_EXACT;
2701 #ifndef EXPERIMENTAL_INPLACESCAN
2702 PERL_UNUSED_ARG(flags);
2703 PERL_UNUSED_ARG(val);
2705 DEBUG_PEEP("join",scan,depth);
2707 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2708 * EXACT ones that are mergeable to the current one. */
2710 && (PL_regkind[OP(n)] == NOTHING
2711 || (stringok && OP(n) == OP(scan)))
2713 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2716 if (OP(n) == TAIL || n > next)
2718 if (PL_regkind[OP(n)] == NOTHING) {
2719 DEBUG_PEEP("skip:",n,depth);
2720 NEXT_OFF(scan) += NEXT_OFF(n);
2721 next = n + NODE_STEP_REGNODE;
2728 else if (stringok) {
2729 const unsigned int oldl = STR_LEN(scan);
2730 regnode * const nnext = regnext(n);
2732 if (oldl + STR_LEN(n) > U8_MAX)
2735 DEBUG_PEEP("merg",n,depth);
2738 NEXT_OFF(scan) += NEXT_OFF(n);
2739 STR_LEN(scan) += STR_LEN(n);
2740 next = n + NODE_SZ_STR(n);
2741 /* Now we can overwrite *n : */
2742 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2750 #ifdef EXPERIMENTAL_INPLACESCAN
2751 if (flags && !NEXT_OFF(n)) {
2752 DEBUG_PEEP("atch", val, depth);
2753 if (reg_off_by_arg[OP(n)]) {
2754 ARG_SET(n, val - n);
2757 NEXT_OFF(n) = val - n;
2765 *has_exactf_sharp_s = FALSE;
2767 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2768 * can now analyze for sequences of problematic code points. (Prior to
2769 * this final joining, sequences could have been split over boundaries, and
2770 * hence missed). The sequences only happen in folding, hence for any
2771 * non-EXACT EXACTish node */
2772 if (OP(scan) != EXACT) {
2774 U8 * s0 = (U8*) STRING(scan);
2775 U8 * const s_end = s0 + STR_LEN(scan);
2777 /* The below is perhaps overboard, but this allows us to save a test
2778 * each time through the loop at the expense of a mask. This is
2779 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2780 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2781 * This uses an exclusive 'or' to find that bit and then inverts it to
2782 * form a mask, with just a single 0, in the bit position where 'S' and
2784 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2785 const U8 s_masked = 's' & S_or_s_mask;
2787 /* One pass is made over the node's string looking for all the
2788 * possibilities. to avoid some tests in the loop, there are two main
2789 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2793 /* There are two problematic Greek code points in Unicode
2796 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2797 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2803 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2804 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2806 * This means that in case-insensitive matching (or "loose
2807 * matching", as Unicode calls it), an EXACTF of length six (the
2808 * UTF-8 encoded byte length of the above casefolded versions) can
2809 * match a target string of length two (the byte length of UTF-8
2810 * encoded U+0390 or U+03B0). This would rather mess up the
2811 * minimum length computation. (there are other code points that
2812 * also fold to these two sequences, but the delta is smaller)
2814 * If these sequences are found, the minimum length is decreased by
2815 * four (six minus two).
2817 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2818 * LETTER SHARP S. We decrease the min length by 1 for each
2819 * occurrence of 'ss' found */
2821 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2822 # define U390_first_byte 0xb4
2823 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2824 # define U3B0_first_byte 0xb5
2825 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2827 # define U390_first_byte 0xce
2828 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2829 # define U3B0_first_byte 0xcf
2830 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2832 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2833 yields a net of 0 */
2834 /* Examine the string for one of the problematic sequences */
2836 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2837 * sequence we are looking for is 2 */
2841 /* Look for the first byte in each problematic sequence */
2843 /* We don't have to worry about other things that fold to
2844 * 's' (such as the long s, U+017F), as all above-latin1
2845 * code points have been pre-folded */
2849 /* Current character is an 's' or 'S'. If next one is
2850 * as well, we have the dreaded sequence */
2851 if (((*(s+1) & S_or_s_mask) == s_masked)
2852 /* These two node types don't have special handling
2854 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2857 OP(scan) = EXACTFU_SS;
2858 s++; /* No need to look at this character again */
2862 case U390_first_byte:
2863 if (s_end - s >= len
2865 /* The 1's are because are skipping comparing the
2867 && memEQ(s + 1, U390_tail, len - 1))
2869 goto greek_sequence;
2873 case U3B0_first_byte:
2874 if (! (s_end - s >= len
2875 && memEQ(s + 1, U3B0_tail, len - 1)))
2882 /* This can't currently be handled by trie's, so change
2883 * the node type to indicate this. If EXACTFA and
2884 * EXACTFL were ever to be handled by trie's, this
2885 * would have to be changed. If this node has already
2886 * been changed to EXACTFU_SS in this loop, leave it as
2887 * is. (I (khw) think it doesn't matter in regexec.c
2888 * for UTF patterns, but no need to change it */
2889 if (OP(scan) == EXACTFU) {
2890 OP(scan) = EXACTFU_TRICKYFOLD;
2892 s += 6; /* We already know what this sequence is. Skip
2898 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2900 /* Here, the pattern is not UTF-8. We need to look only for the
2901 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2902 * in the final position. Otherwise we can stop looking 1 byte
2903 * earlier because have to find both the first and second 's' */
2904 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2906 for (s = s0; s < upper; s++) {
2911 && ((*(s+1) & S_or_s_mask) == s_masked))
2915 /* EXACTF nodes need to know that the minimum
2916 * length changed so that a sharp s in the string
2917 * can match this ss in the pattern, but they
2918 * remain EXACTF nodes, as they are not trie'able,
2919 * so don't have to invent a new node type to
2920 * exclude them from the trie code */
2921 if (OP(scan) != EXACTF) {
2922 OP(scan) = EXACTFU_SS;
2927 case LATIN_SMALL_LETTER_SHARP_S:
2928 if (OP(scan) == EXACTF) {
2929 *has_exactf_sharp_s = TRUE;
2938 /* Allow dumping but overwriting the collection of skipped
2939 * ops and/or strings with fake optimized ops */
2940 n = scan + NODE_SZ_STR(scan);
2948 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2952 /* REx optimizer. Converts nodes into quicker variants "in place".
2953 Finds fixed substrings. */
2955 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2956 to the position after last scanned or to NULL. */
2958 #define INIT_AND_WITHP \
2959 assert(!and_withp); \
2960 Newx(and_withp,1,struct regnode_charclass_class); \
2961 SAVEFREEPV(and_withp)
2963 /* this is a chain of data about sub patterns we are processing that
2964 need to be handled separately/specially in study_chunk. Its so
2965 we can simulate recursion without losing state. */
2967 typedef struct scan_frame {
2968 regnode *last; /* last node to process in this frame */
2969 regnode *next; /* next node to process when last is reached */
2970 struct scan_frame *prev; /*previous frame*/
2971 I32 stop; /* what stopparen do we use */
2975 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2977 #define CASE_SYNST_FNC(nAmE) \
2979 if (flags & SCF_DO_STCLASS_AND) { \
2980 for (value = 0; value < 256; value++) \
2981 if (!is_ ## nAmE ## _cp(value)) \
2982 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2985 for (value = 0; value < 256; value++) \
2986 if (is_ ## nAmE ## _cp(value)) \
2987 ANYOF_BITMAP_SET(data->start_class, value); \
2991 if (flags & SCF_DO_STCLASS_AND) { \
2992 for (value = 0; value < 256; value++) \
2993 if (is_ ## nAmE ## _cp(value)) \
2994 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2997 for (value = 0; value < 256; value++) \
2998 if (!is_ ## nAmE ## _cp(value)) \
2999 ANYOF_BITMAP_SET(data->start_class, value); \
3006 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3007 I32 *minlenp, I32 *deltap,
3012 struct regnode_charclass_class *and_withp,
3013 U32 flags, U32 depth)
3014 /* scanp: Start here (read-write). */
3015 /* deltap: Write maxlen-minlen here. */
3016 /* last: Stop before this one. */
3017 /* data: string data about the pattern */
3018 /* stopparen: treat close N as END */
3019 /* recursed: which subroutines have we recursed into */
3020 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3023 I32 min = 0, pars = 0, code;
3024 regnode *scan = *scanp, *next;
3026 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3027 int is_inf_internal = 0; /* The studied chunk is infinite */
3028 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3029 scan_data_t data_fake;
3030 SV *re_trie_maxbuff = NULL;
3031 regnode *first_non_open = scan;
3032 I32 stopmin = I32_MAX;
3033 scan_frame *frame = NULL;
3034 GET_RE_DEBUG_FLAGS_DECL;
3036 PERL_ARGS_ASSERT_STUDY_CHUNK;
3039 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3043 while (first_non_open && OP(first_non_open) == OPEN)
3044 first_non_open=regnext(first_non_open);
3049 while ( scan && OP(scan) != END && scan < last ){
3050 UV min_subtract = 0; /* How much to subtract from the minimum node
3051 length to get a real minimum (because the
3052 folded version may be shorter) */
3053 bool has_exactf_sharp_s = FALSE;
3054 /* Peephole optimizer: */
3055 DEBUG_STUDYDATA("Peep:", data,depth);
3056 DEBUG_PEEP("Peep",scan,depth);
3058 /* Its not clear to khw or hv why this is done here, and not in the
3059 * clauses that deal with EXACT nodes. khw's guess is that it's
3060 * because of a previous design */
3061 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3063 /* Follow the next-chain of the current node and optimize
3064 away all the NOTHINGs from it. */
3065 if (OP(scan) != CURLYX) {
3066 const int max = (reg_off_by_arg[OP(scan)]
3068 /* I32 may be smaller than U16 on CRAYs! */
3069 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3070 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3074 /* Skip NOTHING and LONGJMP. */
3075 while ((n = regnext(n))
3076 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3077 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3078 && off + noff < max)
3080 if (reg_off_by_arg[OP(scan)])
3083 NEXT_OFF(scan) = off;
3088 /* The principal pseudo-switch. Cannot be a switch, since we
3089 look into several different things. */
3090 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3091 || OP(scan) == IFTHEN) {
3092 next = regnext(scan);
3094 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3096 if (OP(next) == code || code == IFTHEN) {
3097 /* NOTE - There is similar code to this block below for handling
3098 TRIE nodes on a re-study. If you change stuff here check there
3100 I32 max1 = 0, min1 = I32_MAX, num = 0;
3101 struct regnode_charclass_class accum;
3102 regnode * const startbranch=scan;
3104 if (flags & SCF_DO_SUBSTR)
3105 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3106 if (flags & SCF_DO_STCLASS)
3107 cl_init_zero(pRExC_state, &accum);
3109 while (OP(scan) == code) {
3110 I32 deltanext, minnext, f = 0, fake;
3111 struct regnode_charclass_class this_class;
3114 data_fake.flags = 0;
3116 data_fake.whilem_c = data->whilem_c;
3117 data_fake.last_closep = data->last_closep;
3120 data_fake.last_closep = &fake;
3122 data_fake.pos_delta = delta;
3123 next = regnext(scan);
3124 scan = NEXTOPER(scan);
3126 scan = NEXTOPER(scan);
3127 if (flags & SCF_DO_STCLASS) {
3128 cl_init(pRExC_state, &this_class);
3129 data_fake.start_class = &this_class;
3130 f = SCF_DO_STCLASS_AND;
3132 if (flags & SCF_WHILEM_VISITED_POS)
3133 f |= SCF_WHILEM_VISITED_POS;
3135 /* we suppose the run is continuous, last=next...*/
3136 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3138 stopparen, recursed, NULL, f,depth+1);
3141 if (max1 < minnext + deltanext)
3142 max1 = minnext + deltanext;
3143 if (deltanext == I32_MAX)
3144 is_inf = is_inf_internal = 1;
3146 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3148 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3149 if ( stopmin > minnext)
3150 stopmin = min + min1;
3151 flags &= ~SCF_DO_SUBSTR;
3153 data->flags |= SCF_SEEN_ACCEPT;
3156 if (data_fake.flags & SF_HAS_EVAL)
3157 data->flags |= SF_HAS_EVAL;
3158 data->whilem_c = data_fake.whilem_c;
3160 if (flags & SCF_DO_STCLASS)
3161 cl_or(pRExC_state, &accum, &this_class);
3163 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3165 if (flags & SCF_DO_SUBSTR) {
3166 data->pos_min += min1;
3167 data->pos_delta += max1 - min1;
3168 if (max1 != min1 || is_inf)
3169 data->longest = &(data->longest_float);
3172 delta += max1 - min1;
3173 if (flags & SCF_DO_STCLASS_OR) {
3174 cl_or(pRExC_state, data->start_class, &accum);
3176 cl_and(data->start_class, and_withp);
3177 flags &= ~SCF_DO_STCLASS;
3180 else if (flags & SCF_DO_STCLASS_AND) {
3182 cl_and(data->start_class, &accum);
3183 flags &= ~SCF_DO_STCLASS;
3186 /* Switch to OR mode: cache the old value of
3187 * data->start_class */
3189 StructCopy(data->start_class, and_withp,
3190 struct regnode_charclass_class);
3191 flags &= ~SCF_DO_STCLASS_AND;
3192 StructCopy(&accum, data->start_class,
3193 struct regnode_charclass_class);
3194 flags |= SCF_DO_STCLASS_OR;
3195 data->start_class->flags |= ANYOF_EOS;
3199 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3202 Assuming this was/is a branch we are dealing with: 'scan' now
3203 points at the item that follows the branch sequence, whatever
3204 it is. We now start at the beginning of the sequence and look
3211 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3213 If we can find such a subsequence we need to turn the first
3214 element into a trie and then add the subsequent branch exact
3215 strings to the trie.
3219 1. patterns where the whole set of branches can be converted.
3221 2. patterns where only a subset can be converted.
3223 In case 1 we can replace the whole set with a single regop
3224 for the trie. In case 2 we need to keep the start and end
3227 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3228 becomes BRANCH TRIE; BRANCH X;
3230 There is an additional case, that being where there is a
3231 common prefix, which gets split out into an EXACT like node
3232 preceding the TRIE node.
3234 If x(1..n)==tail then we can do a simple trie, if not we make
3235 a "jump" trie, such that when we match the appropriate word
3236 we "jump" to the appropriate tail node. Essentially we turn
3237 a nested if into a case structure of sorts.
3242 if (!re_trie_maxbuff) {
3243 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3244 if (!SvIOK(re_trie_maxbuff))
3245 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3247 if ( SvIV(re_trie_maxbuff)>=0 ) {
3249 regnode *first = (regnode *)NULL;
3250 regnode *last = (regnode *)NULL;
3251 regnode *tail = scan;
3256 SV * const mysv = sv_newmortal(); /* for dumping */
3258 /* var tail is used because there may be a TAIL
3259 regop in the way. Ie, the exacts will point to the
3260 thing following the TAIL, but the last branch will
3261 point at the TAIL. So we advance tail. If we
3262 have nested (?:) we may have to move through several
3266 while ( OP( tail ) == TAIL ) {
3267 /* this is the TAIL generated by (?:) */
3268 tail = regnext( tail );
3272 DEBUG_TRIE_COMPILE_r({
3273 regprop(RExC_rx, mysv, tail );
3274 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3275 (int)depth * 2 + 2, "",
3276 "Looking for TRIE'able sequences. Tail node is: ",
3277 SvPV_nolen_const( mysv )
3283 Step through the branches
3284 cur represents each branch,
3285 noper is the first thing to be matched as part of that branch
3286 noper_next is the regnext() of that node.
3288 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3289 via a "jump trie" but we also support building with NOJUMPTRIE,
3290 which restricts the trie logic to structures like /FOO|BAR/.
3292 If noper is a trieable nodetype then the branch is a possible optimization
3293 target. If we are building under NOJUMPTRIE then we require that noper_next
3294 is the same as scan (our current position in the regex program).
3296 Once we have two or more consecutive such branches we can create a
3297 trie of the EXACT's contents and stitch it in place into the program.
3299 If the sequence represents all of the branches in the alternation we
3300 replace the entire thing with a single TRIE node.
3302 Otherwise when it is a subsequence we need to stitch it in place and
3303 replace only the relevant branches. This means the first branch has
3304 to remain as it is used by the alternation logic, and its next pointer,
3305 and needs to be repointed at the item on the branch chain following
3306 the last branch we have optimized away.
3308 This could be either a BRANCH, in which case the subsequence is internal,
3309 or it could be the item following the branch sequence in which case the
3310 subsequence is at the end (which does not necessarily mean the first node
3311 is the start of the alternation).
3313 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3316 ----------------+-----------
3320 EXACTFU_SS | EXACTFU
3321 EXACTFU_TRICKYFOLD | EXACTFU
3326 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3327 ( EXACT == (X) ) ? EXACT : \
3328 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3331 /* dont use tail as the end marker for this traverse */
3332 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3333 regnode * const noper = NEXTOPER( cur );
3334 U8 noper_type = OP( noper );
3335 U8 noper_trietype = TRIE_TYPE( noper_type );
3336 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3337 regnode * const noper_next = regnext( noper );
3338 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3339 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3342 DEBUG_TRIE_COMPILE_r({
3343 regprop(RExC_rx, mysv, cur);
3344 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3345 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3347 regprop(RExC_rx, mysv, noper);
3348 PerlIO_printf( Perl_debug_log, " -> %s",
3349 SvPV_nolen_const(mysv));
3352 regprop(RExC_rx, mysv, noper_next );
3353 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3354 SvPV_nolen_const(mysv));
3356 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3357 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3358 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3362 /* Is noper a trieable nodetype that can be merged with the
3363 * current trie (if there is one)? */
3367 ( noper_trietype == NOTHING)
3368 || ( trietype == NOTHING )
3369 || ( trietype == noper_trietype )
3372 && noper_next == tail
3376 /* Handle mergable triable node
3377 * Either we are the first node in a new trieable sequence,
3378 * in which case we do some bookkeeping, otherwise we update
3379 * the end pointer. */
3382 if ( noper_trietype == NOTHING ) {
3383 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3384 regnode * const noper_next = regnext( noper );
3385 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3386 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3389 if ( noper_next_trietype ) {
3390 trietype = noper_next_trietype;
3391 } else if (noper_next_type) {
3392 /* a NOTHING regop is 1 regop wide. We need at least two
3393 * for a trie so we can't merge this in */
3397 trietype = noper_trietype;
3400 if ( trietype == NOTHING )
3401 trietype = noper_trietype;
3406 } /* end handle mergable triable node */
3408 /* handle unmergable node -
3409 * noper may either be a triable node which can not be tried
3410 * together with the current trie, or a non triable node */
3412 /* If last is set and trietype is not NOTHING then we have found
3413 * at least two triable branch sequences in a row of a similar
3414 * trietype so we can turn them into a trie. If/when we
3415 * allow NOTHING to start a trie sequence this condition will be
3416 * required, and it isn't expensive so we leave it in for now. */
3417 if ( trietype != NOTHING )
3418 make_trie( pRExC_state,
3419 startbranch, first, cur, tail, count,
3420 trietype, depth+1 );
3421 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3425 && noper_next == tail
3428 /* noper is triable, so we can start a new trie sequence */
3431 trietype = noper_trietype;
3433 /* if we already saw a first but the current node is not triable then we have
3434 * to reset the first information. */
3439 } /* end handle unmergable node */
3440 } /* loop over branches */
3441 DEBUG_TRIE_COMPILE_r({
3442 regprop(RExC_rx, mysv, cur);
3443 PerlIO_printf( Perl_debug_log,
3444 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3445 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3449 if ( trietype != NOTHING ) {
3450 /* the last branch of the sequence was part of a trie,
3451 * so we have to construct it here outside of the loop
3453 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3454 #ifdef TRIE_STUDY_OPT
3455 if ( ((made == MADE_EXACT_TRIE &&
3456 startbranch == first)
3457 || ( first_non_open == first )) &&
3459 flags |= SCF_TRIE_RESTUDY;
3460 if ( startbranch == first
3463 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3468 /* at this point we know whatever we have is a NOTHING sequence/branch
3469 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3471 if ( startbranch == first ) {
3473 /* the entire thing is a NOTHING sequence, something like this:
3474 * (?:|) So we can turn it into a plain NOTHING op. */
3475 DEBUG_TRIE_COMPILE_r({
3476 regprop(RExC_rx, mysv, cur);
3477 PerlIO_printf( Perl_debug_log,
3478 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3479 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3482 OP(startbranch)= NOTHING;
3483 NEXT_OFF(startbranch)= tail - startbranch;
3484 for ( opt= startbranch + 1; opt < tail ; opt++ )
3488 } /* end if ( last) */
3489 } /* TRIE_MAXBUF is non zero */
3494 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3495 scan = NEXTOPER(NEXTOPER(scan));
3496 } else /* single branch is optimized. */
3497 scan = NEXTOPER(scan);
3499 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3500 scan_frame *newframe = NULL;
3505 if (OP(scan) != SUSPEND) {
3506 /* set the pointer */
3507 if (OP(scan) == GOSUB) {
3509 RExC_recurse[ARG2L(scan)] = scan;
3510 start = RExC_open_parens[paren-1];
3511 end = RExC_close_parens[paren-1];
3514 start = RExC_rxi->program + 1;
3518 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3519 SAVEFREEPV(recursed);
3521 if (!PAREN_TEST(recursed,paren+1)) {
3522 PAREN_SET(recursed,paren+1);
3523 Newx(newframe,1,scan_frame);
3525 if (flags & SCF_DO_SUBSTR) {
3526 SCAN_COMMIT(pRExC_state,data,minlenp);
3527 data->longest = &(data->longest_float);
3529 is_inf = is_inf_internal = 1;
3530 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3531 cl_anything(pRExC_state, data->start_class);
3532 flags &= ~SCF_DO_STCLASS;
3535 Newx(newframe,1,scan_frame);
3538 end = regnext(scan);
3543 SAVEFREEPV(newframe);
3544 newframe->next = regnext(scan);
3545 newframe->last = last;
3546 newframe->stop = stopparen;
3547 newframe->prev = frame;
3557 else if (OP(scan) == EXACT) {
3558 I32 l = STR_LEN(scan);
3561 const U8 * const s = (U8*)STRING(scan);
3562 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3563 l = utf8_length(s, s + l);
3565 uc = *((U8*)STRING(scan));
3568 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3569 /* The code below prefers earlier match for fixed
3570 offset, later match for variable offset. */
3571 if (data->last_end == -1) { /* Update the start info. */
3572 data->last_start_min = data->pos_min;
3573 data->last_start_max = is_inf
3574 ? I32_MAX : data->pos_min + data->pos_delta;
3576 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3578 SvUTF8_on(data->last_found);
3580 SV * const sv = data->last_found;
3581 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3582 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3583 if (mg && mg->mg_len >= 0)
3584 mg->mg_len += utf8_length((U8*)STRING(scan),
3585 (U8*)STRING(scan)+STR_LEN(scan));
3587 data->last_end = data->pos_min + l;
3588 data->pos_min += l; /* As in the first entry. */
3589 data->flags &= ~SF_BEFORE_EOL;
3591 if (flags & SCF_DO_STCLASS_AND) {
3592 /* Check whether it is compatible with what we know already! */
3596 /* If compatible, we or it in below. It is compatible if is
3597 * in the bitmp and either 1) its bit or its fold is set, or 2)
3598 * it's for a locale. Even if there isn't unicode semantics
3599 * here, at runtime there may be because of matching against a
3600 * utf8 string, so accept a possible false positive for
3601 * latin1-range folds */
3603 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3604 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3605 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3606 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3611 ANYOF_CLASS_ZERO(data->start_class);
3612 ANYOF_BITMAP_ZERO(data->start_class);
3614 ANYOF_BITMAP_SET(data->start_class, uc);
3615 else if (uc >= 0x100) {
3618 /* Some Unicode code points fold to the Latin1 range; as
3619 * XXX temporary code, instead of figuring out if this is
3620 * one, just assume it is and set all the start class bits
3621 * that could be some such above 255 code point's fold
3622 * which will generate fals positives. As the code
3623 * elsewhere that does compute the fold settles down, it
3624 * can be extracted out and re-used here */
3625 for (i = 0; i < 256; i++){
3626 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3627 ANYOF_BITMAP_SET(data->start_class, i);
3631 data->start_class->flags &= ~ANYOF_EOS;
3633 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3635 else if (flags & SCF_DO_STCLASS_OR) {
3636 /* false positive possible if the class is case-folded */
3638 ANYOF_BITMAP_SET(data->start_class, uc);
3640 data->start_class->flags |= ANYOF_UNICODE_ALL;
3641 data->start_class->flags &= ~ANYOF_EOS;
3642 cl_and(data->start_class, and_withp);
3644 flags &= ~SCF_DO_STCLASS;
3646 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3647 I32 l = STR_LEN(scan);
3648 UV uc = *((U8*)STRING(scan));
3650 /* Search for fixed substrings supports EXACT only. */
3651 if (flags & SCF_DO_SUBSTR) {
3653 SCAN_COMMIT(pRExC_state, data, minlenp);
3656 const U8 * const s = (U8 *)STRING(scan);
3657 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3658 l = utf8_length(s, s + l);
3660 else if (has_exactf_sharp_s) {
3661 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3663 min += l - min_subtract;
3667 delta += min_subtract;
3668 if (flags & SCF_DO_SUBSTR) {
3669 data->pos_min += l - min_subtract;
3670 if (data->pos_min < 0) {
3673 data->pos_delta += min_subtract;
3675 data->longest = &(data->longest_float);
3678 if (flags & SCF_DO_STCLASS_AND) {
3679 /* Check whether it is compatible with what we know already! */
3682 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3683 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3684 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3688 ANYOF_CLASS_ZERO(data->start_class);
3689 ANYOF_BITMAP_ZERO(data->start_class);
3691 ANYOF_BITMAP_SET(data->start_class, uc);
3692 data->start_class->flags &= ~ANYOF_EOS;
3693 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3694 if (OP(scan) == EXACTFL) {
3695 /* XXX This set is probably no longer necessary, and
3696 * probably wrong as LOCALE now is on in the initial
3698 data->start_class->flags |= ANYOF_LOCALE;
3702 /* Also set the other member of the fold pair. In case
3703 * that unicode semantics is called for at runtime, use
3704 * the full latin1 fold. (Can't do this for locale,
3705 * because not known until runtime) */
3706 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3708 /* All other (EXACTFL handled above) folds except under
3709 * /iaa that include s, S, and sharp_s also may include
3711 if (OP(scan) != EXACTFA) {
3712 if (uc == 's' || uc == 'S') {
3713 ANYOF_BITMAP_SET(data->start_class,
3714 LATIN_SMALL_LETTER_SHARP_S);
3716 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3717 ANYOF_BITMAP_SET(data->start_class, 's');
3718 ANYOF_BITMAP_SET(data->start_class, 'S');
3723 else if (uc >= 0x100) {
3725 for (i = 0; i < 256; i++){
3726 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3727 ANYOF_BITMAP_SET(data->start_class, i);
3732 else if (flags & SCF_DO_STCLASS_OR) {
3733 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3734 /* false positive possible if the class is case-folded.
3735 Assume that the locale settings are the same... */
3737 ANYOF_BITMAP_SET(data->start_class, uc);
3738 if (OP(scan) != EXACTFL) {
3740 /* And set the other member of the fold pair, but
3741 * can't do that in locale because not known until
3743 ANYOF_BITMAP_SET(data->start_class,
3744 PL_fold_latin1[uc]);
3746 /* All folds except under /iaa that include s, S,
3747 * and sharp_s also may include the others */
3748 if (OP(scan) != EXACTFA) {
3749 if (uc == 's' || uc == 'S') {
3750 ANYOF_BITMAP_SET(data->start_class,
3751 LATIN_SMALL_LETTER_SHARP_S);
3753 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3754 ANYOF_BITMAP_SET(data->start_class, 's');
3755 ANYOF_BITMAP_SET(data->start_class, 'S');
3760 data->start_class->flags &= ~ANYOF_EOS;
3762 cl_and(data->start_class, and_withp);
3764 flags &= ~SCF_DO_STCLASS;
3766 else if (REGNODE_VARIES(OP(scan))) {
3767 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3768 I32 f = flags, pos_before = 0;
3769 regnode * const oscan = scan;
3770 struct regnode_charclass_class this_class;
3771 struct regnode_charclass_class *oclass = NULL;
3772 I32 next_is_eval = 0;
3774 switch (PL_regkind[OP(scan)]) {
3775 case WHILEM: /* End of (?:...)* . */
3776 scan = NEXTOPER(scan);
3779 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3780 next = NEXTOPER(scan);
3781 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3783 maxcount = REG_INFTY;
3784 next = regnext(scan);
3785 scan = NEXTOPER(scan);
3789 if (flags & SCF_DO_SUBSTR)
3794 if (flags & SCF_DO_STCLASS) {
3796 maxcount = REG_INFTY;
3797 next = regnext(scan);
3798 scan = NEXTOPER(scan);
3801 is_inf = is_inf_internal = 1;
3802 scan = regnext(scan);
3803 if (flags & SCF_DO_SUBSTR) {
3804 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3805 data->longest = &(data->longest_float);
3807 goto optimize_curly_tail;
3809 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3810 && (scan->flags == stopparen))
3815 mincount = ARG1(scan);
3816 maxcount = ARG2(scan);
3818 next = regnext(scan);
3819 if (OP(scan) == CURLYX) {
3820 I32 lp = (data ? *(data->last_closep) : 0);
3821 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3823 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3824 next_is_eval = (OP(scan) == EVAL);
3826 if (flags & SCF_DO_SUBSTR) {
3827 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3828 pos_before = data->pos_min;
3832 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3834 data->flags |= SF_IS_INF;
3836 if (flags & SCF_DO_STCLASS) {
3837 cl_init(pRExC_state, &this_class);
3838 oclass = data->start_class;
3839 data->start_class = &this_class;
3840 f |= SCF_DO_STCLASS_AND;
3841 f &= ~SCF_DO_STCLASS_OR;
3843 /* Exclude from super-linear cache processing any {n,m}
3844 regops for which the combination of input pos and regex
3845 pos is not enough information to determine if a match
3848 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3849 regex pos at the \s*, the prospects for a match depend not
3850 only on the input position but also on how many (bar\s*)
3851 repeats into the {4,8} we are. */
3852 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3853 f &= ~SCF_WHILEM_VISITED_POS;
3855 /* This will finish on WHILEM, setting scan, or on NULL: */
3856 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3857 last, data, stopparen, recursed, NULL,
3859 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3861 if (flags & SCF_DO_STCLASS)
3862 data->start_class = oclass;
3863 if (mincount == 0 || minnext == 0) {
3864 if (flags & SCF_DO_STCLASS_OR) {
3865 cl_or(pRExC_state, data->start_class, &this_class);
3867 else if (flags & SCF_DO_STCLASS_AND) {
3868 /* Switch to OR mode: cache the old value of
3869 * data->start_class */
3871 StructCopy(data->start_class, and_withp,
3872 struct regnode_charclass_class);
3873 flags &= ~SCF_DO_STCLASS_AND;
3874 StructCopy(&this_class, data->start_class,
3875 struct regnode_charclass_class);
3876 flags |= SCF_DO_STCLASS_OR;
3877 data->start_class->flags |= ANYOF_EOS;
3879 } else { /* Non-zero len */
3880 if (flags & SCF_DO_STCLASS_OR) {
3881 cl_or(pRExC_state, data->start_class, &this_class);
3882 cl_and(data->start_class, and_withp);
3884 else if (flags & SCF_DO_STCLASS_AND)
3885 cl_and(data->start_class, &this_class);
3886 flags &= ~SCF_DO_STCLASS;
3888 if (!scan) /* It was not CURLYX, but CURLY. */
3890 if ( /* ? quantifier ok, except for (?{ ... }) */
3891 (next_is_eval || !(mincount == 0 && maxcount == 1))
3892 && (minnext == 0) && (deltanext == 0)
3893 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3894 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3896 ckWARNreg(RExC_parse,
3897 "Quantifier unexpected on zero-length expression");
3900 min += minnext * mincount;
3901 is_inf_internal |= ((maxcount == REG_INFTY
3902 && (minnext + deltanext) > 0)
3903 || deltanext == I32_MAX);
3904 is_inf |= is_inf_internal;
3905 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3907 /* Try powerful optimization CURLYX => CURLYN. */
3908 if ( OP(oscan) == CURLYX && data
3909 && data->flags & SF_IN_PAR
3910 && !(data->flags & SF_HAS_EVAL)
3911 && !deltanext && minnext == 1 ) {
3912 /* Try to optimize to CURLYN. */
3913 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3914 regnode * const nxt1 = nxt;
3921 if (!REGNODE_SIMPLE(OP(nxt))
3922 && !(PL_regkind[OP(nxt)] == EXACT
3923 && STR_LEN(nxt) == 1))
3929 if (OP(nxt) != CLOSE)
3931 if (RExC_open_parens) {
3932 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3933 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3935 /* Now we know that nxt2 is the only contents: */
3936 oscan->flags = (U8)ARG(nxt);
3938 OP(nxt1) = NOTHING; /* was OPEN. */
3941 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3942 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3943 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3944 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3945 OP(nxt + 1) = OPTIMIZED; /* was count. */
3946 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3951 /* Try optimization CURLYX => CURLYM. */
3952 if ( OP(oscan) == CURLYX && data
3953 && !(data->flags & SF_HAS_PAR)
3954 && !(data->flags & SF_HAS_EVAL)
3955 && !deltanext /* atom is fixed width */
3956 && minnext != 0 /* CURLYM can't handle zero width */
3958 /* XXXX How to optimize if data == 0? */
3959 /* Optimize to a simpler form. */
3960 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3964 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3965 && (OP(nxt2) != WHILEM))
3967 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3968 /* Need to optimize away parenths. */
3969 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3970 /* Set the parenth number. */
3971 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3973 oscan->flags = (U8)ARG(nxt);
3974 if (RExC_open_parens) {
3975 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3976 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3978 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3979 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3982 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3983 OP(nxt + 1) = OPTIMIZED; /* was count. */
3984 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3985 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3988 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3989 regnode *nnxt = regnext(nxt1);
3991 if (reg_off_by_arg[OP(nxt1)])
3992 ARG_SET(nxt1, nxt2 - nxt1);
3993 else if (nxt2 - nxt1 < U16_MAX)
3994 NEXT_OFF(nxt1) = nxt2 - nxt1;
3996 OP(nxt) = NOTHING; /* Cannot beautify */
4001 /* Optimize again: */
4002 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4003 NULL, stopparen, recursed, NULL, 0,depth+1);
4008 else if ((OP(oscan) == CURLYX)
4009 && (flags & SCF_WHILEM_VISITED_POS)
4010 /* See the comment on a similar expression above.
4011 However, this time it's not a subexpression
4012 we care about, but the expression itself. */
4013 && (maxcount == REG_INFTY)
4014 && data && ++data->whilem_c < 16) {
4015 /* This stays as CURLYX, we can put the count/of pair. */
4016 /* Find WHILEM (as in regexec.c) */
4017 regnode *nxt = oscan + NEXT_OFF(oscan);
4019 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4021 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4022 | (RExC_whilem_seen << 4)); /* On WHILEM */
4024 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4026 if (flags & SCF_DO_SUBSTR) {
4027 SV *last_str = NULL;
4028 int counted = mincount != 0;
4030 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4031 #if defined(SPARC64_GCC_WORKAROUND)
4034 const char *s = NULL;
4037 if (pos_before >= data->last_start_min)
4040 b = data->last_start_min;
4043 s = SvPV_const(data->last_found, l);
4044 old = b - data->last_start_min;
4047 I32 b = pos_before >= data->last_start_min
4048 ? pos_before : data->last_start_min;
4050 const char * const s = SvPV_const(data->last_found, l);
4051 I32 old = b - data->last_start_min;
4055 old = utf8_hop((U8*)s, old) - (U8*)s;
4057 /* Get the added string: */
4058 last_str = newSVpvn_utf8(s + old, l, UTF);
4059 if (deltanext == 0 && pos_before == b) {
4060 /* What was added is a constant string */
4062 SvGROW(last_str, (mincount * l) + 1);
4063 repeatcpy(SvPVX(last_str) + l,
4064 SvPVX_const(last_str), l, mincount - 1);
4065 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4066 /* Add additional parts. */
4067 SvCUR_set(data->last_found,
4068 SvCUR(data->last_found) - l);
4069 sv_catsv(data->last_found, last_str);
4071 SV * sv = data->last_found;
4073 SvUTF8(sv) && SvMAGICAL(sv) ?
4074 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4075 if (mg && mg->mg_len >= 0)
4076 mg->mg_len += CHR_SVLEN(last_str) - l;
4078 data->last_end += l * (mincount - 1);
4081 /* start offset must point into the last copy */
4082 data->last_start_min += minnext * (mincount - 1);
4083 data->last_start_max += is_inf ? I32_MAX
4084 : (maxcount - 1) * (minnext + data->pos_delta);
4087 /* It is counted once already... */
4088 data->pos_min += minnext * (mincount - counted);
4089 data->pos_delta += - counted * deltanext +
4090 (minnext + deltanext) * maxcount - minnext * mincount;
4091 if (mincount != maxcount) {
4092 /* Cannot extend fixed substrings found inside
4094 SCAN_COMMIT(pRExC_state,data,minlenp);
4095 if (mincount && last_str) {
4096 SV * const sv = data->last_found;
4097 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4098 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4102 sv_setsv(sv, last_str);
4103 data->last_end = data->pos_min;
4104 data->last_start_min =
4105 data->pos_min - CHR_SVLEN(last_str);
4106 data->last_start_max = is_inf
4108 : data->pos_min + data->pos_delta
4109 - CHR_SVLEN(last_str);
4111 data->longest = &(data->longest_float);
4113 SvREFCNT_dec(last_str);
4115 if (data && (fl & SF_HAS_EVAL))
4116 data->flags |= SF_HAS_EVAL;
4117 optimize_curly_tail:
4118 if (OP(oscan) != CURLYX) {
4119 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4121 NEXT_OFF(oscan) += NEXT_OFF(next);
4124 default: /* REF, ANYOFV, and CLUMP only? */
4125 if (flags & SCF_DO_SUBSTR) {
4126 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4127 data->longest = &(data->longest_float);
4129 is_inf = is_inf_internal = 1;
4130 if (flags & SCF_DO_STCLASS_OR)
4131 cl_anything(pRExC_state, data->start_class);
4132 flags &= ~SCF_DO_STCLASS;
4136 else if (OP(scan) == LNBREAK) {
4137 if (flags & SCF_DO_STCLASS) {
4139 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4140 if (flags & SCF_DO_STCLASS_AND) {
4141 for (value = 0; value < 256; value++)
4142 if (!is_VERTWS_cp(value))
4143 ANYOF_BITMAP_CLEAR(data->start_class, value);
4146 for (value = 0; value < 256; value++)
4147 if (is_VERTWS_cp(value))
4148 ANYOF_BITMAP_SET(data->start_class, value);
4150 if (flags & SCF_DO_STCLASS_OR)
4151 cl_and(data->start_class, and_withp);
4152 flags &= ~SCF_DO_STCLASS;
4156 if (flags & SCF_DO_SUBSTR) {
4157 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4159 data->pos_delta += 1;
4160 data->longest = &(data->longest_float);
4163 else if (REGNODE_SIMPLE(OP(scan))) {
4166 if (flags & SCF_DO_SUBSTR) {
4167 SCAN_COMMIT(pRExC_state,data,minlenp);
4171 if (flags & SCF_DO_STCLASS) {
4172 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4174 /* Some of the logic below assumes that switching
4175 locale on will only add false positives. */
4176 switch (PL_regkind[OP(scan)]) {
4180 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4181 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4182 cl_anything(pRExC_state, data->start_class);
4185 if (OP(scan) == SANY)
4187 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4188 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4189 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4190 cl_anything(pRExC_state, data->start_class);
4192 if (flags & SCF_DO_STCLASS_AND || !value)
4193 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4196 if (flags & SCF_DO_STCLASS_AND)
4197 cl_and(data->start_class,
4198 (struct regnode_charclass_class*)scan);
4200 cl_or(pRExC_state, data->start_class,
4201 (struct regnode_charclass_class*)scan);
4204 if (flags & SCF_DO_STCLASS_AND) {
4205 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4206 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4207 if (OP(scan) == ALNUMU) {
4208 for (value = 0; value < 256; value++) {
4209 if (!isWORDCHAR_L1(value)) {
4210 ANYOF_BITMAP_CLEAR(data->start_class, value);
4214 for (value = 0; value < 256; value++) {
4215 if (!isALNUM(value)) {
4216 ANYOF_BITMAP_CLEAR(data->start_class, value);
4223 if (data->start_class->flags & ANYOF_LOCALE)
4224 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4226 /* Even if under locale, set the bits for non-locale
4227 * in case it isn't a true locale-node. This will
4228 * create false positives if it truly is locale */
4229 if (OP(scan) == ALNUMU) {
4230 for (value = 0; value < 256; value++) {
4231 if (isWORDCHAR_L1(value)) {
4232 ANYOF_BITMAP_SET(data->start_class, value);
4236 for (value = 0; value < 256; value++) {
4237 if (isALNUM(value)) {
4238 ANYOF_BITMAP_SET(data->start_class, value);
4245 if (flags & SCF_DO_STCLASS_AND) {
4246 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4247 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4248 if (OP(scan) == NALNUMU) {
4249 for (value = 0; value < 256; value++) {
4250 if (isWORDCHAR_L1(value)) {
4251 ANYOF_BITMAP_CLEAR(data->start_class, value);
4255 for (value = 0; value < 256; value++) {
4256 if (isALNUM(value)) {
4257 ANYOF_BITMAP_CLEAR(data->start_class, value);
4264 if (data->start_class->flags & ANYOF_LOCALE)
4265 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4267 /* Even if under locale, set the bits for non-locale in
4268 * case it isn't a true locale-node. This will create
4269 * false positives if it truly is locale */
4270 if (OP(scan) == NALNUMU) {
4271 for (value = 0; value < 256; value++) {
4272 if (! isWORDCHAR_L1(value)) {
4273 ANYOF_BITMAP_SET(data->start_class, value);
4277 for (value = 0; value < 256; value++) {
4278 if (! isALNUM(value)) {
4279 ANYOF_BITMAP_SET(data->start_class, value);
4286 if (flags & SCF_DO_STCLASS_AND) {
4287 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4288 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4289 if (OP(scan) == SPACEU) {
4290 for (value = 0; value < 256; value++) {
4291 if (!isSPACE_L1(value)) {
4292 ANYOF_BITMAP_CLEAR(data->start_class, value);
4296 for (value = 0; value < 256; value++) {
4297 if (!isSPACE(value)) {
4298 ANYOF_BITMAP_CLEAR(data->start_class, value);
4305 if (data->start_class->flags & ANYOF_LOCALE) {
4306 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4308 if (OP(scan) == SPACEU) {
4309 for (value = 0; value < 256; value++) {
4310 if (isSPACE_L1(value)) {
4311 ANYOF_BITMAP_SET(data->start_class, value);
4315 for (value = 0; value < 256; value++) {
4316 if (isSPACE(value)) {
4317 ANYOF_BITMAP_SET(data->start_class, value);
4324 if (flags & SCF_DO_STCLASS_AND) {
4325 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4326 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4327 if (OP(scan) == NSPACEU) {
4328 for (value = 0; value < 256; value++) {
4329 if (isSPACE_L1(value)) {
4330 ANYOF_BITMAP_CLEAR(data->start_class, value);
4334 for (value = 0; value < 256; value++) {
4335 if (isSPACE(value)) {
4336 ANYOF_BITMAP_CLEAR(data->start_class, value);
4343 if (data->start_class->flags & ANYOF_LOCALE)
4344 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4345 if (OP(scan) == NSPACEU) {
4346 for (value = 0; value < 256; value++) {
4347 if (!isSPACE_L1(value)) {
4348 ANYOF_BITMAP_SET(data->start_class, value);
4353 for (value = 0; value < 256; value++) {
4354 if (!isSPACE(value)) {
4355 ANYOF_BITMAP_SET(data->start_class, value);
4362 if (flags & SCF_DO_STCLASS_AND) {
4363 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4364 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4365 for (value = 0; value < 256; value++)
4366 if (!isDIGIT(value))
4367 ANYOF_BITMAP_CLEAR(data->start_class, value);
4371 if (data->start_class->flags & ANYOF_LOCALE)
4372 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4373 for (value = 0; value < 256; value++)
4375 ANYOF_BITMAP_SET(data->start_class, value);
4379 if (flags & SCF_DO_STCLASS_AND) {
4380 if (!(data->start_class->flags & ANYOF_LOCALE))
4381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4382 for (value = 0; value < 256; value++)
4384 ANYOF_BITMAP_CLEAR(data->start_class, value);
4387 if (data->start_class->flags & ANYOF_LOCALE)
4388 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4389 for (value = 0; value < 256; value++)
4390 if (!isDIGIT(value))
4391 ANYOF_BITMAP_SET(data->start_class, value);
4394 CASE_SYNST_FNC(VERTWS);
4395 CASE_SYNST_FNC(HORIZWS);
4398 if (flags & SCF_DO_STCLASS_OR)
4399 cl_and(data->start_class, and_withp);
4400 flags &= ~SCF_DO_STCLASS;
4403 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4404 data->flags |= (OP(scan) == MEOL
4408 else if ( PL_regkind[OP(scan)] == BRANCHJ
4409 /* Lookbehind, or need to calculate parens/evals/stclass: */
4410 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4411 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4412 if ( OP(scan) == UNLESSM &&
4414 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4415 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4418 regnode *upto= regnext(scan);
4420 SV * const mysv_val=sv_newmortal();
4421 DEBUG_STUDYDATA("OPFAIL",data,depth);
4423 /*DEBUG_PARSE_MSG("opfail");*/
4424 regprop(RExC_rx, mysv_val, upto);
4425 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4426 SvPV_nolen_const(mysv_val),
4427 (IV)REG_NODE_NUM(upto),
4432 NEXT_OFF(scan) = upto - scan;
4433 for (opt= scan + 1; opt < upto ; opt++)
4434 OP(opt) = OPTIMIZED;
4438 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4439 || OP(scan) == UNLESSM )
4441 /* Negative Lookahead/lookbehind
4442 In this case we can't do fixed string optimisation.
4445 I32 deltanext, minnext, fake = 0;
4447 struct regnode_charclass_class intrnl;
4450 data_fake.flags = 0;
4452 data_fake.whilem_c = data->whilem_c;
4453 data_fake.last_closep = data->last_closep;
4456 data_fake.last_closep = &fake;
4457 data_fake.pos_delta = delta;
4458 if ( flags & SCF_DO_STCLASS && !scan->flags
4459 && OP(scan) == IFMATCH ) { /* Lookahead */
4460 cl_init(pRExC_state, &intrnl);
4461 data_fake.start_class = &intrnl;
4462 f |= SCF_DO_STCLASS_AND;
4464 if (flags & SCF_WHILEM_VISITED_POS)
4465 f |= SCF_WHILEM_VISITED_POS;
4466 next = regnext(scan);
4467 nscan = NEXTOPER(NEXTOPER(scan));
4468 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4469 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4472 FAIL("Variable length lookbehind not implemented");
4474 else if (minnext > (I32)U8_MAX) {
4475 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4477 scan->flags = (U8)minnext;
4480 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4482 if (data_fake.flags & SF_HAS_EVAL)
4483 data->flags |= SF_HAS_EVAL;
4484 data->whilem_c = data_fake.whilem_c;
4486 if (f & SCF_DO_STCLASS_AND) {
4487 if (flags & SCF_DO_STCLASS_OR) {
4488 /* OR before, AND after: ideally we would recurse with
4489 * data_fake to get the AND applied by study of the
4490 * remainder of the pattern, and then derecurse;
4491 * *** HACK *** for now just treat as "no information".
4492 * See [perl #56690].
4494 cl_init(pRExC_state, data->start_class);
4496 /* AND before and after: combine and continue */
4497 const int was = (data->start_class->flags & ANYOF_EOS);
4499 cl_and(data->start_class, &intrnl);
4501 data->start_class->flags |= ANYOF_EOS;
4505 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4507 /* Positive Lookahead/lookbehind
4508 In this case we can do fixed string optimisation,
4509 but we must be careful about it. Note in the case of
4510 lookbehind the positions will be offset by the minimum
4511 length of the pattern, something we won't know about
4512 until after the recurse.
4514 I32 deltanext, fake = 0;
4516 struct regnode_charclass_class intrnl;
4518 /* We use SAVEFREEPV so that when the full compile
4519 is finished perl will clean up the allocated
4520 minlens when it's all done. This way we don't
4521 have to worry about freeing them when we know
4522 they wont be used, which would be a pain.
4525 Newx( minnextp, 1, I32 );
4526 SAVEFREEPV(minnextp);
4529 StructCopy(data, &data_fake, scan_data_t);
4530 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4533 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4534 data_fake.last_found=newSVsv(data->last_found);
4538 data_fake.last_closep = &fake;
4539 data_fake.flags = 0;
4540 data_fake.pos_delta = delta;
4542 data_fake.flags |= SF_IS_INF;
4543 if ( flags & SCF_DO_STCLASS && !scan->flags
4544 && OP(scan) == IFMATCH ) { /* Lookahead */
4545 cl_init(pRExC_state, &intrnl);
4546 data_fake.start_class = &intrnl;
4547 f |= SCF_DO_STCLASS_AND;
4549 if (flags & SCF_WHILEM_VISITED_POS)
4550 f |= SCF_WHILEM_VISITED_POS;
4551 next = regnext(scan);
4552 nscan = NEXTOPER(NEXTOPER(scan));
4554 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4555 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4558 FAIL("Variable length lookbehind not implemented");
4560 else if (*minnextp > (I32)U8_MAX) {
4561 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4563 scan->flags = (U8)*minnextp;
4568 if (f & SCF_DO_STCLASS_AND) {
4569 const int was = (data->start_class->flags & ANYOF_EOS);
4571 cl_and(data->start_class, &intrnl);
4573 data->start_class->flags |= ANYOF_EOS;
4576 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4578 if (data_fake.flags & SF_HAS_EVAL)
4579 data->flags |= SF_HAS_EVAL;
4580 data->whilem_c = data_fake.whilem_c;
4581 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4582 if (RExC_rx->minlen<*minnextp)
4583 RExC_rx->minlen=*minnextp;
4584 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4585 SvREFCNT_dec(data_fake.last_found);
4587 if ( data_fake.minlen_fixed != minlenp )
4589 data->offset_fixed= data_fake.offset_fixed;
4590 data->minlen_fixed= data_fake.minlen_fixed;
4591 data->lookbehind_fixed+= scan->flags;
4593 if ( data_fake.minlen_float != minlenp )
4595 data->minlen_float= data_fake.minlen_float;
4596 data->offset_float_min=data_fake.offset_float_min;
4597 data->offset_float_max=data_fake.offset_float_max;
4598 data->lookbehind_float+= scan->flags;
4605 else if (OP(scan) == OPEN) {
4606 if (stopparen != (I32)ARG(scan))
4609 else if (OP(scan) == CLOSE) {
4610 if (stopparen == (I32)ARG(scan)) {
4613 if ((I32)ARG(scan) == is_par) {
4614 next = regnext(scan);
4616 if ( next && (OP(next) != WHILEM) && next < last)
4617 is_par = 0; /* Disable optimization */
4620 *(data->last_closep) = ARG(scan);
4622 else if (OP(scan) == EVAL) {
4624 data->flags |= SF_HAS_EVAL;
4626 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4627 if (flags & SCF_DO_SUBSTR) {
4628 SCAN_COMMIT(pRExC_state,data,minlenp);
4629 flags &= ~SCF_DO_SUBSTR;
4631 if (data && OP(scan)==ACCEPT) {
4632 data->flags |= SCF_SEEN_ACCEPT;
4637 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4639 if (flags & SCF_DO_SUBSTR) {
4640 SCAN_COMMIT(pRExC_state,data,minlenp);
4641 data->longest = &(data->longest_float);
4643 is_inf = is_inf_internal = 1;
4644 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4645 cl_anything(pRExC_state, data->start_class);
4646 flags &= ~SCF_DO_STCLASS;
4648 else if (OP(scan) == GPOS) {
4649 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4650 !(delta || is_inf || (data && data->pos_delta)))
4652 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4653 RExC_rx->extflags |= RXf_ANCH_GPOS;
4654 if (RExC_rx->gofs < (U32)min)
4655 RExC_rx->gofs = min;
4657 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4661 #ifdef TRIE_STUDY_OPT
4662 #ifdef FULL_TRIE_STUDY
4663 else if (PL_regkind[OP(scan)] == TRIE) {
4664 /* NOTE - There is similar code to this block above for handling
4665 BRANCH nodes on the initial study. If you change stuff here
4667 regnode *trie_node= scan;
4668 regnode *tail= regnext(scan);
4669 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4670 I32 max1 = 0, min1 = I32_MAX;
4671 struct regnode_charclass_class accum;
4673 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4674 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4675 if (flags & SCF_DO_STCLASS)
4676 cl_init_zero(pRExC_state, &accum);
4682 const regnode *nextbranch= NULL;
4685 for ( word=1 ; word <= trie->wordcount ; word++)
4687 I32 deltanext=0, minnext=0, f = 0, fake;
4688 struct regnode_charclass_class this_class;
4690 data_fake.flags = 0;
4692 data_fake.whilem_c = data->whilem_c;
4693 data_fake.last_closep = data->last_closep;
4696 data_fake.last_closep = &fake;
4697 data_fake.pos_delta = delta;
4698 if (flags & SCF_DO_STCLASS) {
4699 cl_init(pRExC_state, &this_class);
4700 data_fake.start_class = &this_class;
4701 f = SCF_DO_STCLASS_AND;
4703 if (flags & SCF_WHILEM_VISITED_POS)
4704 f |= SCF_WHILEM_VISITED_POS;
4706 if (trie->jump[word]) {
4708 nextbranch = trie_node + trie->jump[0];
4709 scan= trie_node + trie->jump[word];
4710 /* We go from the jump point to the branch that follows
4711 it. Note this means we need the vestigal unused branches
4712 even though they arent otherwise used.
4714 minnext = study_chunk(pRExC_state, &scan, minlenp,
4715 &deltanext, (regnode *)nextbranch, &data_fake,
4716 stopparen, recursed, NULL, f,depth+1);
4718 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4719 nextbranch= regnext((regnode*)nextbranch);
4721 if (min1 > (I32)(minnext + trie->minlen))
4722 min1 = minnext + trie->minlen;
4723 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4724 max1 = minnext + deltanext + trie->maxlen;
4725 if (deltanext == I32_MAX)
4726 is_inf = is_inf_internal = 1;
4728 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4730 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4731 if ( stopmin > min + min1)
4732 stopmin = min + min1;
4733 flags &= ~SCF_DO_SUBSTR;
4735 data->flags |= SCF_SEEN_ACCEPT;
4738 if (data_fake.flags & SF_HAS_EVAL)
4739 data->flags |= SF_HAS_EVAL;
4740 data->whilem_c = data_fake.whilem_c;
4742 if (flags & SCF_DO_STCLASS)
4743 cl_or(pRExC_state, &accum, &this_class);
4746 if (flags & SCF_DO_SUBSTR) {
4747 data->pos_min += min1;
4748 data->pos_delta += max1 - min1;
4749 if (max1 != min1 || is_inf)
4750 data->longest = &(data->longest_float);
4753 delta += max1 - min1;
4754 if (flags & SCF_DO_STCLASS_OR) {
4755 cl_or(pRExC_state, data->start_class, &accum);
4757 cl_and(data->start_class, and_withp);
4758 flags &= ~SCF_DO_STCLASS;
4761 else if (flags & SCF_DO_STCLASS_AND) {
4763 cl_and(data->start_class, &accum);
4764 flags &= ~SCF_DO_STCLASS;
4767 /* Switch to OR mode: cache the old value of
4768 * data->start_class */
4770 StructCopy(data->start_class, and_withp,
4771 struct regnode_charclass_class);
4772 flags &= ~SCF_DO_STCLASS_AND;
4773 StructCopy(&accum, data->start_class,
4774 struct regnode_charclass_class);
4775 flags |= SCF_DO_STCLASS_OR;
4776 data->start_class->flags |= ANYOF_EOS;
4783 else if (PL_regkind[OP(scan)] == TRIE) {
4784 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4787 min += trie->minlen;
4788 delta += (trie->maxlen - trie->minlen);
4789 flags &= ~SCF_DO_STCLASS; /* xxx */
4790 if (flags & SCF_DO_SUBSTR) {
4791 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4792 data->pos_min += trie->minlen;
4793 data->pos_delta += (trie->maxlen - trie->minlen);
4794 if (trie->maxlen != trie->minlen)
4795 data->longest = &(data->longest_float);
4797 if (trie->jump) /* no more substrings -- for now /grr*/
4798 flags &= ~SCF_DO_SUBSTR;
4800 #endif /* old or new */
4801 #endif /* TRIE_STUDY_OPT */
4803 /* Else: zero-length, ignore. */
4804 scan = regnext(scan);
4809 stopparen = frame->stop;
4810 frame = frame->prev;
4811 goto fake_study_recurse;
4816 DEBUG_STUDYDATA("pre-fin:",data,depth);
4819 *deltap = is_inf_internal ? I32_MAX : delta;
4820 if (flags & SCF_DO_SUBSTR && is_inf)
4821 data->pos_delta = I32_MAX - data->pos_min;
4822 if (is_par > (I32)U8_MAX)
4824 if (is_par && pars==1 && data) {
4825 data->flags |= SF_IN_PAR;
4826 data->flags &= ~SF_HAS_PAR;
4828 else if (pars && data) {
4829 data->flags |= SF_HAS_PAR;
4830 data->flags &= ~SF_IN_PAR;
4832 if (flags & SCF_DO_STCLASS_OR)
4833 cl_and(data->start_class, and_withp);
4834 if (flags & SCF_TRIE_RESTUDY)
4835 data->flags |= SCF_TRIE_RESTUDY;
4837 DEBUG_STUDYDATA("post-fin:",data,depth);
4839 return min < stopmin ? min : stopmin;
4843 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4845 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4847 PERL_ARGS_ASSERT_ADD_DATA;
4849 Renewc(RExC_rxi->data,
4850 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4851 char, struct reg_data);
4853 Renew(RExC_rxi->data->what, count + n, U8);
4855 Newx(RExC_rxi->data->what, n, U8);
4856 RExC_rxi->data->count = count + n;
4857 Copy(s, RExC_rxi->data->what + count, n, U8);
4861 /*XXX: todo make this not included in a non debugging perl */
4862 #ifndef PERL_IN_XSUB_RE
4864 Perl_reginitcolors(pTHX)
4867 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4869 char *t = savepv(s);
4873 t = strchr(t, '\t');
4879 PL_colors[i] = t = (char *)"";
4884 PL_colors[i++] = (char *)"";
4891 #ifdef TRIE_STUDY_OPT
4892 #define CHECK_RESTUDY_GOTO \
4894 (data.flags & SCF_TRIE_RESTUDY) \
4898 #define CHECK_RESTUDY_GOTO
4902 * pregcomp - compile a regular expression into internal code
4904 * Decides which engine's compiler to call based on the hint currently in
4908 #ifndef PERL_IN_XSUB_RE
4910 /* return the currently in-scope regex engine (or the default if none) */
4912 regexp_engine const *
4913 Perl_current_re_engine(pTHX)
4917 if (IN_PERL_COMPILETIME) {
4918 HV * const table = GvHV(PL_hintgv);
4922 return &PL_core_reg_engine;
4923 ptr = hv_fetchs(table, "regcomp", FALSE);
4924 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4925 return &PL_core_reg_engine;
4926 return INT2PTR(regexp_engine*,SvIV(*ptr));
4930 if (!PL_curcop->cop_hints_hash)
4931 return &PL_core_reg_engine;
4932 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4933 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4934 return &PL_core_reg_engine;
4935 return INT2PTR(regexp_engine*,SvIV(ptr));
4941 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4944 regexp_engine const *eng = current_re_engine();
4945 GET_RE_DEBUG_FLAGS_DECL;
4947 PERL_ARGS_ASSERT_PREGCOMP;
4949 /* Dispatch a request to compile a regexp to correct regexp engine. */
4951 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4954 return CALLREGCOMP_ENG(eng, pattern, flags);
4958 /* public(ish) wrapper for Perl_re_op_compile that only takes an SV
4959 * pattern rather than a list of OPs */
4962 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4964 SV *pat = pattern; /* defeat constness! */
4965 PERL_ARGS_ASSERT_RE_COMPILE;
4966 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
4967 NULL, NULL, rx_flags, 0);
4970 /* see if there are any run-time code blocks in the pattern.
4971 * False positives are allowed */
4974 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4975 U32 pm_flags, char *pat, STRLEN plen)
4980 /* avoid infinitely recursing when we recompile the pattern parcelled up
4981 * as qr'...'. A single constant qr// string can't have have any
4982 * run-time component in it, and thus, no runtime code. (A non-qr
4983 * string, however, can, e.g. $x =~ '(?{})') */
4984 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4987 for (s = 0; s < plen; s++) {
4988 if (n < pRExC_state->num_code_blocks
4989 && s == pRExC_state->code_blocks[n].start)
4991 s = pRExC_state->code_blocks[n].end;
4995 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4997 if (pat[s] == '(' && pat[s+1] == '?' &&
4998 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
5005 /* Handle run-time code blocks. We will already have compiled any direct
5006 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5007 * copy of it, but with any literal code blocks blanked out and
5008 * appropriate chars escaped; then feed it into
5010 * eval "qr'modified_pattern'"
5014 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5018 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5020 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5021 * and merge them with any code blocks of the original regexp.
5023 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5024 * instead, just save the qr and return FALSE; this tells our caller that
5025 * the original pattern needs upgrading to utf8.
5029 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5030 char *pat, STRLEN plen)
5034 GET_RE_DEBUG_FLAGS_DECL;
5036 if (pRExC_state->runtime_code_qr) {
5037 /* this is the second time we've been called; this should
5038 * only happen if the main pattern got upgraded to utf8
5039 * during compilation; re-use the qr we compiled first time
5040 * round (which should be utf8 too)
5042 qr = pRExC_state->runtime_code_qr;
5043 pRExC_state->runtime_code_qr = NULL;
5044 assert(RExC_utf8 && SvUTF8(qr));
5050 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5054 /* determine how many extra chars we need for ' and \ escaping */
5055 for (s = 0; s < plen; s++) {
5056 if (pat[s] == '\'' || pat[s] == '\\')
5060 Newx(newpat, newlen, char);
5062 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5064 for (s = 0; s < plen; s++) {
5065 if (n < pRExC_state->num_code_blocks
5066 && s == pRExC_state->code_blocks[n].start)
5068 /* blank out literal code block */
5069 assert(pat[s] == '(');
5070 while (s <= pRExC_state->code_blocks[n].end) {
5078 if (pat[s] == '\'' || pat[s] == '\\')
5083 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5087 PerlIO_printf(Perl_debug_log,
5088 "%sre-parsing pattern for runtime code:%s %s\n",
5089 PL_colors[4],PL_colors[5],newpat);
5092 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5098 PUSHSTACKi(PERLSI_REQUIRE);
5099 /* this causes the toker to collapse \\ into \ when parsing
5100 * qr''; normally only q'' does this. It also alters hints
5102 PL_reg_state.re_reparsing = TRUE;
5103 eval_sv(sv, G_SCALAR);
5109 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5110 assert(SvROK(qr_ref));
5112 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5113 /* the leaving below frees the tmp qr_ref.
5114 * Give qr a life of its own */
5122 if (!RExC_utf8 && SvUTF8(qr)) {
5123 /* first time through; the pattern got upgraded; save the
5124 * qr for the next time through */
5125 assert(!pRExC_state->runtime_code_qr);
5126 pRExC_state->runtime_code_qr = qr;
5131 /* extract any code blocks within the returned qr// */
5134 /* merge the main (r1) and run-time (r2) code blocks into one */
5136 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5137 struct reg_code_block *new_block, *dst;
5138 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5141 if (!r2->num_code_blocks) /* we guessed wrong */
5145 r1->num_code_blocks + r2->num_code_blocks,
5146 struct reg_code_block);
5149 while ( i1 < r1->num_code_blocks
5150 || i2 < r2->num_code_blocks)
5152 struct reg_code_block *src;
5155 if (i1 == r1->num_code_blocks) {
5156 src = &r2->code_blocks[i2++];
5159 else if (i2 == r2->num_code_blocks)
5160 src = &r1->code_blocks[i1++];
5161 else if ( r1->code_blocks[i1].start
5162 < r2->code_blocks[i2].start)
5164 src = &r1->code_blocks[i1++];
5165 assert(src->end < r2->code_blocks[i2].start);
5168 assert( r1->code_blocks[i1].start
5169 > r2->code_blocks[i2].start);
5170 src = &r2->code_blocks[i2++];
5172 assert(src->end < r1->code_blocks[i1].start);
5175 assert(pat[src->start] == '(');
5176 assert(pat[src->end] == ')');
5177 dst->start = src->start;
5178 dst->end = src->end;
5179 dst->block = src->block;
5180 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5184 r1->num_code_blocks += r2->num_code_blocks;
5185 Safefree(r1->code_blocks);
5186 r1->code_blocks = new_block;
5195 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5196 * regular expression into internal code.
5197 * The pattern may be passed either as:
5198 * a list of SVs (patternp plus pat_count)
5199 * a list of OPs (expr)
5200 * If both are passed, the SV list is used, but the OP list indicates
5201 * which SVs are actually pre-compiled code blocks
5203 * The SVs in the list have magic and qr overloading applied to them (and
5204 * the list may be modified in-place with replacement SVs in the latter
5207 * If the pattern hasn't changed from old_re, then old_re will be
5210 * eng is the current engine. If that engine has an op_comp method, then
5211 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5212 * do the initial concatenation of arguments and pass on to the external
5215 * If is_bare_re is not null, set it to a boolean indicating whether the
5216 * arg list reduced (after overloading) to a single bare regex which has
5217 * been returned (i.e. /$qr/).
5219 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5221 * pm_flags contains the PMf_* flags, typically based on those from the
5222 * pm_flags field of the related PMOP. Currently we're only interested in
5223 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5225 * We can't allocate space until we know how big the compiled form will be,
5226 * but we can't compile it (and thus know how big it is) until we've got a
5227 * place to put the code. So we cheat: we compile it twice, once with code
5228 * generation turned off and size counting turned on, and once "for real".
5229 * This also means that we don't allocate space until we are sure that the
5230 * thing really will compile successfully, and we never have to move the
5231 * code and thus invalidate pointers into it. (Note that it has to be in
5232 * one piece because free() must be able to free it all.) [NB: not true in perl]
5234 * Beware that the optimization-preparation code in here knows about some
5235 * of the structure of the compiled regexp. [I'll say.]
5239 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5240 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5241 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5246 register regexp_internal *ri;
5256 /* these are all flags - maybe they should be turned
5257 * into a single int with different bit masks */
5258 I32 sawlookahead = 0;
5261 bool used_setjump = FALSE;
5262 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5263 bool code_is_utf8 = 0;
5264 bool VOL recompile = 0;
5265 bool runtime_code = 0;
5269 RExC_state_t RExC_state;
5270 RExC_state_t * const pRExC_state = &RExC_state;
5271 #ifdef TRIE_STUDY_OPT
5273 RExC_state_t copyRExC_state;
5275 GET_RE_DEBUG_FLAGS_DECL;
5277 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5279 DEBUG_r(if (!PL_colorset) reginitcolors());
5281 #ifndef PERL_IN_XSUB_RE
5282 /* Initialize these here instead of as-needed, as is quick and avoids
5283 * having to test them each time otherwise */
5284 if (! PL_AboveLatin1) {
5285 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5286 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5287 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5289 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5290 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5292 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5293 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5295 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5296 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5298 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5300 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5301 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5303 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5305 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5306 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5308 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5309 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5311 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5312 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5314 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5315 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5317 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5318 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5320 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5321 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5323 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5324 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5326 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5327 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5329 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5331 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5332 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5334 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5335 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5339 pRExC_state->code_blocks = NULL;
5340 pRExC_state->num_code_blocks = 0;
5343 *is_bare_re = FALSE;
5345 if (expr && (expr->op_type == OP_LIST ||
5346 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5348 /* is the source UTF8, and how many code blocks are there? */
5352 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5353 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5355 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5356 /* count of DO blocks */
5360 pRExC_state->num_code_blocks = ncode;
5361 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5366 /* handle a list of SVs */
5370 /* apply magic and RE overloading to each arg */
5371 for (svp = patternp; svp < patternp + pat_count; svp++) {
5374 if (SvROK(rx) && SvAMAGIC(rx)) {
5375 SV *sv = AMG_CALLunary(rx, regexp_amg);
5379 if (SvTYPE(sv) != SVt_REGEXP)
5380 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5386 if (pat_count > 1) {
5387 /* concat multiple args and find any code block indexes */
5392 STRLEN orig_patlen = 0;
5394 if (pRExC_state->num_code_blocks) {
5395 o = cLISTOPx(expr)->op_first;
5396 assert(o->op_type == OP_PUSHMARK);
5400 pat = newSVpvn("", 0);
5403 /* determine if the pattern is going to be utf8 (needed
5404 * in advance to align code block indices correctly).
5405 * XXX This could fail to be detected for an arg with
5406 * overloading but not concat overloading; but the main effect
5407 * in this obscure case is to need a 'use re eval' for a
5408 * literal code block */
5409 for (svp = patternp; svp < patternp + pat_count; svp++) {
5416 for (svp = patternp; svp < patternp + pat_count; svp++) {
5417 SV *sv, *msv = *svp;
5421 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5422 assert(n < pRExC_state->num_code_blocks);
5423 pRExC_state->code_blocks[n].start = SvCUR(pat);
5424 pRExC_state->code_blocks[n].block = o;
5425 pRExC_state->code_blocks[n].src_regex = NULL;
5428 o = o->op_sibling; /* skip CONST */
5434 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5435 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5438 /* overloading involved: all bets are off over literal
5439 * code. Pretend we haven't seen it */
5440 pRExC_state->num_code_blocks -= n;
5446 while (SvAMAGIC(msv)
5447 && (sv = AMG_CALLunary(msv, string_amg))
5453 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5455 orig_patlen = SvCUR(pat);
5456 sv_catsv_nomg(pat, msv);
5459 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5462 /* extract any code blocks within any embedded qr//'s */
5463 if (rx && SvTYPE(rx) == SVt_REGEXP
5464 && RX_ENGINE((REGEXP*)rx)->op_comp)
5467 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5468 if (ri->num_code_blocks) {
5470 /* the presence of an embedded qr// with code means
5471 * we should always recompile: the text of the
5472 * qr// may not have changed, but it may be a
5473 * different closure than last time */
5475 Renew(pRExC_state->code_blocks,
5476 pRExC_state->num_code_blocks + ri->num_code_blocks,
5477 struct reg_code_block);
5478 pRExC_state->num_code_blocks += ri->num_code_blocks;
5479 for (i=0; i < ri->num_code_blocks; i++) {
5480 struct reg_code_block *src, *dst;
5481 STRLEN offset = orig_patlen
5482 + ((struct regexp *)SvANY(rx))->pre_prefix;
5483 assert(n < pRExC_state->num_code_blocks);
5484 src = &ri->code_blocks[i];
5485 dst = &pRExC_state->code_blocks[n];
5486 dst->start = src->start + offset;
5487 dst->end = src->end + offset;
5488 dst->block = src->block;
5489 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5503 while (SvAMAGIC(pat)
5504 && (sv = AMG_CALLunary(pat, string_amg))
5512 /* handle bare regex: foo =~ $re */
5517 if (SvTYPE(re) == SVt_REGEXP) {
5521 Safefree(pRExC_state->code_blocks);
5527 /* not a list of SVs, so must be a list of OPs */
5529 if (expr->op_type == OP_LIST) {
5534 pat = newSVpvn("", 0);
5539 /* given a list of CONSTs and DO blocks in expr, append all
5540 * the CONSTs to pat, and record the start and end of each
5541 * code block in code_blocks[] (each DO{} op is followed by an
5542 * OP_CONST containing the corresponding literal '(?{...})
5545 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5546 if (o->op_type == OP_CONST) {
5547 sv_catsv(pat, cSVOPo_sv);
5549 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5553 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5554 assert(i+1 < pRExC_state->num_code_blocks);
5555 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5556 pRExC_state->code_blocks[i].block = o;
5557 pRExC_state->code_blocks[i].src_regex = NULL;
5563 assert(expr->op_type == OP_CONST);
5564 pat = cSVOPx_sv(expr);
5568 exp = SvPV_nomg(pat, plen);
5570 if (!eng->op_comp) {
5571 if ((SvUTF8(pat) && IN_BYTES)
5572 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5574 /* make a temporary copy; either to convert to bytes,
5575 * or to avoid repeating get-magic / overloaded stringify */
5576 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5577 (IN_BYTES ? 0 : SvUTF8(pat)));
5579 Safefree(pRExC_state->code_blocks);
5580 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5583 /* ignore the utf8ness if the pattern is 0 length */
5584 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5585 RExC_uni_semantics = 0;
5586 RExC_contains_locale = 0;
5587 pRExC_state->runtime_code_qr = NULL;
5589 /****************** LONG JUMP TARGET HERE***********************/
5590 /* Longjmp back to here if have to switch in midstream to utf8 */
5591 if (! RExC_orig_utf8) {
5592 JMPENV_PUSH(jump_ret);
5593 used_setjump = TRUE;
5596 if (jump_ret == 0) { /* First time through */
5600 SV *dsv= sv_newmortal();
5601 RE_PV_QUOTED_DECL(s, RExC_utf8,
5602 dsv, exp, plen, 60);
5603 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5604 PL_colors[4],PL_colors[5],s);
5607 else { /* longjumped back */
5610 STRLEN s = 0, d = 0;
5613 /* If the cause for the longjmp was other than changing to utf8, pop
5614 * our own setjmp, and longjmp to the correct handler */
5615 if (jump_ret != UTF8_LONGJMP) {
5617 JMPENV_JUMP(jump_ret);
5622 /* It's possible to write a regexp in ascii that represents Unicode
5623 codepoints outside of the byte range, such as via \x{100}. If we
5624 detect such a sequence we have to convert the entire pattern to utf8
5625 and then recompile, as our sizing calculation will have been based
5626 on 1 byte == 1 character, but we will need to use utf8 to encode
5627 at least some part of the pattern, and therefore must convert the whole
5630 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5631 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5633 /* upgrade pattern to UTF8, and if there are code blocks,
5634 * recalculate the indices.
5635 * This is essentially an unrolled Perl_bytes_to_utf8() */
5637 src = (U8*)SvPV_nomg(pat, plen);
5638 Newx(dst, plen * 2 + 1, U8);
5641 const UV uv = NATIVE_TO_ASCII(src[s]);
5642 if (UNI_IS_INVARIANT(uv))
5643 dst[d] = (U8)UTF_TO_NATIVE(uv);
5645 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5646 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5648 if (n < pRExC_state->num_code_blocks) {
5649 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5650 pRExC_state->code_blocks[n].start = d;
5651 assert(dst[d] == '(');
5654 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5655 pRExC_state->code_blocks[n].end = d;
5656 assert(dst[d] == ')');
5669 RExC_orig_utf8 = RExC_utf8 = 1;
5672 /* return old regex if pattern hasn't changed */
5676 && !!RX_UTF8(old_re) == !!RExC_utf8
5677 && RX_PRECOMP(old_re)
5678 && RX_PRELEN(old_re) == plen
5679 && memEQ(RX_PRECOMP(old_re), exp, plen))
5681 /* with runtime code, always recompile */
5682 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5684 if (!runtime_code) {
5685 ReREFCNT_inc(old_re);
5689 Safefree(pRExC_state->code_blocks);
5693 else if ((pm_flags & PMf_USE_RE_EVAL)
5694 /* this second condition covers the non-regex literal case,
5695 * i.e. $foo =~ '(?{})'. */
5696 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5697 && (PL_hints & HINT_RE_EVAL))
5699 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5702 #ifdef TRIE_STUDY_OPT
5706 rx_flags = orig_rx_flags;
5708 if (initial_charset == REGEX_LOCALE_CHARSET) {
5709 RExC_contains_locale = 1;
5711 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5713 /* Set to use unicode semantics if the pattern is in utf8 and has the
5714 * 'depends' charset specified, as it means unicode when utf8 */
5715 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5719 RExC_flags = rx_flags;
5720 RExC_pm_flags = pm_flags;
5723 if (PL_tainting && PL_tainted)
5724 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5726 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5727 /* whoops, we have a non-utf8 pattern, whilst run-time code
5728 * got compiled as utf8. Try again with a utf8 pattern */
5729 JMPENV_JUMP(UTF8_LONGJMP);
5732 assert(!pRExC_state->runtime_code_qr);
5737 RExC_in_lookbehind = 0;
5738 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5740 RExC_override_recoding = 0;
5742 /* First pass: determine size, legality. */
5750 RExC_emit = &PL_regdummy;
5751 RExC_whilem_seen = 0;
5752 RExC_open_parens = NULL;
5753 RExC_close_parens = NULL;
5755 RExC_paren_names = NULL;
5757 RExC_paren_name_list = NULL;
5759 RExC_recurse = NULL;
5760 RExC_recurse_count = 0;
5761 pRExC_state->code_index = 0;
5763 #if 0 /* REGC() is (currently) a NOP at the first pass.
5764 * Clever compilers notice this and complain. --jhi */
5765 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5768 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5770 RExC_lastparse=NULL;
5772 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5773 RExC_precomp = NULL;
5774 Safefree(pRExC_state->code_blocks);
5778 /* Here, finished first pass. Get rid of any added setjmp */
5784 PerlIO_printf(Perl_debug_log,
5785 "Required size %"IVdf" nodes\n"
5786 "Starting second pass (creation)\n",
5789 RExC_lastparse=NULL;
5792 /* The first pass could have found things that force Unicode semantics */
5793 if ((RExC_utf8 || RExC_uni_semantics)
5794 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5796 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5799 /* Small enough for pointer-storage convention?
5800 If extralen==0, this means that we will not need long jumps. */
5801 if (RExC_size >= 0x10000L && RExC_extralen)
5802 RExC_size += RExC_extralen;
5805 if (RExC_whilem_seen > 15)
5806 RExC_whilem_seen = 15;
5808 /* Allocate space and zero-initialize. Note, the two step process
5809 of zeroing when in debug mode, thus anything assigned has to
5810 happen after that */
5811 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5812 r = (struct regexp*)SvANY(rx);
5813 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5814 char, regexp_internal);
5815 if ( r == NULL || ri == NULL )
5816 FAIL("Regexp out of space");
5818 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5819 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5821 /* bulk initialize base fields with 0. */
5822 Zero(ri, sizeof(regexp_internal), char);
5825 /* non-zero initialization begins here */
5828 r->extflags = rx_flags;
5829 if (pm_flags & PMf_IS_QR) {
5830 ri->code_blocks = pRExC_state->code_blocks;
5831 ri->num_code_blocks = pRExC_state->num_code_blocks;
5834 SAVEFREEPV(pRExC_state->code_blocks);
5837 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5838 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5840 /* The caret is output if there are any defaults: if not all the STD
5841 * flags are set, or if no character set specifier is needed */
5843 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5845 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5846 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5847 >> RXf_PMf_STD_PMMOD_SHIFT);
5848 const char *fptr = STD_PAT_MODS; /*"msix"*/
5850 /* Allocate for the worst case, which is all the std flags are turned
5851 * on. If more precision is desired, we could do a population count of
5852 * the flags set. This could be done with a small lookup table, or by
5853 * shifting, masking and adding, or even, when available, assembly
5854 * language for a machine-language population count.
5855 * We never output a minus, as all those are defaults, so are
5856 * covered by the caret */
5857 const STRLEN wraplen = plen + has_p + has_runon
5858 + has_default /* If needs a caret */
5860 /* If needs a character set specifier */
5861 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5862 + (sizeof(STD_PAT_MODS) - 1)
5863 + (sizeof("(?:)") - 1);
5865 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5868 SvFLAGS(rx) |= SVf_UTF8;
5871 /* If a default, cover it using the caret */
5873 *p++= DEFAULT_PAT_MOD;
5877 const char* const name = get_regex_charset_name(r->extflags, &len);
5878 Copy(name, p, len, char);
5882 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5885 while((ch = *fptr++)) {
5893 Copy(RExC_precomp, p, plen, char);
5894 assert ((RX_WRAPPED(rx) - p) < 16);
5895 r->pre_prefix = p - RX_WRAPPED(rx);
5901 SvCUR_set(rx, p - SvPVX_const(rx));
5905 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5907 if (RExC_seen & REG_SEEN_RECURSE) {
5908 Newxz(RExC_open_parens, RExC_npar,regnode *);
5909 SAVEFREEPV(RExC_open_parens);
5910 Newxz(RExC_close_parens,RExC_npar,regnode *);
5911 SAVEFREEPV(RExC_close_parens);
5914 /* Useful during FAIL. */
5915 #ifdef RE_TRACK_PATTERN_OFFSETS
5916 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5917 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5918 "%s %"UVuf" bytes for offset annotations.\n",
5919 ri->u.offsets ? "Got" : "Couldn't get",
5920 (UV)((2*RExC_size+1) * sizeof(U32))));
5922 SetProgLen(ri,RExC_size);
5927 /* Second pass: emit code. */
5928 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5929 RExC_pm_flags = pm_flags;
5934 RExC_emit_start = ri->program;
5935 RExC_emit = ri->program;
5936 RExC_emit_bound = ri->program + RExC_size + 1;
5937 pRExC_state->code_index = 0;
5939 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5940 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5944 /* XXXX To minimize changes to RE engine we always allocate
5945 3-units-long substrs field. */
5946 Newx(r->substrs, 1, struct reg_substr_data);
5947 if (RExC_recurse_count) {
5948 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5949 SAVEFREEPV(RExC_recurse);
5953 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5954 Zero(r->substrs, 1, struct reg_substr_data);
5956 #ifdef TRIE_STUDY_OPT
5958 StructCopy(&zero_scan_data, &data, scan_data_t);
5959 copyRExC_state = RExC_state;
5962 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5964 RExC_state = copyRExC_state;
5965 if (seen & REG_TOP_LEVEL_BRANCHES)
5966 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5968 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5969 if (data.last_found) {
5970 SvREFCNT_dec(data.longest_fixed);
5971 SvREFCNT_dec(data.longest_float);
5972 SvREFCNT_dec(data.last_found);
5974 StructCopy(&zero_scan_data, &data, scan_data_t);
5977 StructCopy(&zero_scan_data, &data, scan_data_t);
5980 /* Dig out information for optimizations. */
5981 r->extflags = RExC_flags; /* was pm_op */
5982 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5985 SvUTF8_on(rx); /* Unicode in it? */
5986 ri->regstclass = NULL;
5987 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5988 r->intflags |= PREGf_NAUGHTY;
5989 scan = ri->program + 1; /* First BRANCH. */
5991 /* testing for BRANCH here tells us whether there is "must appear"
5992 data in the pattern. If there is then we can use it for optimisations */
5993 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5995 STRLEN longest_float_length, longest_fixed_length;
5996 struct regnode_charclass_class ch_class; /* pointed to by data */
5998 I32 last_close = 0; /* pointed to by data */
5999 regnode *first= scan;
6000 regnode *first_next= regnext(first);
6002 * Skip introductions and multiplicators >= 1
6003 * so that we can extract the 'meat' of the pattern that must
6004 * match in the large if() sequence following.
6005 * NOTE that EXACT is NOT covered here, as it is normally
6006 * picked up by the optimiser separately.
6008 * This is unfortunate as the optimiser isnt handling lookahead
6009 * properly currently.
6012 while ((OP(first) == OPEN && (sawopen = 1)) ||
6013 /* An OR of *one* alternative - should not happen now. */
6014 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6015 /* for now we can't handle lookbehind IFMATCH*/
6016 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6017 (OP(first) == PLUS) ||
6018 (OP(first) == MINMOD) ||
6019 /* An {n,m} with n>0 */
6020 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6021 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6024 * the only op that could be a regnode is PLUS, all the rest
6025 * will be regnode_1 or regnode_2.
6028 if (OP(first) == PLUS)
6031 first += regarglen[OP(first)];
6033 first = NEXTOPER(first);
6034 first_next= regnext(first);
6037 /* Starting-point info. */
6039 DEBUG_PEEP("first:",first,0);
6040 /* Ignore EXACT as we deal with it later. */
6041 if (PL_regkind[OP(first)] == EXACT) {
6042 if (OP(first) == EXACT)
6043 NOOP; /* Empty, get anchored substr later. */
6045 ri->regstclass = first;
6048 else if (PL_regkind[OP(first)] == TRIE &&
6049 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6052 /* this can happen only on restudy */
6053 if ( OP(first) == TRIE ) {
6054 struct regnode_1 *trieop = (struct regnode_1 *)
6055 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6056 StructCopy(first,trieop,struct regnode_1);
6057 trie_op=(regnode *)trieop;
6059 struct regnode_charclass *trieop = (struct regnode_charclass *)
6060 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6061 StructCopy(first,trieop,struct regnode_charclass);
6062 trie_op=(regnode *)trieop;
6065 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6066 ri->regstclass = trie_op;
6069 else if (REGNODE_SIMPLE(OP(first)))
6070 ri->regstclass = first;
6071 else if (PL_regkind[OP(first)] == BOUND ||
6072 PL_regkind[OP(first)] == NBOUND)
6073 ri->regstclass = first;
6074 else if (PL_regkind[OP(first)] == BOL) {
6075 r->extflags |= (OP(first) == MBOL
6077 : (OP(first) == SBOL
6080 first = NEXTOPER(first);
6083 else if (OP(first) == GPOS) {
6084 r->extflags |= RXf_ANCH_GPOS;
6085 first = NEXTOPER(first);
6088 else if ((!sawopen || !RExC_sawback) &&
6089 (OP(first) == STAR &&
6090 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6091 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6093 /* turn .* into ^.* with an implied $*=1 */
6095 (OP(NEXTOPER(first)) == REG_ANY)
6098 r->extflags |= type;
6099 r->intflags |= PREGf_IMPLICIT;
6100 first = NEXTOPER(first);
6103 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6104 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6105 /* x+ must match at the 1st pos of run of x's */
6106 r->intflags |= PREGf_SKIP;
6108 /* Scan is after the zeroth branch, first is atomic matcher. */
6109 #ifdef TRIE_STUDY_OPT
6112 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6113 (IV)(first - scan + 1))
6117 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6118 (IV)(first - scan + 1))
6124 * If there's something expensive in the r.e., find the
6125 * longest literal string that must appear and make it the
6126 * regmust. Resolve ties in favor of later strings, since
6127 * the regstart check works with the beginning of the r.e.
6128 * and avoiding duplication strengthens checking. Not a
6129 * strong reason, but sufficient in the absence of others.
6130 * [Now we resolve ties in favor of the earlier string if
6131 * it happens that c_offset_min has been invalidated, since the
6132 * earlier string may buy us something the later one won't.]
6135 data.longest_fixed = newSVpvs("");
6136 data.longest_float = newSVpvs("");
6137 data.last_found = newSVpvs("");
6138 data.longest = &(data.longest_fixed);
6140 if (!ri->regstclass) {
6141 cl_init(pRExC_state, &ch_class);
6142 data.start_class = &ch_class;
6143 stclass_flag = SCF_DO_STCLASS_AND;
6144 } else /* XXXX Check for BOUND? */
6146 data.last_closep = &last_close;
6148 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6149 &data, -1, NULL, NULL,
6150 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6156 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6157 && data.last_start_min == 0 && data.last_end > 0
6158 && !RExC_seen_zerolen
6159 && !(RExC_seen & REG_SEEN_VERBARG)
6160 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6161 r->extflags |= RXf_CHECK_ALL;
6162 scan_commit(pRExC_state, &data,&minlen,0);
6163 SvREFCNT_dec(data.last_found);
6165 /* Note that code very similar to this but for anchored string
6166 follows immediately below, changes may need to be made to both.
6169 longest_float_length = CHR_SVLEN(data.longest_float);
6170 if (longest_float_length
6171 || (data.flags & SF_FL_BEFORE_EOL
6172 && (!(data.flags & SF_FL_BEFORE_MEOL)
6173 || (RExC_flags & RXf_PMf_MULTILINE))))
6177 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
6178 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6179 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6180 && data.offset_fixed == data.offset_float_min
6181 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6182 goto remove_float; /* As in (a)+. */
6184 /* copy the information about the longest float from the reg_scan_data
6185 over to the program. */
6186 if (SvUTF8(data.longest_float)) {
6187 r->float_utf8 = data.longest_float;
6188 r->float_substr = NULL;
6190 r->float_substr = data.longest_float;
6191 r->float_utf8 = NULL;
6193 /* float_end_shift is how many chars that must be matched that
6194 follow this item. We calculate it ahead of time as once the
6195 lookbehind offset is added in we lose the ability to correctly
6197 ml = data.minlen_float ? *(data.minlen_float)
6198 : (I32)longest_float_length;
6199 r->float_end_shift = ml - data.offset_float_min
6200 - longest_float_length + (SvTAIL(data.longest_float) != 0)
6201 + data.lookbehind_float;
6202 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6203 r->float_max_offset = data.offset_float_max;
6204 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6205 r->float_max_offset -= data.lookbehind_float;
6207 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
6208 && (!(data.flags & SF_FL_BEFORE_MEOL)
6209 || (RExC_flags & RXf_PMf_MULTILINE)));
6210 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
6214 r->float_substr = r->float_utf8 = NULL;
6215 SvREFCNT_dec(data.longest_float);
6216 longest_float_length = 0;
6219 /* Note that code very similar to this but for floating string
6220 is immediately above, changes may need to be made to both.
6223 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6225 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
6226 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
6227 && (longest_fixed_length
6228 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
6229 && (!(data.flags & SF_FIX_BEFORE_MEOL)
6230 || (RExC_flags & RXf_PMf_MULTILINE)))) )
6234 /* copy the information about the longest fixed
6235 from the reg_scan_data over to the program. */
6236 if (SvUTF8(data.longest_fixed)) {
6237 r->anchored_utf8 = data.longest_fixed;
6238 r->anchored_substr = NULL;
6240 r->anchored_substr = data.longest_fixed;
6241 r->anchored_utf8 = NULL;
6243 /* fixed_end_shift is how many chars that must be matched that
6244 follow this item. We calculate it ahead of time as once the
6245 lookbehind offset is added in we lose the ability to correctly
6247 ml = data.minlen_fixed ? *(data.minlen_fixed)
6248 : (I32)longest_fixed_length;
6249 r->anchored_end_shift = ml - data.offset_fixed
6250 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
6251 + data.lookbehind_fixed;
6252 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6254 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
6255 && (!(data.flags & SF_FIX_BEFORE_MEOL)
6256 || (RExC_flags & RXf_PMf_MULTILINE)));
6257 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
6260 r->anchored_substr = r->anchored_utf8 = NULL;
6261 SvREFCNT_dec(data.longest_fixed);
6262 longest_fixed_length = 0;
6265 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6266 ri->regstclass = NULL;
6268 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6270 && !(data.start_class->flags & ANYOF_EOS)
6271 && !cl_is_anything(data.start_class))
6273 const U32 n = add_data(pRExC_state, 1, "f");
6274 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6276 Newx(RExC_rxi->data->data[n], 1,
6277 struct regnode_charclass_class);
6278 StructCopy(data.start_class,
6279 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6280 struct regnode_charclass_class);
6281 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6282 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6283 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6284 regprop(r, sv, (regnode*)data.start_class);
6285 PerlIO_printf(Perl_debug_log,
6286 "synthetic stclass \"%s\".\n",
6287 SvPVX_const(sv));});
6290 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6291 if (longest_fixed_length > longest_float_length) {
6292 r->check_end_shift = r->anchored_end_shift;
6293 r->check_substr = r->anchored_substr;
6294 r->check_utf8 = r->anchored_utf8;
6295 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6296 if (r->extflags & RXf_ANCH_SINGLE)
6297 r->extflags |= RXf_NOSCAN;
6300 r->check_end_shift = r->float_end_shift;
6301 r->check_substr = r->float_substr;
6302 r->check_utf8 = r->float_utf8;
6303 r->check_offset_min = r->float_min_offset;
6304 r->check_offset_max = r->float_max_offset;
6306 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6307 This should be changed ASAP! */
6308 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6309 r->extflags |= RXf_USE_INTUIT;
6310 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6311 r->extflags |= RXf_INTUIT_TAIL;
6313 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6314 if ( (STRLEN)minlen < longest_float_length )
6315 minlen= longest_float_length;
6316 if ( (STRLEN)minlen < longest_fixed_length )
6317 minlen= longest_fixed_length;
6321 /* Several toplevels. Best we can is to set minlen. */
6323 struct regnode_charclass_class ch_class;
6326 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6328 scan = ri->program + 1;
6329 cl_init(pRExC_state, &ch_class);
6330 data.start_class = &ch_class;
6331 data.last_closep = &last_close;
6334 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6335 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6339 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6340 = r->float_substr = r->float_utf8 = NULL;
6342 if (!(data.start_class->flags & ANYOF_EOS)
6343 && !cl_is_anything(data.start_class))
6345 const U32 n = add_data(pRExC_state, 1, "f");
6346 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6348 Newx(RExC_rxi->data->data[n], 1,
6349 struct regnode_charclass_class);
6350 StructCopy(data.start_class,
6351 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6352 struct regnode_charclass_class);
6353 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6354 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6355 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6356 regprop(r, sv, (regnode*)data.start_class);
6357 PerlIO_printf(Perl_debug_log,
6358 "synthetic stclass \"%s\".\n",
6359 SvPVX_const(sv));});
6363 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6364 the "real" pattern. */
6366 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6367 (IV)minlen, (IV)r->minlen);
6369 r->minlenret = minlen;
6370 if (r->minlen < minlen)
6373 if (RExC_seen & REG_SEEN_GPOS)
6374 r->extflags |= RXf_GPOS_SEEN;
6375 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6376 r->extflags |= RXf_LOOKBEHIND_SEEN;
6377 if (pRExC_state->num_code_blocks)
6378 r->extflags |= RXf_EVAL_SEEN;
6379 if (RExC_seen & REG_SEEN_CANY)
6380 r->extflags |= RXf_CANY_SEEN;
6381 if (RExC_seen & REG_SEEN_VERBARG)
6382 r->intflags |= PREGf_VERBARG_SEEN;
6383 if (RExC_seen & REG_SEEN_CUTGROUP)
6384 r->intflags |= PREGf_CUTGROUP_SEEN;
6385 if (pm_flags & PMf_USE_RE_EVAL)
6386 r->intflags |= PREGf_USE_RE_EVAL;
6387 if (RExC_paren_names)
6388 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6390 RXp_PAREN_NAMES(r) = NULL;
6392 #ifdef STUPID_PATTERN_CHECKS
6393 if (RX_PRELEN(rx) == 0)
6394 r->extflags |= RXf_NULL;
6395 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6396 /* XXX: this should happen BEFORE we compile */
6397 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6398 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6399 r->extflags |= RXf_WHITE;
6400 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6401 r->extflags |= RXf_START_ONLY;
6403 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6404 /* XXX: this should happen BEFORE we compile */
6405 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6407 regnode *first = ri->program + 1;
6410 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6411 r->extflags |= RXf_NULL;
6412 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6413 r->extflags |= RXf_START_ONLY;
6414 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6415 && OP(regnext(first)) == END)
6416 r->extflags |= RXf_WHITE;
6420 if (RExC_paren_names) {
6421 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6422 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6425 ri->name_list_idx = 0;
6427 if (RExC_recurse_count) {
6428 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6429 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6430 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6433 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6434 /* assume we don't need to swap parens around before we match */
6437 PerlIO_printf(Perl_debug_log,"Final program:\n");
6440 #ifdef RE_TRACK_PATTERN_OFFSETS
6441 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6442 const U32 len = ri->u.offsets[0];
6444 GET_RE_DEBUG_FLAGS_DECL;
6445 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6446 for (i = 1; i <= len; i++) {
6447 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6448 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6449 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6451 PerlIO_printf(Perl_debug_log, "\n");
6459 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6462 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6464 PERL_UNUSED_ARG(value);
6466 if (flags & RXapif_FETCH) {
6467 return reg_named_buff_fetch(rx, key, flags);
6468 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6469 Perl_croak_no_modify(aTHX);
6471 } else if (flags & RXapif_EXISTS) {
6472 return reg_named_buff_exists(rx, key, flags)
6475 } else if (flags & RXapif_REGNAMES) {
6476 return reg_named_buff_all(rx, flags);
6477 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6478 return reg_named_buff_scalar(rx, flags);
6480 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6486 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6489 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6490 PERL_UNUSED_ARG(lastkey);
6492 if (flags & RXapif_FIRSTKEY)
6493 return reg_named_buff_firstkey(rx, flags);
6494 else if (flags & RXapif_NEXTKEY)
6495 return reg_named_buff_nextkey(rx, flags);
6497 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6503 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6506 AV *retarray = NULL;
6508 struct regexp *const rx = (struct regexp *)SvANY(r);
6510 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6512 if (flags & RXapif_ALL)
6515 if (rx && RXp_PAREN_NAMES(rx)) {
6516 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6519 SV* sv_dat=HeVAL(he_str);
6520 I32 *nums=(I32*)SvPVX(sv_dat);
6521 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6522 if ((I32)(rx->nparens) >= nums[i]
6523 && rx->offs[nums[i]].start != -1
6524 && rx->offs[nums[i]].end != -1)
6527 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6532 ret = newSVsv(&PL_sv_undef);
6535 av_push(retarray, ret);
6538 return newRV_noinc(MUTABLE_SV(retarray));
6545 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6548 struct regexp *const rx = (struct regexp *)SvANY(r);
6550 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6552 if (rx && RXp_PAREN_NAMES(rx)) {
6553 if (flags & RXapif_ALL) {
6554 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6556 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6570 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6572 struct regexp *const rx = (struct regexp *)SvANY(r);
6574 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6576 if ( rx && RXp_PAREN_NAMES(rx) ) {
6577 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6579 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6586 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6588 struct regexp *const rx = (struct regexp *)SvANY(r);
6589 GET_RE_DEBUG_FLAGS_DECL;
6591 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6593 if (rx && RXp_PAREN_NAMES(rx)) {
6594 HV *hv = RXp_PAREN_NAMES(rx);
6596 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6599 SV* sv_dat = HeVAL(temphe);
6600 I32 *nums = (I32*)SvPVX(sv_dat);
6601 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6602 if ((I32)(rx->lastparen) >= nums[i] &&
6603 rx->offs[nums[i]].start != -1 &&
6604 rx->offs[nums[i]].end != -1)
6610 if (parno || flags & RXapif_ALL) {
6611 return newSVhek(HeKEY_hek(temphe));
6619 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6624 struct regexp *const rx = (struct regexp *)SvANY(r);
6626 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6628 if (rx && RXp_PAREN_NAMES(rx)) {
6629 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6630 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6631 } else if (flags & RXapif_ONE) {
6632 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6633 av = MUTABLE_AV(SvRV(ret));
6634 length = av_len(av);
6636 return newSViv(length + 1);
6638 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6642 return &PL_sv_undef;
6646 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6648 struct regexp *const rx = (struct regexp *)SvANY(r);
6651 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6653 if (rx && RXp_PAREN_NAMES(rx)) {
6654 HV *hv= RXp_PAREN_NAMES(rx);
6656 (void)hv_iterinit(hv);
6657 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6660 SV* sv_dat = HeVAL(temphe);
6661 I32 *nums = (I32*)SvPVX(sv_dat);
6662 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6663 if ((I32)(rx->lastparen) >= nums[i] &&
6664 rx->offs[nums[i]].start != -1 &&
6665 rx->offs[nums[i]].end != -1)
6671 if (parno || flags & RXapif_ALL) {
6672 av_push(av, newSVhek(HeKEY_hek(temphe)));
6677 return newRV_noinc(MUTABLE_SV(av));
6681 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6684 struct regexp *const rx = (struct regexp *)SvANY(r);
6689 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6692 sv_setsv(sv,&PL_sv_undef);
6696 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6698 i = rx->offs[0].start;
6702 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6704 s = rx->subbeg + rx->offs[0].end;
6705 i = rx->sublen - rx->offs[0].end;
6708 if ( 0 <= paren && paren <= (I32)rx->nparens &&
6709 (s1 = rx->offs[paren].start) != -1 &&
6710 (t1 = rx->offs[paren].end) != -1)
6714 s = rx->subbeg + s1;
6716 sv_setsv(sv,&PL_sv_undef);
6719 assert(rx->sublen >= (s - rx->subbeg) + i );
6721 const int oldtainted = PL_tainted;
6723 sv_setpvn(sv, s, i);
6724 PL_tainted = oldtainted;
6725 if ( (rx->extflags & RXf_CANY_SEEN)
6726 ? (RXp_MATCH_UTF8(rx)
6727 && (!i || is_utf8_string((U8*)s, i)))
6728 : (RXp_MATCH_UTF8(rx)) )
6735 if (RXp_MATCH_TAINTED(rx)) {
6736 if (SvTYPE(sv) >= SVt_PVMG) {
6737 MAGIC* const mg = SvMAGIC(sv);
6740 SvMAGIC_set(sv, mg->mg_moremagic);
6742 if ((mgt = SvMAGIC(sv))) {
6743 mg->mg_moremagic = mgt;
6744 SvMAGIC_set(sv, mg);
6754 sv_setsv(sv,&PL_sv_undef);
6760 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6761 SV const * const value)
6763 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6765 PERL_UNUSED_ARG(rx);
6766 PERL_UNUSED_ARG(paren);
6767 PERL_UNUSED_ARG(value);
6770 Perl_croak_no_modify(aTHX);
6774 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6777 struct regexp *const rx = (struct regexp *)SvANY(r);
6781 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6783 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6785 /* $` / ${^PREMATCH} */
6786 case RX_BUFF_IDX_PREMATCH:
6787 if (rx->offs[0].start != -1) {
6788 i = rx->offs[0].start;
6796 /* $' / ${^POSTMATCH} */
6797 case RX_BUFF_IDX_POSTMATCH:
6798 if (rx->offs[0].end != -1) {
6799 i = rx->sublen - rx->offs[0].end;
6801 s1 = rx->offs[0].end;
6807 /* $& / ${^MATCH}, $1, $2, ... */
6809 if (paren <= (I32)rx->nparens &&
6810 (s1 = rx->offs[paren].start) != -1 &&
6811 (t1 = rx->offs[paren].end) != -1)
6816 if (ckWARN(WARN_UNINITIALIZED))
6817 report_uninit((const SV *)sv);
6822 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6823 const char * const s = rx->subbeg + s1;
6828 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6835 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6837 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6838 PERL_UNUSED_ARG(rx);
6842 return newSVpvs("Regexp");
6845 /* Scans the name of a named buffer from the pattern.
6846 * If flags is REG_RSN_RETURN_NULL returns null.
6847 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6848 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6849 * to the parsed name as looked up in the RExC_paren_names hash.
6850 * If there is an error throws a vFAIL().. type exception.
6853 #define REG_RSN_RETURN_NULL 0
6854 #define REG_RSN_RETURN_NAME 1
6855 #define REG_RSN_RETURN_DATA 2
6858 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6860 char *name_start = RExC_parse;
6862 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6864 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6865 /* skip IDFIRST by using do...while */
6868 RExC_parse += UTF8SKIP(RExC_parse);
6869 } while (isALNUM_utf8((U8*)RExC_parse));
6873 } while (isALNUM(*RExC_parse));
6875 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6876 vFAIL("Group name must start with a non-digit word character");
6880 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6881 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6882 if ( flags == REG_RSN_RETURN_NAME)
6884 else if (flags==REG_RSN_RETURN_DATA) {
6887 if ( ! sv_name ) /* should not happen*/
6888 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6889 if (RExC_paren_names)
6890 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6892 sv_dat = HeVAL(he_str);
6894 vFAIL("Reference to nonexistent named group");
6898 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6899 (unsigned long) flags);
6901 assert(0); /* NOT REACHED */
6906 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6907 int rem=(int)(RExC_end - RExC_parse); \
6916 if (RExC_lastparse!=RExC_parse) \
6917 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6920 iscut ? "..." : "<" \
6923 PerlIO_printf(Perl_debug_log,"%16s",""); \
6926 num = RExC_size + 1; \
6928 num=REG_NODE_NUM(RExC_emit); \
6929 if (RExC_lastnum!=num) \
6930 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6932 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6933 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6934 (int)((depth*2)), "", \
6938 RExC_lastparse=RExC_parse; \
6943 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6944 DEBUG_PARSE_MSG((funcname)); \
6945 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6947 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6948 DEBUG_PARSE_MSG((funcname)); \
6949 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6952 /* This section of code defines the inversion list object and its methods. The
6953 * interfaces are highly subject to change, so as much as possible is static to
6954 * this file. An inversion list is here implemented as a malloc'd C UV array
6955 * with some added info that is placed as UVs at the beginning in a header
6956 * portion. An inversion list for Unicode is an array of code points, sorted
6957 * by ordinal number. The zeroth element is the first code point in the list.
6958 * The 1th element is the first element beyond that not in the list. In other
6959 * words, the first range is
6960 * invlist[0]..(invlist[1]-1)
6961 * The other ranges follow. Thus every element whose index is divisible by two
6962 * marks the beginning of a range that is in the list, and every element not
6963 * divisible by two marks the beginning of a range not in the list. A single
6964 * element inversion list that contains the single code point N generally
6965 * consists of two elements
6968 * (The exception is when N is the highest representable value on the
6969 * machine, in which case the list containing just it would be a single
6970 * element, itself. By extension, if the last range in the list extends to
6971 * infinity, then the first element of that range will be in the inversion list
6972 * at a position that is divisible by two, and is the final element in the
6974 * Taking the complement (inverting) an inversion list is quite simple, if the
6975 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6976 * This implementation reserves an element at the beginning of each inversion list
6977 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6978 * beginning of the list is either that element if 0, or the next one if 1.
6980 * More about inversion lists can be found in "Unicode Demystified"
6981 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6982 * More will be coming when functionality is added later.
6984 * The inversion list data structure is currently implemented as an SV pointing
6985 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6986 * array of UV whose memory management is automatically handled by the existing
6987 * facilities for SV's.
6989 * Some of the methods should always be private to the implementation, and some
6990 * should eventually be made public */
6992 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6993 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6995 /* This is a combination of a version and data structure type, so that one
6996 * being passed in can be validated to be an inversion list of the correct
6997 * vintage. When the structure of the header is changed, a new random number
6998 * in the range 2**31-1 should be generated and the new() method changed to
6999 * insert that at this location. Then, if an auxiliary program doesn't change
7000 * correspondingly, it will be discovered immediately */
7001 #define INVLIST_VERSION_ID_OFFSET 2
7002 #define INVLIST_VERSION_ID 1064334010
7004 /* For safety, when adding new elements, remember to #undef them at the end of
7005 * the inversion list code section */
7007 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
7008 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
7009 * contains the code point U+00000, and begins here. If 1, the inversion list
7010 * doesn't contain U+0000, and it begins at the next UV in the array.
7011 * Inverting an inversion list consists of adding or removing the 0 at the
7012 * beginning of it. By reserving a space for that 0, inversion can be made
7015 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
7017 /* Internally things are UVs */
7018 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7019 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7021 #define INVLIST_INITIAL_LEN 10
7023 PERL_STATIC_INLINE UV*
7024 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7026 /* Returns a pointer to the first element in the inversion list's array.
7027 * This is called upon initialization of an inversion list. Where the
7028 * array begins depends on whether the list has the code point U+0000
7029 * in it or not. The other parameter tells it whether the code that
7030 * follows this call is about to put a 0 in the inversion list or not.
7031 * The first element is either the element with 0, if 0, or the next one,
7034 UV* zero = get_invlist_zero_addr(invlist);
7036 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7039 assert(! *get_invlist_len_addr(invlist));
7041 /* 1^1 = 0; 1^0 = 1 */
7042 *zero = 1 ^ will_have_0;
7043 return zero + *zero;
7046 PERL_STATIC_INLINE UV*
7047 S_invlist_array(pTHX_ SV* const invlist)
7049 /* Returns the pointer to the inversion list's array. Every time the
7050 * length changes, this needs to be called in case malloc or realloc moved
7053 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7055 /* Must not be empty. If these fail, you probably didn't check for <len>
7056 * being non-zero before trying to get the array */
7057 assert(*get_invlist_len_addr(invlist));
7058 assert(*get_invlist_zero_addr(invlist) == 0
7059 || *get_invlist_zero_addr(invlist) == 1);
7061 /* The array begins either at the element reserved for zero if the
7062 * list contains 0 (that element will be set to 0), or otherwise the next
7063 * element (in which case the reserved element will be set to 1). */
7064 return (UV *) (get_invlist_zero_addr(invlist)
7065 + *get_invlist_zero_addr(invlist));
7068 PERL_STATIC_INLINE UV*
7069 S_get_invlist_len_addr(pTHX_ SV* invlist)
7071 /* Return the address of the UV that contains the current number
7072 * of used elements in the inversion list */
7074 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
7076 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
7079 PERL_STATIC_INLINE UV
7080 S_invlist_len(pTHX_ SV* const invlist)
7082 /* Returns the current number of elements stored in the inversion list's
7085 PERL_ARGS_ASSERT_INVLIST_LEN;
7087 return *get_invlist_len_addr(invlist);
7090 PERL_STATIC_INLINE void
7091 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7093 /* Sets the current number of elements stored in the inversion list */
7095 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7097 *get_invlist_len_addr(invlist) = len;
7099 assert(len <= SvLEN(invlist));
7101 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7102 /* If the list contains U+0000, that element is part of the header,
7103 * and should not be counted as part of the array. It will contain
7104 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7106 * SvCUR_set(invlist,
7107 * TO_INTERNAL_SIZE(len
7108 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7109 * But, this is only valid if len is not 0. The consequences of not doing
7110 * this is that the memory allocation code may think that 1 more UV is
7111 * being used than actually is, and so might do an unnecessary grow. That
7112 * seems worth not bothering to make this the precise amount.
7114 * Note that when inverting, SvCUR shouldn't change */
7117 PERL_STATIC_INLINE UV
7118 S_invlist_max(pTHX_ SV* const invlist)
7120 /* Returns the maximum number of elements storable in the inversion list's
7121 * array, without having to realloc() */
7123 PERL_ARGS_ASSERT_INVLIST_MAX;
7125 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7128 PERL_STATIC_INLINE UV*
7129 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7131 /* Return the address of the UV that is reserved to hold 0 if the inversion
7132 * list contains 0. This has to be the last element of the heading, as the
7133 * list proper starts with either it if 0, or the next element if not.
7134 * (But we force it to contain either 0 or 1) */
7136 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7138 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7141 #ifndef PERL_IN_XSUB_RE
7143 Perl__new_invlist(pTHX_ IV initial_size)
7146 /* Return a pointer to a newly constructed inversion list, with enough
7147 * space to store 'initial_size' elements. If that number is negative, a
7148 * system default is used instead */
7152 if (initial_size < 0) {
7153 initial_size = INVLIST_INITIAL_LEN;
7156 /* Allocate the initial space */
7157 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7158 invlist_set_len(new_list, 0);
7160 /* Force iterinit() to be used to get iteration to work */
7161 *get_invlist_iter_addr(new_list) = UV_MAX;
7163 /* This should force a segfault if a method doesn't initialize this
7165 *get_invlist_zero_addr(new_list) = UV_MAX;
7167 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7168 #if HEADER_LENGTH != 4
7169 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7177 S__new_invlist_C_array(pTHX_ UV* list)
7179 /* Return a pointer to a newly constructed inversion list, initialized to
7180 * point to <list>, which has to be in the exact correct inversion list
7181 * form, including internal fields. Thus this is a dangerous routine that
7182 * should not be used in the wrong hands */
7184 SV* invlist = newSV_type(SVt_PV);
7186 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7188 SvPV_set(invlist, (char *) list);
7189 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7190 shouldn't touch it */
7191 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
7193 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7194 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7201 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7203 /* Grow the maximum size of an inversion list */
7205 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7207 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7210 PERL_STATIC_INLINE void
7211 S_invlist_trim(pTHX_ SV* const invlist)
7213 PERL_ARGS_ASSERT_INVLIST_TRIM;
7215 /* Change the length of the inversion list to how many entries it currently
7218 SvPV_shrink_to_cur((SV *) invlist);
7221 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
7223 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
7224 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
7226 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7229 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7231 /* Subject to change or removal. Append the range from 'start' to 'end' at
7232 * the end of the inversion list. The range must be above any existing
7236 UV max = invlist_max(invlist);
7237 UV len = invlist_len(invlist);
7239 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7241 if (len == 0) { /* Empty lists must be initialized */
7242 array = _invlist_array_init(invlist, start == 0);
7245 /* Here, the existing list is non-empty. The current max entry in the
7246 * list is generally the first value not in the set, except when the
7247 * set extends to the end of permissible values, in which case it is
7248 * the first entry in that final set, and so this call is an attempt to
7249 * append out-of-order */
7251 UV final_element = len - 1;
7252 array = invlist_array(invlist);
7253 if (array[final_element] > start
7254 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7256 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7257 array[final_element], start,
7258 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7261 /* Here, it is a legal append. If the new range begins with the first
7262 * value not in the set, it is extending the set, so the new first
7263 * value not in the set is one greater than the newly extended range.
7265 if (array[final_element] == start) {
7266 if (end != UV_MAX) {
7267 array[final_element] = end + 1;
7270 /* But if the end is the maximum representable on the machine,
7271 * just let the range that this would extend to have no end */
7272 invlist_set_len(invlist, len - 1);
7278 /* Here the new range doesn't extend any existing set. Add it */
7280 len += 2; /* Includes an element each for the start and end of range */
7282 /* If overflows the existing space, extend, which may cause the array to be
7285 invlist_extend(invlist, len);
7286 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7287 failure in invlist_array() */
7288 array = invlist_array(invlist);
7291 invlist_set_len(invlist, len);
7294 /* The next item on the list starts the range, the one after that is
7295 * one past the new range. */
7296 array[len - 2] = start;
7297 if (end != UV_MAX) {
7298 array[len - 1] = end + 1;
7301 /* But if the end is the maximum representable on the machine, just let
7302 * the range have no end */
7303 invlist_set_len(invlist, len - 1);
7307 #ifndef PERL_IN_XSUB_RE
7310 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7312 /* Searches the inversion list for the entry that contains the input code
7313 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7314 * return value is the index into the list's array of the range that
7318 IV high = invlist_len(invlist);
7319 const UV * const array = invlist_array(invlist);
7321 PERL_ARGS_ASSERT_INVLIST_SEARCH;
7323 /* If list is empty or the code point is before the first element, return
7325 if (high == 0 || cp < array[0]) {
7329 /* Binary search. What we are looking for is <i> such that
7330 * array[i] <= cp < array[i+1]
7331 * The loop below converges on the i+1. */
7332 while (low < high) {
7333 IV mid = (low + high) / 2;
7334 if (array[mid] <= cp) {
7337 /* We could do this extra test to exit the loop early.
7338 if (cp < array[low]) {
7343 else { /* cp < array[mid] */
7352 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7354 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7355 * but is used when the swash has an inversion list. This makes this much
7356 * faster, as it uses a binary search instead of a linear one. This is
7357 * intimately tied to that function, and perhaps should be in utf8.c,
7358 * except it is intimately tied to inversion lists as well. It assumes
7359 * that <swatch> is all 0's on input */
7362 const IV len = invlist_len(invlist);
7366 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7368 if (len == 0) { /* Empty inversion list */
7372 array = invlist_array(invlist);
7374 /* Find which element it is */
7375 i = invlist_search(invlist, start);
7377 /* We populate from <start> to <end> */
7378 while (current < end) {
7381 /* The inversion list gives the results for every possible code point
7382 * after the first one in the list. Only those ranges whose index is
7383 * even are ones that the inversion list matches. For the odd ones,
7384 * and if the initial code point is not in the list, we have to skip
7385 * forward to the next element */
7386 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7388 if (i >= len) { /* Finished if beyond the end of the array */
7392 if (current >= end) { /* Finished if beyond the end of what we
7397 assert(current >= start);
7399 /* The current range ends one below the next one, except don't go past
7402 upper = (i < len && array[i] < end) ? array[i] : end;
7404 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7405 * for each code point in it */
7406 for (; current < upper; current++) {
7407 const STRLEN offset = (STRLEN)(current - start);
7408 swatch[offset >> 3] |= 1 << (offset & 7);
7411 /* Quit if at the end of the list */
7414 /* But first, have to deal with the highest possible code point on
7415 * the platform. The previous code assumes that <end> is one
7416 * beyond where we want to populate, but that is impossible at the
7417 * platform's infinity, so have to handle it specially */
7418 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7420 const STRLEN offset = (STRLEN)(end - start);
7421 swatch[offset >> 3] |= 1 << (offset & 7);
7426 /* Advance to the next range, which will be for code points not in the
7436 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7438 /* Take the union of two inversion lists and point <output> to it. *output
7439 * should be defined upon input, and if it points to one of the two lists,
7440 * the reference count to that list will be decremented. The first list,
7441 * <a>, may be NULL, in which case a copy of the second list is returned.
7442 * If <complement_b> is TRUE, the union is taken of the complement
7443 * (inversion) of <b> instead of b itself.
7445 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7446 * Richard Gillam, published by Addison-Wesley, and explained at some
7447 * length there. The preface says to incorporate its examples into your
7448 * code at your own risk.
7450 * The algorithm is like a merge sort.
7452 * XXX A potential performance improvement is to keep track as we go along
7453 * if only one of the inputs contributes to the result, meaning the other
7454 * is a subset of that one. In that case, we can skip the final copy and
7455 * return the larger of the input lists, but then outside code might need
7456 * to keep track of whether to free the input list or not */
7458 UV* array_a; /* a's array */
7460 UV len_a; /* length of a's array */
7463 SV* u; /* the resulting union */
7467 UV i_a = 0; /* current index into a's array */
7471 /* running count, as explained in the algorithm source book; items are
7472 * stopped accumulating and are output when the count changes to/from 0.
7473 * The count is incremented when we start a range that's in the set, and
7474 * decremented when we start a range that's not in the set. So its range
7475 * is 0 to 2. Only when the count is zero is something not in the set.
7479 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7482 /* If either one is empty, the union is the other one */
7483 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7490 *output = invlist_clone(b);
7492 _invlist_invert(*output);
7494 } /* else *output already = b; */
7497 else if ((len_b = invlist_len(b)) == 0) {
7502 /* The complement of an empty list is a list that has everything in it,
7503 * so the union with <a> includes everything too */
7508 *output = _new_invlist(1);
7509 _append_range_to_invlist(*output, 0, UV_MAX);
7511 else if (*output != a) {
7512 *output = invlist_clone(a);
7514 /* else *output already = a; */
7518 /* Here both lists exist and are non-empty */
7519 array_a = invlist_array(a);
7520 array_b = invlist_array(b);
7522 /* If are to take the union of 'a' with the complement of b, set it
7523 * up so are looking at b's complement. */
7526 /* To complement, we invert: if the first element is 0, remove it. To
7527 * do this, we just pretend the array starts one later, and clear the
7528 * flag as we don't have to do anything else later */
7529 if (array_b[0] == 0) {
7532 complement_b = FALSE;
7536 /* But if the first element is not zero, we unshift a 0 before the
7537 * array. The data structure reserves a space for that 0 (which
7538 * should be a '1' right now), so physical shifting is unneeded,
7539 * but temporarily change that element to 0. Before exiting the
7540 * routine, we must restore the element to '1' */
7547 /* Size the union for the worst case: that the sets are completely
7549 u = _new_invlist(len_a + len_b);
7551 /* Will contain U+0000 if either component does */
7552 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7553 || (len_b > 0 && array_b[0] == 0));
7555 /* Go through each list item by item, stopping when exhausted one of
7557 while (i_a < len_a && i_b < len_b) {
7558 UV cp; /* The element to potentially add to the union's array */
7559 bool cp_in_set; /* is it in the the input list's set or not */
7561 /* We need to take one or the other of the two inputs for the union.
7562 * Since we are merging two sorted lists, we take the smaller of the
7563 * next items. In case of a tie, we take the one that is in its set
7564 * first. If we took one not in the set first, it would decrement the
7565 * count, possibly to 0 which would cause it to be output as ending the
7566 * range, and the next time through we would take the same number, and
7567 * output it again as beginning the next range. By doing it the
7568 * opposite way, there is no possibility that the count will be
7569 * momentarily decremented to 0, and thus the two adjoining ranges will
7570 * be seamlessly merged. (In a tie and both are in the set or both not
7571 * in the set, it doesn't matter which we take first.) */
7572 if (array_a[i_a] < array_b[i_b]
7573 || (array_a[i_a] == array_b[i_b]
7574 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7576 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7580 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7584 /* Here, have chosen which of the two inputs to look at. Only output
7585 * if the running count changes to/from 0, which marks the
7586 * beginning/end of a range in that's in the set */
7589 array_u[i_u++] = cp;
7596 array_u[i_u++] = cp;
7601 /* Here, we are finished going through at least one of the lists, which
7602 * means there is something remaining in at most one. We check if the list
7603 * that hasn't been exhausted is positioned such that we are in the middle
7604 * of a range in its set or not. (i_a and i_b point to the element beyond
7605 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7606 * is potentially more to output.
7607 * There are four cases:
7608 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7609 * in the union is entirely from the non-exhausted set.
7610 * 2) Both were in their sets, count is 2. Nothing further should
7611 * be output, as everything that remains will be in the exhausted
7612 * list's set, hence in the union; decrementing to 1 but not 0 insures
7614 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7615 * Nothing further should be output because the union includes
7616 * everything from the exhausted set. Not decrementing ensures that.
7617 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7618 * decrementing to 0 insures that we look at the remainder of the
7619 * non-exhausted set */
7620 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7621 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7626 /* The final length is what we've output so far, plus what else is about to
7627 * be output. (If 'count' is non-zero, then the input list we exhausted
7628 * has everything remaining up to the machine's limit in its set, and hence
7629 * in the union, so there will be no further output. */
7632 /* At most one of the subexpressions will be non-zero */
7633 len_u += (len_a - i_a) + (len_b - i_b);
7636 /* Set result to final length, which can change the pointer to array_u, so
7638 if (len_u != invlist_len(u)) {
7639 invlist_set_len(u, len_u);
7641 array_u = invlist_array(u);
7644 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7645 * the other) ended with everything above it not in its set. That means
7646 * that the remaining part of the union is precisely the same as the
7647 * non-exhausted list, so can just copy it unchanged. (If both list were
7648 * exhausted at the same time, then the operations below will be both 0.)
7651 IV copy_count; /* At most one will have a non-zero copy count */
7652 if ((copy_count = len_a - i_a) > 0) {
7653 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7655 else if ((copy_count = len_b - i_b) > 0) {
7656 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7660 /* We may be removing a reference to one of the inputs */
7661 if (a == *output || b == *output) {
7662 SvREFCNT_dec(*output);
7665 /* If we've changed b, restore it */
7675 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7677 /* Take the intersection of two inversion lists and point <i> to it. *i
7678 * should be defined upon input, and if it points to one of the two lists,
7679 * the reference count to that list will be decremented.
7680 * If <complement_b> is TRUE, the result will be the intersection of <a>
7681 * and the complement (or inversion) of <b> instead of <b> directly.
7683 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7684 * Richard Gillam, published by Addison-Wesley, and explained at some
7685 * length there. The preface says to incorporate its examples into your
7686 * code at your own risk. In fact, it had bugs
7688 * The algorithm is like a merge sort, and is essentially the same as the
7692 UV* array_a; /* a's array */
7694 UV len_a; /* length of a's array */
7697 SV* r; /* the resulting intersection */
7701 UV i_a = 0; /* current index into a's array */
7705 /* running count, as explained in the algorithm source book; items are
7706 * stopped accumulating and are output when the count changes to/from 2.
7707 * The count is incremented when we start a range that's in the set, and
7708 * decremented when we start a range that's not in the set. So its range
7709 * is 0 to 2. Only when the count is 2 is something in the intersection.
7713 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7716 /* Special case if either one is empty */
7717 len_a = invlist_len(a);
7718 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7720 if (len_a != 0 && complement_b) {
7722 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7723 * be empty. Here, also we are using 'b's complement, which hence
7724 * must be every possible code point. Thus the intersection is
7727 *i = invlist_clone(a);
7733 /* else *i is already 'a' */
7737 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7738 * intersection must be empty */
7745 *i = _new_invlist(0);
7749 /* Here both lists exist and are non-empty */
7750 array_a = invlist_array(a);
7751 array_b = invlist_array(b);
7753 /* If are to take the intersection of 'a' with the complement of b, set it
7754 * up so are looking at b's complement. */
7757 /* To complement, we invert: if the first element is 0, remove it. To
7758 * do this, we just pretend the array starts one later, and clear the
7759 * flag as we don't have to do anything else later */
7760 if (array_b[0] == 0) {
7763 complement_b = FALSE;
7767 /* But if the first element is not zero, we unshift a 0 before the
7768 * array. The data structure reserves a space for that 0 (which
7769 * should be a '1' right now), so physical shifting is unneeded,
7770 * but temporarily change that element to 0. Before exiting the
7771 * routine, we must restore the element to '1' */
7778 /* Size the intersection for the worst case: that the intersection ends up
7779 * fragmenting everything to be completely disjoint */
7780 r= _new_invlist(len_a + len_b);
7782 /* Will contain U+0000 iff both components do */
7783 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7784 && len_b > 0 && array_b[0] == 0);
7786 /* Go through each list item by item, stopping when exhausted one of
7788 while (i_a < len_a && i_b < len_b) {
7789 UV cp; /* The element to potentially add to the intersection's
7791 bool cp_in_set; /* Is it in the input list's set or not */
7793 /* We need to take one or the other of the two inputs for the
7794 * intersection. Since we are merging two sorted lists, we take the
7795 * smaller of the next items. In case of a tie, we take the one that
7796 * is not in its set first (a difference from the union algorithm). If
7797 * we took one in the set first, it would increment the count, possibly
7798 * to 2 which would cause it to be output as starting a range in the
7799 * intersection, and the next time through we would take that same
7800 * number, and output it again as ending the set. By doing it the
7801 * opposite of this, there is no possibility that the count will be
7802 * momentarily incremented to 2. (In a tie and both are in the set or
7803 * both not in the set, it doesn't matter which we take first.) */
7804 if (array_a[i_a] < array_b[i_b]
7805 || (array_a[i_a] == array_b[i_b]
7806 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7808 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7812 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7816 /* Here, have chosen which of the two inputs to look at. Only output
7817 * if the running count changes to/from 2, which marks the
7818 * beginning/end of a range that's in the intersection */
7822 array_r[i_r++] = cp;
7827 array_r[i_r++] = cp;
7833 /* Here, we are finished going through at least one of the lists, which
7834 * means there is something remaining in at most one. We check if the list
7835 * that has been exhausted is positioned such that we are in the middle
7836 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7837 * the ones we care about.) There are four cases:
7838 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7839 * nothing left in the intersection.
7840 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7841 * above 2. What should be output is exactly that which is in the
7842 * non-exhausted set, as everything it has is also in the intersection
7843 * set, and everything it doesn't have can't be in the intersection
7844 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7845 * gets incremented to 2. Like the previous case, the intersection is
7846 * everything that remains in the non-exhausted set.
7847 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7848 * remains 1. And the intersection has nothing more. */
7849 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7850 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7855 /* The final length is what we've output so far plus what else is in the
7856 * intersection. At most one of the subexpressions below will be non-zero */
7859 len_r += (len_a - i_a) + (len_b - i_b);
7862 /* Set result to final length, which can change the pointer to array_r, so
7864 if (len_r != invlist_len(r)) {
7865 invlist_set_len(r, len_r);
7867 array_r = invlist_array(r);
7870 /* Finish outputting any remaining */
7871 if (count >= 2) { /* At most one will have a non-zero copy count */
7873 if ((copy_count = len_a - i_a) > 0) {
7874 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7876 else if ((copy_count = len_b - i_b) > 0) {
7877 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7881 /* We may be removing a reference to one of the inputs */
7882 if (a == *i || b == *i) {
7886 /* If we've changed b, restore it */
7896 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7898 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7899 * set. A pointer to the inversion list is returned. This may actually be
7900 * a new list, in which case the passed in one has been destroyed. The
7901 * passed in inversion list can be NULL, in which case a new one is created
7902 * with just the one range in it */
7907 if (invlist == NULL) {
7908 invlist = _new_invlist(2);
7912 len = invlist_len(invlist);
7915 /* If comes after the final entry, can just append it to the end */
7917 || start >= invlist_array(invlist)
7918 [invlist_len(invlist) - 1])
7920 _append_range_to_invlist(invlist, start, end);
7924 /* Here, can't just append things, create and return a new inversion list
7925 * which is the union of this range and the existing inversion list */
7926 range_invlist = _new_invlist(2);
7927 _append_range_to_invlist(range_invlist, start, end);
7929 _invlist_union(invlist, range_invlist, &invlist);
7931 /* The temporary can be freed */
7932 SvREFCNT_dec(range_invlist);
7939 PERL_STATIC_INLINE SV*
7940 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7941 return _add_range_to_invlist(invlist, cp, cp);
7944 #ifndef PERL_IN_XSUB_RE
7946 Perl__invlist_invert(pTHX_ SV* const invlist)
7948 /* Complement the input inversion list. This adds a 0 if the list didn't
7949 * have a zero; removes it otherwise. As described above, the data
7950 * structure is set up so that this is very efficient */
7952 UV* len_pos = get_invlist_len_addr(invlist);
7954 PERL_ARGS_ASSERT__INVLIST_INVERT;
7956 /* The inverse of matching nothing is matching everything */
7957 if (*len_pos == 0) {
7958 _append_range_to_invlist(invlist, 0, UV_MAX);
7962 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7963 * zero element was a 0, so it is being removed, so the length decrements
7964 * by 1; and vice-versa. SvCUR is unaffected */
7965 if (*get_invlist_zero_addr(invlist) ^= 1) {
7974 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7976 /* Complement the input inversion list (which must be a Unicode property,
7977 * all of which don't match above the Unicode maximum code point.) And
7978 * Perl has chosen to not have the inversion match above that either. This
7979 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7985 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7987 _invlist_invert(invlist);
7989 len = invlist_len(invlist);
7991 if (len != 0) { /* If empty do nothing */
7992 array = invlist_array(invlist);
7993 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7994 /* Add 0x110000. First, grow if necessary */
7996 if (invlist_max(invlist) < len) {
7997 invlist_extend(invlist, len);
7998 array = invlist_array(invlist);
8000 invlist_set_len(invlist, len);
8001 array[len - 1] = PERL_UNICODE_MAX + 1;
8003 else { /* Remove the 0x110000 */
8004 invlist_set_len(invlist, len - 1);
8012 PERL_STATIC_INLINE SV*
8013 S_invlist_clone(pTHX_ SV* const invlist)
8016 /* Return a new inversion list that is a copy of the input one, which is
8019 /* Need to allocate extra space to accommodate Perl's addition of a
8020 * trailing NUL to SvPV's, since it thinks they are always strings */
8021 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
8022 STRLEN length = SvCUR(invlist);
8024 PERL_ARGS_ASSERT_INVLIST_CLONE;
8026 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8027 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8032 PERL_STATIC_INLINE UV*
8033 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8035 /* Return the address of the UV that contains the current iteration
8038 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8040 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8043 PERL_STATIC_INLINE UV*
8044 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8046 /* Return the address of the UV that contains the version id. */
8048 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8050 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8053 PERL_STATIC_INLINE void
8054 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8056 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8058 *get_invlist_iter_addr(invlist) = 0;
8062 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8064 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8065 * This call sets in <*start> and <*end>, the next range in <invlist>.
8066 * Returns <TRUE> if successful and the next call will return the next
8067 * range; <FALSE> if was already at the end of the list. If the latter,
8068 * <*start> and <*end> are unchanged, and the next call to this function
8069 * will start over at the beginning of the list */
8071 UV* pos = get_invlist_iter_addr(invlist);
8072 UV len = invlist_len(invlist);
8075 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8078 *pos = UV_MAX; /* Force iternit() to be required next time */
8082 array = invlist_array(invlist);
8084 *start = array[(*pos)++];
8090 *end = array[(*pos)++] - 1;
8096 #ifndef PERL_IN_XSUB_RE
8098 Perl__invlist_contents(pTHX_ SV* const invlist)
8100 /* Get the contents of an inversion list into a string SV so that they can
8101 * be printed out. It uses the format traditionally done for debug tracing
8105 SV* output = newSVpvs("\n");
8107 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8109 invlist_iterinit(invlist);
8110 while (invlist_iternext(invlist, &start, &end)) {
8111 if (end == UV_MAX) {
8112 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8114 else if (end != start) {
8115 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8119 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8129 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8131 /* Dumps out the ranges in an inversion list. The string 'header'
8132 * if present is output on a line before the first range */
8136 if (header && strlen(header)) {
8137 PerlIO_printf(Perl_debug_log, "%s\n", header);
8139 invlist_iterinit(invlist);
8140 while (invlist_iternext(invlist, &start, &end)) {
8141 if (end == UV_MAX) {
8142 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8145 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8151 #undef HEADER_LENGTH
8152 #undef INVLIST_INITIAL_LENGTH
8153 #undef TO_INTERNAL_SIZE
8154 #undef FROM_INTERNAL_SIZE
8155 #undef INVLIST_LEN_OFFSET
8156 #undef INVLIST_ZERO_OFFSET
8157 #undef INVLIST_ITER_OFFSET
8158 #undef INVLIST_VERSION_ID
8160 /* End of inversion list object */
8163 - reg - regular expression, i.e. main body or parenthesized thing
8165 * Caller must absorb opening parenthesis.
8167 * Combining parenthesis handling with the base level of regular expression
8168 * is a trifle forced, but the need to tie the tails of the branches to what
8169 * follows makes it hard to avoid.
8171 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8173 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8175 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8179 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8180 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8183 register regnode *ret; /* Will be the head of the group. */
8184 register regnode *br;
8185 register regnode *lastbr;
8186 register regnode *ender = NULL;
8187 register I32 parno = 0;
8189 U32 oregflags = RExC_flags;
8190 bool have_branch = 0;
8192 I32 freeze_paren = 0;
8193 I32 after_freeze = 0;
8195 /* for (?g), (?gc), and (?o) warnings; warning
8196 about (?c) will warn about (?g) -- japhy */
8198 #define WASTED_O 0x01
8199 #define WASTED_G 0x02
8200 #define WASTED_C 0x04
8201 #define WASTED_GC (0x02|0x04)
8202 I32 wastedflags = 0x00;
8204 char * parse_start = RExC_parse; /* MJD */
8205 char * const oregcomp_parse = RExC_parse;
8207 GET_RE_DEBUG_FLAGS_DECL;
8209 PERL_ARGS_ASSERT_REG;
8210 DEBUG_PARSE("reg ");
8212 *flagp = 0; /* Tentatively. */
8215 /* Make an OPEN node, if parenthesized. */
8217 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8218 char *start_verb = RExC_parse;
8219 STRLEN verb_len = 0;
8220 char *start_arg = NULL;
8221 unsigned char op = 0;
8223 int internal_argval = 0; /* internal_argval is only useful if !argok */
8224 while ( *RExC_parse && *RExC_parse != ')' ) {
8225 if ( *RExC_parse == ':' ) {
8226 start_arg = RExC_parse + 1;
8232 verb_len = RExC_parse - start_verb;
8235 while ( *RExC_parse && *RExC_parse != ')' )
8237 if ( *RExC_parse != ')' )
8238 vFAIL("Unterminated verb pattern argument");
8239 if ( RExC_parse == start_arg )
8242 if ( *RExC_parse != ')' )
8243 vFAIL("Unterminated verb pattern");
8246 switch ( *start_verb ) {
8247 case 'A': /* (*ACCEPT) */
8248 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8250 internal_argval = RExC_nestroot;
8253 case 'C': /* (*COMMIT) */
8254 if ( memEQs(start_verb,verb_len,"COMMIT") )
8257 case 'F': /* (*FAIL) */
8258 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8263 case ':': /* (*:NAME) */
8264 case 'M': /* (*MARK:NAME) */
8265 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8270 case 'P': /* (*PRUNE) */
8271 if ( memEQs(start_verb,verb_len,"PRUNE") )
8274 case 'S': /* (*SKIP) */
8275 if ( memEQs(start_verb,verb_len,"SKIP") )
8278 case 'T': /* (*THEN) */
8279 /* [19:06] <TimToady> :: is then */
8280 if ( memEQs(start_verb,verb_len,"THEN") ) {
8282 RExC_seen |= REG_SEEN_CUTGROUP;
8288 vFAIL3("Unknown verb pattern '%.*s'",
8289 verb_len, start_verb);
8292 if ( start_arg && internal_argval ) {
8293 vFAIL3("Verb pattern '%.*s' may not have an argument",
8294 verb_len, start_verb);
8295 } else if ( argok < 0 && !start_arg ) {
8296 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8297 verb_len, start_verb);
8299 ret = reganode(pRExC_state, op, internal_argval);
8300 if ( ! internal_argval && ! SIZE_ONLY ) {
8302 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8303 ARG(ret) = add_data( pRExC_state, 1, "S" );
8304 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8311 if (!internal_argval)
8312 RExC_seen |= REG_SEEN_VERBARG;
8313 } else if ( start_arg ) {
8314 vFAIL3("Verb pattern '%.*s' may not have an argument",
8315 verb_len, start_verb);
8317 ret = reg_node(pRExC_state, op);
8319 nextchar(pRExC_state);
8322 if (*RExC_parse == '?') { /* (?...) */
8323 bool is_logical = 0;
8324 const char * const seqstart = RExC_parse;
8325 bool has_use_defaults = FALSE;
8328 paren = *RExC_parse++;
8329 ret = NULL; /* For look-ahead/behind. */
8332 case 'P': /* (?P...) variants for those used to PCRE/Python */
8333 paren = *RExC_parse++;
8334 if ( paren == '<') /* (?P<...>) named capture */
8336 else if (paren == '>') { /* (?P>name) named recursion */
8337 goto named_recursion;
8339 else if (paren == '=') { /* (?P=...) named backref */
8340 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8341 you change this make sure you change that */
8342 char* name_start = RExC_parse;
8344 SV *sv_dat = reg_scan_name(pRExC_state,
8345 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8346 if (RExC_parse == name_start || *RExC_parse != ')')
8347 vFAIL2("Sequence %.3s... not terminated",parse_start);
8350 num = add_data( pRExC_state, 1, "S" );
8351 RExC_rxi->data->data[num]=(void*)sv_dat;
8352 SvREFCNT_inc_simple_void(sv_dat);
8355 ret = reganode(pRExC_state,
8358 : (MORE_ASCII_RESTRICTED)
8360 : (AT_LEAST_UNI_SEMANTICS)
8368 Set_Node_Offset(ret, parse_start+1);
8369 Set_Node_Cur_Length(ret); /* MJD */
8371 nextchar(pRExC_state);
8375 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8377 case '<': /* (?<...) */
8378 if (*RExC_parse == '!')
8380 else if (*RExC_parse != '=')
8386 case '\'': /* (?'...') */
8387 name_start= RExC_parse;
8388 svname = reg_scan_name(pRExC_state,
8389 SIZE_ONLY ? /* reverse test from the others */
8390 REG_RSN_RETURN_NAME :
8391 REG_RSN_RETURN_NULL);
8392 if (RExC_parse == name_start) {
8394 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8397 if (*RExC_parse != paren)
8398 vFAIL2("Sequence (?%c... not terminated",
8399 paren=='>' ? '<' : paren);
8403 if (!svname) /* shouldn't happen */
8405 "panic: reg_scan_name returned NULL");
8406 if (!RExC_paren_names) {
8407 RExC_paren_names= newHV();
8408 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8410 RExC_paren_name_list= newAV();
8411 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8414 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8416 sv_dat = HeVAL(he_str);
8418 /* croak baby croak */
8420 "panic: paren_name hash element allocation failed");
8421 } else if ( SvPOK(sv_dat) ) {
8422 /* (?|...) can mean we have dupes so scan to check
8423 its already been stored. Maybe a flag indicating
8424 we are inside such a construct would be useful,
8425 but the arrays are likely to be quite small, so
8426 for now we punt -- dmq */
8427 IV count = SvIV(sv_dat);
8428 I32 *pv = (I32*)SvPVX(sv_dat);
8430 for ( i = 0 ; i < count ; i++ ) {
8431 if ( pv[i] == RExC_npar ) {
8437 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8438 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8439 pv[count] = RExC_npar;
8440 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8443 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8444 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8446 SvIV_set(sv_dat, 1);
8449 /* Yes this does cause a memory leak in debugging Perls */
8450 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8451 SvREFCNT_dec(svname);
8454 /*sv_dump(sv_dat);*/
8456 nextchar(pRExC_state);
8458 goto capturing_parens;
8460 RExC_seen |= REG_SEEN_LOOKBEHIND;
8461 RExC_in_lookbehind++;
8463 case '=': /* (?=...) */
8464 RExC_seen_zerolen++;
8466 case '!': /* (?!...) */
8467 RExC_seen_zerolen++;
8468 if (*RExC_parse == ')') {
8469 ret=reg_node(pRExC_state, OPFAIL);
8470 nextchar(pRExC_state);
8474 case '|': /* (?|...) */
8475 /* branch reset, behave like a (?:...) except that
8476 buffers in alternations share the same numbers */
8478 after_freeze = freeze_paren = RExC_npar;
8480 case ':': /* (?:...) */
8481 case '>': /* (?>...) */
8483 case '$': /* (?$...) */
8484 case '@': /* (?@...) */
8485 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8487 case '#': /* (?#...) */
8488 while (*RExC_parse && *RExC_parse != ')')
8490 if (*RExC_parse != ')')
8491 FAIL("Sequence (?#... not terminated");
8492 nextchar(pRExC_state);
8495 case '0' : /* (?0) */
8496 case 'R' : /* (?R) */
8497 if (*RExC_parse != ')')
8498 FAIL("Sequence (?R) not terminated");
8499 ret = reg_node(pRExC_state, GOSTART);
8500 *flagp |= POSTPONED;
8501 nextchar(pRExC_state);
8504 { /* named and numeric backreferences */
8506 case '&': /* (?&NAME) */
8507 parse_start = RExC_parse - 1;
8510 SV *sv_dat = reg_scan_name(pRExC_state,
8511 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8512 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8514 goto gen_recurse_regop;
8515 assert(0); /* NOT REACHED */
8517 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8519 vFAIL("Illegal pattern");
8521 goto parse_recursion;
8523 case '-': /* (?-1) */
8524 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8525 RExC_parse--; /* rewind to let it be handled later */
8529 case '1': case '2': case '3': case '4': /* (?1) */
8530 case '5': case '6': case '7': case '8': case '9':
8533 num = atoi(RExC_parse);
8534 parse_start = RExC_parse - 1; /* MJD */
8535 if (*RExC_parse == '-')
8537 while (isDIGIT(*RExC_parse))
8539 if (*RExC_parse!=')')
8540 vFAIL("Expecting close bracket");
8543 if ( paren == '-' ) {
8545 Diagram of capture buffer numbering.
8546 Top line is the normal capture buffer numbers
8547 Bottom line is the negative indexing as from
8551 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8555 num = RExC_npar + num;
8558 vFAIL("Reference to nonexistent group");
8560 } else if ( paren == '+' ) {
8561 num = RExC_npar + num - 1;
8564 ret = reganode(pRExC_state, GOSUB, num);
8566 if (num > (I32)RExC_rx->nparens) {
8568 vFAIL("Reference to nonexistent group");
8570 ARG2L_SET( ret, RExC_recurse_count++);
8572 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8573 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8577 RExC_seen |= REG_SEEN_RECURSE;
8578 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8579 Set_Node_Offset(ret, parse_start); /* MJD */
8581 *flagp |= POSTPONED;
8582 nextchar(pRExC_state);
8584 } /* named and numeric backreferences */
8585 assert(0); /* NOT REACHED */
8587 case '?': /* (??...) */
8589 if (*RExC_parse != '{') {
8591 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8594 *flagp |= POSTPONED;
8595 paren = *RExC_parse++;
8597 case '{': /* (?{...}) */
8600 struct reg_code_block *cb;
8602 RExC_seen_zerolen++;
8604 if ( !pRExC_state->num_code_blocks
8605 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8606 || pRExC_state->code_blocks[pRExC_state->code_index].start
8607 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8610 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8611 FAIL("panic: Sequence (?{...}): no code block found\n");
8612 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8614 /* this is a pre-compiled code block (?{...}) */
8615 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8616 RExC_parse = RExC_start + cb->end;
8619 if (cb->src_regex) {
8620 n = add_data(pRExC_state, 2, "rl");
8621 RExC_rxi->data->data[n] =
8622 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8623 RExC_rxi->data->data[n+1] = (void*)o;
8626 n = add_data(pRExC_state, 1,
8627 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8628 RExC_rxi->data->data[n] = (void*)o;
8631 pRExC_state->code_index++;
8632 nextchar(pRExC_state);
8636 ret = reg_node(pRExC_state, LOGICAL);
8637 eval = reganode(pRExC_state, EVAL, n);
8640 /* for later propagation into (??{}) return value */
8641 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8643 REGTAIL(pRExC_state, ret, eval);
8644 /* deal with the length of this later - MJD */
8647 ret = reganode(pRExC_state, EVAL, n);
8648 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8649 Set_Node_Offset(ret, parse_start);
8652 case '(': /* (?(?{...})...) and (?(?=...)...) */
8655 if (RExC_parse[0] == '?') { /* (?(?...)) */
8656 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8657 || RExC_parse[1] == '<'
8658 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8661 ret = reg_node(pRExC_state, LOGICAL);
8664 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8668 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8669 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8671 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8672 char *name_start= RExC_parse++;
8674 SV *sv_dat=reg_scan_name(pRExC_state,
8675 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8676 if (RExC_parse == name_start || *RExC_parse != ch)
8677 vFAIL2("Sequence (?(%c... not terminated",
8678 (ch == '>' ? '<' : ch));
8681 num = add_data( pRExC_state, 1, "S" );
8682 RExC_rxi->data->data[num]=(void*)sv_dat;
8683 SvREFCNT_inc_simple_void(sv_dat);
8685 ret = reganode(pRExC_state,NGROUPP,num);
8686 goto insert_if_check_paren;
8688 else if (RExC_parse[0] == 'D' &&
8689 RExC_parse[1] == 'E' &&
8690 RExC_parse[2] == 'F' &&
8691 RExC_parse[3] == 'I' &&
8692 RExC_parse[4] == 'N' &&
8693 RExC_parse[5] == 'E')
8695 ret = reganode(pRExC_state,DEFINEP,0);
8698 goto insert_if_check_paren;
8700 else if (RExC_parse[0] == 'R') {
8703 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8704 parno = atoi(RExC_parse++);
8705 while (isDIGIT(*RExC_parse))
8707 } else if (RExC_parse[0] == '&') {
8710 sv_dat = reg_scan_name(pRExC_state,
8711 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8712 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8714 ret = reganode(pRExC_state,INSUBP,parno);
8715 goto insert_if_check_paren;
8717 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8720 parno = atoi(RExC_parse++);
8722 while (isDIGIT(*RExC_parse))
8724 ret = reganode(pRExC_state, GROUPP, parno);
8726 insert_if_check_paren:
8727 if ((c = *nextchar(pRExC_state)) != ')')
8728 vFAIL("Switch condition not recognized");
8730 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8731 br = regbranch(pRExC_state, &flags, 1,depth+1);
8733 br = reganode(pRExC_state, LONGJMP, 0);
8735 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8736 c = *nextchar(pRExC_state);
8741 vFAIL("(?(DEFINE)....) does not allow branches");
8742 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8743 regbranch(pRExC_state, &flags, 1,depth+1);
8744 REGTAIL(pRExC_state, ret, lastbr);
8747 c = *nextchar(pRExC_state);
8752 vFAIL("Switch (?(condition)... contains too many branches");
8753 ender = reg_node(pRExC_state, TAIL);
8754 REGTAIL(pRExC_state, br, ender);
8756 REGTAIL(pRExC_state, lastbr, ender);
8757 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8760 REGTAIL(pRExC_state, ret, ender);
8761 RExC_size++; /* XXX WHY do we need this?!!
8762 For large programs it seems to be required
8763 but I can't figure out why. -- dmq*/
8767 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8771 RExC_parse--; /* for vFAIL to print correctly */
8772 vFAIL("Sequence (? incomplete");
8774 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8776 has_use_defaults = TRUE;
8777 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8778 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8779 ? REGEX_UNICODE_CHARSET
8780 : REGEX_DEPENDS_CHARSET);
8784 parse_flags: /* (?i) */
8786 U32 posflags = 0, negflags = 0;
8787 U32 *flagsp = &posflags;
8788 char has_charset_modifier = '\0';
8789 regex_charset cs = get_regex_charset(RExC_flags);
8790 if (cs == REGEX_DEPENDS_CHARSET
8791 && (RExC_utf8 || RExC_uni_semantics))
8793 cs = REGEX_UNICODE_CHARSET;
8796 while (*RExC_parse) {
8797 /* && strchr("iogcmsx", *RExC_parse) */
8798 /* (?g), (?gc) and (?o) are useless here
8799 and must be globally applied -- japhy */
8800 switch (*RExC_parse) {
8801 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8802 case LOCALE_PAT_MOD:
8803 if (has_charset_modifier) {
8804 goto excess_modifier;
8806 else if (flagsp == &negflags) {
8809 cs = REGEX_LOCALE_CHARSET;
8810 has_charset_modifier = LOCALE_PAT_MOD;
8811 RExC_contains_locale = 1;
8813 case UNICODE_PAT_MOD:
8814 if (has_charset_modifier) {
8815 goto excess_modifier;
8817 else if (flagsp == &negflags) {
8820 cs = REGEX_UNICODE_CHARSET;
8821 has_charset_modifier = UNICODE_PAT_MOD;
8823 case ASCII_RESTRICT_PAT_MOD:
8824 if (flagsp == &negflags) {
8827 if (has_charset_modifier) {
8828 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8829 goto excess_modifier;
8831 /* Doubled modifier implies more restricted */
8832 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8835 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8837 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8839 case DEPENDS_PAT_MOD:
8840 if (has_use_defaults) {
8841 goto fail_modifiers;
8843 else if (flagsp == &negflags) {
8846 else if (has_charset_modifier) {
8847 goto excess_modifier;
8850 /* The dual charset means unicode semantics if the
8851 * pattern (or target, not known until runtime) are
8852 * utf8, or something in the pattern indicates unicode
8854 cs = (RExC_utf8 || RExC_uni_semantics)
8855 ? REGEX_UNICODE_CHARSET
8856 : REGEX_DEPENDS_CHARSET;
8857 has_charset_modifier = DEPENDS_PAT_MOD;
8861 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8862 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8864 else if (has_charset_modifier == *(RExC_parse - 1)) {
8865 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8868 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8873 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8875 case ONCE_PAT_MOD: /* 'o' */
8876 case GLOBAL_PAT_MOD: /* 'g' */
8877 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8878 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8879 if (! (wastedflags & wflagbit) ) {
8880 wastedflags |= wflagbit;
8883 "Useless (%s%c) - %suse /%c modifier",
8884 flagsp == &negflags ? "?-" : "?",
8886 flagsp == &negflags ? "don't " : "",
8893 case CONTINUE_PAT_MOD: /* 'c' */
8894 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8895 if (! (wastedflags & WASTED_C) ) {
8896 wastedflags |= WASTED_GC;
8899 "Useless (%sc) - %suse /gc modifier",
8900 flagsp == &negflags ? "?-" : "?",
8901 flagsp == &negflags ? "don't " : ""
8906 case KEEPCOPY_PAT_MOD: /* 'p' */
8907 if (flagsp == &negflags) {
8909 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8911 *flagsp |= RXf_PMf_KEEPCOPY;
8915 /* A flag is a default iff it is following a minus, so
8916 * if there is a minus, it means will be trying to
8917 * re-specify a default which is an error */
8918 if (has_use_defaults || flagsp == &negflags) {
8921 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8925 wastedflags = 0; /* reset so (?g-c) warns twice */
8931 RExC_flags |= posflags;
8932 RExC_flags &= ~negflags;
8933 set_regex_charset(&RExC_flags, cs);
8935 oregflags |= posflags;
8936 oregflags &= ~negflags;
8937 set_regex_charset(&oregflags, cs);
8939 nextchar(pRExC_state);
8950 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8955 }} /* one for the default block, one for the switch */
8962 ret = reganode(pRExC_state, OPEN, parno);
8965 RExC_nestroot = parno;
8966 if (RExC_seen & REG_SEEN_RECURSE
8967 && !RExC_open_parens[parno-1])
8969 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8970 "Setting open paren #%"IVdf" to %d\n",
8971 (IV)parno, REG_NODE_NUM(ret)));
8972 RExC_open_parens[parno-1]= ret;
8975 Set_Node_Length(ret, 1); /* MJD */
8976 Set_Node_Offset(ret, RExC_parse); /* MJD */
8984 /* Pick up the branches, linking them together. */
8985 parse_start = RExC_parse; /* MJD */
8986 br = regbranch(pRExC_state, &flags, 1,depth+1);
8988 /* branch_len = (paren != 0); */
8992 if (*RExC_parse == '|') {
8993 if (!SIZE_ONLY && RExC_extralen) {
8994 reginsert(pRExC_state, BRANCHJ, br, depth+1);
8997 reginsert(pRExC_state, BRANCH, br, depth+1);
8998 Set_Node_Length(br, paren != 0);
8999 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9003 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9005 else if (paren == ':') {
9006 *flagp |= flags&SIMPLE;
9008 if (is_open) { /* Starts with OPEN. */
9009 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9011 else if (paren != '?') /* Not Conditional */
9013 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9015 while (*RExC_parse == '|') {
9016 if (!SIZE_ONLY && RExC_extralen) {
9017 ender = reganode(pRExC_state, LONGJMP,0);
9018 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9021 RExC_extralen += 2; /* Account for LONGJMP. */
9022 nextchar(pRExC_state);
9024 if (RExC_npar > after_freeze)
9025 after_freeze = RExC_npar;
9026 RExC_npar = freeze_paren;
9028 br = regbranch(pRExC_state, &flags, 0, depth+1);
9032 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9034 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9037 if (have_branch || paren != ':') {
9038 /* Make a closing node, and hook it on the end. */
9041 ender = reg_node(pRExC_state, TAIL);
9044 ender = reganode(pRExC_state, CLOSE, parno);
9045 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9046 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9047 "Setting close paren #%"IVdf" to %d\n",
9048 (IV)parno, REG_NODE_NUM(ender)));
9049 RExC_close_parens[parno-1]= ender;
9050 if (RExC_nestroot == parno)
9053 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9054 Set_Node_Length(ender,1); /* MJD */
9060 *flagp &= ~HASWIDTH;
9063 ender = reg_node(pRExC_state, SUCCEED);
9066 ender = reg_node(pRExC_state, END);
9068 assert(!RExC_opend); /* there can only be one! */
9073 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9074 SV * const mysv_val1=sv_newmortal();
9075 SV * const mysv_val2=sv_newmortal();
9076 DEBUG_PARSE_MSG("lsbr");
9077 regprop(RExC_rx, mysv_val1, lastbr);
9078 regprop(RExC_rx, mysv_val2, ender);
9079 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9080 SvPV_nolen_const(mysv_val1),
9081 (IV)REG_NODE_NUM(lastbr),
9082 SvPV_nolen_const(mysv_val2),
9083 (IV)REG_NODE_NUM(ender),
9084 (IV)(ender - lastbr)
9087 REGTAIL(pRExC_state, lastbr, ender);
9089 if (have_branch && !SIZE_ONLY) {
9092 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9094 /* Hook the tails of the branches to the closing node. */
9095 for (br = ret; br; br = regnext(br)) {
9096 const U8 op = PL_regkind[OP(br)];
9098 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9099 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9102 else if (op == BRANCHJ) {
9103 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9104 /* for now we always disable this optimisation * /
9105 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9111 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9112 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9113 SV * const mysv_val1=sv_newmortal();
9114 SV * const mysv_val2=sv_newmortal();
9115 DEBUG_PARSE_MSG("NADA");
9116 regprop(RExC_rx, mysv_val1, ret);
9117 regprop(RExC_rx, mysv_val2, ender);
9118 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9119 SvPV_nolen_const(mysv_val1),
9120 (IV)REG_NODE_NUM(ret),
9121 SvPV_nolen_const(mysv_val2),
9122 (IV)REG_NODE_NUM(ender),
9127 if (OP(ender) == TAIL) {
9132 for ( opt= br + 1; opt < ender ; opt++ )
9134 NEXT_OFF(br)= ender - br;
9142 static const char parens[] = "=!<,>";
9144 if (paren && (p = strchr(parens, paren))) {
9145 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9146 int flag = (p - parens) > 1;
9149 node = SUSPEND, flag = 0;
9150 reginsert(pRExC_state, node,ret, depth+1);
9151 Set_Node_Cur_Length(ret);
9152 Set_Node_Offset(ret, parse_start + 1);
9154 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9158 /* Check for proper termination. */
9160 RExC_flags = oregflags;
9161 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9162 RExC_parse = oregcomp_parse;
9163 vFAIL("Unmatched (");
9166 else if (!paren && RExC_parse < RExC_end) {
9167 if (*RExC_parse == ')') {
9169 vFAIL("Unmatched )");
9172 FAIL("Junk on end of regexp"); /* "Can't happen". */
9173 assert(0); /* NOTREACHED */
9176 if (RExC_in_lookbehind) {
9177 RExC_in_lookbehind--;
9179 if (after_freeze > RExC_npar)
9180 RExC_npar = after_freeze;
9185 - regbranch - one alternative of an | operator
9187 * Implements the concatenation operator.
9190 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9193 register regnode *ret;
9194 register regnode *chain = NULL;
9195 register regnode *latest;
9196 I32 flags = 0, c = 0;
9197 GET_RE_DEBUG_FLAGS_DECL;
9199 PERL_ARGS_ASSERT_REGBRANCH;
9201 DEBUG_PARSE("brnc");
9206 if (!SIZE_ONLY && RExC_extralen)
9207 ret = reganode(pRExC_state, BRANCHJ,0);
9209 ret = reg_node(pRExC_state, BRANCH);
9210 Set_Node_Length(ret, 1);
9214 if (!first && SIZE_ONLY)
9215 RExC_extralen += 1; /* BRANCHJ */
9217 *flagp = WORST; /* Tentatively. */
9220 nextchar(pRExC_state);
9221 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9223 latest = regpiece(pRExC_state, &flags,depth+1);
9224 if (latest == NULL) {
9225 if (flags & TRYAGAIN)
9229 else if (ret == NULL)
9231 *flagp |= flags&(HASWIDTH|POSTPONED);
9232 if (chain == NULL) /* First piece. */
9233 *flagp |= flags&SPSTART;
9236 REGTAIL(pRExC_state, chain, latest);
9241 if (chain == NULL) { /* Loop ran zero times. */
9242 chain = reg_node(pRExC_state, NOTHING);
9247 *flagp |= flags&SIMPLE;
9254 - regpiece - something followed by possible [*+?]
9256 * Note that the branching code sequences used for ? and the general cases
9257 * of * and + are somewhat optimized: they use the same NOTHING node as
9258 * both the endmarker for their branch list and the body of the last branch.
9259 * It might seem that this node could be dispensed with entirely, but the
9260 * endmarker role is not redundant.
9263 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9266 register regnode *ret;
9268 register char *next;
9270 const char * const origparse = RExC_parse;
9272 I32 max = REG_INFTY;
9273 #ifdef RE_TRACK_PATTERN_OFFSETS
9276 const char *maxpos = NULL;
9277 GET_RE_DEBUG_FLAGS_DECL;
9279 PERL_ARGS_ASSERT_REGPIECE;
9281 DEBUG_PARSE("piec");
9283 ret = regatom(pRExC_state, &flags,depth+1);
9285 if (flags & TRYAGAIN)
9292 if (op == '{' && regcurly(RExC_parse)) {
9294 #ifdef RE_TRACK_PATTERN_OFFSETS
9295 parse_start = RExC_parse; /* MJD */
9297 next = RExC_parse + 1;
9298 while (isDIGIT(*next) || *next == ',') {
9307 if (*next == '}') { /* got one */
9311 min = atoi(RExC_parse);
9315 maxpos = RExC_parse;
9317 if (!max && *maxpos != '0')
9318 max = REG_INFTY; /* meaning "infinity" */
9319 else if (max >= REG_INFTY)
9320 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9322 nextchar(pRExC_state);
9325 if ((flags&SIMPLE)) {
9326 RExC_naughty += 2 + RExC_naughty / 2;
9327 reginsert(pRExC_state, CURLY, ret, depth+1);
9328 Set_Node_Offset(ret, parse_start+1); /* MJD */
9329 Set_Node_Cur_Length(ret);
9332 regnode * const w = reg_node(pRExC_state, WHILEM);
9335 REGTAIL(pRExC_state, ret, w);
9336 if (!SIZE_ONLY && RExC_extralen) {
9337 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9338 reginsert(pRExC_state, NOTHING,ret, depth+1);
9339 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9341 reginsert(pRExC_state, CURLYX,ret, depth+1);
9343 Set_Node_Offset(ret, parse_start+1);
9344 Set_Node_Length(ret,
9345 op == '{' ? (RExC_parse - parse_start) : 1);
9347 if (!SIZE_ONLY && RExC_extralen)
9348 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9349 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9351 RExC_whilem_seen++, RExC_extralen += 3;
9352 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9361 vFAIL("Can't do {n,m} with n > m");
9363 ARG1_SET(ret, (U16)min);
9364 ARG2_SET(ret, (U16)max);
9376 #if 0 /* Now runtime fix should be reliable. */
9378 /* if this is reinstated, don't forget to put this back into perldiag:
9380 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9382 (F) The part of the regexp subject to either the * or + quantifier
9383 could match an empty string. The {#} shows in the regular
9384 expression about where the problem was discovered.
9388 if (!(flags&HASWIDTH) && op != '?')
9389 vFAIL("Regexp *+ operand could be empty");
9392 #ifdef RE_TRACK_PATTERN_OFFSETS
9393 parse_start = RExC_parse;
9395 nextchar(pRExC_state);
9397 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9399 if (op == '*' && (flags&SIMPLE)) {
9400 reginsert(pRExC_state, STAR, ret, depth+1);
9404 else if (op == '*') {
9408 else if (op == '+' && (flags&SIMPLE)) {
9409 reginsert(pRExC_state, PLUS, ret, depth+1);
9413 else if (op == '+') {
9417 else if (op == '?') {
9422 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9423 ckWARN3reg(RExC_parse,
9424 "%.*s matches null string many times",
9425 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9429 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9430 nextchar(pRExC_state);
9431 reginsert(pRExC_state, MINMOD, ret, depth+1);
9432 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9434 #ifndef REG_ALLOW_MINMOD_SUSPEND
9437 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9439 nextchar(pRExC_state);
9440 ender = reg_node(pRExC_state, SUCCEED);
9441 REGTAIL(pRExC_state, ret, ender);
9442 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9444 ender = reg_node(pRExC_state, TAIL);
9445 REGTAIL(pRExC_state, ret, ender);
9449 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9451 vFAIL("Nested quantifiers");
9458 /* reg_namedseq(pRExC_state,UVp, UV depth)
9460 This is expected to be called by a parser routine that has
9461 recognized '\N' and needs to handle the rest. RExC_parse is
9462 expected to point at the first char following the N at the time
9465 The \N may be inside (indicated by valuep not being NULL) or outside a
9468 \N may begin either a named sequence, or if outside a character class, mean
9469 to match a non-newline. For non single-quoted regexes, the tokenizer has
9470 attempted to decide which, and in the case of a named sequence converted it
9471 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9472 where c1... are the characters in the sequence. For single-quoted regexes,
9473 the tokenizer passes the \N sequence through unchanged; this code will not
9474 attempt to determine this nor expand those. The net effect is that if the
9475 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9476 signals that this \N occurrence means to match a non-newline.
9478 Only the \N{U+...} form should occur in a character class, for the same
9479 reason that '.' inside a character class means to just match a period: it
9480 just doesn't make sense.
9482 If valuep is non-null then it is assumed that we are parsing inside
9483 of a charclass definition and the first codepoint in the resolved
9484 string is returned via *valuep and the routine will return NULL.
9485 In this mode if a multichar string is returned from the charnames
9486 handler, a warning will be issued, and only the first char in the
9487 sequence will be examined. If the string returned is zero length
9488 then the value of *valuep is undefined and NON-NULL will
9489 be returned to indicate failure. (This will NOT be a valid pointer
9492 If valuep is null then it is assumed that we are parsing normal text and a
9493 new EXACT node is inserted into the program containing the resolved string,
9494 and a pointer to the new node is returned. But if the string is zero length
9495 a NOTHING node is emitted instead.
9497 On success RExC_parse is set to the char following the endbrace.
9498 Parsing failures will generate a fatal error via vFAIL(...)
9501 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
9503 char * endbrace; /* '}' following the name */
9504 regnode *ret = NULL;
9507 GET_RE_DEBUG_FLAGS_DECL;
9509 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
9513 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9514 * modifier. The other meaning does not */
9515 p = (RExC_flags & RXf_PMf_EXTENDED)
9516 ? regwhite( pRExC_state, RExC_parse )
9519 /* Disambiguate between \N meaning a named character versus \N meaning
9520 * [^\n]. The former is assumed when it can't be the latter. */
9521 if (*p != '{' || regcurly(p)) {
9524 /* no bare \N in a charclass */
9525 vFAIL("\\N in a character class must be a named character: \\N{...}");
9527 nextchar(pRExC_state);
9528 ret = reg_node(pRExC_state, REG_ANY);
9529 *flagp |= HASWIDTH|SIMPLE;
9532 Set_Node_Length(ret, 1); /* MJD */
9536 /* Here, we have decided it should be a named sequence */
9538 /* The test above made sure that the next real character is a '{', but
9539 * under the /x modifier, it could be separated by space (or a comment and
9540 * \n) and this is not allowed (for consistency with \x{...} and the
9541 * tokenizer handling of \N{NAME}). */
9542 if (*RExC_parse != '{') {
9543 vFAIL("Missing braces on \\N{}");
9546 RExC_parse++; /* Skip past the '{' */
9548 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9549 || ! (endbrace == RExC_parse /* nothing between the {} */
9550 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9551 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9553 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9554 vFAIL("\\N{NAME} must be resolved by the lexer");
9557 if (endbrace == RExC_parse) { /* empty: \N{} */
9559 RExC_parse = endbrace + 1;
9560 return reg_node(pRExC_state,NOTHING);
9564 ckWARNreg(RExC_parse,
9565 "Ignoring zero length \\N{} in character class"
9567 RExC_parse = endbrace + 1;
9570 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
9573 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
9574 RExC_parse += 2; /* Skip past the 'U+' */
9576 if (valuep) { /* In a bracketed char class */
9577 /* We only pay attention to the first char of
9578 multichar strings being returned. I kinda wonder
9579 if this makes sense as it does change the behaviour
9580 from earlier versions, OTOH that behaviour was broken
9581 as well. XXX Solution is to recharacterize as
9582 [rest-of-class]|multi1|multi2... */
9584 STRLEN length_of_hex;
9585 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9586 | PERL_SCAN_DISALLOW_PREFIX
9587 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9589 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9590 if (endchar < endbrace) {
9591 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9594 length_of_hex = (STRLEN)(endchar - RExC_parse);
9595 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9597 /* The tokenizer should have guaranteed validity, but it's possible to
9598 * bypass it by using single quoting, so check */
9599 if (length_of_hex == 0
9600 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9602 RExC_parse += length_of_hex; /* Includes all the valid */
9603 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9604 ? UTF8SKIP(RExC_parse)
9606 /* Guard against malformed utf8 */
9607 if (RExC_parse >= endchar) RExC_parse = endchar;
9608 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9611 RExC_parse = endbrace + 1;
9612 if (endchar == endbrace) return NULL;
9614 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
9616 else { /* Not a char class */
9618 /* What is done here is to convert this to a sub-pattern of the form
9619 * (?:\x{char1}\x{char2}...)
9620 * and then call reg recursively. That way, it retains its atomicness,
9621 * while not having to worry about special handling that some code
9622 * points may have. toke.c has converted the original Unicode values
9623 * to native, so that we can just pass on the hex values unchanged. We
9624 * do have to set a flag to keep recoding from happening in the
9627 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9629 char *endchar; /* Points to '.' or '}' ending cur char in the input
9631 char *orig_end = RExC_end;
9633 while (RExC_parse < endbrace) {
9635 /* Code points are separated by dots. If none, there is only one
9636 * code point, and is terminated by the brace */
9637 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9639 /* Convert to notation the rest of the code understands */
9640 sv_catpv(substitute_parse, "\\x{");
9641 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9642 sv_catpv(substitute_parse, "}");
9644 /* Point to the beginning of the next character in the sequence. */
9645 RExC_parse = endchar + 1;
9647 sv_catpv(substitute_parse, ")");
9649 RExC_parse = SvPV(substitute_parse, len);
9651 /* Don't allow empty number */
9653 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9655 RExC_end = RExC_parse + len;
9657 /* The values are Unicode, and therefore not subject to recoding */
9658 RExC_override_recoding = 1;
9660 ret = reg(pRExC_state, 1, flagp, depth+1);
9662 RExC_parse = endbrace;
9663 RExC_end = orig_end;
9664 RExC_override_recoding = 0;
9666 nextchar(pRExC_state);
9676 * It returns the code point in utf8 for the value in *encp.
9677 * value: a code value in the source encoding
9678 * encp: a pointer to an Encode object
9680 * If the result from Encode is not a single character,
9681 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9684 S_reg_recode(pTHX_ const char value, SV **encp)
9687 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9688 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9689 const STRLEN newlen = SvCUR(sv);
9690 UV uv = UNICODE_REPLACEMENT;
9692 PERL_ARGS_ASSERT_REG_RECODE;
9696 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9699 if (!newlen || numlen != newlen) {
9700 uv = UNICODE_REPLACEMENT;
9708 - regatom - the lowest level
9710 Try to identify anything special at the start of the pattern. If there
9711 is, then handle it as required. This may involve generating a single regop,
9712 such as for an assertion; or it may involve recursing, such as to
9713 handle a () structure.
9715 If the string doesn't start with something special then we gobble up
9716 as much literal text as we can.
9718 Once we have been able to handle whatever type of thing started the
9719 sequence, we return.
9721 Note: we have to be careful with escapes, as they can be both literal
9722 and special, and in the case of \10 and friends, context determines which.
9724 A summary of the code structure is:
9726 switch (first_byte) {
9727 cases for each special:
9728 handle this special;
9732 cases for each unambiguous special:
9733 handle this special;
9735 cases for each ambigous special/literal:
9737 if (special) handle here
9739 default: // unambiguously literal:
9742 default: // is a literal char
9745 create EXACTish node for literal;
9746 while (more input and node isn't full) {
9747 switch (input_byte) {
9748 cases for each special;
9749 make sure parse pointer is set so that the next call to
9750 regatom will see this special first
9751 goto loopdone; // EXACTish node terminated by prev. char
9753 append char to EXACTISH node;
9755 get next input byte;
9759 return the generated node;
9761 Specifically there are two separate switches for handling
9762 escape sequences, with the one for handling literal escapes requiring
9763 a dummy entry for all of the special escapes that are actually handled
9768 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9771 register regnode *ret = NULL;
9773 char *parse_start = RExC_parse;
9775 GET_RE_DEBUG_FLAGS_DECL;
9776 DEBUG_PARSE("atom");
9777 *flagp = WORST; /* Tentatively. */
9779 PERL_ARGS_ASSERT_REGATOM;
9782 switch ((U8)*RExC_parse) {
9784 RExC_seen_zerolen++;
9785 nextchar(pRExC_state);
9786 if (RExC_flags & RXf_PMf_MULTILINE)
9787 ret = reg_node(pRExC_state, MBOL);
9788 else if (RExC_flags & RXf_PMf_SINGLELINE)
9789 ret = reg_node(pRExC_state, SBOL);
9791 ret = reg_node(pRExC_state, BOL);
9792 Set_Node_Length(ret, 1); /* MJD */
9795 nextchar(pRExC_state);
9797 RExC_seen_zerolen++;
9798 if (RExC_flags & RXf_PMf_MULTILINE)
9799 ret = reg_node(pRExC_state, MEOL);
9800 else if (RExC_flags & RXf_PMf_SINGLELINE)
9801 ret = reg_node(pRExC_state, SEOL);
9803 ret = reg_node(pRExC_state, EOL);
9804 Set_Node_Length(ret, 1); /* MJD */
9807 nextchar(pRExC_state);
9808 if (RExC_flags & RXf_PMf_SINGLELINE)
9809 ret = reg_node(pRExC_state, SANY);
9811 ret = reg_node(pRExC_state, REG_ANY);
9812 *flagp |= HASWIDTH|SIMPLE;
9814 Set_Node_Length(ret, 1); /* MJD */
9818 char * const oregcomp_parse = ++RExC_parse;
9819 ret = regclass(pRExC_state,depth+1);
9820 if (*RExC_parse != ']') {
9821 RExC_parse = oregcomp_parse;
9822 vFAIL("Unmatched [");
9824 nextchar(pRExC_state);
9825 *flagp |= HASWIDTH|SIMPLE;
9826 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9830 nextchar(pRExC_state);
9831 ret = reg(pRExC_state, 1, &flags,depth+1);
9833 if (flags & TRYAGAIN) {
9834 if (RExC_parse == RExC_end) {
9835 /* Make parent create an empty node if needed. */
9843 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9847 if (flags & TRYAGAIN) {
9851 vFAIL("Internal urp");
9852 /* Supposed to be caught earlier. */
9858 vFAIL("Quantifier follows nothing");
9863 This switch handles escape sequences that resolve to some kind
9864 of special regop and not to literal text. Escape sequnces that
9865 resolve to literal text are handled below in the switch marked
9868 Every entry in this switch *must* have a corresponding entry
9869 in the literal escape switch. However, the opposite is not
9870 required, as the default for this switch is to jump to the
9871 literal text handling code.
9873 switch ((U8)*++RExC_parse) {
9874 /* Special Escapes */
9876 RExC_seen_zerolen++;
9877 ret = reg_node(pRExC_state, SBOL);
9879 goto finish_meta_pat;
9881 ret = reg_node(pRExC_state, GPOS);
9882 RExC_seen |= REG_SEEN_GPOS;
9884 goto finish_meta_pat;
9886 RExC_seen_zerolen++;
9887 ret = reg_node(pRExC_state, KEEPS);
9889 /* XXX:dmq : disabling in-place substitution seems to
9890 * be necessary here to avoid cases of memory corruption, as
9891 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9893 RExC_seen |= REG_SEEN_LOOKBEHIND;
9894 goto finish_meta_pat;
9896 ret = reg_node(pRExC_state, SEOL);
9898 RExC_seen_zerolen++; /* Do not optimize RE away */
9899 goto finish_meta_pat;
9901 ret = reg_node(pRExC_state, EOS);
9903 RExC_seen_zerolen++; /* Do not optimize RE away */
9904 goto finish_meta_pat;
9906 ret = reg_node(pRExC_state, CANY);
9907 RExC_seen |= REG_SEEN_CANY;
9908 *flagp |= HASWIDTH|SIMPLE;
9909 goto finish_meta_pat;
9911 ret = reg_node(pRExC_state, CLUMP);
9913 goto finish_meta_pat;
9915 switch (get_regex_charset(RExC_flags)) {
9916 case REGEX_LOCALE_CHARSET:
9919 case REGEX_UNICODE_CHARSET:
9922 case REGEX_ASCII_RESTRICTED_CHARSET:
9923 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9926 case REGEX_DEPENDS_CHARSET:
9932 ret = reg_node(pRExC_state, op);
9933 *flagp |= HASWIDTH|SIMPLE;
9934 goto finish_meta_pat;
9936 switch (get_regex_charset(RExC_flags)) {
9937 case REGEX_LOCALE_CHARSET:
9940 case REGEX_UNICODE_CHARSET:
9943 case REGEX_ASCII_RESTRICTED_CHARSET:
9944 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9947 case REGEX_DEPENDS_CHARSET:
9953 ret = reg_node(pRExC_state, op);
9954 *flagp |= HASWIDTH|SIMPLE;
9955 goto finish_meta_pat;
9957 RExC_seen_zerolen++;
9958 RExC_seen |= REG_SEEN_LOOKBEHIND;
9959 switch (get_regex_charset(RExC_flags)) {
9960 case REGEX_LOCALE_CHARSET:
9963 case REGEX_UNICODE_CHARSET:
9966 case REGEX_ASCII_RESTRICTED_CHARSET:
9967 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9970 case REGEX_DEPENDS_CHARSET:
9976 ret = reg_node(pRExC_state, op);
9977 FLAGS(ret) = get_regex_charset(RExC_flags);
9979 goto finish_meta_pat;
9981 RExC_seen_zerolen++;
9982 RExC_seen |= REG_SEEN_LOOKBEHIND;
9983 switch (get_regex_charset(RExC_flags)) {
9984 case REGEX_LOCALE_CHARSET:
9987 case REGEX_UNICODE_CHARSET:
9990 case REGEX_ASCII_RESTRICTED_CHARSET:
9991 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9994 case REGEX_DEPENDS_CHARSET:
10000 ret = reg_node(pRExC_state, op);
10001 FLAGS(ret) = get_regex_charset(RExC_flags);
10003 goto finish_meta_pat;
10005 switch (get_regex_charset(RExC_flags)) {
10006 case REGEX_LOCALE_CHARSET:
10009 case REGEX_UNICODE_CHARSET:
10012 case REGEX_ASCII_RESTRICTED_CHARSET:
10013 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10016 case REGEX_DEPENDS_CHARSET:
10022 ret = reg_node(pRExC_state, op);
10023 *flagp |= HASWIDTH|SIMPLE;
10024 goto finish_meta_pat;
10026 switch (get_regex_charset(RExC_flags)) {
10027 case REGEX_LOCALE_CHARSET:
10030 case REGEX_UNICODE_CHARSET:
10033 case REGEX_ASCII_RESTRICTED_CHARSET:
10034 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10037 case REGEX_DEPENDS_CHARSET:
10043 ret = reg_node(pRExC_state, op);
10044 *flagp |= HASWIDTH|SIMPLE;
10045 goto finish_meta_pat;
10047 switch (get_regex_charset(RExC_flags)) {
10048 case REGEX_LOCALE_CHARSET:
10051 case REGEX_ASCII_RESTRICTED_CHARSET:
10052 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10055 case REGEX_DEPENDS_CHARSET: /* No difference between these */
10056 case REGEX_UNICODE_CHARSET:
10062 ret = reg_node(pRExC_state, op);
10063 *flagp |= HASWIDTH|SIMPLE;
10064 goto finish_meta_pat;
10066 switch (get_regex_charset(RExC_flags)) {
10067 case REGEX_LOCALE_CHARSET:
10070 case REGEX_ASCII_RESTRICTED_CHARSET:
10071 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10074 case REGEX_DEPENDS_CHARSET: /* No difference between these */
10075 case REGEX_UNICODE_CHARSET:
10081 ret = reg_node(pRExC_state, op);
10082 *flagp |= HASWIDTH|SIMPLE;
10083 goto finish_meta_pat;
10085 ret = reg_node(pRExC_state, LNBREAK);
10086 *flagp |= HASWIDTH|SIMPLE;
10087 goto finish_meta_pat;
10089 ret = reg_node(pRExC_state, HORIZWS);
10090 *flagp |= HASWIDTH|SIMPLE;
10091 goto finish_meta_pat;
10093 ret = reg_node(pRExC_state, NHORIZWS);
10094 *flagp |= HASWIDTH|SIMPLE;
10095 goto finish_meta_pat;
10097 ret = reg_node(pRExC_state, VERTWS);
10098 *flagp |= HASWIDTH|SIMPLE;
10099 goto finish_meta_pat;
10101 ret = reg_node(pRExC_state, NVERTWS);
10102 *flagp |= HASWIDTH|SIMPLE;
10104 nextchar(pRExC_state);
10105 Set_Node_Length(ret, 2); /* MJD */
10110 char* const oldregxend = RExC_end;
10112 char* parse_start = RExC_parse - 2;
10115 if (RExC_parse[1] == '{') {
10116 /* a lovely hack--pretend we saw [\pX] instead */
10117 RExC_end = strchr(RExC_parse, '}');
10119 const U8 c = (U8)*RExC_parse;
10121 RExC_end = oldregxend;
10122 vFAIL2("Missing right brace on \\%c{}", c);
10127 RExC_end = RExC_parse + 2;
10128 if (RExC_end > oldregxend)
10129 RExC_end = oldregxend;
10133 ret = regclass(pRExC_state,depth+1);
10135 RExC_end = oldregxend;
10138 Set_Node_Offset(ret, parse_start + 2);
10139 Set_Node_Cur_Length(ret);
10140 nextchar(pRExC_state);
10141 *flagp |= HASWIDTH|SIMPLE;
10145 /* Handle \N and \N{NAME} here and not below because it can be
10146 multicharacter. join_exact() will join them up later on.
10147 Also this makes sure that things like /\N{BLAH}+/ and
10148 \N{BLAH} being multi char Just Happen. dmq*/
10150 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
10152 case 'k': /* Handle \k<NAME> and \k'NAME' */
10155 char ch= RExC_parse[1];
10156 if (ch != '<' && ch != '\'' && ch != '{') {
10158 vFAIL2("Sequence %.2s... not terminated",parse_start);
10160 /* this pretty much dupes the code for (?P=...) in reg(), if
10161 you change this make sure you change that */
10162 char* name_start = (RExC_parse += 2);
10164 SV *sv_dat = reg_scan_name(pRExC_state,
10165 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10166 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10167 if (RExC_parse == name_start || *RExC_parse != ch)
10168 vFAIL2("Sequence %.3s... not terminated",parse_start);
10171 num = add_data( pRExC_state, 1, "S" );
10172 RExC_rxi->data->data[num]=(void*)sv_dat;
10173 SvREFCNT_inc_simple_void(sv_dat);
10177 ret = reganode(pRExC_state,
10180 : (MORE_ASCII_RESTRICTED)
10182 : (AT_LEAST_UNI_SEMANTICS)
10188 *flagp |= HASWIDTH;
10190 /* override incorrect value set in reganode MJD */
10191 Set_Node_Offset(ret, parse_start+1);
10192 Set_Node_Cur_Length(ret); /* MJD */
10193 nextchar(pRExC_state);
10199 case '1': case '2': case '3': case '4':
10200 case '5': case '6': case '7': case '8': case '9':
10203 bool isg = *RExC_parse == 'g';
10208 if (*RExC_parse == '{') {
10212 if (*RExC_parse == '-') {
10216 if (hasbrace && !isDIGIT(*RExC_parse)) {
10217 if (isrel) RExC_parse--;
10219 goto parse_named_seq;
10221 num = atoi(RExC_parse);
10222 if (isg && num == 0)
10223 vFAIL("Reference to invalid group 0");
10225 num = RExC_npar - num;
10227 vFAIL("Reference to nonexistent or unclosed group");
10229 if (!isg && num > 9 && num >= RExC_npar)
10230 /* Probably a character specified in octal, e.g. \35 */
10233 char * const parse_start = RExC_parse - 1; /* MJD */
10234 while (isDIGIT(*RExC_parse))
10236 if (parse_start == RExC_parse - 1)
10237 vFAIL("Unterminated \\g... pattern");
10239 if (*RExC_parse != '}')
10240 vFAIL("Unterminated \\g{...} pattern");
10244 if (num > (I32)RExC_rx->nparens)
10245 vFAIL("Reference to nonexistent group");
10248 ret = reganode(pRExC_state,
10251 : (MORE_ASCII_RESTRICTED)
10253 : (AT_LEAST_UNI_SEMANTICS)
10259 *flagp |= HASWIDTH;
10261 /* override incorrect value set in reganode MJD */
10262 Set_Node_Offset(ret, parse_start+1);
10263 Set_Node_Cur_Length(ret); /* MJD */
10265 nextchar(pRExC_state);
10270 if (RExC_parse >= RExC_end)
10271 FAIL("Trailing \\");
10274 /* Do not generate "unrecognized" warnings here, we fall
10275 back into the quick-grab loop below */
10282 if (RExC_flags & RXf_PMf_EXTENDED) {
10283 if ( reg_skipcomment( pRExC_state ) )
10290 parse_start = RExC_parse - 1;
10295 register STRLEN len;
10300 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
10303 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
10304 * it is folded to 'ss' even if not utf8 */
10305 bool is_exactfu_sharp_s;
10308 node_type = ((! FOLD) ? EXACT
10311 : (MORE_ASCII_RESTRICTED)
10313 : (AT_LEAST_UNI_SEMANTICS)
10316 ret = reg_node(pRExC_state, node_type);
10319 /* XXX The node can hold up to 255 bytes, yet this only goes to
10320 * 127. I (khw) do not know why. Keeping it somewhat less than
10321 * 255 allows us to not have to worry about overflow due to
10322 * converting to utf8 and fold expansion, but that value is
10323 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10324 * split up by this limit into a single one using the real max of
10325 * 255. Even at 127, this breaks under rare circumstances. If
10326 * folding, we do not want to split a node at a character that is a
10327 * non-final in a multi-char fold, as an input string could just
10328 * happen to want to match across the node boundary. The join
10329 * would solve that problem if the join actually happens. But a
10330 * series of more than two nodes in a row each of 127 would cause
10331 * the first join to succeed to get to 254, but then there wouldn't
10332 * be room for the next one, which could at be one of those split
10333 * multi-char folds. I don't know of any fool-proof solution. One
10334 * could back off to end with only a code point that isn't such a
10335 * non-final, but it is possible for there not to be any in the
10337 for (len = 0, p = RExC_parse - 1;
10338 len < 127 && p < RExC_end;
10341 char * const oldp = p;
10343 if (RExC_flags & RXf_PMf_EXTENDED)
10344 p = regwhite( pRExC_state, p );
10355 /* Literal Escapes Switch
10357 This switch is meant to handle escape sequences that
10358 resolve to a literal character.
10360 Every escape sequence that represents something
10361 else, like an assertion or a char class, is handled
10362 in the switch marked 'Special Escapes' above in this
10363 routine, but also has an entry here as anything that
10364 isn't explicitly mentioned here will be treated as
10365 an unescaped equivalent literal.
10368 switch ((U8)*++p) {
10369 /* These are all the special escapes. */
10370 case 'A': /* Start assertion */
10371 case 'b': case 'B': /* Word-boundary assertion*/
10372 case 'C': /* Single char !DANGEROUS! */
10373 case 'd': case 'D': /* digit class */
10374 case 'g': case 'G': /* generic-backref, pos assertion */
10375 case 'h': case 'H': /* HORIZWS */
10376 case 'k': case 'K': /* named backref, keep marker */
10377 case 'N': /* named char sequence */
10378 case 'p': case 'P': /* Unicode property */
10379 case 'R': /* LNBREAK */
10380 case 's': case 'S': /* space class */
10381 case 'v': case 'V': /* VERTWS */
10382 case 'w': case 'W': /* word class */
10383 case 'X': /* eXtended Unicode "combining character sequence" */
10384 case 'z': case 'Z': /* End of line/string assertion */
10388 /* Anything after here is an escape that resolves to a
10389 literal. (Except digits, which may or may not)
10408 ender = ASCII_TO_NATIVE('\033');
10412 ender = ASCII_TO_NATIVE('\007');
10417 STRLEN brace_len = len;
10419 const char* error_msg;
10421 bool valid = grok_bslash_o(p,
10428 RExC_parse = p; /* going to die anyway; point
10429 to exact spot of failure */
10436 if (PL_encoding && ender < 0x100) {
10437 goto recode_encoding;
10439 if (ender > 0xff) {
10446 STRLEN brace_len = len;
10448 const char* error_msg;
10450 bool valid = grok_bslash_x(p,
10457 RExC_parse = p; /* going to die anyway; point
10458 to exact spot of failure */
10464 if (PL_encoding && ender < 0x100) {
10465 goto recode_encoding;
10467 if (ender > 0xff) {
10474 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10476 case '0': case '1': case '2': case '3':case '4':
10477 case '5': case '6': case '7':
10479 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10481 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10483 ender = grok_oct(p, &numlen, &flags, NULL);
10484 if (ender > 0xff) {
10493 if (PL_encoding && ender < 0x100)
10494 goto recode_encoding;
10497 if (! RExC_override_recoding) {
10498 SV* enc = PL_encoding;
10499 ender = reg_recode((const char)(U8)ender, &enc);
10500 if (!enc && SIZE_ONLY)
10501 ckWARNreg(p, "Invalid escape in the specified encoding");
10507 FAIL("Trailing \\");
10510 if (!SIZE_ONLY&& isALNUMC(*p)) {
10511 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10513 goto normal_default;
10517 /* Currently we don't warn when the lbrace is at the start
10518 * of a construct. This catches it in the middle of a
10519 * literal string, or when its the first thing after
10520 * something like "\b" */
10522 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10524 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10529 if (UTF8_IS_START(*p) && UTF) {
10531 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10532 &numlen, UTF8_ALLOW_DEFAULT);
10538 } /* End of switch on the literal */
10540 is_exactfu_sharp_s = (node_type == EXACTFU
10541 && ender == LATIN_SMALL_LETTER_SHARP_S);
10542 if ( RExC_flags & RXf_PMf_EXTENDED)
10543 p = regwhite( pRExC_state, p );
10544 if ((UTF && FOLD) || is_exactfu_sharp_s) {
10545 /* Prime the casefolded buffer. Locale rules, which apply
10546 * only to code points < 256, aren't known until execution,
10547 * so for them, just output the original character using
10548 * utf8. If we start to fold non-UTF patterns, be sure to
10549 * update join_exact() */
10550 if (LOC && ender < 256) {
10551 if (UNI_IS_INVARIANT(ender)) {
10552 *tmpbuf = (U8) ender;
10555 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10556 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10560 else if (isASCII(ender)) { /* Note: Here can't also be LOC
10562 ender = toLOWER(ender);
10563 *tmpbuf = (U8) ender;
10566 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10568 /* Locale and /aa require more selectivity about the
10569 * fold, so are handled below. Otherwise, here, just
10571 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10574 /* Under locale rules or /aa we are not to mix,
10575 * respectively, ords < 256 or ASCII with non-. So
10576 * reject folds that mix them, using only the
10577 * non-folded code point. So do the fold to a
10578 * temporary, and inspect each character in it. */
10579 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10581 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10582 U8* e = s + foldlen;
10583 bool fold_ok = TRUE;
10587 || (LOC && (UTF8_IS_INVARIANT(*s)
10588 || UTF8_IS_DOWNGRADEABLE_START(*s))))
10596 Copy(trialbuf, tmpbuf, foldlen, U8);
10600 uvuni_to_utf8(tmpbuf, ender);
10601 foldlen = UNISKIP(ender);
10605 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
10608 else if (UTF || is_exactfu_sharp_s) {
10610 /* Emit all the Unicode characters. */
10612 for (foldbuf = tmpbuf;
10614 foldlen -= numlen) {
10616 /* tmpbuf has been constructed by us, so we
10617 * know it is valid utf8 */
10618 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10620 const STRLEN unilen = reguni(pRExC_state, ender, s);
10623 /* In EBCDIC the numlen
10624 * and unilen can differ. */
10626 if (numlen >= foldlen)
10630 break; /* "Can't happen." */
10634 const STRLEN unilen = reguni(pRExC_state, ender, s);
10643 REGC((char)ender, s++);
10647 if (UTF || is_exactfu_sharp_s) {
10649 /* Emit all the Unicode characters. */
10651 for (foldbuf = tmpbuf;
10653 foldlen -= numlen) {
10654 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10656 const STRLEN unilen = reguni(pRExC_state, ender, s);
10659 /* In EBCDIC the numlen
10660 * and unilen can differ. */
10662 if (numlen >= foldlen)
10670 const STRLEN unilen = reguni(pRExC_state, ender, s);
10679 REGC((char)ender, s++);
10682 loopdone: /* Jumped to when encounters something that shouldn't be in
10684 RExC_parse = p - 1;
10685 Set_Node_Cur_Length(ret); /* MJD */
10686 nextchar(pRExC_state);
10688 /* len is STRLEN which is unsigned, need to copy to signed */
10691 vFAIL("Internal disaster");
10694 *flagp |= HASWIDTH;
10695 if (len == 1 && UNI_IS_INVARIANT(ender))
10699 RExC_size += STR_SZ(len);
10701 STR_LEN(ret) = len;
10702 RExC_emit += STR_SZ(len);
10710 /* Jumped to when an unrecognized character set is encountered */
10712 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
10717 S_regwhite( RExC_state_t *pRExC_state, char *p )
10719 const char *e = RExC_end;
10721 PERL_ARGS_ASSERT_REGWHITE;
10726 else if (*p == '#') {
10729 if (*p++ == '\n') {
10735 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10743 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10744 Character classes ([:foo:]) can also be negated ([:^foo:]).
10745 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10746 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
10747 but trigger failures because they are currently unimplemented. */
10749 #define POSIXCC_DONE(c) ((c) == ':')
10750 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10751 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10754 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
10757 I32 namedclass = OOB_NAMEDCLASS;
10759 PERL_ARGS_ASSERT_REGPPOSIXCC;
10761 if (value == '[' && RExC_parse + 1 < RExC_end &&
10762 /* I smell either [: or [= or [. -- POSIX has been here, right? */
10763 POSIXCC(UCHARAT(RExC_parse))) {
10764 const char c = UCHARAT(RExC_parse);
10765 char* const s = RExC_parse++;
10767 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
10769 if (RExC_parse == RExC_end)
10770 /* Grandfather lone [:, [=, [. */
10773 const char* const t = RExC_parse++; /* skip over the c */
10776 if (UCHARAT(RExC_parse) == ']') {
10777 const char *posixcc = s + 1;
10778 RExC_parse++; /* skip over the ending ] */
10781 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10782 const I32 skip = t - posixcc;
10784 /* Initially switch on the length of the name. */
10787 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10788 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10791 /* Names all of length 5. */
10792 /* alnum alpha ascii blank cntrl digit graph lower
10793 print punct space upper */
10794 /* Offset 4 gives the best switch position. */
10795 switch (posixcc[4]) {
10797 if (memEQ(posixcc, "alph", 4)) /* alpha */
10798 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10801 if (memEQ(posixcc, "spac", 4)) /* space */
10802 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10805 if (memEQ(posixcc, "grap", 4)) /* graph */
10806 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10809 if (memEQ(posixcc, "asci", 4)) /* ascii */
10810 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10813 if (memEQ(posixcc, "blan", 4)) /* blank */
10814 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10817 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10818 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10821 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10822 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10825 if (memEQ(posixcc, "lowe", 4)) /* lower */
10826 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10827 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10828 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10831 if (memEQ(posixcc, "digi", 4)) /* digit */
10832 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10833 else if (memEQ(posixcc, "prin", 4)) /* print */
10834 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10835 else if (memEQ(posixcc, "punc", 4)) /* punct */
10836 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10841 if (memEQ(posixcc, "xdigit", 6))
10842 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10846 if (namedclass == OOB_NAMEDCLASS)
10847 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10849 assert (posixcc[skip] == ':');
10850 assert (posixcc[skip+1] == ']');
10851 } else if (!SIZE_ONLY) {
10852 /* [[=foo=]] and [[.foo.]] are still future. */
10854 /* adjust RExC_parse so the warning shows after
10855 the class closes */
10856 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10858 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10861 /* Maternal grandfather:
10862 * "[:" ending in ":" but not in ":]" */
10872 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10876 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10878 if (POSIXCC(UCHARAT(RExC_parse))) {
10879 const char *s = RExC_parse;
10880 const char c = *s++;
10882 while (isALNUM(*s))
10884 if (*s && c == *s && s[1] == ']') {
10886 "POSIX syntax [%c %c] belongs inside character classes",
10889 /* [[=foo=]] and [[.foo.]] are still future. */
10890 if (POSIXCC_NOTYET(c)) {
10891 /* adjust RExC_parse so the error shows after
10892 the class closes */
10893 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10895 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10901 /* Generate the code to add a full posix character <class> to the bracketed
10902 * character class given by <node>. (<node> is needed only under locale rules)
10903 * destlist is the inversion list for non-locale rules that this class is
10905 * sourcelist is the ASCII-range inversion list to add under /a rules
10906 * Xsourcelist is the full Unicode range list to use otherwise. */
10907 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10909 SV* scratch_list = NULL; \
10911 /* Set this class in the node for runtime matching */ \
10912 ANYOF_CLASS_SET(node, class); \
10914 /* For above Latin1 code points, we use the full Unicode range */ \
10915 _invlist_intersection(PL_AboveLatin1, \
10918 /* And set the output to it, adding instead if there already is an \
10919 * output. Checking if <destlist> is NULL first saves an extra \
10920 * clone. Its reference count will be decremented at the next \
10921 * union, etc, or if this is the only instance, at the end of the \
10923 if (! destlist) { \
10924 destlist = scratch_list; \
10927 _invlist_union(destlist, scratch_list, &destlist); \
10928 SvREFCNT_dec(scratch_list); \
10932 /* For non-locale, just add it to any existing list */ \
10933 _invlist_union(destlist, \
10934 (AT_LEAST_ASCII_RESTRICTED) \
10940 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10942 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10944 SV* scratch_list = NULL; \
10945 ANYOF_CLASS_SET(node, class); \
10946 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10947 if (! destlist) { \
10948 destlist = scratch_list; \
10951 _invlist_union(destlist, scratch_list, &destlist); \
10952 SvREFCNT_dec(scratch_list); \
10956 _invlist_union_complement_2nd(destlist, \
10957 (AT_LEAST_ASCII_RESTRICTED) \
10961 /* Under /d, everything in the upper half of the Latin1 range \
10962 * matches this complement */ \
10963 if (DEPENDS_SEMANTICS) { \
10964 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10968 /* Generate the code to add a posix character <class> to the bracketed
10969 * character class given by <node>. (<node> is needed only under locale rules)
10970 * destlist is the inversion list for non-locale rules that this class is
10972 * sourcelist is the ASCII-range inversion list to add under /a rules
10973 * l1_sourcelist is the Latin1 range list to use otherwise.
10974 * Xpropertyname is the name to add to <run_time_list> of the property to
10975 * specify the code points above Latin1 that will have to be
10976 * determined at run-time
10977 * run_time_list is a SV* that contains text names of properties that are to
10978 * be computed at run time. This concatenates <Xpropertyname>
10979 * to it, apppropriately
10980 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10982 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10983 l1_sourcelist, Xpropertyname, run_time_list) \
10984 /* First, resolve whether to use the ASCII-only list or the L1 \
10986 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
10987 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10988 Xpropertyname, run_time_list)
10990 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10991 Xpropertyname, run_time_list) \
10992 /* If not /a matching, there are going to be code points we will have \
10993 * to defer to runtime to look-up */ \
10994 if (! AT_LEAST_ASCII_RESTRICTED) { \
10995 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10998 ANYOF_CLASS_SET(node, class); \
11001 _invlist_union(destlist, sourcelist, &destlist); \
11004 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11005 * this and DO_N_POSIX */
11006 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11007 l1_sourcelist, Xpropertyname, run_time_list) \
11008 if (AT_LEAST_ASCII_RESTRICTED) { \
11009 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11012 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11014 ANYOF_CLASS_SET(node, namedclass); \
11017 SV* scratch_list = NULL; \
11018 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11019 if (! destlist) { \
11020 destlist = scratch_list; \
11023 _invlist_union(destlist, scratch_list, &destlist); \
11024 SvREFCNT_dec(scratch_list); \
11026 if (DEPENDS_SEMANTICS) { \
11027 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11033 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
11036 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
11037 * Locale folding is done at run-time, so this function should not be
11038 * called for nodes that are for locales.
11040 * This function sets the bit corresponding to the fold of the input
11041 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
11044 * It also knows about the characters that are in the bitmap that have
11045 * folds that are matchable only outside it, and sets the appropriate lists
11048 * It returns the number of bits that actually changed from 0 to 1 */
11053 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
11055 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
11058 /* It assumes the bit for 'value' has already been set */
11059 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
11060 ANYOF_BITMAP_SET(node, fold);
11063 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
11064 /* Certain Latin1 characters have matches outside the bitmap. To get
11065 * here, 'value' is one of those characters. None of these matches is
11066 * valid for ASCII characters under /aa, which have been excluded by
11067 * the 'if' above. The matches fall into three categories:
11068 * 1) They are singly folded-to or -from an above 255 character, as
11069 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
11071 * 2) They are part of a multi-char fold with another character in the
11072 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
11073 * 3) They are part of a multi-char fold with a character not in the
11074 * bitmap, such as various ligatures.
11075 * We aren't dealing fully with multi-char folds, except we do deal
11076 * with the pattern containing a character that has a multi-char fold
11077 * (not so much the inverse).
11078 * For types 1) and 3), the matches only happen when the target string
11079 * is utf8; that's not true for 2), and we set a flag for it.
11081 * The code below adds to the passed in inversion list the single fold
11082 * closures for 'value'. The values are hard-coded here so that an
11083 * innocent-looking character class, like /[ks]/i won't have to go out
11084 * to disk to find the possible matches. XXX It would be better to
11085 * generate these via regen, in case a new version of the Unicode
11086 * standard adds new mappings, though that is not really likely. */
11091 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
11095 /* LATIN SMALL LETTER LONG S */
11096 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
11099 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11100 GREEK_SMALL_LETTER_MU);
11101 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11102 GREEK_CAPITAL_LETTER_MU);
11104 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
11105 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
11106 /* ANGSTROM SIGN */
11107 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
11108 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
11109 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11110 PL_fold_latin1[value]);
11113 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
11114 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11115 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
11117 case LATIN_SMALL_LETTER_SHARP_S:
11118 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
11119 LATIN_CAPITAL_LETTER_SHARP_S);
11121 /* Under /a, /d, and /u, this can match the two chars "ss" */
11122 if (! MORE_ASCII_RESTRICTED) {
11123 add_alternate(alternate_ptr, (U8 *) "ss", 2);
11125 /* And under /u or /a, it can match even if the target is
11127 if (AT_LEAST_UNI_SEMANTICS) {
11128 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
11132 case 'F': case 'f':
11133 case 'I': case 'i':
11134 case 'L': case 'l':
11135 case 'T': case 't':
11136 case 'A': case 'a':
11137 case 'H': case 'h':
11138 case 'J': case 'j':
11139 case 'N': case 'n':
11140 case 'W': case 'w':
11141 case 'Y': case 'y':
11142 /* These all are targets of multi-character folds from code
11143 * points that require UTF8 to express, so they can't match
11144 * unless the target string is in UTF-8, so no action here is
11145 * necessary, as regexec.c properly handles the general case
11146 * for UTF-8 matching */
11149 /* Use deprecated warning to increase the chances of this
11151 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
11155 else if (DEPENDS_SEMANTICS
11156 && ! isASCII(value)
11157 && PL_fold_latin1[value] != value)
11159 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
11160 * folds only when the target string is in UTF-8. We add the fold
11161 * here to the list of things to match outside the bitmap, which
11162 * won't be looked at unless it is UTF8 (or else if something else
11163 * says to look even if not utf8, but those things better not happen
11164 * under DEPENDS semantics. */
11165 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
11172 PERL_STATIC_INLINE U8
11173 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
11175 /* This inline function sets a bit in the bitmap if not already set, and if
11176 * appropriate, its fold, returning the number of bits that actually
11177 * changed from 0 to 1 */
11181 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
11183 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
11187 ANYOF_BITMAP_SET(node, value);
11190 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
11191 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
11198 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
11200 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
11201 * alternate list, pointed to by 'alternate_ptr'. This is an array of
11202 * the multi-character folds of characters in the node */
11205 PERL_ARGS_ASSERT_ADD_ALTERNATE;
11207 if (! *alternate_ptr) {
11208 *alternate_ptr = newAV();
11210 sv = newSVpvn_utf8((char*)string, len, TRUE);
11211 av_push(*alternate_ptr, sv);
11216 parse a class specification and produce either an ANYOF node that
11217 matches the pattern or perhaps will be optimized into an EXACTish node
11218 instead. The node contains a bit map for the first 256 characters, with the
11219 corresponding bit set if that character is in the list. For characters
11220 above 255, a range list is used */
11223 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
11226 register UV nextvalue;
11227 register IV prevvalue = OOB_UNICODE;
11228 register IV range = 0;
11229 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
11230 register regnode *ret;
11233 char *rangebegin = NULL;
11234 bool need_class = 0;
11235 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
11237 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11238 than just initialized. */
11239 SV* properties = NULL; /* Code points that match \p{} \P{} */
11240 UV element_count = 0; /* Number of distinct elements in the class.
11241 Optimizations may be possible if this is tiny */
11244 /* Unicode properties are stored in a swash; this holds the current one
11245 * being parsed. If this swash is the only above-latin1 component of the
11246 * character class, an optimization is to pass it directly on to the
11247 * execution engine. Otherwise, it is set to NULL to indicate that there
11248 * are other things in the class that have to be dealt with at execution
11250 SV* swash = NULL; /* Code points that match \p{} \P{} */
11252 /* Set if a component of this character class is user-defined; just passed
11253 * on to the engine */
11254 UV has_user_defined_property = 0;
11256 /* code points this node matches that can't be stored in the bitmap */
11257 SV* nonbitmap = NULL;
11259 /* The items that are to match that aren't stored in the bitmap, but are a
11260 * result of things that are stored there. This is the fold closure of
11261 * such a character, either because it has DEPENDS semantics and shouldn't
11262 * be matched unless the target string is utf8, or is a code point that is
11263 * too large for the bit map, as for example, the fold of the MICRO SIGN is
11264 * above 255. This all is solely for performance reasons. By having this
11265 * code know the outside-the-bitmap folds that the bitmapped characters are
11266 * involved with, we don't have to go out to disk to find the list of
11267 * matches, unless the character class includes code points that aren't
11268 * storable in the bit map. That means that a character class with an 's'
11269 * in it, for example, doesn't need to go out to disk to find everything
11270 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
11271 * empty unless there is something whose fold we don't know about, and will
11272 * have to go out to the disk to find. */
11273 SV* l1_fold_invlist = NULL;
11275 /* List of multi-character folds that are matched by this node */
11276 AV* unicode_alternate = NULL;
11278 UV literal_endpoint = 0;
11280 UV stored = 0; /* how many chars stored in the bitmap */
11282 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11283 case we need to change the emitted regop to an EXACT. */
11284 const char * orig_parse = RExC_parse;
11285 GET_RE_DEBUG_FLAGS_DECL;
11287 PERL_ARGS_ASSERT_REGCLASS;
11289 PERL_UNUSED_ARG(depth);
11292 DEBUG_PARSE("clas");
11294 /* Assume we are going to generate an ANYOF node. */
11295 ret = reganode(pRExC_state, ANYOF, 0);
11299 ANYOF_FLAGS(ret) = 0;
11302 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11306 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
11308 /* We have decided to not allow multi-char folds in inverted character
11309 * classes, due to the confusion that can happen, especially with
11310 * classes that are designed for a non-Unicode world: You have the
11311 * peculiar case that:
11312 "s s" =~ /^[^\xDF]+$/i => Y
11313 "ss" =~ /^[^\xDF]+$/i => N
11315 * See [perl #89750] */
11316 allow_full_fold = FALSE;
11320 RExC_size += ANYOF_SKIP;
11321 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11324 RExC_emit += ANYOF_SKIP;
11326 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11328 ANYOF_BITMAP_ZERO(ret);
11329 listsv = newSVpvs("# comment\n");
11330 initial_listsv_len = SvCUR(listsv);
11333 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11335 if (!SIZE_ONLY && POSIXCC(nextvalue))
11336 checkposixcc(pRExC_state);
11338 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11339 if (UCHARAT(RExC_parse) == ']')
11340 goto charclassloop;
11343 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11347 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11350 rangebegin = RExC_parse;
11354 value = utf8n_to_uvchr((U8*)RExC_parse,
11355 RExC_end - RExC_parse,
11356 &numlen, UTF8_ALLOW_DEFAULT);
11357 RExC_parse += numlen;
11360 value = UCHARAT(RExC_parse++);
11362 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11363 if (value == '[' && POSIXCC(nextvalue))
11364 namedclass = regpposixcc(pRExC_state, value);
11365 else if (value == '\\') {
11367 value = utf8n_to_uvchr((U8*)RExC_parse,
11368 RExC_end - RExC_parse,
11369 &numlen, UTF8_ALLOW_DEFAULT);
11370 RExC_parse += numlen;
11373 value = UCHARAT(RExC_parse++);
11374 /* Some compilers cannot handle switching on 64-bit integer
11375 * values, therefore value cannot be an UV. Yes, this will
11376 * be a problem later if we want switch on Unicode.
11377 * A similar issue a little bit later when switching on
11378 * namedclass. --jhi */
11379 switch ((I32)value) {
11380 case 'w': namedclass = ANYOF_ALNUM; break;
11381 case 'W': namedclass = ANYOF_NALNUM; break;
11382 case 's': namedclass = ANYOF_SPACE; break;
11383 case 'S': namedclass = ANYOF_NSPACE; break;
11384 case 'd': namedclass = ANYOF_DIGIT; break;
11385 case 'D': namedclass = ANYOF_NDIGIT; break;
11386 case 'v': namedclass = ANYOF_VERTWS; break;
11387 case 'V': namedclass = ANYOF_NVERTWS; break;
11388 case 'h': namedclass = ANYOF_HORIZWS; break;
11389 case 'H': namedclass = ANYOF_NHORIZWS; break;
11390 case 'N': /* Handle \N{NAME} in class */
11392 /* We only pay attention to the first char of
11393 multichar strings being returned. I kinda wonder
11394 if this makes sense as it does change the behaviour
11395 from earlier versions, OTOH that behaviour was broken
11397 UV v; /* value is register so we cant & it /grrr */
11398 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
11408 if (RExC_parse >= RExC_end)
11409 vFAIL2("Empty \\%c{}", (U8)value);
11410 if (*RExC_parse == '{') {
11411 const U8 c = (U8)value;
11412 e = strchr(RExC_parse++, '}');
11414 vFAIL2("Missing right brace on \\%c{}", c);
11415 while (isSPACE(UCHARAT(RExC_parse)))
11417 if (e == RExC_parse)
11418 vFAIL2("Empty \\%c{}", c);
11419 n = e - RExC_parse;
11420 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11431 if (UCHARAT(RExC_parse) == '^') {
11434 value = value == 'p' ? 'P' : 'p'; /* toggle */
11435 while (isSPACE(UCHARAT(RExC_parse))) {
11440 /* Try to get the definition of the property into
11441 * <invlist>. If /i is in effect, the effective property
11442 * will have its name be <__NAME_i>. The design is
11443 * discussed in commit
11444 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11445 Newx(name, n + sizeof("_i__\n"), char);
11447 sprintf(name, "%s%.*s%s\n",
11448 (FOLD) ? "__" : "",
11454 /* Look up the property name, and get its swash and
11455 * inversion list, if the property is found */
11457 SvREFCNT_dec(swash);
11459 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11462 TRUE, /* this routine will handle
11463 undefined properties */
11464 NULL, FALSE /* No inversion list */
11468 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11470 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11472 || ! (invlist = *invlistsvp))
11475 SvREFCNT_dec(swash);
11479 /* Here didn't find it. It could be a user-defined
11480 * property that will be available at run-time. Add it
11481 * to the list to look up then */
11482 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11483 (value == 'p' ? '+' : '!'),
11485 has_user_defined_property = 1;
11487 /* We don't know yet, so have to assume that the
11488 * property could match something in the Latin1 range,
11489 * hence something that isn't utf8 */
11490 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11494 /* Here, did get the swash and its inversion list. If
11495 * the swash is from a user-defined property, then this
11496 * whole character class should be regarded as such */
11497 SV** user_defined_svp =
11498 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11499 "USER_DEFINED", FALSE);
11500 if (user_defined_svp) {
11501 has_user_defined_property
11502 |= SvUV(*user_defined_svp);
11505 /* Invert if asking for the complement */
11506 if (value == 'P') {
11507 _invlist_union_complement_2nd(properties, invlist, &properties);
11509 /* The swash can't be used as-is, because we've
11510 * inverted things; delay removing it to here after
11511 * have copied its invlist above */
11512 SvREFCNT_dec(swash);
11516 _invlist_union(properties, invlist, &properties);
11521 RExC_parse = e + 1;
11522 namedclass = ANYOF_MAX; /* no official name, but it's named */
11524 /* \p means they want Unicode semantics */
11525 RExC_uni_semantics = 1;
11528 case 'n': value = '\n'; break;
11529 case 'r': value = '\r'; break;
11530 case 't': value = '\t'; break;
11531 case 'f': value = '\f'; break;
11532 case 'b': value = '\b'; break;
11533 case 'e': value = ASCII_TO_NATIVE('\033');break;
11534 case 'a': value = ASCII_TO_NATIVE('\007');break;
11536 RExC_parse--; /* function expects to be pointed at the 'o' */
11538 const char* error_msg;
11539 bool valid = grok_bslash_o(RExC_parse,
11544 RExC_parse += numlen;
11549 if (PL_encoding && value < 0x100) {
11550 goto recode_encoding;
11554 RExC_parse--; /* function expects to be pointed at the 'x' */
11556 const char* error_msg;
11557 bool valid = grok_bslash_x(RExC_parse,
11562 RExC_parse += numlen;
11567 if (PL_encoding && value < 0x100)
11568 goto recode_encoding;
11571 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11573 case '0': case '1': case '2': case '3': case '4':
11574 case '5': case '6': case '7':
11576 /* Take 1-3 octal digits */
11577 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11579 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11580 RExC_parse += numlen;
11581 if (PL_encoding && value < 0x100)
11582 goto recode_encoding;
11586 if (! RExC_override_recoding) {
11587 SV* enc = PL_encoding;
11588 value = reg_recode((const char)(U8)value, &enc);
11589 if (!enc && SIZE_ONLY)
11590 ckWARNreg(RExC_parse,
11591 "Invalid escape in the specified encoding");
11595 /* Allow \_ to not give an error */
11596 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11597 ckWARN2reg(RExC_parse,
11598 "Unrecognized escape \\%c in character class passed through",
11603 } /* end of \blah */
11606 literal_endpoint++;
11609 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11611 /* What matches in a locale is not known until runtime, so need to
11612 * (one time per class) allocate extra space to pass to regexec.
11613 * The space will contain a bit for each named class that is to be
11614 * matched against. This isn't needed for \p{} and pseudo-classes,
11615 * as they are not affected by locale, and hence are dealt with
11617 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
11620 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11623 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11624 ANYOF_CLASS_ZERO(ret);
11626 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11629 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11630 * literal, as is the character that began the false range, i.e.
11631 * the 'a' in the examples */
11635 RExC_parse >= rangebegin ?
11636 RExC_parse - rangebegin : 0;
11637 ckWARN4reg(RExC_parse,
11638 "False [] range \"%*.*s\"",
11642 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11643 if (prevvalue < 256) {
11645 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
11648 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
11652 range = 0; /* this was not a true range */
11657 /* Possible truncation here but in some 64-bit environments
11658 * the compiler gets heartburn about switch on 64-bit values.
11659 * A similar issue a little earlier when switching on value.
11661 switch ((I32)namedclass) {
11663 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11664 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11665 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11667 case ANYOF_NALNUMC:
11668 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11669 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11672 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11673 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11676 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11677 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11681 ANYOF_CLASS_SET(ret, namedclass);
11684 _invlist_union(properties, PL_ASCII, &properties);
11689 ANYOF_CLASS_SET(ret, namedclass);
11692 _invlist_union_complement_2nd(properties,
11693 PL_ASCII, &properties);
11694 if (DEPENDS_SEMANTICS) {
11695 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11700 DO_POSIX(ret, namedclass, properties,
11701 PL_PosixBlank, PL_XPosixBlank);
11704 DO_N_POSIX(ret, namedclass, properties,
11705 PL_PosixBlank, PL_XPosixBlank);
11708 DO_POSIX(ret, namedclass, properties,
11709 PL_PosixCntrl, PL_XPosixCntrl);
11712 DO_N_POSIX(ret, namedclass, properties,
11713 PL_PosixCntrl, PL_XPosixCntrl);
11716 /* There are no digits in the Latin1 range outside of
11717 * ASCII, so call the macro that doesn't have to resolve
11719 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
11720 PL_PosixDigit, "XPosixDigit", listsv);
11723 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11724 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
11727 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11728 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11731 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11732 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11734 case ANYOF_HORIZWS:
11735 /* For these, we use the nonbitmap, as /d doesn't make a
11736 * difference in what these match. There would be problems
11737 * if these characters had folds other than themselves, as
11738 * nonbitmap is subject to folding. It turns out that \h
11739 * is just a synonym for XPosixBlank */
11740 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
11742 case ANYOF_NHORIZWS:
11743 _invlist_union_complement_2nd(nonbitmap,
11744 PL_XPosixBlank, &nonbitmap);
11748 { /* These require special handling, as they differ under
11749 folding, matching Cased there (which in the ASCII range
11750 is the same as Alpha */
11756 if (FOLD && ! LOC) {
11757 ascii_source = PL_PosixAlpha;
11758 l1_source = PL_L1Cased;
11762 ascii_source = PL_PosixLower;
11763 l1_source = PL_L1PosixLower;
11764 Xname = "XPosixLower";
11766 if (namedclass == ANYOF_LOWER) {
11767 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11768 ascii_source, l1_source, Xname, listsv);
11771 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11772 properties, ascii_source, l1_source, Xname, listsv);
11777 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11778 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11781 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11782 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11785 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11786 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11789 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11790 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11793 DO_POSIX(ret, namedclass, properties,
11794 PL_PosixSpace, PL_XPosixSpace);
11796 case ANYOF_NPSXSPC:
11797 DO_N_POSIX(ret, namedclass, properties,
11798 PL_PosixSpace, PL_XPosixSpace);
11801 DO_POSIX(ret, namedclass, properties,
11802 PL_PerlSpace, PL_XPerlSpace);
11805 DO_N_POSIX(ret, namedclass, properties,
11806 PL_PerlSpace, PL_XPerlSpace);
11808 case ANYOF_UPPER: /* Same as LOWER, above */
11815 if (FOLD && ! LOC) {
11816 ascii_source = PL_PosixAlpha;
11817 l1_source = PL_L1Cased;
11821 ascii_source = PL_PosixUpper;
11822 l1_source = PL_L1PosixUpper;
11823 Xname = "XPosixUpper";
11825 if (namedclass == ANYOF_UPPER) {
11826 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11827 ascii_source, l1_source, Xname, listsv);
11830 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11831 properties, ascii_source, l1_source, Xname, listsv);
11835 case ANYOF_ALNUM: /* Really is 'Word' */
11836 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11837 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11840 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11841 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11844 /* For these, we use the nonbitmap, as /d doesn't make a
11845 * difference in what these match. There would be problems
11846 * if these characters had folds other than themselves, as
11847 * nonbitmap is subject to folding */
11848 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11850 case ANYOF_NVERTWS:
11851 _invlist_union_complement_2nd(nonbitmap,
11852 PL_VertSpace, &nonbitmap);
11855 DO_POSIX(ret, namedclass, properties,
11856 PL_PosixXDigit, PL_XPosixXDigit);
11858 case ANYOF_NXDIGIT:
11859 DO_N_POSIX(ret, namedclass, properties,
11860 PL_PosixXDigit, PL_XPosixXDigit);
11863 /* this is to handle \p and \P */
11866 vFAIL("Invalid [::] class");
11872 } /* end of namedclass \blah */
11875 if (prevvalue > (IV)value) /* b-a */ {
11876 const int w = RExC_parse - rangebegin;
11877 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11878 range = 0; /* not a valid range */
11882 prevvalue = value; /* save the beginning of the range */
11883 if (RExC_parse+1 < RExC_end
11884 && *RExC_parse == '-'
11885 && RExC_parse[1] != ']')
11889 /* a bad range like \w-, [:word:]- ? */
11890 if (namedclass > OOB_NAMEDCLASS) {
11891 if (ckWARN(WARN_REGEXP)) {
11893 RExC_parse >= rangebegin ?
11894 RExC_parse - rangebegin : 0;
11896 "False [] range \"%*.*s\"",
11901 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11903 range = 1; /* yeah, it's a range! */
11904 continue; /* but do it the next time */
11908 /* non-Latin1 code point implies unicode semantics. Must be set in
11909 * pass1 so is there for the whole of pass 2 */
11911 RExC_uni_semantics = 1;
11914 /* now is the next time */
11916 if (prevvalue < 256) {
11917 const IV ceilvalue = value < 256 ? value : 255;
11920 /* In EBCDIC [\x89-\x91] should include
11921 * the \x8e but [i-j] should not. */
11922 if (literal_endpoint == 2 &&
11923 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11924 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11926 if (isLOWER(prevvalue)) {
11927 for (i = prevvalue; i <= ceilvalue; i++)
11928 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11930 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11933 for (i = prevvalue; i <= ceilvalue; i++)
11934 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11936 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11942 for (i = prevvalue; i <= ceilvalue; i++) {
11943 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11947 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11948 const UV natvalue = NATIVE_TO_UNI(value);
11949 nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11952 literal_endpoint = 0;
11956 range = 0; /* this range (if it was one) is done now */
11963 /****** !SIZE_ONLY AFTER HERE *********/
11965 /* If folding and there are code points above 255, we calculate all
11966 * characters that could fold to or from the ones already on the list */
11967 if (FOLD && nonbitmap) {
11968 UV start, end; /* End points of code point ranges */
11970 SV* fold_intersection = NULL;
11972 /* This is a list of all the characters that participate in folds
11973 * (except marks, etc in multi-char folds */
11974 if (! PL_utf8_foldable) {
11975 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11976 PL_utf8_foldable = _swash_to_invlist(swash);
11977 SvREFCNT_dec(swash);
11980 /* This is a hash that for a particular fold gives all characters
11981 * that are involved in it */
11982 if (! PL_utf8_foldclosures) {
11984 /* If we were unable to find any folds, then we likely won't be
11985 * able to find the closures. So just create an empty list.
11986 * Folding will effectively be restricted to the non-Unicode rules
11987 * hard-coded into Perl. (This case happens legitimately during
11988 * compilation of Perl itself before the Unicode tables are
11990 if (invlist_len(PL_utf8_foldable) == 0) {
11991 PL_utf8_foldclosures = newHV();
11993 /* If the folds haven't been read in, call a fold function
11995 if (! PL_utf8_tofold) {
11996 U8 dummy[UTF8_MAXBYTES+1];
11999 /* This particular string is above \xff in both UTF-8 and
12001 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
12002 assert(PL_utf8_tofold); /* Verify that worked */
12004 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
12008 /* Only the characters in this class that participate in folds need be
12009 * checked. Get the intersection of this class and all the possible
12010 * characters that are foldable. This can quickly narrow down a large
12012 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
12014 /* Now look at the foldable characters in this class individually */
12015 invlist_iterinit(fold_intersection);
12016 while (invlist_iternext(fold_intersection, &start, &end)) {
12019 /* Look at every character in the range */
12020 for (j = start; j <= end; j++) {
12023 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12026 _to_uni_fold_flags(j, foldbuf, &foldlen,
12027 (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
12029 if (foldlen > (STRLEN)UNISKIP(f)) {
12031 /* Any multicharacter foldings (disallowed in lookbehind
12032 * patterns) require the following transform: [ABCDEF] ->
12033 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
12034 * folds into "rst", all other characters fold to single
12035 * characters. We save away these multicharacter foldings,
12036 * to be later saved as part of the additional "s" data. */
12037 if (! RExC_in_lookbehind) {
12039 U8* e = foldbuf + foldlen;
12041 /* If any of the folded characters of this are in the
12042 * Latin1 range, tell the regex engine that this can
12043 * match a non-utf8 target string. The only multi-byte
12044 * fold whose source is in the Latin1 range (U+00DF)
12045 * applies only when the target string is utf8, or
12046 * under unicode rules */
12047 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
12050 /* Can't mix ascii with non- under /aa */
12051 if (MORE_ASCII_RESTRICTED
12052 && (isASCII(*loc) != isASCII(j)))
12054 goto end_multi_fold;
12056 if (UTF8_IS_INVARIANT(*loc)
12057 || UTF8_IS_DOWNGRADEABLE_START(*loc))
12059 /* Can't mix above and below 256 under LOC
12062 goto end_multi_fold;
12065 |= ANYOF_NONBITMAP_NON_UTF8;
12068 loc += UTF8SKIP(loc);
12072 add_alternate(&unicode_alternate, foldbuf, foldlen);
12076 /* This is special-cased, as it is the only letter which
12077 * has both a multi-fold and single-fold in Latin1. All
12078 * the other chars that have single and multi-folds are
12079 * always in utf8, and the utf8 folding algorithm catches
12081 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
12082 stored += set_regclass_bit(pRExC_state,
12084 LATIN_SMALL_LETTER_SHARP_S,
12085 &l1_fold_invlist, &unicode_alternate);
12089 /* Single character fold. Add everything in its fold
12090 * closure to the list that this node should match */
12093 /* The fold closures data structure is a hash with the keys
12094 * being every character that is folded to, like 'k', and
12095 * the values each an array of everything that folds to its
12096 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
12097 if ((listp = hv_fetch(PL_utf8_foldclosures,
12098 (char *) foldbuf, foldlen, FALSE)))
12100 AV* list = (AV*) *listp;
12102 for (k = 0; k <= av_len(list); k++) {
12103 SV** c_p = av_fetch(list, k, FALSE);
12106 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12110 /* /aa doesn't allow folds between ASCII and non-;
12111 * /l doesn't allow them between above and below
12113 if ((MORE_ASCII_RESTRICTED
12114 && (isASCII(c) != isASCII(j)))
12115 || (LOC && ((c < 256) != (j < 256))))
12120 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
12121 stored += set_regclass_bit(pRExC_state,
12124 &l1_fold_invlist, &unicode_alternate);
12126 /* It may be that the code point is already in
12127 * this range or already in the bitmap, in
12128 * which case we need do nothing */
12129 else if ((c < start || c > end)
12131 || ! ANYOF_BITMAP_TEST(ret, c)))
12133 nonbitmap = add_cp_to_invlist(nonbitmap, c);
12140 SvREFCNT_dec(fold_intersection);
12143 /* Combine the two lists into one. */
12144 if (l1_fold_invlist) {
12146 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
12147 SvREFCNT_dec(l1_fold_invlist);
12150 nonbitmap = l1_fold_invlist;
12154 /* And combine the result (if any) with any inversion list from properties.
12155 * The lists are kept separate up to now because we don't want to fold the
12159 _invlist_union(nonbitmap, properties, &nonbitmap);
12160 SvREFCNT_dec(properties);
12163 nonbitmap = properties;
12167 /* Here, <nonbitmap> contains all the code points we can determine at
12168 * compile time that we haven't put into the bitmap. Go through it, and
12169 * for things that belong in the bitmap, put them there, and delete from
12173 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
12174 * possibly only should match when the target string is UTF-8 */
12175 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
12177 /* This gets set if we actually need to modify things */
12178 bool change_invlist = FALSE;
12182 /* Start looking through <nonbitmap> */
12183 invlist_iterinit(nonbitmap);
12184 while (invlist_iternext(nonbitmap, &start, &end)) {
12188 /* Quit if are above what we should change */
12189 if (start > max_cp_to_set) {
12193 change_invlist = TRUE;
12195 /* Set all the bits in the range, up to the max that we are doing */
12196 high = (end < max_cp_to_set) ? end : max_cp_to_set;
12197 for (i = start; i <= (int) high; i++) {
12198 if (! ANYOF_BITMAP_TEST(ret, i)) {
12199 ANYOF_BITMAP_SET(ret, i);
12207 /* Done with loop; remove any code points that are in the bitmap from
12209 if (change_invlist) {
12210 _invlist_subtract(nonbitmap,
12211 (DEPENDS_SEMANTICS)
12217 /* If have completely emptied it, remove it completely */
12218 if (invlist_len(nonbitmap) == 0) {
12219 SvREFCNT_dec(nonbitmap);
12224 /* Here, we have calculated what code points should be in the character
12225 * class. <nonbitmap> does not overlap the bitmap except possibly in the
12226 * case of DEPENDS rules.
12228 * Now we can see about various optimizations. Fold calculation (which we
12229 * did above) needs to take place before inversion. Otherwise /[^k]/i
12230 * would invert to include K, which under /i would match k, which it
12233 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
12234 * set the FOLD flag yet, so this does optimize those. It doesn't
12235 * optimize locale. Doing so perhaps could be done as long as there is
12236 * nothing like \w in it; some thought also would have to be given to the
12237 * interaction with above 0x100 chars */
12238 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
12240 && ! unicode_alternate
12241 /* In case of /d, there are some things that should match only when in
12242 * not in the bitmap, i.e., they require UTF8 to match. These are
12243 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
12244 * case, they don't require UTF8, so can invert here */
12246 || ! DEPENDS_SEMANTICS
12247 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12248 && SvCUR(listsv) == initial_listsv_len)
12252 for (i = 0; i < 256; ++i) {
12253 if (ANYOF_BITMAP_TEST(ret, i)) {
12254 ANYOF_BITMAP_CLEAR(ret, i);
12257 ANYOF_BITMAP_SET(ret, i);
12262 /* The inversion means that everything above 255 is matched */
12263 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12266 /* Here, also has things outside the bitmap that may overlap with
12267 * the bitmap. We have to sync them up, so that they get inverted
12268 * in both places. Earlier, we removed all overlaps except in the
12269 * case of /d rules, so no syncing is needed except for this case
12271 SV *remove_list = NULL;
12273 if (DEPENDS_SEMANTICS) {
12276 /* Set the bits that correspond to the ones that aren't in the
12277 * bitmap. Otherwise, when we invert, we'll miss these.
12278 * Earlier, we removed from the nonbitmap all code points
12279 * < 128, so there is no extra work here */
12280 invlist_iterinit(nonbitmap);
12281 while (invlist_iternext(nonbitmap, &start, &end)) {
12282 if (start > 255) { /* The bit map goes to 255 */
12288 for (i = start; i <= (int) end; ++i) {
12289 ANYOF_BITMAP_SET(ret, i);
12296 /* Now invert both the bitmap and the nonbitmap. Anything in the
12297 * bitmap has to also be removed from the non-bitmap, but again,
12298 * there should not be overlap unless is /d rules. */
12299 _invlist_invert(nonbitmap);
12301 /* Any swash can't be used as-is, because we've inverted things */
12303 SvREFCNT_dec(swash);
12307 for (i = 0; i < 256; ++i) {
12308 if (ANYOF_BITMAP_TEST(ret, i)) {
12309 ANYOF_BITMAP_CLEAR(ret, i);
12310 if (DEPENDS_SEMANTICS) {
12311 if (! remove_list) {
12312 remove_list = _new_invlist(2);
12314 remove_list = add_cp_to_invlist(remove_list, i);
12318 ANYOF_BITMAP_SET(ret, i);
12324 /* And do the removal */
12325 if (DEPENDS_SEMANTICS) {
12327 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
12328 SvREFCNT_dec(remove_list);
12332 /* There is no overlap for non-/d, so just delete anything
12334 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
12338 stored = 256 - stored;
12340 /* Clear the invert flag since have just done it here */
12341 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
12344 /* Folding in the bitmap is taken care of above, but not for locale (for
12345 * which we have to wait to see what folding is in effect at runtime), and
12346 * for some things not in the bitmap (only the upper latin folds in this
12347 * case, as all other single-char folding has been set above). Set
12348 * run-time fold flag for these */
12350 || (DEPENDS_SEMANTICS
12352 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12353 || unicode_alternate))
12355 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12358 /* A single character class can be "optimized" into an EXACTish node.
12359 * Note that since we don't currently count how many characters there are
12360 * outside the bitmap, we are XXX missing optimization possibilities for
12361 * them. This optimization can't happen unless this is a truly single
12362 * character class, which means that it can't be an inversion into a
12363 * many-character class, and there must be no possibility of there being
12364 * things outside the bitmap. 'stored' (only) for locales doesn't include
12365 * \w, etc, so have to make a special test that they aren't present
12367 * Similarly A 2-character class of the very special form like [bB] can be
12368 * optimized into an EXACTFish node, but only for non-locales, and for
12369 * characters which only have the two folds; so things like 'fF' and 'Ii'
12370 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12373 && ! unicode_alternate
12374 && SvCUR(listsv) == initial_listsv_len
12375 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
12376 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12377 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12378 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12379 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12380 /* If the latest code point has a fold whose
12381 * bit is set, it must be the only other one */
12382 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
12383 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
12385 /* Note that the information needed to decide to do this optimization
12386 * is not currently available until the 2nd pass, and that the actually
12387 * used EXACTish node takes less space than the calculated ANYOF node,
12388 * and hence the amount of space calculated in the first pass is larger
12389 * than actually used, so this optimization doesn't gain us any space.
12390 * But an EXACT node is faster than an ANYOF node, and can be combined
12391 * with any adjacent EXACT nodes later by the optimizer for further
12392 * gains. The speed of executing an EXACTF is similar to an ANYOF
12393 * node, so the optimization advantage comes from the ability to join
12394 * it to adjacent EXACT nodes */
12396 const char * cur_parse= RExC_parse;
12398 RExC_emit = (regnode *)orig_emit;
12399 RExC_parse = (char *)orig_parse;
12403 /* A locale node with one point can be folded; all the other cases
12404 * with folding will have two points, since we calculate them above
12406 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
12413 else { /* else 2 chars in the bit map: the folds of each other */
12415 /* Use the folded value, which for the cases where we get here,
12416 * is just the lower case of the current one (which may resolve to
12417 * itself, or to the other one */
12418 value = toLOWER_LATIN1(value);
12420 /* To join adjacent nodes, they must be the exact EXACTish type.
12421 * Try to use the most likely type, by using EXACTFA if possible,
12422 * then EXACTFU if the regex calls for it, or is required because
12423 * the character is non-ASCII. (If <value> is ASCII, its fold is
12424 * also ASCII for the cases where we get here.) */
12425 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12428 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
12431 else { /* Otherwise, more likely to be EXACTF type */
12436 ret = reg_node(pRExC_state, op);
12437 RExC_parse = (char *)cur_parse;
12438 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12439 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12440 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12442 RExC_emit += STR_SZ(2);
12445 *STRING(ret)= (char)value;
12447 RExC_emit += STR_SZ(1);
12449 SvREFCNT_dec(listsv);
12453 /* If there is a swash and more than one element, we can't use the swash in
12454 * the optimization below. */
12455 if (swash && element_count > 1) {
12456 SvREFCNT_dec(swash);
12460 && SvCUR(listsv) == initial_listsv_len
12461 && ! unicode_alternate)
12463 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12464 SvREFCNT_dec(listsv);
12465 SvREFCNT_dec(unicode_alternate);
12468 /* av[0] stores the character class description in its textual form:
12469 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12470 * appropriate swash, and is also useful for dumping the regnode.
12471 * av[1] if NULL, is a placeholder to later contain the swash computed
12472 * from av[0]. But if no further computation need be done, the
12473 * swash is stored there now.
12474 * av[2] stores the multicharacter foldings, used later in
12475 * regexec.c:S_reginclass().
12476 * av[3] stores the nonbitmap inversion list for use in addition or
12477 * instead of av[0]; not used if av[1] isn't NULL
12478 * av[4] is set if any component of the class is from a user-defined
12479 * property; not used if av[1] isn't NULL */
12480 AV * const av = newAV();
12483 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12487 av_store(av, 1, swash);
12488 SvREFCNT_dec(nonbitmap);
12491 av_store(av, 1, NULL);
12493 av_store(av, 3, nonbitmap);
12494 av_store(av, 4, newSVuv(has_user_defined_property));
12498 /* Store any computed multi-char folds only if we are allowing
12500 if (allow_full_fold) {
12501 av_store(av, 2, MUTABLE_SV(unicode_alternate));
12502 if (unicode_alternate) { /* This node is variable length */
12507 av_store(av, 2, NULL);
12509 rv = newRV_noinc(MUTABLE_SV(av));
12510 n = add_data(pRExC_state, 1, "s");
12511 RExC_rxi->data->data[n] = (void*)rv;
12518 /* reg_skipcomment()
12520 Absorbs an /x style # comments from the input stream.
12521 Returns true if there is more text remaining in the stream.
12522 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12523 terminates the pattern without including a newline.
12525 Note its the callers responsibility to ensure that we are
12526 actually in /x mode
12531 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12535 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12537 while (RExC_parse < RExC_end)
12538 if (*RExC_parse++ == '\n') {
12543 /* we ran off the end of the pattern without ending
12544 the comment, so we have to add an \n when wrapping */
12545 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12553 Advances the parse position, and optionally absorbs
12554 "whitespace" from the inputstream.
12556 Without /x "whitespace" means (?#...) style comments only,
12557 with /x this means (?#...) and # comments and whitespace proper.
12559 Returns the RExC_parse point from BEFORE the scan occurs.
12561 This is the /x friendly way of saying RExC_parse++.
12565 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12567 char* const retval = RExC_parse++;
12569 PERL_ARGS_ASSERT_NEXTCHAR;
12572 if (RExC_end - RExC_parse >= 3
12573 && *RExC_parse == '('
12574 && RExC_parse[1] == '?'
12575 && RExC_parse[2] == '#')
12577 while (*RExC_parse != ')') {
12578 if (RExC_parse == RExC_end)
12579 FAIL("Sequence (?#... not terminated");
12585 if (RExC_flags & RXf_PMf_EXTENDED) {
12586 if (isSPACE(*RExC_parse)) {
12590 else if (*RExC_parse == '#') {
12591 if ( reg_skipcomment( pRExC_state ) )
12600 - reg_node - emit a node
12602 STATIC regnode * /* Location. */
12603 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12606 register regnode *ptr;
12607 regnode * const ret = RExC_emit;
12608 GET_RE_DEBUG_FLAGS_DECL;
12610 PERL_ARGS_ASSERT_REG_NODE;
12613 SIZE_ALIGN(RExC_size);
12617 if (RExC_emit >= RExC_emit_bound)
12618 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12619 op, RExC_emit, RExC_emit_bound);
12621 NODE_ALIGN_FILL(ret);
12623 FILL_ADVANCE_NODE(ptr, op);
12624 #ifdef RE_TRACK_PATTERN_OFFSETS
12625 if (RExC_offsets) { /* MJD */
12626 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
12627 "reg_node", __LINE__,
12629 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12630 ? "Overwriting end of array!\n" : "OK",
12631 (UV)(RExC_emit - RExC_emit_start),
12632 (UV)(RExC_parse - RExC_start),
12633 (UV)RExC_offsets[0]));
12634 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
12642 - reganode - emit a node with an argument
12644 STATIC regnode * /* Location. */
12645 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12648 register regnode *ptr;
12649 regnode * const ret = RExC_emit;
12650 GET_RE_DEBUG_FLAGS_DECL;
12652 PERL_ARGS_ASSERT_REGANODE;
12655 SIZE_ALIGN(RExC_size);
12660 assert(2==regarglen[op]+1);
12662 Anything larger than this has to allocate the extra amount.
12663 If we changed this to be:
12665 RExC_size += (1 + regarglen[op]);
12667 then it wouldn't matter. Its not clear what side effect
12668 might come from that so its not done so far.
12673 if (RExC_emit >= RExC_emit_bound)
12674 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12675 op, RExC_emit, RExC_emit_bound);
12677 NODE_ALIGN_FILL(ret);
12679 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
12680 #ifdef RE_TRACK_PATTERN_OFFSETS
12681 if (RExC_offsets) { /* MJD */
12682 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12686 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
12687 "Overwriting end of array!\n" : "OK",
12688 (UV)(RExC_emit - RExC_emit_start),
12689 (UV)(RExC_parse - RExC_start),
12690 (UV)RExC_offsets[0]));
12691 Set_Cur_Node_Offset;
12699 - reguni - emit (if appropriate) a Unicode character
12702 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
12706 PERL_ARGS_ASSERT_REGUNI;
12708 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
12712 - reginsert - insert an operator in front of already-emitted operand
12714 * Means relocating the operand.
12717 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
12720 register regnode *src;
12721 register regnode *dst;
12722 register regnode *place;
12723 const int offset = regarglen[(U8)op];
12724 const int size = NODE_STEP_REGNODE + offset;
12725 GET_RE_DEBUG_FLAGS_DECL;
12727 PERL_ARGS_ASSERT_REGINSERT;
12728 PERL_UNUSED_ARG(depth);
12729 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
12730 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
12739 if (RExC_open_parens) {
12741 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
12742 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12743 if ( RExC_open_parens[paren] >= opnd ) {
12744 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
12745 RExC_open_parens[paren] += size;
12747 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12749 if ( RExC_close_parens[paren] >= opnd ) {
12750 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
12751 RExC_close_parens[paren] += size;
12753 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12758 while (src > opnd) {
12759 StructCopy(--src, --dst, regnode);
12760 #ifdef RE_TRACK_PATTERN_OFFSETS
12761 if (RExC_offsets) { /* MJD 20010112 */
12762 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
12766 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12767 ? "Overwriting end of array!\n" : "OK",
12768 (UV)(src - RExC_emit_start),
12769 (UV)(dst - RExC_emit_start),
12770 (UV)RExC_offsets[0]));
12771 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12772 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12778 place = opnd; /* Op node, where operand used to be. */
12779 #ifdef RE_TRACK_PATTERN_OFFSETS
12780 if (RExC_offsets) { /* MJD */
12781 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12785 (UV)(place - RExC_emit_start) > RExC_offsets[0]
12786 ? "Overwriting end of array!\n" : "OK",
12787 (UV)(place - RExC_emit_start),
12788 (UV)(RExC_parse - RExC_start),
12789 (UV)RExC_offsets[0]));
12790 Set_Node_Offset(place, RExC_parse);
12791 Set_Node_Length(place, 1);
12794 src = NEXTOPER(place);
12795 FILL_ADVANCE_NODE(place, op);
12796 Zero(src, offset, regnode);
12800 - regtail - set the next-pointer at the end of a node chain of p to val.
12801 - SEE ALSO: regtail_study
12803 /* TODO: All three parms should be const */
12805 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12808 register regnode *scan;
12809 GET_RE_DEBUG_FLAGS_DECL;
12811 PERL_ARGS_ASSERT_REGTAIL;
12813 PERL_UNUSED_ARG(depth);
12819 /* Find last node. */
12822 regnode * const temp = regnext(scan);
12824 SV * const mysv=sv_newmortal();
12825 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12826 regprop(RExC_rx, mysv, scan);
12827 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12828 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12829 (temp == NULL ? "->" : ""),
12830 (temp == NULL ? PL_reg_name[OP(val)] : "")
12838 if (reg_off_by_arg[OP(scan)]) {
12839 ARG_SET(scan, val - scan);
12842 NEXT_OFF(scan) = val - scan;
12848 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12849 - Look for optimizable sequences at the same time.
12850 - currently only looks for EXACT chains.
12852 This is experimental code. The idea is to use this routine to perform
12853 in place optimizations on branches and groups as they are constructed,
12854 with the long term intention of removing optimization from study_chunk so
12855 that it is purely analytical.
12857 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12858 to control which is which.
12861 /* TODO: All four parms should be const */
12864 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12867 register regnode *scan;
12869 #ifdef EXPERIMENTAL_INPLACESCAN
12872 GET_RE_DEBUG_FLAGS_DECL;
12874 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12880 /* Find last node. */
12884 regnode * const temp = regnext(scan);
12885 #ifdef EXPERIMENTAL_INPLACESCAN
12886 if (PL_regkind[OP(scan)] == EXACT) {
12887 bool has_exactf_sharp_s; /* Unexamined in this routine */
12888 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12893 switch (OP(scan)) {
12899 case EXACTFU_TRICKYFOLD:
12901 if( exact == PSEUDO )
12903 else if ( exact != OP(scan) )
12912 SV * const mysv=sv_newmortal();
12913 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12914 regprop(RExC_rx, mysv, scan);
12915 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12916 SvPV_nolen_const(mysv),
12917 REG_NODE_NUM(scan),
12918 PL_reg_name[exact]);
12925 SV * const mysv_val=sv_newmortal();
12926 DEBUG_PARSE_MSG("");
12927 regprop(RExC_rx, mysv_val, val);
12928 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12929 SvPV_nolen_const(mysv_val),
12930 (IV)REG_NODE_NUM(val),
12934 if (reg_off_by_arg[OP(scan)]) {
12935 ARG_SET(scan, val - scan);
12938 NEXT_OFF(scan) = val - scan;
12946 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12950 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12956 for (bit=0; bit<32; bit++) {
12957 if (flags & (1<<bit)) {
12958 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12961 if (!set++ && lead)
12962 PerlIO_printf(Perl_debug_log, "%s",lead);
12963 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12966 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12967 if (!set++ && lead) {
12968 PerlIO_printf(Perl_debug_log, "%s",lead);
12971 case REGEX_UNICODE_CHARSET:
12972 PerlIO_printf(Perl_debug_log, "UNICODE");
12974 case REGEX_LOCALE_CHARSET:
12975 PerlIO_printf(Perl_debug_log, "LOCALE");
12977 case REGEX_ASCII_RESTRICTED_CHARSET:
12978 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12980 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12981 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12984 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12990 PerlIO_printf(Perl_debug_log, "\n");
12992 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12998 Perl_regdump(pTHX_ const regexp *r)
13002 SV * const sv = sv_newmortal();
13003 SV *dsv= sv_newmortal();
13004 RXi_GET_DECL(r,ri);
13005 GET_RE_DEBUG_FLAGS_DECL;
13007 PERL_ARGS_ASSERT_REGDUMP;
13009 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13011 /* Header fields of interest. */
13012 if (r->anchored_substr) {
13013 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13014 RE_SV_DUMPLEN(r->anchored_substr), 30);
13015 PerlIO_printf(Perl_debug_log,
13016 "anchored %s%s at %"IVdf" ",
13017 s, RE_SV_TAIL(r->anchored_substr),
13018 (IV)r->anchored_offset);
13019 } else if (r->anchored_utf8) {
13020 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13021 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13022 PerlIO_printf(Perl_debug_log,
13023 "anchored utf8 %s%s at %"IVdf" ",
13024 s, RE_SV_TAIL(r->anchored_utf8),
13025 (IV)r->anchored_offset);
13027 if (r->float_substr) {
13028 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13029 RE_SV_DUMPLEN(r->float_substr), 30);
13030 PerlIO_printf(Perl_debug_log,
13031 "floating %s%s at %"IVdf"..%"UVuf" ",
13032 s, RE_SV_TAIL(r->float_substr),
13033 (IV)r->float_min_offset, (UV)r->float_max_offset);
13034 } else if (r->float_utf8) {
13035 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13036 RE_SV_DUMPLEN(r->float_utf8), 30);
13037 PerlIO_printf(Perl_debug_log,
13038 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13039 s, RE_SV_TAIL(r->float_utf8),
13040 (IV)r->float_min_offset, (UV)r->float_max_offset);
13042 if (r->check_substr || r->check_utf8)
13043 PerlIO_printf(Perl_debug_log,
13045 (r->check_substr == r->float_substr
13046 && r->check_utf8 == r->float_utf8
13047 ? "(checking floating" : "(checking anchored"));
13048 if (r->extflags & RXf_NOSCAN)
13049 PerlIO_printf(Perl_debug_log, " noscan");
13050 if (r->extflags & RXf_CHECK_ALL)
13051 PerlIO_printf(Perl_debug_log, " isall");
13052 if (r->check_substr || r->check_utf8)
13053 PerlIO_printf(Perl_debug_log, ") ");
13055 if (ri->regstclass) {
13056 regprop(r, sv, ri->regstclass);
13057 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13059 if (r->extflags & RXf_ANCH) {
13060 PerlIO_printf(Perl_debug_log, "anchored");
13061 if (r->extflags & RXf_ANCH_BOL)
13062 PerlIO_printf(Perl_debug_log, "(BOL)");
13063 if (r->extflags & RXf_ANCH_MBOL)
13064 PerlIO_printf(Perl_debug_log, "(MBOL)");
13065 if (r->extflags & RXf_ANCH_SBOL)
13066 PerlIO_printf(Perl_debug_log, "(SBOL)");
13067 if (r->extflags & RXf_ANCH_GPOS)
13068 PerlIO_printf(Perl_debug_log, "(GPOS)");
13069 PerlIO_putc(Perl_debug_log, ' ');
13071 if (r->extflags & RXf_GPOS_SEEN)
13072 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13073 if (r->intflags & PREGf_SKIP)
13074 PerlIO_printf(Perl_debug_log, "plus ");
13075 if (r->intflags & PREGf_IMPLICIT)
13076 PerlIO_printf(Perl_debug_log, "implicit ");
13077 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13078 if (r->extflags & RXf_EVAL_SEEN)
13079 PerlIO_printf(Perl_debug_log, "with eval ");
13080 PerlIO_printf(Perl_debug_log, "\n");
13081 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13083 PERL_ARGS_ASSERT_REGDUMP;
13084 PERL_UNUSED_CONTEXT;
13085 PERL_UNUSED_ARG(r);
13086 #endif /* DEBUGGING */
13090 - regprop - printable representation of opcode
13092 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13095 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13096 if (flags & ANYOF_INVERT) \
13097 /*make sure the invert info is in each */ \
13098 sv_catpvs(sv, "^"); \
13104 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13109 RXi_GET_DECL(prog,progi);
13110 GET_RE_DEBUG_FLAGS_DECL;
13112 PERL_ARGS_ASSERT_REGPROP;
13116 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13117 /* It would be nice to FAIL() here, but this may be called from
13118 regexec.c, and it would be hard to supply pRExC_state. */
13119 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13120 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13122 k = PL_regkind[OP(o)];
13125 sv_catpvs(sv, " ");
13126 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13127 * is a crude hack but it may be the best for now since
13128 * we have no flag "this EXACTish node was UTF-8"
13130 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13131 PERL_PV_ESCAPE_UNI_DETECT |
13132 PERL_PV_ESCAPE_NONASCII |
13133 PERL_PV_PRETTY_ELLIPSES |
13134 PERL_PV_PRETTY_LTGT |
13135 PERL_PV_PRETTY_NOCLEAR
13137 } else if (k == TRIE) {
13138 /* print the details of the trie in dumpuntil instead, as
13139 * progi->data isn't available here */
13140 const char op = OP(o);
13141 const U32 n = ARG(o);
13142 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13143 (reg_ac_data *)progi->data->data[n] :
13145 const reg_trie_data * const trie
13146 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13148 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13149 DEBUG_TRIE_COMPILE_r(
13150 Perl_sv_catpvf(aTHX_ sv,
13151 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13152 (UV)trie->startstate,
13153 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13154 (UV)trie->wordcount,
13157 (UV)TRIE_CHARCOUNT(trie),
13158 (UV)trie->uniquecharcount
13161 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13163 int rangestart = -1;
13164 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13165 sv_catpvs(sv, "[");
13166 for (i = 0; i <= 256; i++) {
13167 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13168 if (rangestart == -1)
13170 } else if (rangestart != -1) {
13171 if (i <= rangestart + 3)
13172 for (; rangestart < i; rangestart++)
13173 put_byte(sv, rangestart);
13175 put_byte(sv, rangestart);
13176 sv_catpvs(sv, "-");
13177 put_byte(sv, i - 1);
13182 sv_catpvs(sv, "]");
13185 } else if (k == CURLY) {
13186 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13187 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13188 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13190 else if (k == WHILEM && o->flags) /* Ordinal/of */
13191 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13192 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13193 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13194 if ( RXp_PAREN_NAMES(prog) ) {
13195 if ( k != REF || (OP(o) < NREF)) {
13196 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13197 SV **name= av_fetch(list, ARG(o), 0 );
13199 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13202 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13203 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13204 I32 *nums=(I32*)SvPVX(sv_dat);
13205 SV **name= av_fetch(list, nums[0], 0 );
13208 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13209 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13210 (n ? "," : ""), (IV)nums[n]);
13212 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13216 } else if (k == GOSUB)
13217 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13218 else if (k == VERB) {
13220 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13221 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13222 } else if (k == LOGICAL)
13223 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13224 else if (k == ANYOF) {
13225 int i, rangestart = -1;
13226 const U8 flags = ANYOF_FLAGS(o);
13229 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13230 static const char * const anyofs[] = {
13263 if (flags & ANYOF_LOCALE)
13264 sv_catpvs(sv, "{loc}");
13265 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13266 sv_catpvs(sv, "{i}");
13267 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13268 if (flags & ANYOF_INVERT)
13269 sv_catpvs(sv, "^");
13271 /* output what the standard cp 0-255 bitmap matches */
13272 for (i = 0; i <= 256; i++) {
13273 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13274 if (rangestart == -1)
13276 } else if (rangestart != -1) {
13277 if (i <= rangestart + 3)
13278 for (; rangestart < i; rangestart++)
13279 put_byte(sv, rangestart);
13281 put_byte(sv, rangestart);
13282 sv_catpvs(sv, "-");
13283 put_byte(sv, i - 1);
13290 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13291 /* output any special charclass tests (used entirely under use locale) */
13292 if (ANYOF_CLASS_TEST_ANY_SET(o))
13293 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13294 if (ANYOF_CLASS_TEST(o,i)) {
13295 sv_catpv(sv, anyofs[i]);
13299 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13301 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13302 sv_catpvs(sv, "{non-utf8-latin1-all}");
13305 /* output information about the unicode matching */
13306 if (flags & ANYOF_UNICODE_ALL)
13307 sv_catpvs(sv, "{unicode_all}");
13308 else if (ANYOF_NONBITMAP(o))
13309 sv_catpvs(sv, "{unicode}");
13310 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13311 sv_catpvs(sv, "{outside bitmap}");
13313 if (ANYOF_NONBITMAP(o)) {
13314 SV *lv; /* Set if there is something outside the bit map */
13315 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13316 bool byte_output = FALSE; /* If something in the bitmap has been
13319 if (lv && lv != &PL_sv_undef) {
13321 U8 s[UTF8_MAXBYTES_CASE+1];
13323 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13324 uvchr_to_utf8(s, i);
13327 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13331 && swash_fetch(sw, s, TRUE))
13333 if (rangestart == -1)
13335 } else if (rangestart != -1) {
13336 byte_output = TRUE;
13337 if (i <= rangestart + 3)
13338 for (; rangestart < i; rangestart++) {
13339 put_byte(sv, rangestart);
13342 put_byte(sv, rangestart);
13343 sv_catpvs(sv, "-");
13352 char *s = savesvpv(lv);
13353 char * const origs = s;
13355 while (*s && *s != '\n')
13359 const char * const t = ++s;
13362 sv_catpvs(sv, " ");
13368 /* Truncate very long output */
13369 if (s - origs > 256) {
13370 Perl_sv_catpvf(aTHX_ sv,
13372 (int) (s - origs - 1),
13378 else if (*s == '\t') {
13397 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13399 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13400 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13402 PERL_UNUSED_CONTEXT;
13403 PERL_UNUSED_ARG(sv);
13404 PERL_UNUSED_ARG(o);
13405 PERL_UNUSED_ARG(prog);
13406 #endif /* DEBUGGING */
13410 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13411 { /* Assume that RE_INTUIT is set */
13413 struct regexp *const prog = (struct regexp *)SvANY(r);
13414 GET_RE_DEBUG_FLAGS_DECL;
13416 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13417 PERL_UNUSED_CONTEXT;
13421 const char * const s = SvPV_nolen_const(prog->check_substr
13422 ? prog->check_substr : prog->check_utf8);
13424 if (!PL_colorset) reginitcolors();
13425 PerlIO_printf(Perl_debug_log,
13426 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13428 prog->check_substr ? "" : "utf8 ",
13429 PL_colors[5],PL_colors[0],
13432 (strlen(s) > 60 ? "..." : ""));
13435 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13441 handles refcounting and freeing the perl core regexp structure. When
13442 it is necessary to actually free the structure the first thing it
13443 does is call the 'free' method of the regexp_engine associated to
13444 the regexp, allowing the handling of the void *pprivate; member
13445 first. (This routine is not overridable by extensions, which is why
13446 the extensions free is called first.)
13448 See regdupe and regdupe_internal if you change anything here.
13450 #ifndef PERL_IN_XSUB_RE
13452 Perl_pregfree(pTHX_ REGEXP *r)
13458 Perl_pregfree2(pTHX_ REGEXP *rx)
13461 struct regexp *const r = (struct regexp *)SvANY(rx);
13462 GET_RE_DEBUG_FLAGS_DECL;
13464 PERL_ARGS_ASSERT_PREGFREE2;
13466 if (r->mother_re) {
13467 ReREFCNT_dec(r->mother_re);
13469 CALLREGFREE_PVT(rx); /* free the private data */
13470 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13473 SvREFCNT_dec(r->anchored_substr);
13474 SvREFCNT_dec(r->anchored_utf8);
13475 SvREFCNT_dec(r->float_substr);
13476 SvREFCNT_dec(r->float_utf8);
13477 Safefree(r->substrs);
13479 RX_MATCH_COPY_FREE(rx);
13480 #ifdef PERL_OLD_COPY_ON_WRITE
13481 SvREFCNT_dec(r->saved_copy);
13484 SvREFCNT_dec(r->qr_anoncv);
13489 This is a hacky workaround to the structural issue of match results
13490 being stored in the regexp structure which is in turn stored in
13491 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13492 could be PL_curpm in multiple contexts, and could require multiple
13493 result sets being associated with the pattern simultaneously, such
13494 as when doing a recursive match with (??{$qr})
13496 The solution is to make a lightweight copy of the regexp structure
13497 when a qr// is returned from the code executed by (??{$qr}) this
13498 lightweight copy doesn't actually own any of its data except for
13499 the starp/end and the actual regexp structure itself.
13505 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13507 struct regexp *ret;
13508 struct regexp *const r = (struct regexp *)SvANY(rx);
13510 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13513 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13514 ret = (struct regexp *)SvANY(ret_x);
13516 (void)ReREFCNT_inc(rx);
13517 /* We can take advantage of the existing "copied buffer" mechanism in SVs
13518 by pointing directly at the buffer, but flagging that the allocated
13519 space in the copy is zero. As we've just done a struct copy, it's now
13520 a case of zero-ing that, rather than copying the current length. */
13521 SvPV_set(ret_x, RX_WRAPPED(rx));
13522 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13523 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13524 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13525 SvLEN_set(ret_x, 0);
13526 SvSTASH_set(ret_x, NULL);
13527 SvMAGIC_set(ret_x, NULL);
13529 const I32 npar = r->nparens+1;
13530 Newx(ret->offs, npar, regexp_paren_pair);
13531 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13534 Newx(ret->substrs, 1, struct reg_substr_data);
13535 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13537 SvREFCNT_inc_void(ret->anchored_substr);
13538 SvREFCNT_inc_void(ret->anchored_utf8);
13539 SvREFCNT_inc_void(ret->float_substr);
13540 SvREFCNT_inc_void(ret->float_utf8);
13542 /* check_substr and check_utf8, if non-NULL, point to either their
13543 anchored or float namesakes, and don't hold a second reference. */
13545 RX_MATCH_COPIED_off(ret_x);
13546 #ifdef PERL_OLD_COPY_ON_WRITE
13547 ret->saved_copy = NULL;
13549 ret->mother_re = rx;
13550 SvREFCNT_inc_void(ret->qr_anoncv);
13556 /* regfree_internal()
13558 Free the private data in a regexp. This is overloadable by
13559 extensions. Perl takes care of the regexp structure in pregfree(),
13560 this covers the *pprivate pointer which technically perl doesn't
13561 know about, however of course we have to handle the
13562 regexp_internal structure when no extension is in use.
13564 Note this is called before freeing anything in the regexp
13569 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13572 struct regexp *const r = (struct regexp *)SvANY(rx);
13573 RXi_GET_DECL(r,ri);
13574 GET_RE_DEBUG_FLAGS_DECL;
13576 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13582 SV *dsv= sv_newmortal();
13583 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13584 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
13585 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13586 PL_colors[4],PL_colors[5],s);
13589 #ifdef RE_TRACK_PATTERN_OFFSETS
13591 Safefree(ri->u.offsets); /* 20010421 MJD */
13593 if (ri->code_blocks) {
13595 for (n = 0; n < ri->num_code_blocks; n++)
13596 SvREFCNT_dec(ri->code_blocks[n].src_regex);
13597 Safefree(ri->code_blocks);
13601 int n = ri->data->count;
13604 /* If you add a ->what type here, update the comment in regcomp.h */
13605 switch (ri->data->what[n]) {
13611 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13614 Safefree(ri->data->data[n]);
13620 { /* Aho Corasick add-on structure for a trie node.
13621 Used in stclass optimization only */
13623 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13625 refcount = --aho->refcount;
13628 PerlMemShared_free(aho->states);
13629 PerlMemShared_free(aho->fail);
13630 /* do this last!!!! */
13631 PerlMemShared_free(ri->data->data[n]);
13632 PerlMemShared_free(ri->regstclass);
13638 /* trie structure. */
13640 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13642 refcount = --trie->refcount;
13645 PerlMemShared_free(trie->charmap);
13646 PerlMemShared_free(trie->states);
13647 PerlMemShared_free(trie->trans);
13649 PerlMemShared_free(trie->bitmap);
13651 PerlMemShared_free(trie->jump);
13652 PerlMemShared_free(trie->wordinfo);
13653 /* do this last!!!! */
13654 PerlMemShared_free(ri->data->data[n]);
13659 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
13662 Safefree(ri->data->what);
13663 Safefree(ri->data);
13669 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13670 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
13671 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
13674 re_dup - duplicate a regexp.
13676 This routine is expected to clone a given regexp structure. It is only
13677 compiled under USE_ITHREADS.
13679 After all of the core data stored in struct regexp is duplicated
13680 the regexp_engine.dupe method is used to copy any private data
13681 stored in the *pprivate pointer. This allows extensions to handle
13682 any duplication it needs to do.
13684 See pregfree() and regfree_internal() if you change anything here.
13686 #if defined(USE_ITHREADS)
13687 #ifndef PERL_IN_XSUB_RE
13689 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13693 const struct regexp *r = (const struct regexp *)SvANY(sstr);
13694 struct regexp *ret = (struct regexp *)SvANY(dstr);
13696 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13698 npar = r->nparens+1;
13699 Newx(ret->offs, npar, regexp_paren_pair);
13700 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13702 /* no need to copy these */
13703 Newx(ret->swap, npar, regexp_paren_pair);
13706 if (ret->substrs) {
13707 /* Do it this way to avoid reading from *r after the StructCopy().
13708 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13709 cache, it doesn't matter. */
13710 const bool anchored = r->check_substr
13711 ? r->check_substr == r->anchored_substr
13712 : r->check_utf8 == r->anchored_utf8;
13713 Newx(ret->substrs, 1, struct reg_substr_data);
13714 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13716 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13717 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13718 ret->float_substr = sv_dup_inc(ret->float_substr, param);
13719 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
13721 /* check_substr and check_utf8, if non-NULL, point to either their
13722 anchored or float namesakes, and don't hold a second reference. */
13724 if (ret->check_substr) {
13726 assert(r->check_utf8 == r->anchored_utf8);
13727 ret->check_substr = ret->anchored_substr;
13728 ret->check_utf8 = ret->anchored_utf8;
13730 assert(r->check_substr == r->float_substr);
13731 assert(r->check_utf8 == r->float_utf8);
13732 ret->check_substr = ret->float_substr;
13733 ret->check_utf8 = ret->float_utf8;
13735 } else if (ret->check_utf8) {
13737 ret->check_utf8 = ret->anchored_utf8;
13739 ret->check_utf8 = ret->float_utf8;
13744 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13745 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13748 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
13750 if (RX_MATCH_COPIED(dstr))
13751 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
13753 ret->subbeg = NULL;
13754 #ifdef PERL_OLD_COPY_ON_WRITE
13755 ret->saved_copy = NULL;
13758 if (ret->mother_re) {
13759 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13760 /* Our storage points directly to our mother regexp, but that's
13761 1: a buffer in a different thread
13762 2: something we no longer hold a reference on
13763 so we need to copy it locally. */
13764 /* Note we need to use SvCUR(), rather than
13765 SvLEN(), on our mother_re, because it, in
13766 turn, may well be pointing to its own mother_re. */
13767 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13768 SvCUR(ret->mother_re)+1));
13769 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13771 ret->mother_re = NULL;
13775 #endif /* PERL_IN_XSUB_RE */
13780 This is the internal complement to regdupe() which is used to copy
13781 the structure pointed to by the *pprivate pointer in the regexp.
13782 This is the core version of the extension overridable cloning hook.
13783 The regexp structure being duplicated will be copied by perl prior
13784 to this and will be provided as the regexp *r argument, however
13785 with the /old/ structures pprivate pointer value. Thus this routine
13786 may override any copying normally done by perl.
13788 It returns a pointer to the new regexp_internal structure.
13792 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13795 struct regexp *const r = (struct regexp *)SvANY(rx);
13796 regexp_internal *reti;
13798 RXi_GET_DECL(r,ri);
13800 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13804 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13805 Copy(ri->program, reti->program, len+1, regnode);
13807 reti->num_code_blocks = ri->num_code_blocks;
13808 if (ri->code_blocks) {
13810 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13811 struct reg_code_block);
13812 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13813 struct reg_code_block);
13814 for (n = 0; n < ri->num_code_blocks; n++)
13815 reti->code_blocks[n].src_regex = (REGEXP*)
13816 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
13819 reti->code_blocks = NULL;
13821 reti->regstclass = NULL;
13824 struct reg_data *d;
13825 const int count = ri->data->count;
13828 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13829 char, struct reg_data);
13830 Newx(d->what, count, U8);
13833 for (i = 0; i < count; i++) {
13834 d->what[i] = ri->data->what[i];
13835 switch (d->what[i]) {
13836 /* see also regcomp.h and regfree_internal() */
13837 case 'a': /* actually an AV, but the dup function is identical. */
13841 case 'u': /* actually an HV, but the dup function is identical. */
13842 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13845 /* This is cheating. */
13846 Newx(d->data[i], 1, struct regnode_charclass_class);
13847 StructCopy(ri->data->data[i], d->data[i],
13848 struct regnode_charclass_class);
13849 reti->regstclass = (regnode*)d->data[i];
13852 /* Trie stclasses are readonly and can thus be shared
13853 * without duplication. We free the stclass in pregfree
13854 * when the corresponding reg_ac_data struct is freed.
13856 reti->regstclass= ri->regstclass;
13860 ((reg_trie_data*)ri->data->data[i])->refcount++;
13865 d->data[i] = ri->data->data[i];
13868 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13877 reti->name_list_idx = ri->name_list_idx;
13879 #ifdef RE_TRACK_PATTERN_OFFSETS
13880 if (ri->u.offsets) {
13881 Newx(reti->u.offsets, 2*len+1, U32);
13882 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13885 SetProgLen(reti,len);
13888 return (void*)reti;
13891 #endif /* USE_ITHREADS */
13893 #ifndef PERL_IN_XSUB_RE
13896 - regnext - dig the "next" pointer out of a node
13899 Perl_regnext(pTHX_ register regnode *p)
13902 register I32 offset;
13907 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13908 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13911 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13920 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13923 STRLEN l1 = strlen(pat1);
13924 STRLEN l2 = strlen(pat2);
13927 const char *message;
13929 PERL_ARGS_ASSERT_RE_CROAK2;
13935 Copy(pat1, buf, l1 , char);
13936 Copy(pat2, buf + l1, l2 , char);
13937 buf[l1 + l2] = '\n';
13938 buf[l1 + l2 + 1] = '\0';
13940 /* ANSI variant takes additional second argument */
13941 va_start(args, pat2);
13945 msv = vmess(buf, &args);
13947 message = SvPV_const(msv,l1);
13950 Copy(message, buf, l1 , char);
13951 buf[l1-1] = '\0'; /* Overwrite \n */
13952 Perl_croak(aTHX_ "%s", buf);
13955 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13957 #ifndef PERL_IN_XSUB_RE
13959 Perl_save_re_context(pTHX)
13963 struct re_save_state *state;
13965 SAVEVPTR(PL_curcop);
13966 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13968 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13969 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13970 SSPUSHUV(SAVEt_RE_STATE);
13972 Copy(&PL_reg_state, state, 1, struct re_save_state);
13974 PL_reg_oldsaved = NULL;
13975 PL_reg_oldsavedlen = 0;
13976 PL_reg_maxiter = 0;
13977 PL_reg_leftiter = 0;
13978 PL_reg_poscache = NULL;
13979 PL_reg_poscache_size = 0;
13980 #ifdef PERL_OLD_COPY_ON_WRITE
13984 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13986 const REGEXP * const rx = PM_GETRE(PL_curpm);
13989 for (i = 1; i <= RX_NPARENS(rx); i++) {
13990 char digits[TYPE_CHARS(long)];
13991 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13992 GV *const *const gvp
13993 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13996 GV * const gv = *gvp;
13997 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14007 clear_re(pTHX_ void *r)
14010 ReREFCNT_dec((REGEXP *)r);
14016 S_put_byte(pTHX_ SV *sv, int c)
14018 PERL_ARGS_ASSERT_PUT_BYTE;
14020 /* Our definition of isPRINT() ignores locales, so only bytes that are
14021 not part of UTF-8 are considered printable. I assume that the same
14022 holds for UTF-EBCDIC.
14023 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14024 which Wikipedia says:
14026 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14027 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14028 identical, to the ASCII delete (DEL) or rubout control character.
14029 ) So the old condition can be simplified to !isPRINT(c) */
14032 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14035 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14039 const char string = c;
14040 if (c == '-' || c == ']' || c == '\\' || c == '^')
14041 sv_catpvs(sv, "\\");
14042 sv_catpvn(sv, &string, 1);
14047 #define CLEAR_OPTSTART \
14048 if (optstart) STMT_START { \
14049 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14053 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14055 STATIC const regnode *
14056 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14057 const regnode *last, const regnode *plast,
14058 SV* sv, I32 indent, U32 depth)
14061 register U8 op = PSEUDO; /* Arbitrary non-END op. */
14062 register const regnode *next;
14063 const regnode *optstart= NULL;
14065 RXi_GET_DECL(r,ri);
14066 GET_RE_DEBUG_FLAGS_DECL;
14068 PERL_ARGS_ASSERT_DUMPUNTIL;
14070 #ifdef DEBUG_DUMPUNTIL
14071 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14072 last ? last-start : 0,plast ? plast-start : 0);
14075 if (plast && plast < last)
14078 while (PL_regkind[op] != END && (!last || node < last)) {
14079 /* While that wasn't END last time... */
14082 if (op == CLOSE || op == WHILEM)
14084 next = regnext((regnode *)node);
14087 if (OP(node) == OPTIMIZED) {
14088 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14095 regprop(r, sv, node);
14096 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14097 (int)(2*indent + 1), "", SvPVX_const(sv));
14099 if (OP(node) != OPTIMIZED) {
14100 if (next == NULL) /* Next ptr. */
14101 PerlIO_printf(Perl_debug_log, " (0)");
14102 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14103 PerlIO_printf(Perl_debug_log, " (FAIL)");
14105 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14106 (void)PerlIO_putc(Perl_debug_log, '\n');
14110 if (PL_regkind[(U8)op] == BRANCHJ) {
14113 register const regnode *nnode = (OP(next) == LONGJMP
14114 ? regnext((regnode *)next)
14116 if (last && nnode > last)
14118 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14121 else if (PL_regkind[(U8)op] == BRANCH) {
14123 DUMPUNTIL(NEXTOPER(node), next);
14125 else if ( PL_regkind[(U8)op] == TRIE ) {
14126 const regnode *this_trie = node;
14127 const char op = OP(node);
14128 const U32 n = ARG(node);
14129 const reg_ac_data * const ac = op>=AHOCORASICK ?
14130 (reg_ac_data *)ri->data->data[n] :
14132 const reg_trie_data * const trie =
14133 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14135 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14137 const regnode *nextbranch= NULL;
14140 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14141 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14143 PerlIO_printf(Perl_debug_log, "%*s%s ",
14144 (int)(2*(indent+3)), "",
14145 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14146 PL_colors[0], PL_colors[1],
14147 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14148 PERL_PV_PRETTY_ELLIPSES |
14149 PERL_PV_PRETTY_LTGT
14154 U16 dist= trie->jump[word_idx+1];
14155 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14156 (UV)((dist ? this_trie + dist : next) - start));
14159 nextbranch= this_trie + trie->jump[0];
14160 DUMPUNTIL(this_trie + dist, nextbranch);
14162 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14163 nextbranch= regnext((regnode *)nextbranch);
14165 PerlIO_printf(Perl_debug_log, "\n");
14168 if (last && next > last)
14173 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14174 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14175 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14177 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14179 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14181 else if ( op == PLUS || op == STAR) {
14182 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14184 else if (PL_regkind[(U8)op] == ANYOF) {
14185 /* arglen 1 + class block */
14186 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14187 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14188 node = NEXTOPER(node);
14190 else if (PL_regkind[(U8)op] == EXACT) {
14191 /* Literal string, where present. */
14192 node += NODE_SZ_STR(node) - 1;
14193 node = NEXTOPER(node);
14196 node = NEXTOPER(node);
14197 node += regarglen[(U8)op];
14199 if (op == CURLYX || op == OPEN)
14203 #ifdef DEBUG_DUMPUNTIL
14204 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14209 #endif /* DEBUGGING */
14213 * c-indentation-style: bsd
14214 * c-basic-offset: 4
14215 * indent-tabs-mode: nil
14218 * ex: set ts=8 sts=4 sw=4 et: