5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
140 HV *paren_names; /* Paren names */
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
146 char *starttry; /* -Dr: where regtry was called. */
147 #define RExC_starttry (pRExC_state->starttry)
150 const char *lastparse;
152 AV *paren_name_list; /* idx -> name */
153 #define RExC_lastparse (pRExC_state->lastparse)
154 #define RExC_lastnum (pRExC_state->lastnum)
155 #define RExC_paren_name_list (pRExC_state->paren_name_list)
159 #define RExC_flags (pRExC_state->flags)
160 #define RExC_precomp (pRExC_state->precomp)
161 #define RExC_rx_sv (pRExC_state->rx_sv)
162 #define RExC_rx (pRExC_state->rx)
163 #define RExC_rxi (pRExC_state->rxi)
164 #define RExC_start (pRExC_state->start)
165 #define RExC_end (pRExC_state->end)
166 #define RExC_parse (pRExC_state->parse)
167 #define RExC_whilem_seen (pRExC_state->whilem_seen)
168 #ifdef RE_TRACK_PATTERN_OFFSETS
169 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
171 #define RExC_emit (pRExC_state->emit)
172 #define RExC_emit_start (pRExC_state->emit_start)
173 #define RExC_emit_bound (pRExC_state->emit_bound)
174 #define RExC_naughty (pRExC_state->naughty)
175 #define RExC_sawback (pRExC_state->sawback)
176 #define RExC_seen (pRExC_state->seen)
177 #define RExC_size (pRExC_state->size)
178 #define RExC_npar (pRExC_state->npar)
179 #define RExC_nestroot (pRExC_state->nestroot)
180 #define RExC_extralen (pRExC_state->extralen)
181 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
182 #define RExC_seen_evals (pRExC_state->seen_evals)
183 #define RExC_utf8 (pRExC_state->utf8)
184 #define RExC_uni_semantics (pRExC_state->uni_semantics)
185 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
186 #define RExC_open_parens (pRExC_state->open_parens)
187 #define RExC_close_parens (pRExC_state->close_parens)
188 #define RExC_opend (pRExC_state->opend)
189 #define RExC_paren_names (pRExC_state->paren_names)
190 #define RExC_recurse (pRExC_state->recurse)
191 #define RExC_recurse_count (pRExC_state->recurse_count)
192 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
195 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
196 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
197 ((*s) == '{' && regcurly(s)))
200 #undef SPSTART /* dratted cpp namespace... */
203 * Flags to be passed up and down.
205 #define WORST 0 /* Worst case. */
206 #define HASWIDTH 0x01 /* Known to match non-null strings. */
208 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
209 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
211 #define SPSTART 0x04 /* Starts with * or +. */
212 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
213 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
215 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
217 /* whether trie related optimizations are enabled */
218 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
219 #define TRIE_STUDY_OPT
220 #define FULL_TRIE_STUDY
226 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
227 #define PBITVAL(paren) (1 << ((paren) & 7))
228 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
229 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
230 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
232 /* If not already in utf8, do a longjmp back to the beginning */
233 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
234 #define REQUIRE_UTF8 STMT_START { \
235 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
238 /* About scan_data_t.
240 During optimisation we recurse through the regexp program performing
241 various inplace (keyhole style) optimisations. In addition study_chunk
242 and scan_commit populate this data structure with information about
243 what strings MUST appear in the pattern. We look for the longest
244 string that must appear at a fixed location, and we look for the
245 longest string that may appear at a floating location. So for instance
250 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
251 strings (because they follow a .* construct). study_chunk will identify
252 both FOO and BAR as being the longest fixed and floating strings respectively.
254 The strings can be composites, for instance
258 will result in a composite fixed substring 'foo'.
260 For each string some basic information is maintained:
262 - offset or min_offset
263 This is the position the string must appear at, or not before.
264 It also implicitly (when combined with minlenp) tells us how many
265 characters must match before the string we are searching for.
266 Likewise when combined with minlenp and the length of the string it
267 tells us how many characters must appear after the string we have
271 Only used for floating strings. This is the rightmost point that
272 the string can appear at. If set to I32 max it indicates that the
273 string can occur infinitely far to the right.
276 A pointer to the minimum length of the pattern that the string
277 was found inside. This is important as in the case of positive
278 lookahead or positive lookbehind we can have multiple patterns
283 The minimum length of the pattern overall is 3, the minimum length
284 of the lookahead part is 3, but the minimum length of the part that
285 will actually match is 1. So 'FOO's minimum length is 3, but the
286 minimum length for the F is 1. This is important as the minimum length
287 is used to determine offsets in front of and behind the string being
288 looked for. Since strings can be composites this is the length of the
289 pattern at the time it was committed with a scan_commit. Note that
290 the length is calculated by study_chunk, so that the minimum lengths
291 are not known until the full pattern has been compiled, thus the
292 pointer to the value.
296 In the case of lookbehind the string being searched for can be
297 offset past the start point of the final matching string.
298 If this value was just blithely removed from the min_offset it would
299 invalidate some of the calculations for how many chars must match
300 before or after (as they are derived from min_offset and minlen and
301 the length of the string being searched for).
302 When the final pattern is compiled and the data is moved from the
303 scan_data_t structure into the regexp structure the information
304 about lookbehind is factored in, with the information that would
305 have been lost precalculated in the end_shift field for the
308 The fields pos_min and pos_delta are used to store the minimum offset
309 and the delta to the maximum offset at the current point in the pattern.
313 typedef struct scan_data_t {
314 /*I32 len_min; unused */
315 /*I32 len_delta; unused */
319 I32 last_end; /* min value, <0 unless valid. */
322 SV **longest; /* Either &l_fixed, or &l_float. */
323 SV *longest_fixed; /* longest fixed string found in pattern */
324 I32 offset_fixed; /* offset where it starts */
325 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
326 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
327 SV *longest_float; /* longest floating string found in pattern */
328 I32 offset_float_min; /* earliest point in string it can appear */
329 I32 offset_float_max; /* latest point in string it can appear */
330 I32 *minlen_float; /* pointer to the minlen relevant to the string */
331 I32 lookbehind_float; /* is the position of the string modified by LB */
335 struct regnode_charclass_class *start_class;
339 * Forward declarations for pregcomp()'s friends.
342 static const scan_data_t zero_scan_data =
343 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
345 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
346 #define SF_BEFORE_SEOL 0x0001
347 #define SF_BEFORE_MEOL 0x0002
348 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
349 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
352 # define SF_FIX_SHIFT_EOL (0+2)
353 # define SF_FL_SHIFT_EOL (0+4)
355 # define SF_FIX_SHIFT_EOL (+2)
356 # define SF_FL_SHIFT_EOL (+4)
359 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
360 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
362 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
363 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
364 #define SF_IS_INF 0x0040
365 #define SF_HAS_PAR 0x0080
366 #define SF_IN_PAR 0x0100
367 #define SF_HAS_EVAL 0x0200
368 #define SCF_DO_SUBSTR 0x0400
369 #define SCF_DO_STCLASS_AND 0x0800
370 #define SCF_DO_STCLASS_OR 0x1000
371 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
372 #define SCF_WHILEM_VISITED_POS 0x2000
374 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
375 #define SCF_SEEN_ACCEPT 0x8000
377 #define UTF cBOOL(RExC_utf8)
378 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
379 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
380 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
381 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
382 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
384 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
386 #define OOB_UNICODE 12345678
387 #define OOB_NAMEDCLASS -1
389 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
390 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
393 /* length of regex to show in messages that don't mark a position within */
394 #define RegexLengthToShowInErrorMessages 127
397 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
398 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
399 * op/pragma/warn/regcomp.
401 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
402 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
404 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
407 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
408 * arg. Show regex, up to a maximum length. If it's too long, chop and add
411 #define _FAIL(code) STMT_START { \
412 const char *ellipses = ""; \
413 IV len = RExC_end - RExC_precomp; \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
417 if (len > RegexLengthToShowInErrorMessages) { \
418 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
419 len = RegexLengthToShowInErrorMessages - 10; \
425 #define FAIL(msg) _FAIL( \
426 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
427 msg, (int)len, RExC_precomp, ellipses))
429 #define FAIL2(msg,arg) _FAIL( \
430 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
431 arg, (int)len, RExC_precomp, ellipses))
434 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
436 #define Simple_vFAIL(m) STMT_START { \
437 const IV offset = RExC_parse - RExC_precomp; \
438 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
439 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
443 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
445 #define vFAIL(m) STMT_START { \
447 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
452 * Like Simple_vFAIL(), but accepts two arguments.
454 #define Simple_vFAIL2(m,a1) STMT_START { \
455 const IV offset = RExC_parse - RExC_precomp; \
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
463 #define vFAIL2(m,a1) STMT_START { \
465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
466 Simple_vFAIL2(m, a1); \
471 * Like Simple_vFAIL(), but accepts three arguments.
473 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
474 const IV offset = RExC_parse - RExC_precomp; \
475 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
476 (int)offset, RExC_precomp, RExC_precomp + offset); \
480 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
482 #define vFAIL3(m,a1,a2) STMT_START { \
484 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
485 Simple_vFAIL3(m, a1, a2); \
489 * Like Simple_vFAIL(), but accepts four arguments.
491 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
492 const IV offset = RExC_parse - RExC_precomp; \
493 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
494 (int)offset, RExC_precomp, RExC_precomp + offset); \
497 #define ckWARNreg(loc,m) STMT_START { \
498 const IV offset = loc - RExC_precomp; \
499 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
500 (int)offset, RExC_precomp, RExC_precomp + offset); \
503 #define ckWARNregdep(loc,m) STMT_START { \
504 const IV offset = loc - RExC_precomp; \
505 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
507 (int)offset, RExC_precomp, RExC_precomp + offset); \
510 #define ckWARN2reg(loc, m, a1) STMT_START { \
511 const IV offset = loc - RExC_precomp; \
512 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
513 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
516 #define vWARN3(loc, m, a1, a2) STMT_START { \
517 const IV offset = loc - RExC_precomp; \
518 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
519 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
522 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
523 const IV offset = loc - RExC_precomp; \
524 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
525 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
528 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
529 const IV offset = loc - RExC_precomp; \
530 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
531 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
534 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
535 const IV offset = loc - RExC_precomp; \
536 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
537 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
540 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
541 const IV offset = loc - RExC_precomp; \
542 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
543 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
547 /* Allow for side effects in s */
548 #define REGC(c,s) STMT_START { \
549 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
552 /* Macros for recording node offsets. 20001227 mjd@plover.com
553 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
554 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
555 * Element 0 holds the number n.
556 * Position is 1 indexed.
558 #ifndef RE_TRACK_PATTERN_OFFSETS
559 #define Set_Node_Offset_To_R(node,byte)
560 #define Set_Node_Offset(node,byte)
561 #define Set_Cur_Node_Offset
562 #define Set_Node_Length_To_R(node,len)
563 #define Set_Node_Length(node,len)
564 #define Set_Node_Cur_Length(node)
565 #define Node_Offset(n)
566 #define Node_Length(n)
567 #define Set_Node_Offset_Length(node,offset,len)
568 #define ProgLen(ri) ri->u.proglen
569 #define SetProgLen(ri,x) ri->u.proglen = x
571 #define ProgLen(ri) ri->u.offsets[0]
572 #define SetProgLen(ri,x) ri->u.offsets[0] = x
573 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
575 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
576 __LINE__, (int)(node), (int)(byte))); \
578 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
580 RExC_offsets[2*(node)-1] = (byte); \
585 #define Set_Node_Offset(node,byte) \
586 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
587 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
589 #define Set_Node_Length_To_R(node,len) STMT_START { \
591 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
592 __LINE__, (int)(node), (int)(len))); \
594 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
596 RExC_offsets[2*(node)] = (len); \
601 #define Set_Node_Length(node,len) \
602 Set_Node_Length_To_R((node)-RExC_emit_start, len)
603 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
604 #define Set_Node_Cur_Length(node) \
605 Set_Node_Length(node, RExC_parse - parse_start)
607 /* Get offsets and lengths */
608 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
609 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
611 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
612 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
613 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
617 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
618 #define EXPERIMENTAL_INPLACESCAN
619 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
621 #define DEBUG_STUDYDATA(str,data,depth) \
622 DEBUG_OPTIMISE_MORE_r(if(data){ \
623 PerlIO_printf(Perl_debug_log, \
624 "%*s" str "Pos:%"IVdf"/%"IVdf \
625 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
626 (int)(depth)*2, "", \
627 (IV)((data)->pos_min), \
628 (IV)((data)->pos_delta), \
629 (UV)((data)->flags), \
630 (IV)((data)->whilem_c), \
631 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
632 is_inf ? "INF " : "" \
634 if ((data)->last_found) \
635 PerlIO_printf(Perl_debug_log, \
636 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
637 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
638 SvPVX_const((data)->last_found), \
639 (IV)((data)->last_end), \
640 (IV)((data)->last_start_min), \
641 (IV)((data)->last_start_max), \
642 ((data)->longest && \
643 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
644 SvPVX_const((data)->longest_fixed), \
645 (IV)((data)->offset_fixed), \
646 ((data)->longest && \
647 (data)->longest==&((data)->longest_float)) ? "*" : "", \
648 SvPVX_const((data)->longest_float), \
649 (IV)((data)->offset_float_min), \
650 (IV)((data)->offset_float_max) \
652 PerlIO_printf(Perl_debug_log,"\n"); \
655 static void clear_re(pTHX_ void *r);
657 /* Mark that we cannot extend a found fixed substring at this point.
658 Update the longest found anchored substring and the longest found
659 floating substrings if needed. */
662 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
664 const STRLEN l = CHR_SVLEN(data->last_found);
665 const STRLEN old_l = CHR_SVLEN(*data->longest);
666 GET_RE_DEBUG_FLAGS_DECL;
668 PERL_ARGS_ASSERT_SCAN_COMMIT;
670 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
671 SvSetMagicSV(*data->longest, data->last_found);
672 if (*data->longest == data->longest_fixed) {
673 data->offset_fixed = l ? data->last_start_min : data->pos_min;
674 if (data->flags & SF_BEFORE_EOL)
676 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
678 data->flags &= ~SF_FIX_BEFORE_EOL;
679 data->minlen_fixed=minlenp;
680 data->lookbehind_fixed=0;
682 else { /* *data->longest == data->longest_float */
683 data->offset_float_min = l ? data->last_start_min : data->pos_min;
684 data->offset_float_max = (l
685 ? data->last_start_max
686 : data->pos_min + data->pos_delta);
687 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
688 data->offset_float_max = I32_MAX;
689 if (data->flags & SF_BEFORE_EOL)
691 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
693 data->flags &= ~SF_FL_BEFORE_EOL;
694 data->minlen_float=minlenp;
695 data->lookbehind_float=0;
698 SvCUR_set(data->last_found, 0);
700 SV * const sv = data->last_found;
701 if (SvUTF8(sv) && SvMAGICAL(sv)) {
702 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
708 data->flags &= ~SF_BEFORE_EOL;
709 DEBUG_STUDYDATA("commit: ",data,0);
712 /* Can match anything (initialization) */
714 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
716 PERL_ARGS_ASSERT_CL_ANYTHING;
718 ANYOF_CLASS_ZERO(cl);
719 ANYOF_BITMAP_SETALL(cl);
720 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
722 cl->flags |= ANYOF_LOCALE;
725 /* Can match anything (initialization) */
727 S_cl_is_anything(const struct regnode_charclass_class *cl)
731 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
733 for (value = 0; value <= ANYOF_MAX; value += 2)
734 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
736 if (!(cl->flags & ANYOF_UNICODE_ALL))
738 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
743 /* Can match anything (initialization) */
745 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
747 PERL_ARGS_ASSERT_CL_INIT;
749 Zero(cl, 1, struct regnode_charclass_class);
751 cl_anything(pRExC_state, cl);
755 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
757 PERL_ARGS_ASSERT_CL_INIT_ZERO;
759 Zero(cl, 1, struct regnode_charclass_class);
761 cl_anything(pRExC_state, cl);
763 cl->flags |= ANYOF_LOCALE;
766 /* 'And' a given class with another one. Can create false positives */
767 /* We assume that cl is not inverted */
769 S_cl_and(struct regnode_charclass_class *cl,
770 const struct regnode_charclass_class *and_with)
772 PERL_ARGS_ASSERT_CL_AND;
774 assert(and_with->type == ANYOF);
776 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
777 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
778 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
779 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
780 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
783 if (and_with->flags & ANYOF_INVERT)
784 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
785 cl->bitmap[i] &= ~and_with->bitmap[i];
787 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
788 cl->bitmap[i] &= and_with->bitmap[i];
789 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
790 if (!(and_with->flags & ANYOF_EOS))
791 cl->flags &= ~ANYOF_EOS;
793 if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
794 cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
795 if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
796 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
798 if (cl->flags & ANYOF_UNICODE_ALL
799 && and_with->flags & ANYOF_NONBITMAP
800 && !(and_with->flags & ANYOF_INVERT))
802 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
803 cl->flags &= ~ANYOF_UNICODE_ALL;
805 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
808 ARG_SET(cl, ARG(and_with));
810 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
811 !(and_with->flags & ANYOF_INVERT))
812 cl->flags &= ~ANYOF_UNICODE_ALL;
813 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
814 !(and_with->flags & ANYOF_INVERT))
815 cl->flags &= ~ANYOF_NONBITMAP;
818 /* 'OR' a given class with another one. Can create false positives */
819 /* We assume that cl is not inverted */
821 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
823 PERL_ARGS_ASSERT_CL_OR;
825 if (or_with->flags & ANYOF_INVERT) {
827 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
828 * <= (B1 | !B2) | (CL1 | !CL2)
829 * which is wasteful if CL2 is small, but we ignore CL2:
830 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
831 * XXXX Can we handle case-fold? Unclear:
832 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
833 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
835 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
836 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
837 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
840 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
841 cl->bitmap[i] |= ~or_with->bitmap[i];
842 } /* XXXX: logic is complicated otherwise */
844 cl_anything(pRExC_state, cl);
847 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
848 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
849 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
850 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
853 /* OR char bitmap and class bitmap separately */
854 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
855 cl->bitmap[i] |= or_with->bitmap[i];
856 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
857 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
858 cl->classflags[i] |= or_with->classflags[i];
859 cl->flags |= ANYOF_CLASS;
862 else { /* XXXX: logic is complicated, leave it along for a moment. */
863 cl_anything(pRExC_state, cl);
866 if (or_with->flags & ANYOF_EOS)
867 cl->flags |= ANYOF_EOS;
868 if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
869 cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
871 if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
872 cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
874 /* If both nodes match something outside the bitmap, but what they match
875 * outside is not the same pointer, and hence not easily compared, give up
876 * and allow the start class to match everything outside the bitmap */
877 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
878 ARG(cl) != ARG(or_with)) {
879 cl->flags |= ANYOF_UNICODE_ALL;
882 if (or_with->flags & ANYOF_UNICODE_ALL) {
883 cl->flags |= ANYOF_UNICODE_ALL;
887 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
888 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
889 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
890 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
895 dump_trie(trie,widecharmap,revcharmap)
896 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
897 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
899 These routines dump out a trie in a somewhat readable format.
900 The _interim_ variants are used for debugging the interim
901 tables that are used to generate the final compressed
902 representation which is what dump_trie expects.
904 Part of the reason for their existence is to provide a form
905 of documentation as to how the different representations function.
910 Dumps the final compressed table form of the trie to Perl_debug_log.
911 Used for debugging make_trie().
915 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
916 AV *revcharmap, U32 depth)
919 SV *sv=sv_newmortal();
920 int colwidth= widecharmap ? 6 : 4;
922 GET_RE_DEBUG_FLAGS_DECL;
924 PERL_ARGS_ASSERT_DUMP_TRIE;
926 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
927 (int)depth * 2 + 2,"",
928 "Match","Base","Ofs" );
930 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
931 SV ** const tmp = av_fetch( revcharmap, state, 0);
933 PerlIO_printf( Perl_debug_log, "%*s",
935 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
936 PL_colors[0], PL_colors[1],
937 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
938 PERL_PV_ESCAPE_FIRSTCHAR
943 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
944 (int)depth * 2 + 2,"");
946 for( state = 0 ; state < trie->uniquecharcount ; state++ )
947 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
948 PerlIO_printf( Perl_debug_log, "\n");
950 for( state = 1 ; state < trie->statecount ; state++ ) {
951 const U32 base = trie->states[ state ].trans.base;
953 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
955 if ( trie->states[ state ].wordnum ) {
956 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
958 PerlIO_printf( Perl_debug_log, "%6s", "" );
961 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
966 while( ( base + ofs < trie->uniquecharcount ) ||
967 ( base + ofs - trie->uniquecharcount < trie->lasttrans
968 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
971 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
973 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
974 if ( ( base + ofs >= trie->uniquecharcount ) &&
975 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
976 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
978 PerlIO_printf( Perl_debug_log, "%*"UVXf,
980 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
982 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
986 PerlIO_printf( Perl_debug_log, "]");
989 PerlIO_printf( Perl_debug_log, "\n" );
991 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
992 for (word=1; word <= trie->wordcount; word++) {
993 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
994 (int)word, (int)(trie->wordinfo[word].prev),
995 (int)(trie->wordinfo[word].len));
997 PerlIO_printf(Perl_debug_log, "\n" );
1000 Dumps a fully constructed but uncompressed trie in list form.
1001 List tries normally only are used for construction when the number of
1002 possible chars (trie->uniquecharcount) is very high.
1003 Used for debugging make_trie().
1006 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1007 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1011 SV *sv=sv_newmortal();
1012 int colwidth= widecharmap ? 6 : 4;
1013 GET_RE_DEBUG_FLAGS_DECL;
1015 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1017 /* print out the table precompression. */
1018 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1019 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1020 "------:-----+-----------------\n" );
1022 for( state=1 ; state < next_alloc ; state ++ ) {
1025 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1026 (int)depth * 2 + 2,"", (UV)state );
1027 if ( ! trie->states[ state ].wordnum ) {
1028 PerlIO_printf( Perl_debug_log, "%5s| ","");
1030 PerlIO_printf( Perl_debug_log, "W%4x| ",
1031 trie->states[ state ].wordnum
1034 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1035 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1037 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1040 PL_colors[0], PL_colors[1],
1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 PERL_PV_ESCAPE_FIRSTCHAR
1044 TRIE_LIST_ITEM(state,charid).forid,
1045 (UV)TRIE_LIST_ITEM(state,charid).newstate
1048 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1049 (int)((depth * 2) + 14), "");
1052 PerlIO_printf( Perl_debug_log, "\n");
1057 Dumps a fully constructed but uncompressed trie in table form.
1058 This is the normal DFA style state transition table, with a few
1059 twists to facilitate compression later.
1060 Used for debugging make_trie().
1063 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1064 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1069 SV *sv=sv_newmortal();
1070 int colwidth= widecharmap ? 6 : 4;
1071 GET_RE_DEBUG_FLAGS_DECL;
1073 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1076 print out the table precompression so that we can do a visual check
1077 that they are identical.
1080 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1082 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1083 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1085 PerlIO_printf( Perl_debug_log, "%*s",
1087 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1088 PL_colors[0], PL_colors[1],
1089 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1090 PERL_PV_ESCAPE_FIRSTCHAR
1096 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1098 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1099 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1102 PerlIO_printf( Perl_debug_log, "\n" );
1104 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1106 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1107 (int)depth * 2 + 2,"",
1108 (UV)TRIE_NODENUM( state ) );
1110 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1111 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1113 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1115 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1117 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1118 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1120 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1121 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1129 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1130 startbranch: the first branch in the whole branch sequence
1131 first : start branch of sequence of branch-exact nodes.
1132 May be the same as startbranch
1133 last : Thing following the last branch.
1134 May be the same as tail.
1135 tail : item following the branch sequence
1136 count : words in the sequence
1137 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1138 depth : indent depth
1140 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1142 A trie is an N'ary tree where the branches are determined by digital
1143 decomposition of the key. IE, at the root node you look up the 1st character and
1144 follow that branch repeat until you find the end of the branches. Nodes can be
1145 marked as "accepting" meaning they represent a complete word. Eg:
1149 would convert into the following structure. Numbers represent states, letters
1150 following numbers represent valid transitions on the letter from that state, if
1151 the number is in square brackets it represents an accepting state, otherwise it
1152 will be in parenthesis.
1154 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1158 (1) +-i->(6)-+-s->[7]
1160 +-s->(3)-+-h->(4)-+-e->[5]
1162 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1164 This shows that when matching against the string 'hers' we will begin at state 1
1165 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1166 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1167 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1168 single traverse. We store a mapping from accepting to state to which word was
1169 matched, and then when we have multiple possibilities we try to complete the
1170 rest of the regex in the order in which they occured in the alternation.
1172 The only prior NFA like behaviour that would be changed by the TRIE support is
1173 the silent ignoring of duplicate alternations which are of the form:
1175 / (DUPE|DUPE) X? (?{ ... }) Y /x
1177 Thus EVAL blocks following a trie may be called a different number of times with
1178 and without the optimisation. With the optimisations dupes will be silently
1179 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1180 the following demonstrates:
1182 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1184 which prints out 'word' three times, but
1186 'words'=~/(word|word|word)(?{ print $1 })S/
1188 which doesnt print it out at all. This is due to other optimisations kicking in.
1190 Example of what happens on a structural level:
1192 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1194 1: CURLYM[1] {1,32767}(18)
1205 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1206 and should turn into:
1208 1: CURLYM[1] {1,32767}(18)
1210 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1218 Cases where tail != last would be like /(?foo|bar)baz/:
1228 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1229 and would end up looking like:
1232 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1239 d = uvuni_to_utf8_flags(d, uv, 0);
1241 is the recommended Unicode-aware way of saying
1246 #define TRIE_STORE_REVCHAR \
1249 SV *zlopp = newSV(2); \
1250 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1251 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1252 SvCUR_set(zlopp, kapow - flrbbbbb); \
1255 av_push(revcharmap, zlopp); \
1257 char ooooff = (char)uvc; \
1258 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1262 #define TRIE_READ_CHAR STMT_START { \
1266 if ( foldlen > 0 ) { \
1267 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1272 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1273 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1274 foldlen -= UNISKIP( uvc ); \
1275 scan = foldbuf + UNISKIP( uvc ); \
1278 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1288 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1289 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1290 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1291 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1293 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1294 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1295 TRIE_LIST_CUR( state )++; \
1298 #define TRIE_LIST_NEW(state) STMT_START { \
1299 Newxz( trie->states[ state ].trans.list, \
1300 4, reg_trie_trans_le ); \
1301 TRIE_LIST_CUR( state ) = 1; \
1302 TRIE_LIST_LEN( state ) = 4; \
1305 #define TRIE_HANDLE_WORD(state) STMT_START { \
1306 U16 dupe= trie->states[ state ].wordnum; \
1307 regnode * const noper_next = regnext( noper ); \
1310 /* store the word for dumping */ \
1312 if (OP(noper) != NOTHING) \
1313 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1315 tmp = newSVpvn_utf8( "", 0, UTF ); \
1316 av_push( trie_words, tmp ); \
1320 trie->wordinfo[curword].prev = 0; \
1321 trie->wordinfo[curword].len = wordlen; \
1322 trie->wordinfo[curword].accept = state; \
1324 if ( noper_next < tail ) { \
1326 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1327 trie->jump[curword] = (U16)(noper_next - convert); \
1329 jumper = noper_next; \
1331 nextbranch= regnext(cur); \
1335 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1336 /* chain, so that when the bits of chain are later */\
1337 /* linked together, the dups appear in the chain */\
1338 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1339 trie->wordinfo[dupe].prev = curword; \
1341 /* we haven't inserted this word yet. */ \
1342 trie->states[ state ].wordnum = curword; \
1347 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1348 ( ( base + charid >= ucharcount \
1349 && base + charid < ubound \
1350 && state == trie->trans[ base - ucharcount + charid ].check \
1351 && trie->trans[ base - ucharcount + charid ].next ) \
1352 ? trie->trans[ base - ucharcount + charid ].next \
1353 : ( state==1 ? special : 0 ) \
1357 #define MADE_JUMP_TRIE 2
1358 #define MADE_EXACT_TRIE 4
1361 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1364 /* first pass, loop through and scan words */
1365 reg_trie_data *trie;
1366 HV *widecharmap = NULL;
1367 AV *revcharmap = newAV();
1369 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1374 regnode *jumper = NULL;
1375 regnode *nextbranch = NULL;
1376 regnode *convert = NULL;
1377 U32 *prev_states; /* temp array mapping each state to previous one */
1378 /* we just use folder as a flag in utf8 */
1379 const U8 * folder = NULL;
1382 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1383 AV *trie_words = NULL;
1384 /* along with revcharmap, this only used during construction but both are
1385 * useful during debugging so we store them in the struct when debugging.
1388 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1389 STRLEN trie_charcount=0;
1391 SV *re_trie_maxbuff;
1392 GET_RE_DEBUG_FLAGS_DECL;
1394 PERL_ARGS_ASSERT_MAKE_TRIE;
1396 PERL_UNUSED_ARG(depth);
1400 case EXACTFU: folder = PL_fold_latin1; break;
1401 case EXACTF: folder = PL_fold; break;
1402 case EXACTFL: folder = PL_fold_locale; break;
1405 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1407 trie->startstate = 1;
1408 trie->wordcount = word_count;
1409 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1410 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1411 if (!(UTF && folder))
1412 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1413 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1414 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1417 trie_words = newAV();
1420 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1421 if (!SvIOK(re_trie_maxbuff)) {
1422 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1425 PerlIO_printf( Perl_debug_log,
1426 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1427 (int)depth * 2 + 2, "",
1428 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1429 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1433 /* Find the node we are going to overwrite */
1434 if ( first == startbranch && OP( last ) != BRANCH ) {
1435 /* whole branch chain */
1438 /* branch sub-chain */
1439 convert = NEXTOPER( first );
1442 /* -- First loop and Setup --
1444 We first traverse the branches and scan each word to determine if it
1445 contains widechars, and how many unique chars there are, this is
1446 important as we have to build a table with at least as many columns as we
1449 We use an array of integers to represent the character codes 0..255
1450 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1451 native representation of the character value as the key and IV's for the
1454 *TODO* If we keep track of how many times each character is used we can
1455 remap the columns so that the table compression later on is more
1456 efficient in terms of memory by ensuring the most common value is in the
1457 middle and the least common are on the outside. IMO this would be better
1458 than a most to least common mapping as theres a decent chance the most
1459 common letter will share a node with the least common, meaning the node
1460 will not be compressible. With a middle is most common approach the worst
1461 case is when we have the least common nodes twice.
1465 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1466 regnode * const noper = NEXTOPER( cur );
1467 const U8 *uc = (U8*)STRING( noper );
1468 const U8 * const e = uc + STR_LEN( noper );
1470 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1471 const U8 *scan = (U8*)NULL;
1472 U32 wordlen = 0; /* required init */
1474 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1476 if (OP(noper) == NOTHING) {
1480 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1481 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1482 regardless of encoding */
1484 for ( ; uc < e ; uc += len ) {
1485 TRIE_CHARCOUNT(trie)++;
1489 if ( !trie->charmap[ uvc ] ) {
1490 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1492 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1496 /* store the codepoint in the bitmap, and its folded
1498 TRIE_BITMAP_SET(trie,uvc);
1500 /* store the folded codepoint */
1501 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1504 /* store first byte of utf8 representation of
1505 variant codepoints */
1506 if (! UNI_IS_INVARIANT(uvc)) {
1507 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1510 set_bit = 0; /* We've done our bit :-) */
1515 widecharmap = newHV();
1517 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1520 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1522 if ( !SvTRUE( *svpp ) ) {
1523 sv_setiv( *svpp, ++trie->uniquecharcount );
1528 if( cur == first ) {
1531 } else if (chars < trie->minlen) {
1533 } else if (chars > trie->maxlen) {
1537 } /* end first pass */
1538 DEBUG_TRIE_COMPILE_r(
1539 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1540 (int)depth * 2 + 2,"",
1541 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1542 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1543 (int)trie->minlen, (int)trie->maxlen )
1547 We now know what we are dealing with in terms of unique chars and
1548 string sizes so we can calculate how much memory a naive
1549 representation using a flat table will take. If it's over a reasonable
1550 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1551 conservative but potentially much slower representation using an array
1554 At the end we convert both representations into the same compressed
1555 form that will be used in regexec.c for matching with. The latter
1556 is a form that cannot be used to construct with but has memory
1557 properties similar to the list form and access properties similar
1558 to the table form making it both suitable for fast searches and
1559 small enough that its feasable to store for the duration of a program.
1561 See the comment in the code where the compressed table is produced
1562 inplace from the flat tabe representation for an explanation of how
1563 the compression works.
1568 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1571 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1573 Second Pass -- Array Of Lists Representation
1575 Each state will be represented by a list of charid:state records
1576 (reg_trie_trans_le) the first such element holds the CUR and LEN
1577 points of the allocated array. (See defines above).
1579 We build the initial structure using the lists, and then convert
1580 it into the compressed table form which allows faster lookups
1581 (but cant be modified once converted).
1584 STRLEN transcount = 1;
1586 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1587 "%*sCompiling trie using list compiler\n",
1588 (int)depth * 2 + 2, ""));
1590 trie->states = (reg_trie_state *)
1591 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1592 sizeof(reg_trie_state) );
1596 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1598 regnode * const noper = NEXTOPER( cur );
1599 U8 *uc = (U8*)STRING( noper );
1600 const U8 * const e = uc + STR_LEN( noper );
1601 U32 state = 1; /* required init */
1602 U16 charid = 0; /* sanity init */
1603 U8 *scan = (U8*)NULL; /* sanity init */
1604 STRLEN foldlen = 0; /* required init */
1605 U32 wordlen = 0; /* required init */
1606 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1608 if (OP(noper) != NOTHING) {
1609 for ( ; uc < e ; uc += len ) {
1614 charid = trie->charmap[ uvc ];
1616 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1620 charid=(U16)SvIV( *svpp );
1623 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1630 if ( !trie->states[ state ].trans.list ) {
1631 TRIE_LIST_NEW( state );
1633 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1634 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1635 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1640 newstate = next_alloc++;
1641 prev_states[newstate] = state;
1642 TRIE_LIST_PUSH( state, charid, newstate );
1647 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1651 TRIE_HANDLE_WORD(state);
1653 } /* end second pass */
1655 /* next alloc is the NEXT state to be allocated */
1656 trie->statecount = next_alloc;
1657 trie->states = (reg_trie_state *)
1658 PerlMemShared_realloc( trie->states,
1660 * sizeof(reg_trie_state) );
1662 /* and now dump it out before we compress it */
1663 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1664 revcharmap, next_alloc,
1668 trie->trans = (reg_trie_trans *)
1669 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1676 for( state=1 ; state < next_alloc ; state ++ ) {
1680 DEBUG_TRIE_COMPILE_MORE_r(
1681 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1685 if (trie->states[state].trans.list) {
1686 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1690 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1691 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1692 if ( forid < minid ) {
1694 } else if ( forid > maxid ) {
1698 if ( transcount < tp + maxid - minid + 1) {
1700 trie->trans = (reg_trie_trans *)
1701 PerlMemShared_realloc( trie->trans,
1703 * sizeof(reg_trie_trans) );
1704 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1706 base = trie->uniquecharcount + tp - minid;
1707 if ( maxid == minid ) {
1709 for ( ; zp < tp ; zp++ ) {
1710 if ( ! trie->trans[ zp ].next ) {
1711 base = trie->uniquecharcount + zp - minid;
1712 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1713 trie->trans[ zp ].check = state;
1719 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1720 trie->trans[ tp ].check = state;
1725 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1726 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1727 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1728 trie->trans[ tid ].check = state;
1730 tp += ( maxid - minid + 1 );
1732 Safefree(trie->states[ state ].trans.list);
1735 DEBUG_TRIE_COMPILE_MORE_r(
1736 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1739 trie->states[ state ].trans.base=base;
1741 trie->lasttrans = tp + 1;
1745 Second Pass -- Flat Table Representation.
1747 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1748 We know that we will need Charcount+1 trans at most to store the data
1749 (one row per char at worst case) So we preallocate both structures
1750 assuming worst case.
1752 We then construct the trie using only the .next slots of the entry
1755 We use the .check field of the first entry of the node temporarily to
1756 make compression both faster and easier by keeping track of how many non
1757 zero fields are in the node.
1759 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1762 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1763 number representing the first entry of the node, and state as a
1764 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1765 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1766 are 2 entrys per node. eg:
1774 The table is internally in the right hand, idx form. However as we also
1775 have to deal with the states array which is indexed by nodenum we have to
1776 use TRIE_NODENUM() to convert.
1779 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1780 "%*sCompiling trie using table compiler\n",
1781 (int)depth * 2 + 2, ""));
1783 trie->trans = (reg_trie_trans *)
1784 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1785 * trie->uniquecharcount + 1,
1786 sizeof(reg_trie_trans) );
1787 trie->states = (reg_trie_state *)
1788 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1789 sizeof(reg_trie_state) );
1790 next_alloc = trie->uniquecharcount + 1;
1793 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1795 regnode * const noper = NEXTOPER( cur );
1796 const U8 *uc = (U8*)STRING( noper );
1797 const U8 * const e = uc + STR_LEN( noper );
1799 U32 state = 1; /* required init */
1801 U16 charid = 0; /* sanity init */
1802 U32 accept_state = 0; /* sanity init */
1803 U8 *scan = (U8*)NULL; /* sanity init */
1805 STRLEN foldlen = 0; /* required init */
1806 U32 wordlen = 0; /* required init */
1807 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1809 if ( OP(noper) != NOTHING ) {
1810 for ( ; uc < e ; uc += len ) {
1815 charid = trie->charmap[ uvc ];
1817 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1818 charid = svpp ? (U16)SvIV(*svpp) : 0;
1822 if ( !trie->trans[ state + charid ].next ) {
1823 trie->trans[ state + charid ].next = next_alloc;
1824 trie->trans[ state ].check++;
1825 prev_states[TRIE_NODENUM(next_alloc)]
1826 = TRIE_NODENUM(state);
1827 next_alloc += trie->uniquecharcount;
1829 state = trie->trans[ state + charid ].next;
1831 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1833 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1836 accept_state = TRIE_NODENUM( state );
1837 TRIE_HANDLE_WORD(accept_state);
1839 } /* end second pass */
1841 /* and now dump it out before we compress it */
1842 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1844 next_alloc, depth+1));
1848 * Inplace compress the table.*
1850 For sparse data sets the table constructed by the trie algorithm will
1851 be mostly 0/FAIL transitions or to put it another way mostly empty.
1852 (Note that leaf nodes will not contain any transitions.)
1854 This algorithm compresses the tables by eliminating most such
1855 transitions, at the cost of a modest bit of extra work during lookup:
1857 - Each states[] entry contains a .base field which indicates the
1858 index in the state[] array wheres its transition data is stored.
1860 - If .base is 0 there are no valid transitions from that node.
1862 - If .base is nonzero then charid is added to it to find an entry in
1865 -If trans[states[state].base+charid].check!=state then the
1866 transition is taken to be a 0/Fail transition. Thus if there are fail
1867 transitions at the front of the node then the .base offset will point
1868 somewhere inside the previous nodes data (or maybe even into a node
1869 even earlier), but the .check field determines if the transition is
1873 The following process inplace converts the table to the compressed
1874 table: We first do not compress the root node 1,and mark all its
1875 .check pointers as 1 and set its .base pointer as 1 as well. This
1876 allows us to do a DFA construction from the compressed table later,
1877 and ensures that any .base pointers we calculate later are greater
1880 - We set 'pos' to indicate the first entry of the second node.
1882 - We then iterate over the columns of the node, finding the first and
1883 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1884 and set the .check pointers accordingly, and advance pos
1885 appropriately and repreat for the next node. Note that when we copy
1886 the next pointers we have to convert them from the original
1887 NODEIDX form to NODENUM form as the former is not valid post
1890 - If a node has no transitions used we mark its base as 0 and do not
1891 advance the pos pointer.
1893 - If a node only has one transition we use a second pointer into the
1894 structure to fill in allocated fail transitions from other states.
1895 This pointer is independent of the main pointer and scans forward
1896 looking for null transitions that are allocated to a state. When it
1897 finds one it writes the single transition into the "hole". If the
1898 pointer doesnt find one the single transition is appended as normal.
1900 - Once compressed we can Renew/realloc the structures to release the
1903 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1904 specifically Fig 3.47 and the associated pseudocode.
1908 const U32 laststate = TRIE_NODENUM( next_alloc );
1911 trie->statecount = laststate;
1913 for ( state = 1 ; state < laststate ; state++ ) {
1915 const U32 stateidx = TRIE_NODEIDX( state );
1916 const U32 o_used = trie->trans[ stateidx ].check;
1917 U32 used = trie->trans[ stateidx ].check;
1918 trie->trans[ stateidx ].check = 0;
1920 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1921 if ( flag || trie->trans[ stateidx + charid ].next ) {
1922 if ( trie->trans[ stateidx + charid ].next ) {
1924 for ( ; zp < pos ; zp++ ) {
1925 if ( ! trie->trans[ zp ].next ) {
1929 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1930 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1931 trie->trans[ zp ].check = state;
1932 if ( ++zp > pos ) pos = zp;
1939 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1941 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1942 trie->trans[ pos ].check = state;
1947 trie->lasttrans = pos + 1;
1948 trie->states = (reg_trie_state *)
1949 PerlMemShared_realloc( trie->states, laststate
1950 * sizeof(reg_trie_state) );
1951 DEBUG_TRIE_COMPILE_MORE_r(
1952 PerlIO_printf( Perl_debug_log,
1953 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1954 (int)depth * 2 + 2,"",
1955 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1958 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1961 } /* end table compress */
1963 DEBUG_TRIE_COMPILE_MORE_r(
1964 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1965 (int)depth * 2 + 2, "",
1966 (UV)trie->statecount,
1967 (UV)trie->lasttrans)
1969 /* resize the trans array to remove unused space */
1970 trie->trans = (reg_trie_trans *)
1971 PerlMemShared_realloc( trie->trans, trie->lasttrans
1972 * sizeof(reg_trie_trans) );
1974 { /* Modify the program and insert the new TRIE node */
1975 U8 nodetype =(U8)(flags & 0xFF);
1979 regnode *optimize = NULL;
1980 #ifdef RE_TRACK_PATTERN_OFFSETS
1983 U32 mjd_nodelen = 0;
1984 #endif /* RE_TRACK_PATTERN_OFFSETS */
1985 #endif /* DEBUGGING */
1987 This means we convert either the first branch or the first Exact,
1988 depending on whether the thing following (in 'last') is a branch
1989 or not and whther first is the startbranch (ie is it a sub part of
1990 the alternation or is it the whole thing.)
1991 Assuming its a sub part we convert the EXACT otherwise we convert
1992 the whole branch sequence, including the first.
1994 /* Find the node we are going to overwrite */
1995 if ( first != startbranch || OP( last ) == BRANCH ) {
1996 /* branch sub-chain */
1997 NEXT_OFF( first ) = (U16)(last - first);
1998 #ifdef RE_TRACK_PATTERN_OFFSETS
2000 mjd_offset= Node_Offset((convert));
2001 mjd_nodelen= Node_Length((convert));
2004 /* whole branch chain */
2006 #ifdef RE_TRACK_PATTERN_OFFSETS
2009 const regnode *nop = NEXTOPER( convert );
2010 mjd_offset= Node_Offset((nop));
2011 mjd_nodelen= Node_Length((nop));
2015 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2016 (int)depth * 2 + 2, "",
2017 (UV)mjd_offset, (UV)mjd_nodelen)
2020 /* But first we check to see if there is a common prefix we can
2021 split out as an EXACT and put in front of the TRIE node. */
2022 trie->startstate= 1;
2023 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2025 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2029 const U32 base = trie->states[ state ].trans.base;
2031 if ( trie->states[state].wordnum )
2034 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2035 if ( ( base + ofs >= trie->uniquecharcount ) &&
2036 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2037 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2039 if ( ++count > 1 ) {
2040 SV **tmp = av_fetch( revcharmap, ofs, 0);
2041 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2042 if ( state == 1 ) break;
2044 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2046 PerlIO_printf(Perl_debug_log,
2047 "%*sNew Start State=%"UVuf" Class: [",
2048 (int)depth * 2 + 2, "",
2051 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2052 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2054 TRIE_BITMAP_SET(trie,*ch);
2056 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2058 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2062 TRIE_BITMAP_SET(trie,*ch);
2064 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2065 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2071 SV **tmp = av_fetch( revcharmap, idx, 0);
2073 char *ch = SvPV( *tmp, len );
2075 SV *sv=sv_newmortal();
2076 PerlIO_printf( Perl_debug_log,
2077 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2078 (int)depth * 2 + 2, "",
2080 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2081 PL_colors[0], PL_colors[1],
2082 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2083 PERL_PV_ESCAPE_FIRSTCHAR
2088 OP( convert ) = nodetype;
2089 str=STRING(convert);
2092 STR_LEN(convert) += len;
2098 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2103 trie->prefixlen = (state-1);
2105 regnode *n = convert+NODE_SZ_STR(convert);
2106 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2107 trie->startstate = state;
2108 trie->minlen -= (state - 1);
2109 trie->maxlen -= (state - 1);
2111 /* At least the UNICOS C compiler choked on this
2112 * being argument to DEBUG_r(), so let's just have
2115 #ifdef PERL_EXT_RE_BUILD
2121 regnode *fix = convert;
2122 U32 word = trie->wordcount;
2124 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2125 while( ++fix < n ) {
2126 Set_Node_Offset_Length(fix, 0, 0);
2129 SV ** const tmp = av_fetch( trie_words, word, 0 );
2131 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2132 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2134 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2142 NEXT_OFF(convert) = (U16)(tail - convert);
2143 DEBUG_r(optimize= n);
2149 if ( trie->maxlen ) {
2150 NEXT_OFF( convert ) = (U16)(tail - convert);
2151 ARG_SET( convert, data_slot );
2152 /* Store the offset to the first unabsorbed branch in
2153 jump[0], which is otherwise unused by the jump logic.
2154 We use this when dumping a trie and during optimisation. */
2156 trie->jump[0] = (U16)(nextbranch - convert);
2158 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2159 * and there is a bitmap
2160 * and the first "jump target" node we found leaves enough room
2161 * then convert the TRIE node into a TRIEC node, with the bitmap
2162 * embedded inline in the opcode - this is hypothetically faster.
2164 if ( !trie->states[trie->startstate].wordnum
2166 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2168 OP( convert ) = TRIEC;
2169 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2170 PerlMemShared_free(trie->bitmap);
2173 OP( convert ) = TRIE;
2175 /* store the type in the flags */
2176 convert->flags = nodetype;
2180 + regarglen[ OP( convert ) ];
2182 /* XXX We really should free up the resource in trie now,
2183 as we won't use them - (which resources?) dmq */
2185 /* needed for dumping*/
2186 DEBUG_r(if (optimize) {
2187 regnode *opt = convert;
2189 while ( ++opt < optimize) {
2190 Set_Node_Offset_Length(opt,0,0);
2193 Try to clean up some of the debris left after the
2196 while( optimize < jumper ) {
2197 mjd_nodelen += Node_Length((optimize));
2198 OP( optimize ) = OPTIMIZED;
2199 Set_Node_Offset_Length(optimize,0,0);
2202 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2204 } /* end node insert */
2206 /* Finish populating the prev field of the wordinfo array. Walk back
2207 * from each accept state until we find another accept state, and if
2208 * so, point the first word's .prev field at the second word. If the
2209 * second already has a .prev field set, stop now. This will be the
2210 * case either if we've already processed that word's accept state,
2211 * or that state had multiple words, and the overspill words were
2212 * already linked up earlier.
2219 for (word=1; word <= trie->wordcount; word++) {
2221 if (trie->wordinfo[word].prev)
2223 state = trie->wordinfo[word].accept;
2225 state = prev_states[state];
2228 prev = trie->states[state].wordnum;
2232 trie->wordinfo[word].prev = prev;
2234 Safefree(prev_states);
2238 /* and now dump out the compressed format */
2239 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2241 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2243 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2244 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2246 SvREFCNT_dec(revcharmap);
2250 : trie->startstate>1
2256 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2258 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2260 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2261 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2264 We find the fail state for each state in the trie, this state is the longest proper
2265 suffix of the current state's 'word' that is also a proper prefix of another word in our
2266 trie. State 1 represents the word '' and is thus the default fail state. This allows
2267 the DFA not to have to restart after its tried and failed a word at a given point, it
2268 simply continues as though it had been matching the other word in the first place.
2270 'abcdgu'=~/abcdefg|cdgu/
2271 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2272 fail, which would bring us to the state representing 'd' in the second word where we would
2273 try 'g' and succeed, proceeding to match 'cdgu'.
2275 /* add a fail transition */
2276 const U32 trie_offset = ARG(source);
2277 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2279 const U32 ucharcount = trie->uniquecharcount;
2280 const U32 numstates = trie->statecount;
2281 const U32 ubound = trie->lasttrans + ucharcount;
2285 U32 base = trie->states[ 1 ].trans.base;
2288 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2289 GET_RE_DEBUG_FLAGS_DECL;
2291 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2293 PERL_UNUSED_ARG(depth);
2297 ARG_SET( stclass, data_slot );
2298 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2299 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2300 aho->trie=trie_offset;
2301 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2302 Copy( trie->states, aho->states, numstates, reg_trie_state );
2303 Newxz( q, numstates, U32);
2304 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2307 /* initialize fail[0..1] to be 1 so that we always have
2308 a valid final fail state */
2309 fail[ 0 ] = fail[ 1 ] = 1;
2311 for ( charid = 0; charid < ucharcount ; charid++ ) {
2312 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2314 q[ q_write ] = newstate;
2315 /* set to point at the root */
2316 fail[ q[ q_write++ ] ]=1;
2319 while ( q_read < q_write) {
2320 const U32 cur = q[ q_read++ % numstates ];
2321 base = trie->states[ cur ].trans.base;
2323 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2324 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2326 U32 fail_state = cur;
2329 fail_state = fail[ fail_state ];
2330 fail_base = aho->states[ fail_state ].trans.base;
2331 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2333 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2334 fail[ ch_state ] = fail_state;
2335 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2337 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2339 q[ q_write++ % numstates] = ch_state;
2343 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2344 when we fail in state 1, this allows us to use the
2345 charclass scan to find a valid start char. This is based on the principle
2346 that theres a good chance the string being searched contains lots of stuff
2347 that cant be a start char.
2349 fail[ 0 ] = fail[ 1 ] = 0;
2350 DEBUG_TRIE_COMPILE_r({
2351 PerlIO_printf(Perl_debug_log,
2352 "%*sStclass Failtable (%"UVuf" states): 0",
2353 (int)(depth * 2), "", (UV)numstates
2355 for( q_read=1; q_read<numstates; q_read++ ) {
2356 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2358 PerlIO_printf(Perl_debug_log, "\n");
2361 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2366 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2367 * These need to be revisited when a newer toolchain becomes available.
2369 #if defined(__sparc64__) && defined(__GNUC__)
2370 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2371 # undef SPARC64_GCC_WORKAROUND
2372 # define SPARC64_GCC_WORKAROUND 1
2376 #define DEBUG_PEEP(str,scan,depth) \
2377 DEBUG_OPTIMISE_r({if (scan){ \
2378 SV * const mysv=sv_newmortal(); \
2379 regnode *Next = regnext(scan); \
2380 regprop(RExC_rx, mysv, scan); \
2381 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2382 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2383 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2390 #define JOIN_EXACT(scan,min,flags) \
2391 if (PL_regkind[OP(scan)] == EXACT) \
2392 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2395 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2396 /* Merge several consecutive EXACTish nodes into one. */
2397 regnode *n = regnext(scan);
2399 regnode *next = scan + NODE_SZ_STR(scan);
2403 regnode *stop = scan;
2404 GET_RE_DEBUG_FLAGS_DECL;
2406 PERL_UNUSED_ARG(depth);
2409 PERL_ARGS_ASSERT_JOIN_EXACT;
2410 #ifndef EXPERIMENTAL_INPLACESCAN
2411 PERL_UNUSED_ARG(flags);
2412 PERL_UNUSED_ARG(val);
2414 DEBUG_PEEP("join",scan,depth);
2416 /* Skip NOTHING, merge EXACT*. */
2418 ( PL_regkind[OP(n)] == NOTHING ||
2419 (stringok && (OP(n) == OP(scan))))
2421 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2423 if (OP(n) == TAIL || n > next)
2425 if (PL_regkind[OP(n)] == NOTHING) {
2426 DEBUG_PEEP("skip:",n,depth);
2427 NEXT_OFF(scan) += NEXT_OFF(n);
2428 next = n + NODE_STEP_REGNODE;
2435 else if (stringok) {
2436 const unsigned int oldl = STR_LEN(scan);
2437 regnode * const nnext = regnext(n);
2439 DEBUG_PEEP("merg",n,depth);
2442 if (oldl + STR_LEN(n) > U8_MAX)
2444 NEXT_OFF(scan) += NEXT_OFF(n);
2445 STR_LEN(scan) += STR_LEN(n);
2446 next = n + NODE_SZ_STR(n);
2447 /* Now we can overwrite *n : */
2448 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2456 #ifdef EXPERIMENTAL_INPLACESCAN
2457 if (flags && !NEXT_OFF(n)) {
2458 DEBUG_PEEP("atch", val, depth);
2459 if (reg_off_by_arg[OP(n)]) {
2460 ARG_SET(n, val - n);
2463 NEXT_OFF(n) = val - n;
2469 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2470 #define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2471 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2472 #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2475 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2476 && ( STR_LEN(scan) >= 6 ) )
2479 Two problematic code points in Unicode casefolding of EXACT nodes:
2481 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2482 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2488 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2489 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2491 This means that in case-insensitive matching (or "loose matching",
2492 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2493 length of the above casefolded versions) can match a target string
2494 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2495 This would rather mess up the minimum length computation.
2497 What we'll do is to look for the tail four bytes, and then peek
2498 at the preceding two bytes to see whether we need to decrease
2499 the minimum length by four (six minus two).
2501 Thanks to the design of UTF-8, there cannot be false matches:
2502 A sequence of valid UTF-8 bytes cannot be a subsequence of
2503 another valid sequence of UTF-8 bytes.
2506 char * const s0 = STRING(scan), *s, *t;
2507 char * const s1 = s0 + STR_LEN(scan) - 1;
2508 char * const s2 = s1 - 4;
2509 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2510 const char t0[] = "\xaf\x49\xaf\x42";
2512 const char t0[] = "\xcc\x88\xcc\x81";
2514 const char * const t1 = t0 + 3;
2517 s < s2 && (t = ninstr(s, s1, t0, t1));
2520 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2521 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2523 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2524 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2532 n = scan + NODE_SZ_STR(scan);
2534 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2541 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2545 /* REx optimizer. Converts nodes into quicker variants "in place".
2546 Finds fixed substrings. */
2548 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2549 to the position after last scanned or to NULL. */
2551 #define INIT_AND_WITHP \
2552 assert(!and_withp); \
2553 Newx(and_withp,1,struct regnode_charclass_class); \
2554 SAVEFREEPV(and_withp)
2556 /* this is a chain of data about sub patterns we are processing that
2557 need to be handled separately/specially in study_chunk. Its so
2558 we can simulate recursion without losing state. */
2560 typedef struct scan_frame {
2561 regnode *last; /* last node to process in this frame */
2562 regnode *next; /* next node to process when last is reached */
2563 struct scan_frame *prev; /*previous frame*/
2564 I32 stop; /* what stopparen do we use */
2568 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2570 #define CASE_SYNST_FNC(nAmE) \
2572 if (flags & SCF_DO_STCLASS_AND) { \
2573 for (value = 0; value < 256; value++) \
2574 if (!is_ ## nAmE ## _cp(value)) \
2575 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2578 for (value = 0; value < 256; value++) \
2579 if (is_ ## nAmE ## _cp(value)) \
2580 ANYOF_BITMAP_SET(data->start_class, value); \
2584 if (flags & SCF_DO_STCLASS_AND) { \
2585 for (value = 0; value < 256; value++) \
2586 if (is_ ## nAmE ## _cp(value)) \
2587 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2590 for (value = 0; value < 256; value++) \
2591 if (!is_ ## nAmE ## _cp(value)) \
2592 ANYOF_BITMAP_SET(data->start_class, value); \
2599 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2600 I32 *minlenp, I32 *deltap,
2605 struct regnode_charclass_class *and_withp,
2606 U32 flags, U32 depth)
2607 /* scanp: Start here (read-write). */
2608 /* deltap: Write maxlen-minlen here. */
2609 /* last: Stop before this one. */
2610 /* data: string data about the pattern */
2611 /* stopparen: treat close N as END */
2612 /* recursed: which subroutines have we recursed into */
2613 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2616 I32 min = 0, pars = 0, code;
2617 regnode *scan = *scanp, *next;
2619 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2620 int is_inf_internal = 0; /* The studied chunk is infinite */
2621 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2622 scan_data_t data_fake;
2623 SV *re_trie_maxbuff = NULL;
2624 regnode *first_non_open = scan;
2625 I32 stopmin = I32_MAX;
2626 scan_frame *frame = NULL;
2627 GET_RE_DEBUG_FLAGS_DECL;
2629 PERL_ARGS_ASSERT_STUDY_CHUNK;
2632 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2636 while (first_non_open && OP(first_non_open) == OPEN)
2637 first_non_open=regnext(first_non_open);
2642 while ( scan && OP(scan) != END && scan < last ){
2643 /* Peephole optimizer: */
2644 DEBUG_STUDYDATA("Peep:", data,depth);
2645 DEBUG_PEEP("Peep",scan,depth);
2646 JOIN_EXACT(scan,&min,0);
2648 /* Follow the next-chain of the current node and optimize
2649 away all the NOTHINGs from it. */
2650 if (OP(scan) != CURLYX) {
2651 const int max = (reg_off_by_arg[OP(scan)]
2653 /* I32 may be smaller than U16 on CRAYs! */
2654 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2655 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2659 /* Skip NOTHING and LONGJMP. */
2660 while ((n = regnext(n))
2661 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2662 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2663 && off + noff < max)
2665 if (reg_off_by_arg[OP(scan)])
2668 NEXT_OFF(scan) = off;
2673 /* The principal pseudo-switch. Cannot be a switch, since we
2674 look into several different things. */
2675 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2676 || OP(scan) == IFTHEN) {
2677 next = regnext(scan);
2679 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2681 if (OP(next) == code || code == IFTHEN) {
2682 /* NOTE - There is similar code to this block below for handling
2683 TRIE nodes on a re-study. If you change stuff here check there
2685 I32 max1 = 0, min1 = I32_MAX, num = 0;
2686 struct regnode_charclass_class accum;
2687 regnode * const startbranch=scan;
2689 if (flags & SCF_DO_SUBSTR)
2690 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2691 if (flags & SCF_DO_STCLASS)
2692 cl_init_zero(pRExC_state, &accum);
2694 while (OP(scan) == code) {
2695 I32 deltanext, minnext, f = 0, fake;
2696 struct regnode_charclass_class this_class;
2699 data_fake.flags = 0;
2701 data_fake.whilem_c = data->whilem_c;
2702 data_fake.last_closep = data->last_closep;
2705 data_fake.last_closep = &fake;
2707 data_fake.pos_delta = delta;
2708 next = regnext(scan);
2709 scan = NEXTOPER(scan);
2711 scan = NEXTOPER(scan);
2712 if (flags & SCF_DO_STCLASS) {
2713 cl_init(pRExC_state, &this_class);
2714 data_fake.start_class = &this_class;
2715 f = SCF_DO_STCLASS_AND;
2717 if (flags & SCF_WHILEM_VISITED_POS)
2718 f |= SCF_WHILEM_VISITED_POS;
2720 /* we suppose the run is continuous, last=next...*/
2721 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2723 stopparen, recursed, NULL, f,depth+1);
2726 if (max1 < minnext + deltanext)
2727 max1 = minnext + deltanext;
2728 if (deltanext == I32_MAX)
2729 is_inf = is_inf_internal = 1;
2731 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2733 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2734 if ( stopmin > minnext)
2735 stopmin = min + min1;
2736 flags &= ~SCF_DO_SUBSTR;
2738 data->flags |= SCF_SEEN_ACCEPT;
2741 if (data_fake.flags & SF_HAS_EVAL)
2742 data->flags |= SF_HAS_EVAL;
2743 data->whilem_c = data_fake.whilem_c;
2745 if (flags & SCF_DO_STCLASS)
2746 cl_or(pRExC_state, &accum, &this_class);
2748 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2750 if (flags & SCF_DO_SUBSTR) {
2751 data->pos_min += min1;
2752 data->pos_delta += max1 - min1;
2753 if (max1 != min1 || is_inf)
2754 data->longest = &(data->longest_float);
2757 delta += max1 - min1;
2758 if (flags & SCF_DO_STCLASS_OR) {
2759 cl_or(pRExC_state, data->start_class, &accum);
2761 cl_and(data->start_class, and_withp);
2762 flags &= ~SCF_DO_STCLASS;
2765 else if (flags & SCF_DO_STCLASS_AND) {
2767 cl_and(data->start_class, &accum);
2768 flags &= ~SCF_DO_STCLASS;
2771 /* Switch to OR mode: cache the old value of
2772 * data->start_class */
2774 StructCopy(data->start_class, and_withp,
2775 struct regnode_charclass_class);
2776 flags &= ~SCF_DO_STCLASS_AND;
2777 StructCopy(&accum, data->start_class,
2778 struct regnode_charclass_class);
2779 flags |= SCF_DO_STCLASS_OR;
2780 data->start_class->flags |= ANYOF_EOS;
2784 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2787 Assuming this was/is a branch we are dealing with: 'scan' now
2788 points at the item that follows the branch sequence, whatever
2789 it is. We now start at the beginning of the sequence and look
2796 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2798 If we can find such a subsequence we need to turn the first
2799 element into a trie and then add the subsequent branch exact
2800 strings to the trie.
2804 1. patterns where the whole set of branches can be converted.
2806 2. patterns where only a subset can be converted.
2808 In case 1 we can replace the whole set with a single regop
2809 for the trie. In case 2 we need to keep the start and end
2812 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2813 becomes BRANCH TRIE; BRANCH X;
2815 There is an additional case, that being where there is a
2816 common prefix, which gets split out into an EXACT like node
2817 preceding the TRIE node.
2819 If x(1..n)==tail then we can do a simple trie, if not we make
2820 a "jump" trie, such that when we match the appropriate word
2821 we "jump" to the appropriate tail node. Essentially we turn
2822 a nested if into a case structure of sorts.
2827 if (!re_trie_maxbuff) {
2828 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2829 if (!SvIOK(re_trie_maxbuff))
2830 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2832 if ( SvIV(re_trie_maxbuff)>=0 ) {
2834 regnode *first = (regnode *)NULL;
2835 regnode *last = (regnode *)NULL;
2836 regnode *tail = scan;
2841 SV * const mysv = sv_newmortal(); /* for dumping */
2843 /* var tail is used because there may be a TAIL
2844 regop in the way. Ie, the exacts will point to the
2845 thing following the TAIL, but the last branch will
2846 point at the TAIL. So we advance tail. If we
2847 have nested (?:) we may have to move through several
2851 while ( OP( tail ) == TAIL ) {
2852 /* this is the TAIL generated by (?:) */
2853 tail = regnext( tail );
2858 regprop(RExC_rx, mysv, tail );
2859 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2860 (int)depth * 2 + 2, "",
2861 "Looking for TRIE'able sequences. Tail node is: ",
2862 SvPV_nolen_const( mysv )
2868 step through the branches, cur represents each
2869 branch, noper is the first thing to be matched
2870 as part of that branch and noper_next is the
2871 regnext() of that node. if noper is an EXACT
2872 and noper_next is the same as scan (our current
2873 position in the regex) then the EXACT branch is
2874 a possible optimization target. Once we have
2875 two or more consecutive such branches we can
2876 create a trie of the EXACT's contents and stich
2877 it in place. If the sequence represents all of
2878 the branches we eliminate the whole thing and
2879 replace it with a single TRIE. If it is a
2880 subsequence then we need to stitch it in. This
2881 means the first branch has to remain, and needs
2882 to be repointed at the item on the branch chain
2883 following the last branch optimized. This could
2884 be either a BRANCH, in which case the
2885 subsequence is internal, or it could be the
2886 item following the branch sequence in which
2887 case the subsequence is at the end.
2891 /* dont use tail as the end marker for this traverse */
2892 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2893 regnode * const noper = NEXTOPER( cur );
2894 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2895 regnode * const noper_next = regnext( noper );
2899 regprop(RExC_rx, mysv, cur);
2900 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2901 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2903 regprop(RExC_rx, mysv, noper);
2904 PerlIO_printf( Perl_debug_log, " -> %s",
2905 SvPV_nolen_const(mysv));
2908 regprop(RExC_rx, mysv, noper_next );
2909 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2910 SvPV_nolen_const(mysv));
2912 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2913 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2915 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2916 : PL_regkind[ OP( noper ) ] == EXACT )
2917 || OP(noper) == NOTHING )
2919 && noper_next == tail
2924 if ( !first || optype == NOTHING ) {
2925 if (!first) first = cur;
2926 optype = OP( noper );
2932 Currently we do not believe that the trie logic can
2933 handle case insensitive matching properly when the
2934 pattern is not unicode (thus forcing unicode semantics).
2936 If/when this is fixed the following define can be swapped
2937 in below to fully enable trie logic.
2939 #define TRIE_TYPE_IS_SAFE 1
2942 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2944 if ( last && TRIE_TYPE_IS_SAFE ) {
2945 make_trie( pRExC_state,
2946 startbranch, first, cur, tail, count,
2949 if ( PL_regkind[ OP( noper ) ] == EXACT
2951 && noper_next == tail
2956 optype = OP( noper );
2966 regprop(RExC_rx, mysv, cur);
2967 PerlIO_printf( Perl_debug_log,
2968 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2969 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2973 if ( last && TRIE_TYPE_IS_SAFE ) {
2974 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2975 #ifdef TRIE_STUDY_OPT
2976 if ( ((made == MADE_EXACT_TRIE &&
2977 startbranch == first)
2978 || ( first_non_open == first )) &&
2980 flags |= SCF_TRIE_RESTUDY;
2981 if ( startbranch == first
2984 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2994 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2995 scan = NEXTOPER(NEXTOPER(scan));
2996 } else /* single branch is optimized. */
2997 scan = NEXTOPER(scan);
2999 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3000 scan_frame *newframe = NULL;
3005 if (OP(scan) != SUSPEND) {
3006 /* set the pointer */
3007 if (OP(scan) == GOSUB) {
3009 RExC_recurse[ARG2L(scan)] = scan;
3010 start = RExC_open_parens[paren-1];
3011 end = RExC_close_parens[paren-1];
3014 start = RExC_rxi->program + 1;
3018 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3019 SAVEFREEPV(recursed);
3021 if (!PAREN_TEST(recursed,paren+1)) {
3022 PAREN_SET(recursed,paren+1);
3023 Newx(newframe,1,scan_frame);
3025 if (flags & SCF_DO_SUBSTR) {
3026 SCAN_COMMIT(pRExC_state,data,minlenp);
3027 data->longest = &(data->longest_float);
3029 is_inf = is_inf_internal = 1;
3030 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3031 cl_anything(pRExC_state, data->start_class);
3032 flags &= ~SCF_DO_STCLASS;
3035 Newx(newframe,1,scan_frame);
3038 end = regnext(scan);
3043 SAVEFREEPV(newframe);
3044 newframe->next = regnext(scan);
3045 newframe->last = last;
3046 newframe->stop = stopparen;
3047 newframe->prev = frame;
3057 else if (OP(scan) == EXACT) {
3058 I32 l = STR_LEN(scan);
3061 const U8 * const s = (U8*)STRING(scan);
3062 l = utf8_length(s, s + l);
3063 uc = utf8_to_uvchr(s, NULL);
3065 uc = *((U8*)STRING(scan));
3068 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3069 /* The code below prefers earlier match for fixed
3070 offset, later match for variable offset. */
3071 if (data->last_end == -1) { /* Update the start info. */
3072 data->last_start_min = data->pos_min;
3073 data->last_start_max = is_inf
3074 ? I32_MAX : data->pos_min + data->pos_delta;
3076 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3078 SvUTF8_on(data->last_found);
3080 SV * const sv = data->last_found;
3081 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3082 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3083 if (mg && mg->mg_len >= 0)
3084 mg->mg_len += utf8_length((U8*)STRING(scan),
3085 (U8*)STRING(scan)+STR_LEN(scan));
3087 data->last_end = data->pos_min + l;
3088 data->pos_min += l; /* As in the first entry. */
3089 data->flags &= ~SF_BEFORE_EOL;
3091 if (flags & SCF_DO_STCLASS_AND) {
3092 /* Check whether it is compatible with what we know already! */
3096 /* If compatible, we or it in below. It is compatible if is
3097 * in the bitmp and either 1) its bit or its fold is set, or 2)
3098 * it's for a locale. Even if there isn't unicode semantics
3099 * here, at runtime there may be because of matching against a
3100 * utf8 string, so accept a possible false positive for
3101 * latin1-range folds */
3103 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3104 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3105 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3106 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3109 ANYOF_CLASS_ZERO(data->start_class);
3110 ANYOF_BITMAP_ZERO(data->start_class);
3112 ANYOF_BITMAP_SET(data->start_class, uc);
3113 data->start_class->flags &= ~ANYOF_EOS;
3115 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3117 else if (flags & SCF_DO_STCLASS_OR) {
3118 /* false positive possible if the class is case-folded */
3120 ANYOF_BITMAP_SET(data->start_class, uc);
3122 data->start_class->flags |= ANYOF_UNICODE_ALL;
3123 data->start_class->flags &= ~ANYOF_EOS;
3124 cl_and(data->start_class, and_withp);
3126 flags &= ~SCF_DO_STCLASS;
3128 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3129 I32 l = STR_LEN(scan);
3130 UV uc = *((U8*)STRING(scan));
3132 /* Search for fixed substrings supports EXACT only. */
3133 if (flags & SCF_DO_SUBSTR) {
3135 SCAN_COMMIT(pRExC_state, data, minlenp);
3138 const U8 * const s = (U8 *)STRING(scan);
3139 l = utf8_length(s, s + l);
3140 uc = utf8_to_uvchr(s, NULL);
3143 if (flags & SCF_DO_SUBSTR)
3145 if (flags & SCF_DO_STCLASS_AND) {
3146 /* Check whether it is compatible with what we know already! */
3149 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3150 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3151 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3155 ANYOF_CLASS_ZERO(data->start_class);
3156 ANYOF_BITMAP_ZERO(data->start_class);
3158 ANYOF_BITMAP_SET(data->start_class, uc);
3159 data->start_class->flags &= ~ANYOF_EOS;
3160 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3161 if (OP(scan) == EXACTFL) {
3162 data->start_class->flags |= ANYOF_LOCALE;
3166 /* Also set the other member of the fold pair. In case
3167 * that unicode semantics is called for at runtime, use
3168 * the full latin1 fold. (Can't do this for locale,
3169 * because not known until runtime */
3170 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3174 else if (flags & SCF_DO_STCLASS_OR) {
3175 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3176 /* false positive possible if the class is case-folded.
3177 Assume that the locale settings are the same... */
3179 ANYOF_BITMAP_SET(data->start_class, uc);
3180 if (OP(scan) != EXACTFL) {
3182 /* And set the other member of the fold pair, but
3183 * can't do that in locale because not known until
3185 ANYOF_BITMAP_SET(data->start_class,
3186 PL_fold_latin1[uc]);
3189 data->start_class->flags &= ~ANYOF_EOS;
3191 cl_and(data->start_class, and_withp);
3193 flags &= ~SCF_DO_STCLASS;
3195 else if (REGNODE_VARIES(OP(scan))) {
3196 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3197 I32 f = flags, pos_before = 0;
3198 regnode * const oscan = scan;
3199 struct regnode_charclass_class this_class;
3200 struct regnode_charclass_class *oclass = NULL;
3201 I32 next_is_eval = 0;
3203 switch (PL_regkind[OP(scan)]) {
3204 case WHILEM: /* End of (?:...)* . */
3205 scan = NEXTOPER(scan);
3208 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3209 next = NEXTOPER(scan);
3210 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3212 maxcount = REG_INFTY;
3213 next = regnext(scan);
3214 scan = NEXTOPER(scan);
3218 if (flags & SCF_DO_SUBSTR)
3223 if (flags & SCF_DO_STCLASS) {
3225 maxcount = REG_INFTY;
3226 next = regnext(scan);
3227 scan = NEXTOPER(scan);
3230 is_inf = is_inf_internal = 1;
3231 scan = regnext(scan);
3232 if (flags & SCF_DO_SUBSTR) {
3233 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3234 data->longest = &(data->longest_float);
3236 goto optimize_curly_tail;
3238 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3239 && (scan->flags == stopparen))
3244 mincount = ARG1(scan);
3245 maxcount = ARG2(scan);
3247 next = regnext(scan);
3248 if (OP(scan) == CURLYX) {
3249 I32 lp = (data ? *(data->last_closep) : 0);
3250 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3252 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3253 next_is_eval = (OP(scan) == EVAL);
3255 if (flags & SCF_DO_SUBSTR) {
3256 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3257 pos_before = data->pos_min;
3261 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3263 data->flags |= SF_IS_INF;
3265 if (flags & SCF_DO_STCLASS) {
3266 cl_init(pRExC_state, &this_class);
3267 oclass = data->start_class;
3268 data->start_class = &this_class;
3269 f |= SCF_DO_STCLASS_AND;
3270 f &= ~SCF_DO_STCLASS_OR;
3272 /* Exclude from super-linear cache processing any {n,m}
3273 regops for which the combination of input pos and regex
3274 pos is not enough information to determine if a match
3277 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3278 regex pos at the \s*, the prospects for a match depend not
3279 only on the input position but also on how many (bar\s*)
3280 repeats into the {4,8} we are. */
3281 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3282 f &= ~SCF_WHILEM_VISITED_POS;
3284 /* This will finish on WHILEM, setting scan, or on NULL: */
3285 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3286 last, data, stopparen, recursed, NULL,
3288 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3290 if (flags & SCF_DO_STCLASS)
3291 data->start_class = oclass;
3292 if (mincount == 0 || minnext == 0) {
3293 if (flags & SCF_DO_STCLASS_OR) {
3294 cl_or(pRExC_state, data->start_class, &this_class);
3296 else if (flags & SCF_DO_STCLASS_AND) {
3297 /* Switch to OR mode: cache the old value of
3298 * data->start_class */
3300 StructCopy(data->start_class, and_withp,
3301 struct regnode_charclass_class);
3302 flags &= ~SCF_DO_STCLASS_AND;
3303 StructCopy(&this_class, data->start_class,
3304 struct regnode_charclass_class);
3305 flags |= SCF_DO_STCLASS_OR;
3306 data->start_class->flags |= ANYOF_EOS;
3308 } else { /* Non-zero len */
3309 if (flags & SCF_DO_STCLASS_OR) {
3310 cl_or(pRExC_state, data->start_class, &this_class);
3311 cl_and(data->start_class, and_withp);
3313 else if (flags & SCF_DO_STCLASS_AND)
3314 cl_and(data->start_class, &this_class);
3315 flags &= ~SCF_DO_STCLASS;
3317 if (!scan) /* It was not CURLYX, but CURLY. */
3319 if ( /* ? quantifier ok, except for (?{ ... }) */
3320 (next_is_eval || !(mincount == 0 && maxcount == 1))
3321 && (minnext == 0) && (deltanext == 0)
3322 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3323 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3325 ckWARNreg(RExC_parse,
3326 "Quantifier unexpected on zero-length expression");
3329 min += minnext * mincount;
3330 is_inf_internal |= ((maxcount == REG_INFTY
3331 && (minnext + deltanext) > 0)
3332 || deltanext == I32_MAX);
3333 is_inf |= is_inf_internal;
3334 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3336 /* Try powerful optimization CURLYX => CURLYN. */
3337 if ( OP(oscan) == CURLYX && data
3338 && data->flags & SF_IN_PAR
3339 && !(data->flags & SF_HAS_EVAL)
3340 && !deltanext && minnext == 1 ) {
3341 /* Try to optimize to CURLYN. */
3342 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3343 regnode * const nxt1 = nxt;
3350 if (!REGNODE_SIMPLE(OP(nxt))
3351 && !(PL_regkind[OP(nxt)] == EXACT
3352 && STR_LEN(nxt) == 1))
3358 if (OP(nxt) != CLOSE)
3360 if (RExC_open_parens) {
3361 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3362 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3364 /* Now we know that nxt2 is the only contents: */
3365 oscan->flags = (U8)ARG(nxt);
3367 OP(nxt1) = NOTHING; /* was OPEN. */
3370 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3371 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3372 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3373 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3374 OP(nxt + 1) = OPTIMIZED; /* was count. */
3375 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3380 /* Try optimization CURLYX => CURLYM. */
3381 if ( OP(oscan) == CURLYX && data
3382 && !(data->flags & SF_HAS_PAR)
3383 && !(data->flags & SF_HAS_EVAL)
3384 && !deltanext /* atom is fixed width */
3385 && minnext != 0 /* CURLYM can't handle zero width */
3387 /* XXXX How to optimize if data == 0? */
3388 /* Optimize to a simpler form. */
3389 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3393 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3394 && (OP(nxt2) != WHILEM))
3396 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3397 /* Need to optimize away parenths. */
3398 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3399 /* Set the parenth number. */
3400 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3402 oscan->flags = (U8)ARG(nxt);
3403 if (RExC_open_parens) {
3404 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3405 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3407 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3408 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3411 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3412 OP(nxt + 1) = OPTIMIZED; /* was count. */
3413 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3414 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3417 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3418 regnode *nnxt = regnext(nxt1);
3420 if (reg_off_by_arg[OP(nxt1)])
3421 ARG_SET(nxt1, nxt2 - nxt1);
3422 else if (nxt2 - nxt1 < U16_MAX)
3423 NEXT_OFF(nxt1) = nxt2 - nxt1;
3425 OP(nxt) = NOTHING; /* Cannot beautify */
3430 /* Optimize again: */
3431 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3432 NULL, stopparen, recursed, NULL, 0,depth+1);
3437 else if ((OP(oscan) == CURLYX)
3438 && (flags & SCF_WHILEM_VISITED_POS)
3439 /* See the comment on a similar expression above.
3440 However, this time it's not a subexpression
3441 we care about, but the expression itself. */
3442 && (maxcount == REG_INFTY)
3443 && data && ++data->whilem_c < 16) {
3444 /* This stays as CURLYX, we can put the count/of pair. */
3445 /* Find WHILEM (as in regexec.c) */
3446 regnode *nxt = oscan + NEXT_OFF(oscan);
3448 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3450 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3451 | (RExC_whilem_seen << 4)); /* On WHILEM */
3453 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3455 if (flags & SCF_DO_SUBSTR) {
3456 SV *last_str = NULL;
3457 int counted = mincount != 0;
3459 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3460 #if defined(SPARC64_GCC_WORKAROUND)
3463 const char *s = NULL;
3466 if (pos_before >= data->last_start_min)
3469 b = data->last_start_min;
3472 s = SvPV_const(data->last_found, l);
3473 old = b - data->last_start_min;
3476 I32 b = pos_before >= data->last_start_min
3477 ? pos_before : data->last_start_min;
3479 const char * const s = SvPV_const(data->last_found, l);
3480 I32 old = b - data->last_start_min;
3484 old = utf8_hop((U8*)s, old) - (U8*)s;
3486 /* Get the added string: */
3487 last_str = newSVpvn_utf8(s + old, l, UTF);
3488 if (deltanext == 0 && pos_before == b) {
3489 /* What was added is a constant string */
3491 SvGROW(last_str, (mincount * l) + 1);
3492 repeatcpy(SvPVX(last_str) + l,
3493 SvPVX_const(last_str), l, mincount - 1);
3494 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3495 /* Add additional parts. */
3496 SvCUR_set(data->last_found,
3497 SvCUR(data->last_found) - l);
3498 sv_catsv(data->last_found, last_str);
3500 SV * sv = data->last_found;
3502 SvUTF8(sv) && SvMAGICAL(sv) ?
3503 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3504 if (mg && mg->mg_len >= 0)
3505 mg->mg_len += CHR_SVLEN(last_str) - l;
3507 data->last_end += l * (mincount - 1);
3510 /* start offset must point into the last copy */
3511 data->last_start_min += minnext * (mincount - 1);
3512 data->last_start_max += is_inf ? I32_MAX
3513 : (maxcount - 1) * (minnext + data->pos_delta);
3516 /* It is counted once already... */
3517 data->pos_min += minnext * (mincount - counted);
3518 data->pos_delta += - counted * deltanext +
3519 (minnext + deltanext) * maxcount - minnext * mincount;
3520 if (mincount != maxcount) {
3521 /* Cannot extend fixed substrings found inside
3523 SCAN_COMMIT(pRExC_state,data,minlenp);
3524 if (mincount && last_str) {
3525 SV * const sv = data->last_found;
3526 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3527 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3531 sv_setsv(sv, last_str);
3532 data->last_end = data->pos_min;
3533 data->last_start_min =
3534 data->pos_min - CHR_SVLEN(last_str);
3535 data->last_start_max = is_inf
3537 : data->pos_min + data->pos_delta
3538 - CHR_SVLEN(last_str);
3540 data->longest = &(data->longest_float);
3542 SvREFCNT_dec(last_str);
3544 if (data && (fl & SF_HAS_EVAL))
3545 data->flags |= SF_HAS_EVAL;
3546 optimize_curly_tail:
3547 if (OP(oscan) != CURLYX) {
3548 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3550 NEXT_OFF(oscan) += NEXT_OFF(next);
3553 default: /* REF, ANYOFV, and CLUMP only? */
3554 if (flags & SCF_DO_SUBSTR) {
3555 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3556 data->longest = &(data->longest_float);
3558 is_inf = is_inf_internal = 1;
3559 if (flags & SCF_DO_STCLASS_OR)
3560 cl_anything(pRExC_state, data->start_class);
3561 flags &= ~SCF_DO_STCLASS;
3565 else if (OP(scan) == LNBREAK) {
3566 if (flags & SCF_DO_STCLASS) {
3568 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3569 if (flags & SCF_DO_STCLASS_AND) {
3570 for (value = 0; value < 256; value++)
3571 if (!is_VERTWS_cp(value))
3572 ANYOF_BITMAP_CLEAR(data->start_class, value);
3575 for (value = 0; value < 256; value++)
3576 if (is_VERTWS_cp(value))
3577 ANYOF_BITMAP_SET(data->start_class, value);
3579 if (flags & SCF_DO_STCLASS_OR)
3580 cl_and(data->start_class, and_withp);
3581 flags &= ~SCF_DO_STCLASS;
3585 if (flags & SCF_DO_SUBSTR) {
3586 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3588 data->pos_delta += 1;
3589 data->longest = &(data->longest_float);
3592 else if (OP(scan) == FOLDCHAR) {
3593 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3594 flags &= ~SCF_DO_STCLASS;
3597 if (flags & SCF_DO_SUBSTR) {
3598 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3600 data->pos_delta += d;
3601 data->longest = &(data->longest_float);
3604 else if (REGNODE_SIMPLE(OP(scan))) {
3607 if (flags & SCF_DO_SUBSTR) {
3608 SCAN_COMMIT(pRExC_state,data,minlenp);
3612 if (flags & SCF_DO_STCLASS) {
3613 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3615 /* Some of the logic below assumes that switching
3616 locale on will only add false positives. */
3617 switch (PL_regkind[OP(scan)]) {
3621 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3622 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3623 cl_anything(pRExC_state, data->start_class);
3626 if (OP(scan) == SANY)
3628 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3629 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3630 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3631 cl_anything(pRExC_state, data->start_class);
3633 if (flags & SCF_DO_STCLASS_AND || !value)
3634 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3637 if (flags & SCF_DO_STCLASS_AND)
3638 cl_and(data->start_class,
3639 (struct regnode_charclass_class*)scan);
3641 cl_or(pRExC_state, data->start_class,
3642 (struct regnode_charclass_class*)scan);
3645 if (flags & SCF_DO_STCLASS_AND) {
3646 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3647 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3648 if (OP(scan) == ALNUMU) {
3649 for (value = 0; value < 256; value++) {
3650 if (!isWORDCHAR_L1(value)) {
3651 ANYOF_BITMAP_CLEAR(data->start_class, value);
3655 for (value = 0; value < 256; value++) {
3656 if (!isALNUM(value)) {
3657 ANYOF_BITMAP_CLEAR(data->start_class, value);
3664 if (data->start_class->flags & ANYOF_LOCALE)
3665 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3666 else if (OP(scan) == ALNUMU) {
3667 for (value = 0; value < 256; value++) {
3668 if (isWORDCHAR_L1(value)) {
3669 ANYOF_BITMAP_SET(data->start_class, value);
3673 for (value = 0; value < 256; value++) {
3674 if (isALNUM(value)) {
3675 ANYOF_BITMAP_SET(data->start_class, value);
3682 if (flags & SCF_DO_STCLASS_AND) {
3683 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3684 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3685 if (OP(scan) == NALNUMU) {
3686 for (value = 0; value < 256; value++) {
3687 if (isWORDCHAR_L1(value)) {
3688 ANYOF_BITMAP_CLEAR(data->start_class, value);
3692 for (value = 0; value < 256; value++) {
3693 if (isALNUM(value)) {
3694 ANYOF_BITMAP_CLEAR(data->start_class, value);
3701 if (data->start_class->flags & ANYOF_LOCALE)
3702 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3704 if (OP(scan) == NALNUMU) {
3705 for (value = 0; value < 256; value++) {
3706 if (! isWORDCHAR_L1(value)) {
3707 ANYOF_BITMAP_SET(data->start_class, value);
3711 for (value = 0; value < 256; value++) {
3712 if (! isALNUM(value)) {
3713 ANYOF_BITMAP_SET(data->start_class, value);
3721 if (flags & SCF_DO_STCLASS_AND) {
3722 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3723 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3724 if (OP(scan) == SPACEU) {
3725 for (value = 0; value < 256; value++) {
3726 if (!isSPACE_L1(value)) {
3727 ANYOF_BITMAP_CLEAR(data->start_class, value);
3731 for (value = 0; value < 256; value++) {
3732 if (!isSPACE(value)) {
3733 ANYOF_BITMAP_CLEAR(data->start_class, value);
3740 if (data->start_class->flags & ANYOF_LOCALE) {
3741 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3743 else if (OP(scan) == SPACEU) {
3744 for (value = 0; value < 256; value++) {
3745 if (isSPACE_L1(value)) {
3746 ANYOF_BITMAP_SET(data->start_class, value);
3750 for (value = 0; value < 256; value++) {
3751 if (isSPACE(value)) {
3752 ANYOF_BITMAP_SET(data->start_class, value);
3759 if (flags & SCF_DO_STCLASS_AND) {
3760 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3761 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3762 if (OP(scan) == NSPACEU) {
3763 for (value = 0; value < 256; value++) {
3764 if (isSPACE_L1(value)) {
3765 ANYOF_BITMAP_CLEAR(data->start_class, value);
3769 for (value = 0; value < 256; value++) {
3770 if (isSPACE(value)) {
3771 ANYOF_BITMAP_CLEAR(data->start_class, value);
3778 if (data->start_class->flags & ANYOF_LOCALE)
3779 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3780 else if (OP(scan) == NSPACEU) {
3781 for (value = 0; value < 256; value++) {
3782 if (!isSPACE_L1(value)) {
3783 ANYOF_BITMAP_SET(data->start_class, value);
3788 for (value = 0; value < 256; value++) {
3789 if (!isSPACE(value)) {
3790 ANYOF_BITMAP_SET(data->start_class, value);
3797 if (flags & SCF_DO_STCLASS_AND) {
3798 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3799 for (value = 0; value < 256; value++)
3800 if (!isDIGIT(value))
3801 ANYOF_BITMAP_CLEAR(data->start_class, value);
3804 if (data->start_class->flags & ANYOF_LOCALE)
3805 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3807 for (value = 0; value < 256; value++)
3809 ANYOF_BITMAP_SET(data->start_class, value);
3814 if (flags & SCF_DO_STCLASS_AND) {
3815 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3816 for (value = 0; value < 256; value++)
3818 ANYOF_BITMAP_CLEAR(data->start_class, value);
3821 if (data->start_class->flags & ANYOF_LOCALE)
3822 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3824 for (value = 0; value < 256; value++)
3825 if (!isDIGIT(value))
3826 ANYOF_BITMAP_SET(data->start_class, value);
3830 CASE_SYNST_FNC(VERTWS);
3831 CASE_SYNST_FNC(HORIZWS);
3834 if (flags & SCF_DO_STCLASS_OR)
3835 cl_and(data->start_class, and_withp);
3836 flags &= ~SCF_DO_STCLASS;
3839 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3840 data->flags |= (OP(scan) == MEOL
3844 else if ( PL_regkind[OP(scan)] == BRANCHJ
3845 /* Lookbehind, or need to calculate parens/evals/stclass: */
3846 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3847 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3848 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3849 || OP(scan) == UNLESSM )
3851 /* Negative Lookahead/lookbehind
3852 In this case we can't do fixed string optimisation.
3855 I32 deltanext, minnext, fake = 0;
3857 struct regnode_charclass_class intrnl;
3860 data_fake.flags = 0;
3862 data_fake.whilem_c = data->whilem_c;
3863 data_fake.last_closep = data->last_closep;
3866 data_fake.last_closep = &fake;
3867 data_fake.pos_delta = delta;
3868 if ( flags & SCF_DO_STCLASS && !scan->flags
3869 && OP(scan) == IFMATCH ) { /* Lookahead */
3870 cl_init(pRExC_state, &intrnl);
3871 data_fake.start_class = &intrnl;
3872 f |= SCF_DO_STCLASS_AND;
3874 if (flags & SCF_WHILEM_VISITED_POS)
3875 f |= SCF_WHILEM_VISITED_POS;
3876 next = regnext(scan);
3877 nscan = NEXTOPER(NEXTOPER(scan));
3878 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3879 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3882 FAIL("Variable length lookbehind not implemented");
3884 else if (minnext > (I32)U8_MAX) {
3885 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3887 scan->flags = (U8)minnext;
3890 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3892 if (data_fake.flags & SF_HAS_EVAL)
3893 data->flags |= SF_HAS_EVAL;
3894 data->whilem_c = data_fake.whilem_c;
3896 if (f & SCF_DO_STCLASS_AND) {
3897 if (flags & SCF_DO_STCLASS_OR) {
3898 /* OR before, AND after: ideally we would recurse with
3899 * data_fake to get the AND applied by study of the
3900 * remainder of the pattern, and then derecurse;
3901 * *** HACK *** for now just treat as "no information".
3902 * See [perl #56690].
3904 cl_init(pRExC_state, data->start_class);
3906 /* AND before and after: combine and continue */
3907 const int was = (data->start_class->flags & ANYOF_EOS);
3909 cl_and(data->start_class, &intrnl);
3911 data->start_class->flags |= ANYOF_EOS;
3915 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3917 /* Positive Lookahead/lookbehind
3918 In this case we can do fixed string optimisation,
3919 but we must be careful about it. Note in the case of
3920 lookbehind the positions will be offset by the minimum
3921 length of the pattern, something we won't know about
3922 until after the recurse.
3924 I32 deltanext, fake = 0;
3926 struct regnode_charclass_class intrnl;
3928 /* We use SAVEFREEPV so that when the full compile
3929 is finished perl will clean up the allocated
3930 minlens when it's all done. This way we don't
3931 have to worry about freeing them when we know
3932 they wont be used, which would be a pain.
3935 Newx( minnextp, 1, I32 );
3936 SAVEFREEPV(minnextp);
3939 StructCopy(data, &data_fake, scan_data_t);
3940 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3943 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3944 data_fake.last_found=newSVsv(data->last_found);
3948 data_fake.last_closep = &fake;
3949 data_fake.flags = 0;
3950 data_fake.pos_delta = delta;
3952 data_fake.flags |= SF_IS_INF;
3953 if ( flags & SCF_DO_STCLASS && !scan->flags
3954 && OP(scan) == IFMATCH ) { /* Lookahead */
3955 cl_init(pRExC_state, &intrnl);
3956 data_fake.start_class = &intrnl;
3957 f |= SCF_DO_STCLASS_AND;
3959 if (flags & SCF_WHILEM_VISITED_POS)
3960 f |= SCF_WHILEM_VISITED_POS;
3961 next = regnext(scan);
3962 nscan = NEXTOPER(NEXTOPER(scan));
3964 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3965 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3968 FAIL("Variable length lookbehind not implemented");
3970 else if (*minnextp > (I32)U8_MAX) {
3971 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3973 scan->flags = (U8)*minnextp;
3978 if (f & SCF_DO_STCLASS_AND) {
3979 const int was = (data->start_class->flags & ANYOF_EOS);
3981 cl_and(data->start_class, &intrnl);
3983 data->start_class->flags |= ANYOF_EOS;
3986 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3988 if (data_fake.flags & SF_HAS_EVAL)
3989 data->flags |= SF_HAS_EVAL;
3990 data->whilem_c = data_fake.whilem_c;
3991 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3992 if (RExC_rx->minlen<*minnextp)
3993 RExC_rx->minlen=*minnextp;
3994 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
3995 SvREFCNT_dec(data_fake.last_found);
3997 if ( data_fake.minlen_fixed != minlenp )
3999 data->offset_fixed= data_fake.offset_fixed;
4000 data->minlen_fixed= data_fake.minlen_fixed;
4001 data->lookbehind_fixed+= scan->flags;
4003 if ( data_fake.minlen_float != minlenp )
4005 data->minlen_float= data_fake.minlen_float;
4006 data->offset_float_min=data_fake.offset_float_min;
4007 data->offset_float_max=data_fake.offset_float_max;
4008 data->lookbehind_float+= scan->flags;
4017 else if (OP(scan) == OPEN) {
4018 if (stopparen != (I32)ARG(scan))
4021 else if (OP(scan) == CLOSE) {
4022 if (stopparen == (I32)ARG(scan)) {
4025 if ((I32)ARG(scan) == is_par) {
4026 next = regnext(scan);
4028 if ( next && (OP(next) != WHILEM) && next < last)
4029 is_par = 0; /* Disable optimization */
4032 *(data->last_closep) = ARG(scan);
4034 else if (OP(scan) == EVAL) {
4036 data->flags |= SF_HAS_EVAL;
4038 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4039 if (flags & SCF_DO_SUBSTR) {
4040 SCAN_COMMIT(pRExC_state,data,minlenp);
4041 flags &= ~SCF_DO_SUBSTR;
4043 if (data && OP(scan)==ACCEPT) {
4044 data->flags |= SCF_SEEN_ACCEPT;
4049 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4051 if (flags & SCF_DO_SUBSTR) {
4052 SCAN_COMMIT(pRExC_state,data,minlenp);
4053 data->longest = &(data->longest_float);
4055 is_inf = is_inf_internal = 1;
4056 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4057 cl_anything(pRExC_state, data->start_class);
4058 flags &= ~SCF_DO_STCLASS;
4060 else if (OP(scan) == GPOS) {
4061 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4062 !(delta || is_inf || (data && data->pos_delta)))
4064 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4065 RExC_rx->extflags |= RXf_ANCH_GPOS;
4066 if (RExC_rx->gofs < (U32)min)
4067 RExC_rx->gofs = min;
4069 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4073 #ifdef TRIE_STUDY_OPT
4074 #ifdef FULL_TRIE_STUDY
4075 else if (PL_regkind[OP(scan)] == TRIE) {
4076 /* NOTE - There is similar code to this block above for handling
4077 BRANCH nodes on the initial study. If you change stuff here
4079 regnode *trie_node= scan;
4080 regnode *tail= regnext(scan);
4081 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4082 I32 max1 = 0, min1 = I32_MAX;
4083 struct regnode_charclass_class accum;
4085 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4086 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4087 if (flags & SCF_DO_STCLASS)
4088 cl_init_zero(pRExC_state, &accum);
4094 const regnode *nextbranch= NULL;
4097 for ( word=1 ; word <= trie->wordcount ; word++)
4099 I32 deltanext=0, minnext=0, f = 0, fake;
4100 struct regnode_charclass_class this_class;
4102 data_fake.flags = 0;
4104 data_fake.whilem_c = data->whilem_c;
4105 data_fake.last_closep = data->last_closep;
4108 data_fake.last_closep = &fake;
4109 data_fake.pos_delta = delta;
4110 if (flags & SCF_DO_STCLASS) {
4111 cl_init(pRExC_state, &this_class);
4112 data_fake.start_class = &this_class;
4113 f = SCF_DO_STCLASS_AND;
4115 if (flags & SCF_WHILEM_VISITED_POS)
4116 f |= SCF_WHILEM_VISITED_POS;
4118 if (trie->jump[word]) {
4120 nextbranch = trie_node + trie->jump[0];
4121 scan= trie_node + trie->jump[word];
4122 /* We go from the jump point to the branch that follows
4123 it. Note this means we need the vestigal unused branches
4124 even though they arent otherwise used.
4126 minnext = study_chunk(pRExC_state, &scan, minlenp,
4127 &deltanext, (regnode *)nextbranch, &data_fake,
4128 stopparen, recursed, NULL, f,depth+1);
4130 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4131 nextbranch= regnext((regnode*)nextbranch);
4133 if (min1 > (I32)(minnext + trie->minlen))
4134 min1 = minnext + trie->minlen;
4135 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4136 max1 = minnext + deltanext + trie->maxlen;
4137 if (deltanext == I32_MAX)
4138 is_inf = is_inf_internal = 1;
4140 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4142 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4143 if ( stopmin > min + min1)
4144 stopmin = min + min1;
4145 flags &= ~SCF_DO_SUBSTR;
4147 data->flags |= SCF_SEEN_ACCEPT;
4150 if (data_fake.flags & SF_HAS_EVAL)
4151 data->flags |= SF_HAS_EVAL;
4152 data->whilem_c = data_fake.whilem_c;
4154 if (flags & SCF_DO_STCLASS)
4155 cl_or(pRExC_state, &accum, &this_class);
4158 if (flags & SCF_DO_SUBSTR) {
4159 data->pos_min += min1;
4160 data->pos_delta += max1 - min1;
4161 if (max1 != min1 || is_inf)
4162 data->longest = &(data->longest_float);
4165 delta += max1 - min1;
4166 if (flags & SCF_DO_STCLASS_OR) {
4167 cl_or(pRExC_state, data->start_class, &accum);
4169 cl_and(data->start_class, and_withp);
4170 flags &= ~SCF_DO_STCLASS;
4173 else if (flags & SCF_DO_STCLASS_AND) {
4175 cl_and(data->start_class, &accum);
4176 flags &= ~SCF_DO_STCLASS;
4179 /* Switch to OR mode: cache the old value of
4180 * data->start_class */
4182 StructCopy(data->start_class, and_withp,
4183 struct regnode_charclass_class);
4184 flags &= ~SCF_DO_STCLASS_AND;
4185 StructCopy(&accum, data->start_class,
4186 struct regnode_charclass_class);
4187 flags |= SCF_DO_STCLASS_OR;
4188 data->start_class->flags |= ANYOF_EOS;
4195 else if (PL_regkind[OP(scan)] == TRIE) {
4196 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4199 min += trie->minlen;
4200 delta += (trie->maxlen - trie->minlen);
4201 flags &= ~SCF_DO_STCLASS; /* xxx */
4202 if (flags & SCF_DO_SUBSTR) {
4203 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4204 data->pos_min += trie->minlen;
4205 data->pos_delta += (trie->maxlen - trie->minlen);
4206 if (trie->maxlen != trie->minlen)
4207 data->longest = &(data->longest_float);
4209 if (trie->jump) /* no more substrings -- for now /grr*/
4210 flags &= ~SCF_DO_SUBSTR;
4212 #endif /* old or new */
4213 #endif /* TRIE_STUDY_OPT */
4215 /* Else: zero-length, ignore. */
4216 scan = regnext(scan);
4221 stopparen = frame->stop;
4222 frame = frame->prev;
4223 goto fake_study_recurse;
4228 DEBUG_STUDYDATA("pre-fin:",data,depth);
4231 *deltap = is_inf_internal ? I32_MAX : delta;
4232 if (flags & SCF_DO_SUBSTR && is_inf)
4233 data->pos_delta = I32_MAX - data->pos_min;
4234 if (is_par > (I32)U8_MAX)
4236 if (is_par && pars==1 && data) {
4237 data->flags |= SF_IN_PAR;
4238 data->flags &= ~SF_HAS_PAR;
4240 else if (pars && data) {
4241 data->flags |= SF_HAS_PAR;
4242 data->flags &= ~SF_IN_PAR;
4244 if (flags & SCF_DO_STCLASS_OR)
4245 cl_and(data->start_class, and_withp);
4246 if (flags & SCF_TRIE_RESTUDY)
4247 data->flags |= SCF_TRIE_RESTUDY;
4249 DEBUG_STUDYDATA("post-fin:",data,depth);
4251 return min < stopmin ? min : stopmin;
4255 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4257 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4259 PERL_ARGS_ASSERT_ADD_DATA;
4261 Renewc(RExC_rxi->data,
4262 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4263 char, struct reg_data);
4265 Renew(RExC_rxi->data->what, count + n, U8);
4267 Newx(RExC_rxi->data->what, n, U8);
4268 RExC_rxi->data->count = count + n;
4269 Copy(s, RExC_rxi->data->what + count, n, U8);
4273 /*XXX: todo make this not included in a non debugging perl */
4274 #ifndef PERL_IN_XSUB_RE
4276 Perl_reginitcolors(pTHX)
4279 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4281 char *t = savepv(s);
4285 t = strchr(t, '\t');
4291 PL_colors[i] = t = (char *)"";
4296 PL_colors[i++] = (char *)"";
4303 #ifdef TRIE_STUDY_OPT
4304 #define CHECK_RESTUDY_GOTO \
4306 (data.flags & SCF_TRIE_RESTUDY) \
4310 #define CHECK_RESTUDY_GOTO
4314 - pregcomp - compile a regular expression into internal code
4316 * We can't allocate space until we know how big the compiled form will be,
4317 * but we can't compile it (and thus know how big it is) until we've got a
4318 * place to put the code. So we cheat: we compile it twice, once with code
4319 * generation turned off and size counting turned on, and once "for real".
4320 * This also means that we don't allocate space until we are sure that the
4321 * thing really will compile successfully, and we never have to move the
4322 * code and thus invalidate pointers into it. (Note that it has to be in
4323 * one piece because free() must be able to free it all.) [NB: not true in perl]
4325 * Beware that the optimization-preparation code in here knows about some
4326 * of the structure of the compiled regexp. [I'll say.]
4331 #ifndef PERL_IN_XSUB_RE
4332 #define RE_ENGINE_PTR &PL_core_reg_engine
4334 extern const struct regexp_engine my_reg_engine;
4335 #define RE_ENGINE_PTR &my_reg_engine
4338 #ifndef PERL_IN_XSUB_RE
4340 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4343 HV * const table = GvHV(PL_hintgv);
4345 PERL_ARGS_ASSERT_PREGCOMP;
4347 /* Dispatch a request to compile a regexp to correct
4350 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4351 GET_RE_DEBUG_FLAGS_DECL;
4352 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4353 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4355 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4358 return CALLREGCOMP_ENG(eng, pattern, flags);
4361 return Perl_re_compile(aTHX_ pattern, flags);
4366 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4371 register regexp_internal *ri;
4380 /* these are all flags - maybe they should be turned
4381 * into a single int with different bit masks */
4382 I32 sawlookahead = 0;
4385 bool used_setjump = FALSE;
4390 RExC_state_t RExC_state;
4391 RExC_state_t * const pRExC_state = &RExC_state;
4392 #ifdef TRIE_STUDY_OPT
4394 RExC_state_t copyRExC_state;
4396 GET_RE_DEBUG_FLAGS_DECL;
4398 PERL_ARGS_ASSERT_RE_COMPILE;
4400 DEBUG_r(if (!PL_colorset) reginitcolors());
4402 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4403 RExC_uni_semantics = 0;
4405 /****************** LONG JUMP TARGET HERE***********************/
4406 /* Longjmp back to here if have to switch in midstream to utf8 */
4407 if (! RExC_orig_utf8) {
4408 JMPENV_PUSH(jump_ret);
4409 used_setjump = TRUE;
4412 if (jump_ret == 0) { /* First time through */
4413 exp = SvPV(pattern, plen);
4415 /* ignore the utf8ness if the pattern is 0 length */
4417 RExC_utf8 = RExC_orig_utf8 = 0;
4421 SV *dsv= sv_newmortal();
4422 RE_PV_QUOTED_DECL(s, RExC_utf8,
4423 dsv, exp, plen, 60);
4424 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4425 PL_colors[4],PL_colors[5],s);
4428 else { /* longjumped back */
4431 /* If the cause for the longjmp was other than changing to utf8, pop
4432 * our own setjmp, and longjmp to the correct handler */
4433 if (jump_ret != UTF8_LONGJMP) {
4435 JMPENV_JUMP(jump_ret);
4440 /* It's possible to write a regexp in ascii that represents Unicode
4441 codepoints outside of the byte range, such as via \x{100}. If we
4442 detect such a sequence we have to convert the entire pattern to utf8
4443 and then recompile, as our sizing calculation will have been based
4444 on 1 byte == 1 character, but we will need to use utf8 to encode
4445 at least some part of the pattern, and therefore must convert the whole
4448 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4449 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4450 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4452 RExC_orig_utf8 = RExC_utf8 = 1;
4456 #ifdef TRIE_STUDY_OPT
4460 /* Set to use unicode semantics if the pattern is in utf8 and has the
4461 * 'depends' charset specified, as it means unicode when utf8 */
4462 pm_flags = orig_pm_flags;
4464 if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4465 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4469 RExC_flags = pm_flags;
4473 RExC_in_lookbehind = 0;
4474 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4475 RExC_seen_evals = 0;
4478 /* First pass: determine size, legality. */
4486 RExC_emit = &PL_regdummy;
4487 RExC_whilem_seen = 0;
4488 RExC_open_parens = NULL;
4489 RExC_close_parens = NULL;
4491 RExC_paren_names = NULL;
4493 RExC_paren_name_list = NULL;
4495 RExC_recurse = NULL;
4496 RExC_recurse_count = 0;
4498 #if 0 /* REGC() is (currently) a NOP at the first pass.
4499 * Clever compilers notice this and complain. --jhi */
4500 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4502 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4503 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4504 RExC_precomp = NULL;
4508 /* Here, finished first pass. Get rid of any added setjmp */
4514 PerlIO_printf(Perl_debug_log,
4515 "Required size %"IVdf" nodes\n"
4516 "Starting second pass (creation)\n",
4519 RExC_lastparse=NULL;
4522 /* The first pass could have found things that force Unicode semantics */
4523 if ((RExC_utf8 || RExC_uni_semantics)
4524 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4526 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4529 /* Small enough for pointer-storage convention?
4530 If extralen==0, this means that we will not need long jumps. */
4531 if (RExC_size >= 0x10000L && RExC_extralen)
4532 RExC_size += RExC_extralen;
4535 if (RExC_whilem_seen > 15)
4536 RExC_whilem_seen = 15;
4538 /* Allocate space and zero-initialize. Note, the two step process
4539 of zeroing when in debug mode, thus anything assigned has to
4540 happen after that */
4541 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4542 r = (struct regexp*)SvANY(rx);
4543 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4544 char, regexp_internal);
4545 if ( r == NULL || ri == NULL )
4546 FAIL("Regexp out of space");
4548 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4549 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4551 /* bulk initialize base fields with 0. */
4552 Zero(ri, sizeof(regexp_internal), char);
4555 /* non-zero initialization begins here */
4557 r->engine= RE_ENGINE_PTR;
4558 r->extflags = pm_flags;
4560 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4561 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4563 /* The caret is output if there are any defaults: if not all the STD
4564 * flags are set, or if no character set specifier is needed */
4566 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4568 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4569 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4570 >> RXf_PMf_STD_PMMOD_SHIFT);
4571 const char *fptr = STD_PAT_MODS; /*"msix"*/
4573 /* Allocate for the worst case, which is all the std flags are turned
4574 * on. If more precision is desired, we could do a population count of
4575 * the flags set. This could be done with a small lookup table, or by
4576 * shifting, masking and adding, or even, when available, assembly
4577 * language for a machine-language population count.
4578 * We never output a minus, as all those are defaults, so are
4579 * covered by the caret */
4580 const STRLEN wraplen = plen + has_p + has_runon
4581 + has_default /* If needs a caret */
4583 /* If needs a character set specifier */
4584 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4585 + (sizeof(STD_PAT_MODS) - 1)
4586 + (sizeof("(?:)") - 1);
4588 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4590 SvFLAGS(rx) |= SvUTF8(pattern);
4593 /* If a default, cover it using the caret */
4595 *p++= DEFAULT_PAT_MOD;
4599 const char* const name = get_regex_charset_name(r->extflags, &len);
4600 Copy(name, p, len, char);
4604 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4607 while((ch = *fptr++)) {
4615 Copy(RExC_precomp, p, plen, char);
4616 assert ((RX_WRAPPED(rx) - p) < 16);
4617 r->pre_prefix = p - RX_WRAPPED(rx);
4623 SvCUR_set(rx, p - SvPVX_const(rx));
4627 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4629 if (RExC_seen & REG_SEEN_RECURSE) {
4630 Newxz(RExC_open_parens, RExC_npar,regnode *);
4631 SAVEFREEPV(RExC_open_parens);
4632 Newxz(RExC_close_parens,RExC_npar,regnode *);
4633 SAVEFREEPV(RExC_close_parens);
4636 /* Useful during FAIL. */
4637 #ifdef RE_TRACK_PATTERN_OFFSETS
4638 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4639 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4640 "%s %"UVuf" bytes for offset annotations.\n",
4641 ri->u.offsets ? "Got" : "Couldn't get",
4642 (UV)((2*RExC_size+1) * sizeof(U32))));
4644 SetProgLen(ri,RExC_size);
4649 /* Second pass: emit code. */
4650 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4655 RExC_emit_start = ri->program;
4656 RExC_emit = ri->program;
4657 RExC_emit_bound = ri->program + RExC_size + 1;
4659 /* Store the count of eval-groups for security checks: */
4660 RExC_rx->seen_evals = RExC_seen_evals;
4661 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4662 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4666 /* XXXX To minimize changes to RE engine we always allocate
4667 3-units-long substrs field. */
4668 Newx(r->substrs, 1, struct reg_substr_data);
4669 if (RExC_recurse_count) {
4670 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4671 SAVEFREEPV(RExC_recurse);
4675 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4676 Zero(r->substrs, 1, struct reg_substr_data);
4678 #ifdef TRIE_STUDY_OPT
4680 StructCopy(&zero_scan_data, &data, scan_data_t);
4681 copyRExC_state = RExC_state;
4684 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4686 RExC_state = copyRExC_state;
4687 if (seen & REG_TOP_LEVEL_BRANCHES)
4688 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4690 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4691 if (data.last_found) {
4692 SvREFCNT_dec(data.longest_fixed);
4693 SvREFCNT_dec(data.longest_float);
4694 SvREFCNT_dec(data.last_found);
4696 StructCopy(&zero_scan_data, &data, scan_data_t);
4699 StructCopy(&zero_scan_data, &data, scan_data_t);
4702 /* Dig out information for optimizations. */
4703 r->extflags = RExC_flags; /* was pm_op */
4704 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4707 SvUTF8_on(rx); /* Unicode in it? */
4708 ri->regstclass = NULL;
4709 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4710 r->intflags |= PREGf_NAUGHTY;
4711 scan = ri->program + 1; /* First BRANCH. */
4713 /* testing for BRANCH here tells us whether there is "must appear"
4714 data in the pattern. If there is then we can use it for optimisations */
4715 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4717 STRLEN longest_float_length, longest_fixed_length;
4718 struct regnode_charclass_class ch_class; /* pointed to by data */
4720 I32 last_close = 0; /* pointed to by data */
4721 regnode *first= scan;
4722 regnode *first_next= regnext(first);
4724 * Skip introductions and multiplicators >= 1
4725 * so that we can extract the 'meat' of the pattern that must
4726 * match in the large if() sequence following.
4727 * NOTE that EXACT is NOT covered here, as it is normally
4728 * picked up by the optimiser separately.
4730 * This is unfortunate as the optimiser isnt handling lookahead
4731 * properly currently.
4734 while ((OP(first) == OPEN && (sawopen = 1)) ||
4735 /* An OR of *one* alternative - should not happen now. */
4736 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4737 /* for now we can't handle lookbehind IFMATCH*/
4738 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4739 (OP(first) == PLUS) ||
4740 (OP(first) == MINMOD) ||
4741 /* An {n,m} with n>0 */
4742 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4743 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4746 * the only op that could be a regnode is PLUS, all the rest
4747 * will be regnode_1 or regnode_2.
4750 if (OP(first) == PLUS)
4753 first += regarglen[OP(first)];
4755 first = NEXTOPER(first);
4756 first_next= regnext(first);
4759 /* Starting-point info. */
4761 DEBUG_PEEP("first:",first,0);
4762 /* Ignore EXACT as we deal with it later. */
4763 if (PL_regkind[OP(first)] == EXACT) {
4764 if (OP(first) == EXACT)
4765 NOOP; /* Empty, get anchored substr later. */
4767 ri->regstclass = first;
4770 else if (PL_regkind[OP(first)] == TRIE &&
4771 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4774 /* this can happen only on restudy */
4775 if ( OP(first) == TRIE ) {
4776 struct regnode_1 *trieop = (struct regnode_1 *)
4777 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4778 StructCopy(first,trieop,struct regnode_1);
4779 trie_op=(regnode *)trieop;
4781 struct regnode_charclass *trieop = (struct regnode_charclass *)
4782 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4783 StructCopy(first,trieop,struct regnode_charclass);
4784 trie_op=(regnode *)trieop;
4787 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4788 ri->regstclass = trie_op;
4791 else if (REGNODE_SIMPLE(OP(first)))
4792 ri->regstclass = first;
4793 else if (PL_regkind[OP(first)] == BOUND ||
4794 PL_regkind[OP(first)] == NBOUND)
4795 ri->regstclass = first;
4796 else if (PL_regkind[OP(first)] == BOL) {
4797 r->extflags |= (OP(first) == MBOL
4799 : (OP(first) == SBOL
4802 first = NEXTOPER(first);
4805 else if (OP(first) == GPOS) {
4806 r->extflags |= RXf_ANCH_GPOS;
4807 first = NEXTOPER(first);
4810 else if ((!sawopen || !RExC_sawback) &&
4811 (OP(first) == STAR &&
4812 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4813 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4815 /* turn .* into ^.* with an implied $*=1 */
4817 (OP(NEXTOPER(first)) == REG_ANY)
4820 r->extflags |= type;
4821 r->intflags |= PREGf_IMPLICIT;
4822 first = NEXTOPER(first);
4825 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4826 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4827 /* x+ must match at the 1st pos of run of x's */
4828 r->intflags |= PREGf_SKIP;
4830 /* Scan is after the zeroth branch, first is atomic matcher. */
4831 #ifdef TRIE_STUDY_OPT
4834 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4835 (IV)(first - scan + 1))
4839 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4840 (IV)(first - scan + 1))
4846 * If there's something expensive in the r.e., find the
4847 * longest literal string that must appear and make it the
4848 * regmust. Resolve ties in favor of later strings, since
4849 * the regstart check works with the beginning of the r.e.
4850 * and avoiding duplication strengthens checking. Not a
4851 * strong reason, but sufficient in the absence of others.
4852 * [Now we resolve ties in favor of the earlier string if
4853 * it happens that c_offset_min has been invalidated, since the
4854 * earlier string may buy us something the later one won't.]
4857 data.longest_fixed = newSVpvs("");
4858 data.longest_float = newSVpvs("");
4859 data.last_found = newSVpvs("");
4860 data.longest = &(data.longest_fixed);
4862 if (!ri->regstclass) {
4863 cl_init(pRExC_state, &ch_class);
4864 data.start_class = &ch_class;
4865 stclass_flag = SCF_DO_STCLASS_AND;
4866 } else /* XXXX Check for BOUND? */
4868 data.last_closep = &last_close;
4870 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4871 &data, -1, NULL, NULL,
4872 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4878 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4879 && data.last_start_min == 0 && data.last_end > 0
4880 && !RExC_seen_zerolen
4881 && !(RExC_seen & REG_SEEN_VERBARG)
4882 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4883 r->extflags |= RXf_CHECK_ALL;
4884 scan_commit(pRExC_state, &data,&minlen,0);
4885 SvREFCNT_dec(data.last_found);
4887 /* Note that code very similar to this but for anchored string
4888 follows immediately below, changes may need to be made to both.
4891 longest_float_length = CHR_SVLEN(data.longest_float);
4892 if (longest_float_length
4893 || (data.flags & SF_FL_BEFORE_EOL
4894 && (!(data.flags & SF_FL_BEFORE_MEOL)
4895 || (RExC_flags & RXf_PMf_MULTILINE))))
4899 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4900 && data.offset_fixed == data.offset_float_min
4901 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4902 goto remove_float; /* As in (a)+. */
4904 /* copy the information about the longest float from the reg_scan_data
4905 over to the program. */
4906 if (SvUTF8(data.longest_float)) {
4907 r->float_utf8 = data.longest_float;
4908 r->float_substr = NULL;
4910 r->float_substr = data.longest_float;
4911 r->float_utf8 = NULL;
4913 /* float_end_shift is how many chars that must be matched that
4914 follow this item. We calculate it ahead of time as once the
4915 lookbehind offset is added in we lose the ability to correctly
4917 ml = data.minlen_float ? *(data.minlen_float)
4918 : (I32)longest_float_length;
4919 r->float_end_shift = ml - data.offset_float_min
4920 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4921 + data.lookbehind_float;
4922 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4923 r->float_max_offset = data.offset_float_max;
4924 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4925 r->float_max_offset -= data.lookbehind_float;
4927 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4928 && (!(data.flags & SF_FL_BEFORE_MEOL)
4929 || (RExC_flags & RXf_PMf_MULTILINE)));
4930 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4934 r->float_substr = r->float_utf8 = NULL;
4935 SvREFCNT_dec(data.longest_float);
4936 longest_float_length = 0;
4939 /* Note that code very similar to this but for floating string
4940 is immediately above, changes may need to be made to both.
4943 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4944 if (longest_fixed_length
4945 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4946 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4947 || (RExC_flags & RXf_PMf_MULTILINE))))
4951 /* copy the information about the longest fixed
4952 from the reg_scan_data over to the program. */
4953 if (SvUTF8(data.longest_fixed)) {
4954 r->anchored_utf8 = data.longest_fixed;
4955 r->anchored_substr = NULL;
4957 r->anchored_substr = data.longest_fixed;
4958 r->anchored_utf8 = NULL;
4960 /* fixed_end_shift is how many chars that must be matched that
4961 follow this item. We calculate it ahead of time as once the
4962 lookbehind offset is added in we lose the ability to correctly
4964 ml = data.minlen_fixed ? *(data.minlen_fixed)
4965 : (I32)longest_fixed_length;
4966 r->anchored_end_shift = ml - data.offset_fixed
4967 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4968 + data.lookbehind_fixed;
4969 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4971 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4972 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4973 || (RExC_flags & RXf_PMf_MULTILINE)));
4974 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4977 r->anchored_substr = r->anchored_utf8 = NULL;
4978 SvREFCNT_dec(data.longest_fixed);
4979 longest_fixed_length = 0;
4982 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4983 ri->regstclass = NULL;
4985 /* If the synthetic start class were to ever be used when EOS is set,
4986 * that bit would have to be cleared, as it is shared with another */
4987 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4989 && !(data.start_class->flags & ANYOF_EOS)
4990 && !cl_is_anything(data.start_class))
4992 const U32 n = add_data(pRExC_state, 1, "f");
4994 Newx(RExC_rxi->data->data[n], 1,
4995 struct regnode_charclass_class);
4996 StructCopy(data.start_class,
4997 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4998 struct regnode_charclass_class);
4999 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5000 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5001 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5002 regprop(r, sv, (regnode*)data.start_class);
5003 PerlIO_printf(Perl_debug_log,
5004 "synthetic stclass \"%s\".\n",
5005 SvPVX_const(sv));});
5008 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5009 if (longest_fixed_length > longest_float_length) {
5010 r->check_end_shift = r->anchored_end_shift;
5011 r->check_substr = r->anchored_substr;
5012 r->check_utf8 = r->anchored_utf8;
5013 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5014 if (r->extflags & RXf_ANCH_SINGLE)
5015 r->extflags |= RXf_NOSCAN;
5018 r->check_end_shift = r->float_end_shift;
5019 r->check_substr = r->float_substr;
5020 r->check_utf8 = r->float_utf8;
5021 r->check_offset_min = r->float_min_offset;
5022 r->check_offset_max = r->float_max_offset;
5024 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5025 This should be changed ASAP! */
5026 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5027 r->extflags |= RXf_USE_INTUIT;
5028 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5029 r->extflags |= RXf_INTUIT_TAIL;
5031 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5032 if ( (STRLEN)minlen < longest_float_length )
5033 minlen= longest_float_length;
5034 if ( (STRLEN)minlen < longest_fixed_length )
5035 minlen= longest_fixed_length;
5039 /* Several toplevels. Best we can is to set minlen. */
5041 struct regnode_charclass_class ch_class;
5044 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5046 scan = ri->program + 1;
5047 cl_init(pRExC_state, &ch_class);
5048 data.start_class = &ch_class;
5049 data.last_closep = &last_close;
5052 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5053 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5057 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5058 = r->float_substr = r->float_utf8 = NULL;
5060 /* If the synthetic start class were to ever be used when EOS is set,
5061 * that bit would have to be cleared, as it is shared with another */
5062 if (!(data.start_class->flags & ANYOF_EOS)
5063 && !cl_is_anything(data.start_class))
5065 const U32 n = add_data(pRExC_state, 1, "f");
5067 Newx(RExC_rxi->data->data[n], 1,
5068 struct regnode_charclass_class);
5069 StructCopy(data.start_class,
5070 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5071 struct regnode_charclass_class);
5072 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5073 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5074 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5075 regprop(r, sv, (regnode*)data.start_class);
5076 PerlIO_printf(Perl_debug_log,
5077 "synthetic stclass \"%s\".\n",
5078 SvPVX_const(sv));});
5082 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5083 the "real" pattern. */
5085 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5086 (IV)minlen, (IV)r->minlen);
5088 r->minlenret = minlen;
5089 if (r->minlen < minlen)
5092 if (RExC_seen & REG_SEEN_GPOS)
5093 r->extflags |= RXf_GPOS_SEEN;
5094 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5095 r->extflags |= RXf_LOOKBEHIND_SEEN;
5096 if (RExC_seen & REG_SEEN_EVAL)
5097 r->extflags |= RXf_EVAL_SEEN;
5098 if (RExC_seen & REG_SEEN_CANY)
5099 r->extflags |= RXf_CANY_SEEN;
5100 if (RExC_seen & REG_SEEN_VERBARG)
5101 r->intflags |= PREGf_VERBARG_SEEN;
5102 if (RExC_seen & REG_SEEN_CUTGROUP)
5103 r->intflags |= PREGf_CUTGROUP_SEEN;
5104 if (RExC_paren_names)
5105 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5107 RXp_PAREN_NAMES(r) = NULL;
5109 #ifdef STUPID_PATTERN_CHECKS
5110 if (RX_PRELEN(rx) == 0)
5111 r->extflags |= RXf_NULL;
5112 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5113 /* XXX: this should happen BEFORE we compile */
5114 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5115 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5116 r->extflags |= RXf_WHITE;
5117 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5118 r->extflags |= RXf_START_ONLY;
5120 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5121 /* XXX: this should happen BEFORE we compile */
5122 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5124 regnode *first = ri->program + 1;
5127 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5128 r->extflags |= RXf_NULL;
5129 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5130 r->extflags |= RXf_START_ONLY;
5131 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5132 && OP(regnext(first)) == END)
5133 r->extflags |= RXf_WHITE;
5137 if (RExC_paren_names) {
5138 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5139 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5142 ri->name_list_idx = 0;
5144 if (RExC_recurse_count) {
5145 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5146 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5147 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5150 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5151 /* assume we don't need to swap parens around before we match */
5154 PerlIO_printf(Perl_debug_log,"Final program:\n");
5157 #ifdef RE_TRACK_PATTERN_OFFSETS
5158 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5159 const U32 len = ri->u.offsets[0];
5161 GET_RE_DEBUG_FLAGS_DECL;
5162 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5163 for (i = 1; i <= len; i++) {
5164 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5165 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5166 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5168 PerlIO_printf(Perl_debug_log, "\n");
5174 #undef RE_ENGINE_PTR
5178 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5181 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5183 PERL_UNUSED_ARG(value);
5185 if (flags & RXapif_FETCH) {
5186 return reg_named_buff_fetch(rx, key, flags);
5187 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5188 Perl_croak_no_modify(aTHX);
5190 } else if (flags & RXapif_EXISTS) {
5191 return reg_named_buff_exists(rx, key, flags)
5194 } else if (flags & RXapif_REGNAMES) {
5195 return reg_named_buff_all(rx, flags);
5196 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5197 return reg_named_buff_scalar(rx, flags);
5199 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5205 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5208 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5209 PERL_UNUSED_ARG(lastkey);
5211 if (flags & RXapif_FIRSTKEY)
5212 return reg_named_buff_firstkey(rx, flags);
5213 else if (flags & RXapif_NEXTKEY)
5214 return reg_named_buff_nextkey(rx, flags);
5216 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5222 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5225 AV *retarray = NULL;
5227 struct regexp *const rx = (struct regexp *)SvANY(r);
5229 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5231 if (flags & RXapif_ALL)
5234 if (rx && RXp_PAREN_NAMES(rx)) {
5235 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5238 SV* sv_dat=HeVAL(he_str);
5239 I32 *nums=(I32*)SvPVX(sv_dat);
5240 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5241 if ((I32)(rx->nparens) >= nums[i]
5242 && rx->offs[nums[i]].start != -1
5243 && rx->offs[nums[i]].end != -1)
5246 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5250 ret = newSVsv(&PL_sv_undef);
5253 av_push(retarray, ret);
5256 return newRV_noinc(MUTABLE_SV(retarray));
5263 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5266 struct regexp *const rx = (struct regexp *)SvANY(r);
5268 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5270 if (rx && RXp_PAREN_NAMES(rx)) {
5271 if (flags & RXapif_ALL) {
5272 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5274 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5288 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5290 struct regexp *const rx = (struct regexp *)SvANY(r);
5292 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5294 if ( rx && RXp_PAREN_NAMES(rx) ) {
5295 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5297 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5304 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5306 struct regexp *const rx = (struct regexp *)SvANY(r);
5307 GET_RE_DEBUG_FLAGS_DECL;
5309 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5311 if (rx && RXp_PAREN_NAMES(rx)) {
5312 HV *hv = RXp_PAREN_NAMES(rx);
5314 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5317 SV* sv_dat = HeVAL(temphe);
5318 I32 *nums = (I32*)SvPVX(sv_dat);
5319 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5320 if ((I32)(rx->lastparen) >= nums[i] &&
5321 rx->offs[nums[i]].start != -1 &&
5322 rx->offs[nums[i]].end != -1)
5328 if (parno || flags & RXapif_ALL) {
5329 return newSVhek(HeKEY_hek(temphe));
5337 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5342 struct regexp *const rx = (struct regexp *)SvANY(r);
5344 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5346 if (rx && RXp_PAREN_NAMES(rx)) {
5347 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5348 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5349 } else if (flags & RXapif_ONE) {
5350 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5351 av = MUTABLE_AV(SvRV(ret));
5352 length = av_len(av);
5354 return newSViv(length + 1);
5356 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5360 return &PL_sv_undef;
5364 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5366 struct regexp *const rx = (struct regexp *)SvANY(r);
5369 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5371 if (rx && RXp_PAREN_NAMES(rx)) {
5372 HV *hv= RXp_PAREN_NAMES(rx);
5374 (void)hv_iterinit(hv);
5375 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5378 SV* sv_dat = HeVAL(temphe);
5379 I32 *nums = (I32*)SvPVX(sv_dat);
5380 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5381 if ((I32)(rx->lastparen) >= nums[i] &&
5382 rx->offs[nums[i]].start != -1 &&
5383 rx->offs[nums[i]].end != -1)
5389 if (parno || flags & RXapif_ALL) {
5390 av_push(av, newSVhek(HeKEY_hek(temphe)));
5395 return newRV_noinc(MUTABLE_SV(av));
5399 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5402 struct regexp *const rx = (struct regexp *)SvANY(r);
5407 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5410 sv_setsv(sv,&PL_sv_undef);
5414 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5416 i = rx->offs[0].start;
5420 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5422 s = rx->subbeg + rx->offs[0].end;
5423 i = rx->sublen - rx->offs[0].end;
5426 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5427 (s1 = rx->offs[paren].start) != -1 &&
5428 (t1 = rx->offs[paren].end) != -1)
5432 s = rx->subbeg + s1;
5434 sv_setsv(sv,&PL_sv_undef);
5437 assert(rx->sublen >= (s - rx->subbeg) + i );
5439 const int oldtainted = PL_tainted;
5441 sv_setpvn(sv, s, i);
5442 PL_tainted = oldtainted;
5443 if ( (rx->extflags & RXf_CANY_SEEN)
5444 ? (RXp_MATCH_UTF8(rx)
5445 && (!i || is_utf8_string((U8*)s, i)))
5446 : (RXp_MATCH_UTF8(rx)) )
5453 if (RXp_MATCH_TAINTED(rx)) {
5454 if (SvTYPE(sv) >= SVt_PVMG) {
5455 MAGIC* const mg = SvMAGIC(sv);
5458 SvMAGIC_set(sv, mg->mg_moremagic);
5460 if ((mgt = SvMAGIC(sv))) {
5461 mg->mg_moremagic = mgt;
5462 SvMAGIC_set(sv, mg);
5472 sv_setsv(sv,&PL_sv_undef);
5478 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5479 SV const * const value)
5481 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5483 PERL_UNUSED_ARG(rx);
5484 PERL_UNUSED_ARG(paren);
5485 PERL_UNUSED_ARG(value);
5488 Perl_croak_no_modify(aTHX);
5492 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5495 struct regexp *const rx = (struct regexp *)SvANY(r);
5499 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5501 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5503 /* $` / ${^PREMATCH} */
5504 case RX_BUFF_IDX_PREMATCH:
5505 if (rx->offs[0].start != -1) {
5506 i = rx->offs[0].start;
5514 /* $' / ${^POSTMATCH} */
5515 case RX_BUFF_IDX_POSTMATCH:
5516 if (rx->offs[0].end != -1) {
5517 i = rx->sublen - rx->offs[0].end;
5519 s1 = rx->offs[0].end;
5525 /* $& / ${^MATCH}, $1, $2, ... */
5527 if (paren <= (I32)rx->nparens &&
5528 (s1 = rx->offs[paren].start) != -1 &&
5529 (t1 = rx->offs[paren].end) != -1)
5534 if (ckWARN(WARN_UNINITIALIZED))
5535 report_uninit((const SV *)sv);
5540 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5541 const char * const s = rx->subbeg + s1;
5546 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5553 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5555 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5556 PERL_UNUSED_ARG(rx);
5560 return newSVpvs("Regexp");
5563 /* Scans the name of a named buffer from the pattern.
5564 * If flags is REG_RSN_RETURN_NULL returns null.
5565 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5566 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5567 * to the parsed name as looked up in the RExC_paren_names hash.
5568 * If there is an error throws a vFAIL().. type exception.
5571 #define REG_RSN_RETURN_NULL 0
5572 #define REG_RSN_RETURN_NAME 1
5573 #define REG_RSN_RETURN_DATA 2
5576 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5578 char *name_start = RExC_parse;
5580 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5582 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5583 /* skip IDFIRST by using do...while */
5586 RExC_parse += UTF8SKIP(RExC_parse);
5587 } while (isALNUM_utf8((U8*)RExC_parse));
5591 } while (isALNUM(*RExC_parse));
5596 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5597 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5598 if ( flags == REG_RSN_RETURN_NAME)
5600 else if (flags==REG_RSN_RETURN_DATA) {
5603 if ( ! sv_name ) /* should not happen*/
5604 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5605 if (RExC_paren_names)
5606 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5608 sv_dat = HeVAL(he_str);
5610 vFAIL("Reference to nonexistent named group");
5614 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5621 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5622 int rem=(int)(RExC_end - RExC_parse); \
5631 if (RExC_lastparse!=RExC_parse) \
5632 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5635 iscut ? "..." : "<" \
5638 PerlIO_printf(Perl_debug_log,"%16s",""); \
5641 num = RExC_size + 1; \
5643 num=REG_NODE_NUM(RExC_emit); \
5644 if (RExC_lastnum!=num) \
5645 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5647 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5648 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5649 (int)((depth*2)), "", \
5653 RExC_lastparse=RExC_parse; \
5658 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5659 DEBUG_PARSE_MSG((funcname)); \
5660 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5662 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5663 DEBUG_PARSE_MSG((funcname)); \
5664 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5667 /* This section of code defines the inversion list object and its methods. The
5668 * interfaces are highly subject to change, so as much as possible is static to
5669 * this file. An inversion list is here implemented as a malloc'd C array with
5670 * some added info. More will be coming when functionality is added later.
5672 * Some of the methods should always be private to the implementation, and some
5673 * should eventually be made public */
5675 #define INVLIST_INITIAL_LEN 10
5676 #define INVLIST_ARRAY_KEY "array"
5677 #define INVLIST_MAX_KEY "max"
5678 #define INVLIST_LEN_KEY "len"
5680 PERL_STATIC_INLINE UV*
5681 S_invlist_array(pTHX_ HV* const invlist)
5683 /* Returns the pointer to the inversion list's array. Every time the
5684 * length changes, this needs to be called in case malloc or realloc moved
5687 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5689 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5691 if (list_ptr == NULL) {
5692 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5696 return INT2PTR(UV *, SvUV(*list_ptr));
5699 PERL_STATIC_INLINE void
5700 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5702 PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5704 /* Sets the array stored in the inversion list to the memory beginning with
5707 if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5708 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5713 PERL_STATIC_INLINE UV
5714 S_invlist_len(pTHX_ HV* const invlist)
5716 /* Returns the current number of elements in the inversion list's array */
5718 SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5720 PERL_ARGS_ASSERT_INVLIST_LEN;
5722 if (len_ptr == NULL) {
5723 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5727 return SvUV(*len_ptr);
5730 PERL_STATIC_INLINE UV
5731 S_invlist_max(pTHX_ HV* const invlist)
5733 /* Returns the maximum number of elements storable in the inversion list's
5734 * array, without having to realloc() */
5736 SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5738 PERL_ARGS_ASSERT_INVLIST_MAX;
5740 if (max_ptr == NULL) {
5741 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5745 return SvUV(*max_ptr);
5748 PERL_STATIC_INLINE void
5749 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5751 /* Sets the current number of elements stored in the inversion list */
5753 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5755 if (len != 0 && len > invlist_max(invlist)) {
5756 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5759 if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5760 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5765 PERL_STATIC_INLINE void
5766 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5769 /* Sets the maximum number of elements storable in the inversion list
5770 * without having to realloc() */
5772 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5774 if (max < invlist_len(invlist)) {
5775 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5778 if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5779 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5784 #ifndef PERL_IN_XSUB_RE
5786 Perl__new_invlist(pTHX_ IV initial_size)
5789 /* Return a pointer to a newly constructed inversion list, with enough
5790 * space to store 'initial_size' elements. If that number is negative, a
5791 * system default is used instead */
5793 HV* invlist = newHV();
5796 if (initial_size < 0) {
5797 initial_size = INVLIST_INITIAL_LEN;
5800 /* Allocate the initial space */
5801 Newx(list, initial_size, UV);
5802 invlist_set_array(invlist, list);
5804 /* set_len has to come before set_max, as the latter inspects the len */
5805 invlist_set_len(invlist, 0);
5806 invlist_set_max(invlist, initial_size);
5812 PERL_STATIC_INLINE void
5813 S_invlist_destroy(pTHX_ HV* const invlist)
5815 /* Inversion list destructor */
5817 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5819 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5821 if (list_ptr != NULL) {
5822 UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5828 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5830 /* Change the maximum size of an inversion list (up or down) */
5834 const UV old_max = invlist_max(invlist);
5836 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5838 if (old_max == new_max) { /* If a no-op */
5842 array = orig_array = invlist_array(invlist);
5843 Renew(array, new_max, UV);
5845 /* If the size change moved the list in memory, set the new one */
5846 if (array != orig_array) {
5847 invlist_set_array(invlist, array);
5850 invlist_set_max(invlist, new_max);
5854 PERL_STATIC_INLINE void
5855 S_invlist_trim(pTHX_ HV* const invlist)
5857 PERL_ARGS_ASSERT_INVLIST_TRIM;
5859 /* Change the length of the inversion list to how many entries it currently
5862 invlist_extend(invlist, invlist_len(invlist));
5865 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5868 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5870 #ifndef PERL_IN_XSUB_RE
5872 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
5874 /* Subject to change or removal. Append the range from 'start' to 'end' at
5875 * the end of the inversion list. The range must be above any existing
5878 UV* array = invlist_array(invlist);
5879 UV max = invlist_max(invlist);
5880 UV len = invlist_len(invlist);
5882 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5886 /* Here, the existing list is non-empty. The current max entry in the
5887 * list is generally the first value not in the set, except when the
5888 * set extends to the end of permissible values, in which case it is
5889 * the first entry in that final set, and so this call is an attempt to
5890 * append out-of-order */
5892 UV final_element = len - 1;
5893 if (array[final_element] > start
5894 || ELEMENT_IN_INVLIST_SET(final_element))
5896 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5899 /* Here, it is a legal append. If the new range begins with the first
5900 * value not in the set, it is extending the set, so the new first
5901 * value not in the set is one greater than the newly extended range.
5903 if (array[final_element] == start) {
5904 if (end != UV_MAX) {
5905 array[final_element] = end + 1;
5908 /* But if the end is the maximum representable on the machine,
5909 * just let the range that this would extend have no end */
5910 invlist_set_len(invlist, len - 1);
5916 /* Here the new range doesn't extend any existing set. Add it */
5918 len += 2; /* Includes an element each for the start and end of range */
5920 /* If overflows the existing space, extend, which may cause the array to be
5923 invlist_extend(invlist, len);
5924 array = invlist_array(invlist);
5927 invlist_set_len(invlist, len);
5929 /* The next item on the list starts the range, the one after that is
5930 * one past the new range. */
5931 array[len - 2] = start;
5932 if (end != UV_MAX) {
5933 array[len - 1] = end + 1;
5936 /* But if the end is the maximum representable on the machine, just let
5937 * the range have no end */
5938 invlist_set_len(invlist, len - 1);
5943 PERL_STATIC_INLINE HV*
5944 S_invlist_union(pTHX_ HV* const a, HV* const b)
5946 /* Return a new inversion list which is the union of two inversion lists.
5947 * The basis for this comes from "Unicode Demystified" Chapter 13 by
5948 * Richard Gillam, published by Addison-Wesley, and explained at some
5949 * length there. The preface says to incorporate its examples into your
5950 * code at your own risk.
5952 * The algorithm is like a merge sort.
5954 * XXX A potential performance improvement is to keep track as we go along
5955 * if only one of the inputs contributes to the result, meaning the other
5956 * is a subset of that one. In that case, we can skip the final copy and
5957 * return the larger of the input lists */
5959 UV* array_a = invlist_array(a); /* a's array */
5960 UV* array_b = invlist_array(b);
5961 UV len_a = invlist_len(a); /* length of a's array */
5962 UV len_b = invlist_len(b);
5964 HV* u; /* the resulting union */
5968 UV i_a = 0; /* current index into a's array */
5972 /* running count, as explained in the algorithm source book; items are
5973 * stopped accumulating and are output when the count changes to/from 0.
5974 * The count is incremented when we start a range that's in the set, and
5975 * decremented when we start a range that's not in the set. So its range
5976 * is 0 to 2. Only when the count is zero is something not in the set.
5980 PERL_ARGS_ASSERT_INVLIST_UNION;
5982 /* Size the union for the worst case: that the sets are completely
5984 u = _new_invlist(len_a + len_b);
5985 array_u = invlist_array(u);
5987 /* Go through each list item by item, stopping when exhausted one of
5989 while (i_a < len_a && i_b < len_b) {
5990 UV cp; /* The element to potentially add to the union's array */
5991 bool cp_in_set; /* is it in the the input list's set or not */
5993 /* We need to take one or the other of the two inputs for the union.
5994 * Since we are merging two sorted lists, we take the smaller of the
5995 * next items. In case of a tie, we take the one that is in its set
5996 * first. If we took one not in the set first, it would decrement the
5997 * count, possibly to 0 which would cause it to be output as ending the
5998 * range, and the next time through we would take the same number, and
5999 * output it again as beginning the next range. By doing it the
6000 * opposite way, there is no possibility that the count will be
6001 * momentarily decremented to 0, and thus the two adjoining ranges will
6002 * be seamlessly merged. (In a tie and both are in the set or both not
6003 * in the set, it doesn't matter which we take first.) */
6004 if (array_a[i_a] < array_b[i_b]
6005 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6007 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6011 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6015 /* Here, have chosen which of the two inputs to look at. Only output
6016 * if the running count changes to/from 0, which marks the
6017 * beginning/end of a range in that's in the set */
6020 array_u[i_u++] = cp;
6027 array_u[i_u++] = cp;
6032 /* Here, we are finished going through at least one of the lists, which
6033 * means there is something remaining in at most one. We check if the list
6034 * that hasn't been exhausted is positioned such that we are in the middle
6035 * of a range in its set or not. (We are in the set if the next item in
6036 * the array marks the beginning of something not in the set) If in the
6037 * set, we decrement 'count'; if 0, there is potentially more to output.
6038 * There are four cases:
6039 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6040 * in the union is entirely from the non-exhausted set.
6041 * 2) Both were in their sets, count is 2. Nothing further should
6042 * be output, as everything that remains will be in the exhausted
6043 * list's set, hence in the union; decrementing to 1 but not 0 insures
6045 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6046 * Nothing further should be output because the union includes
6047 * everything from the exhausted set. Not decrementing insures that.
6048 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6049 * decrementing to 0 insures that we look at the remainder of the
6050 * non-exhausted set */
6051 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6052 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6057 /* The final length is what we've output so far, plus what else is about to
6058 * be output. (If 'count' is non-zero, then the input list we exhausted
6059 * has everything remaining up to the machine's limit in its set, and hence
6060 * in the union, so there will be no further output. */
6063 /* At most one of the subexpressions will be non-zero */
6064 len_u += (len_a - i_a) + (len_b - i_b);
6067 /* Set result to final length, which can change the pointer to array_u, so
6069 if (len_u != invlist_len(u)) {
6070 invlist_set_len(u, len_u);
6072 array_u = invlist_array(u);
6075 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6076 * the other) ended with everything above it not in its set. That means
6077 * that the remaining part of the union is precisely the same as the
6078 * non-exhausted list, so can just copy it unchanged. (If both list were
6079 * exhausted at the same time, then the operations below will be both 0.)
6082 IV copy_count; /* At most one will have a non-zero copy count */
6083 if ((copy_count = len_a - i_a) > 0) {
6084 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6086 else if ((copy_count = len_b - i_b) > 0) {
6087 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6094 PERL_STATIC_INLINE HV*
6095 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6097 /* Return the intersection of two inversion lists. The basis for this
6098 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6099 * by Addison-Wesley, and explained at some length there. The preface says
6100 * to incorporate its examples into your code at your own risk.
6102 * The algorithm is like a merge sort, and is essentially the same as the
6106 UV* array_a = invlist_array(a); /* a's array */
6107 UV* array_b = invlist_array(b);
6108 UV len_a = invlist_len(a); /* length of a's array */
6109 UV len_b = invlist_len(b);
6111 HV* r; /* the resulting intersection */
6115 UV i_a = 0; /* current index into a's array */
6119 /* running count, as explained in the algorithm source book; items are
6120 * stopped accumulating and are output when the count changes to/from 2.
6121 * The count is incremented when we start a range that's in the set, and
6122 * decremented when we start a range that's not in the set. So its range
6123 * is 0 to 2. Only when the count is 2 is something in the intersection.
6127 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6129 /* Size the intersection for the worst case: that the intersection ends up
6130 * fragmenting everything to be completely disjoint */
6131 r= _new_invlist(len_a + len_b);
6132 array_r = invlist_array(r);
6134 /* Go through each list item by item, stopping when exhausted one of
6136 while (i_a < len_a && i_b < len_b) {
6137 UV cp; /* The element to potentially add to the intersection's
6139 bool cp_in_set; /* Is it in the input list's set or not */
6141 /* We need to take one or the other of the two inputs for the union.
6142 * Since we are merging two sorted lists, we take the smaller of the
6143 * next items. In case of a tie, we take the one that is not in its
6144 * set first (a difference from the union algorithm). If we took one
6145 * in the set first, it would increment the count, possibly to 2 which
6146 * would cause it to be output as starting a range in the intersection,
6147 * and the next time through we would take that same number, and output
6148 * it again as ending the set. By doing it the opposite of this, we
6149 * there is no possibility that the count will be momentarily
6150 * incremented to 2. (In a tie and both are in the set or both not in
6151 * the set, it doesn't matter which we take first.) */
6152 if (array_a[i_a] < array_b[i_b]
6153 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6155 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6159 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6163 /* Here, have chosen which of the two inputs to look at. Only output
6164 * if the running count changes to/from 2, which marks the
6165 * beginning/end of a range that's in the intersection */
6169 array_r[i_r++] = cp;
6174 array_r[i_r++] = cp;
6180 /* Here, we are finished going through at least one of the sets, which
6181 * means there is something remaining in at most one. See the comments in
6183 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6184 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6189 /* The final length is what we've output so far plus what else is in the
6190 * intersection. Only one of the subexpressions below will be non-zero */
6193 len_r += (len_a - i_a) + (len_b - i_b);
6196 /* Set result to final length, which can change the pointer to array_r, so
6198 if (len_r != invlist_len(r)) {
6199 invlist_set_len(r, len_r);
6201 array_r = invlist_array(r);
6204 /* Finish outputting any remaining */
6205 if (count == 2) { /* Only one of will have a non-zero copy count */
6207 if ((copy_count = len_a - i_a) > 0) {
6208 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6210 else if ((copy_count = len_b - i_b) > 0) {
6211 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6219 S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
6221 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6222 * set. A pointer to the inversion list is returned. This may actually be
6223 * a new list, in which case the passed in one has been destroyed */
6228 UV len = invlist_len(invlist);
6230 PERL_ARGS_ASSERT_ADD_RANGE_TO_INVLIST;
6232 /* If comes after the final entry, can just append it to the end */
6234 || start >= invlist_array(invlist)
6235 [invlist_len(invlist) - 1])
6237 _append_range_to_invlist(invlist, start, end);
6241 /* Here, can't just append things, create and return a new inversion list
6242 * which is the union of this range and the existing inversion list */
6243 range_invlist = _new_invlist(2);
6244 _append_range_to_invlist(range_invlist, start, end);
6246 added_invlist = invlist_union(invlist, range_invlist);
6248 /* The passed in list can be freed, as well as our temporary */
6249 invlist_destroy(range_invlist);
6250 if (invlist != added_invlist) {
6251 invlist_destroy(invlist);
6254 return added_invlist;
6257 /* End of inversion list object */
6260 - reg - regular expression, i.e. main body or parenthesized thing
6262 * Caller must absorb opening parenthesis.
6264 * Combining parenthesis handling with the base level of regular expression
6265 * is a trifle forced, but the need to tie the tails of the branches to what
6266 * follows makes it hard to avoid.
6268 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6270 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6272 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6276 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6277 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6280 register regnode *ret; /* Will be the head of the group. */
6281 register regnode *br;
6282 register regnode *lastbr;
6283 register regnode *ender = NULL;
6284 register I32 parno = 0;
6286 U32 oregflags = RExC_flags;
6287 bool have_branch = 0;
6289 I32 freeze_paren = 0;
6290 I32 after_freeze = 0;
6292 /* for (?g), (?gc), and (?o) warnings; warning
6293 about (?c) will warn about (?g) -- japhy */
6295 #define WASTED_O 0x01
6296 #define WASTED_G 0x02
6297 #define WASTED_C 0x04
6298 #define WASTED_GC (0x02|0x04)
6299 I32 wastedflags = 0x00;
6301 char * parse_start = RExC_parse; /* MJD */
6302 char * const oregcomp_parse = RExC_parse;
6304 GET_RE_DEBUG_FLAGS_DECL;
6306 PERL_ARGS_ASSERT_REG;
6307 DEBUG_PARSE("reg ");
6309 *flagp = 0; /* Tentatively. */
6312 /* Make an OPEN node, if parenthesized. */
6314 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6315 char *start_verb = RExC_parse;
6316 STRLEN verb_len = 0;
6317 char *start_arg = NULL;
6318 unsigned char op = 0;
6320 int internal_argval = 0; /* internal_argval is only useful if !argok */
6321 while ( *RExC_parse && *RExC_parse != ')' ) {
6322 if ( *RExC_parse == ':' ) {
6323 start_arg = RExC_parse + 1;
6329 verb_len = RExC_parse - start_verb;
6332 while ( *RExC_parse && *RExC_parse != ')' )
6334 if ( *RExC_parse != ')' )
6335 vFAIL("Unterminated verb pattern argument");
6336 if ( RExC_parse == start_arg )
6339 if ( *RExC_parse != ')' )
6340 vFAIL("Unterminated verb pattern");
6343 switch ( *start_verb ) {
6344 case 'A': /* (*ACCEPT) */
6345 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6347 internal_argval = RExC_nestroot;
6350 case 'C': /* (*COMMIT) */
6351 if ( memEQs(start_verb,verb_len,"COMMIT") )
6354 case 'F': /* (*FAIL) */
6355 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6360 case ':': /* (*:NAME) */
6361 case 'M': /* (*MARK:NAME) */
6362 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6367 case 'P': /* (*PRUNE) */
6368 if ( memEQs(start_verb,verb_len,"PRUNE") )
6371 case 'S': /* (*SKIP) */
6372 if ( memEQs(start_verb,verb_len,"SKIP") )
6375 case 'T': /* (*THEN) */
6376 /* [19:06] <TimToady> :: is then */
6377 if ( memEQs(start_verb,verb_len,"THEN") ) {
6379 RExC_seen |= REG_SEEN_CUTGROUP;
6385 vFAIL3("Unknown verb pattern '%.*s'",
6386 verb_len, start_verb);
6389 if ( start_arg && internal_argval ) {
6390 vFAIL3("Verb pattern '%.*s' may not have an argument",
6391 verb_len, start_verb);
6392 } else if ( argok < 0 && !start_arg ) {
6393 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6394 verb_len, start_verb);
6396 ret = reganode(pRExC_state, op, internal_argval);
6397 if ( ! internal_argval && ! SIZE_ONLY ) {
6399 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6400 ARG(ret) = add_data( pRExC_state, 1, "S" );
6401 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6408 if (!internal_argval)
6409 RExC_seen |= REG_SEEN_VERBARG;
6410 } else if ( start_arg ) {
6411 vFAIL3("Verb pattern '%.*s' may not have an argument",
6412 verb_len, start_verb);
6414 ret = reg_node(pRExC_state, op);
6416 nextchar(pRExC_state);
6419 if (*RExC_parse == '?') { /* (?...) */
6420 bool is_logical = 0;
6421 const char * const seqstart = RExC_parse;
6422 bool has_use_defaults = FALSE;
6425 paren = *RExC_parse++;
6426 ret = NULL; /* For look-ahead/behind. */
6429 case 'P': /* (?P...) variants for those used to PCRE/Python */
6430 paren = *RExC_parse++;
6431 if ( paren == '<') /* (?P<...>) named capture */
6433 else if (paren == '>') { /* (?P>name) named recursion */
6434 goto named_recursion;
6436 else if (paren == '=') { /* (?P=...) named backref */
6437 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6438 you change this make sure you change that */
6439 char* name_start = RExC_parse;
6441 SV *sv_dat = reg_scan_name(pRExC_state,
6442 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6443 if (RExC_parse == name_start || *RExC_parse != ')')
6444 vFAIL2("Sequence %.3s... not terminated",parse_start);
6447 num = add_data( pRExC_state, 1, "S" );
6448 RExC_rxi->data->data[num]=(void*)sv_dat;
6449 SvREFCNT_inc_simple_void(sv_dat);
6452 ret = reganode(pRExC_state,
6463 Set_Node_Offset(ret, parse_start+1);
6464 Set_Node_Cur_Length(ret); /* MJD */
6466 nextchar(pRExC_state);
6470 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6472 case '<': /* (?<...) */
6473 if (*RExC_parse == '!')
6475 else if (*RExC_parse != '=')
6481 case '\'': /* (?'...') */
6482 name_start= RExC_parse;
6483 svname = reg_scan_name(pRExC_state,
6484 SIZE_ONLY ? /* reverse test from the others */
6485 REG_RSN_RETURN_NAME :
6486 REG_RSN_RETURN_NULL);
6487 if (RExC_parse == name_start) {
6489 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6492 if (*RExC_parse != paren)
6493 vFAIL2("Sequence (?%c... not terminated",
6494 paren=='>' ? '<' : paren);
6498 if (!svname) /* shouldn't happen */
6500 "panic: reg_scan_name returned NULL");
6501 if (!RExC_paren_names) {
6502 RExC_paren_names= newHV();
6503 sv_2mortal(MUTABLE_SV(RExC_paren_names));
6505 RExC_paren_name_list= newAV();
6506 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6509 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6511 sv_dat = HeVAL(he_str);
6513 /* croak baby croak */
6515 "panic: paren_name hash element allocation failed");
6516 } else if ( SvPOK(sv_dat) ) {
6517 /* (?|...) can mean we have dupes so scan to check
6518 its already been stored. Maybe a flag indicating
6519 we are inside such a construct would be useful,
6520 but the arrays are likely to be quite small, so
6521 for now we punt -- dmq */
6522 IV count = SvIV(sv_dat);
6523 I32 *pv = (I32*)SvPVX(sv_dat);
6525 for ( i = 0 ; i < count ; i++ ) {
6526 if ( pv[i] == RExC_npar ) {
6532 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6533 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6534 pv[count] = RExC_npar;
6535 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6538 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6539 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6541 SvIV_set(sv_dat, 1);
6544 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6545 SvREFCNT_dec(svname);
6548 /*sv_dump(sv_dat);*/
6550 nextchar(pRExC_state);
6552 goto capturing_parens;
6554 RExC_seen |= REG_SEEN_LOOKBEHIND;
6555 RExC_in_lookbehind++;
6557 case '=': /* (?=...) */
6558 RExC_seen_zerolen++;
6560 case '!': /* (?!...) */
6561 RExC_seen_zerolen++;
6562 if (*RExC_parse == ')') {
6563 ret=reg_node(pRExC_state, OPFAIL);
6564 nextchar(pRExC_state);
6568 case '|': /* (?|...) */
6569 /* branch reset, behave like a (?:...) except that
6570 buffers in alternations share the same numbers */
6572 after_freeze = freeze_paren = RExC_npar;
6574 case ':': /* (?:...) */
6575 case '>': /* (?>...) */
6577 case '$': /* (?$...) */
6578 case '@': /* (?@...) */
6579 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6581 case '#': /* (?#...) */
6582 while (*RExC_parse && *RExC_parse != ')')
6584 if (*RExC_parse != ')')
6585 FAIL("Sequence (?#... not terminated");
6586 nextchar(pRExC_state);
6589 case '0' : /* (?0) */
6590 case 'R' : /* (?R) */
6591 if (*RExC_parse != ')')
6592 FAIL("Sequence (?R) not terminated");
6593 ret = reg_node(pRExC_state, GOSTART);
6594 *flagp |= POSTPONED;
6595 nextchar(pRExC_state);
6598 { /* named and numeric backreferences */
6600 case '&': /* (?&NAME) */
6601 parse_start = RExC_parse - 1;
6604 SV *sv_dat = reg_scan_name(pRExC_state,
6605 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6606 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6608 goto gen_recurse_regop;
6611 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6613 vFAIL("Illegal pattern");
6615 goto parse_recursion;
6617 case '-': /* (?-1) */
6618 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6619 RExC_parse--; /* rewind to let it be handled later */
6623 case '1': case '2': case '3': case '4': /* (?1) */
6624 case '5': case '6': case '7': case '8': case '9':
6627 num = atoi(RExC_parse);
6628 parse_start = RExC_parse - 1; /* MJD */
6629 if (*RExC_parse == '-')
6631 while (isDIGIT(*RExC_parse))
6633 if (*RExC_parse!=')')
6634 vFAIL("Expecting close bracket");
6637 if ( paren == '-' ) {
6639 Diagram of capture buffer numbering.
6640 Top line is the normal capture buffer numbers
6641 Bottom line is the negative indexing as from
6645 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6649 num = RExC_npar + num;
6652 vFAIL("Reference to nonexistent group");
6654 } else if ( paren == '+' ) {
6655 num = RExC_npar + num - 1;
6658 ret = reganode(pRExC_state, GOSUB, num);
6660 if (num > (I32)RExC_rx->nparens) {
6662 vFAIL("Reference to nonexistent group");
6664 ARG2L_SET( ret, RExC_recurse_count++);
6666 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6667 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6671 RExC_seen |= REG_SEEN_RECURSE;
6672 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6673 Set_Node_Offset(ret, parse_start); /* MJD */
6675 *flagp |= POSTPONED;
6676 nextchar(pRExC_state);
6678 } /* named and numeric backreferences */
6681 case '?': /* (??...) */
6683 if (*RExC_parse != '{') {
6685 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6688 *flagp |= POSTPONED;
6689 paren = *RExC_parse++;
6691 case '{': /* (?{...}) */
6696 char *s = RExC_parse;
6698 RExC_seen_zerolen++;
6699 RExC_seen |= REG_SEEN_EVAL;
6700 while (count && (c = *RExC_parse)) {
6711 if (*RExC_parse != ')') {
6713 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6717 OP_4tree *sop, *rop;
6718 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6721 Perl_save_re_context(aTHX);
6722 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6723 sop->op_private |= OPpREFCOUNTED;
6724 /* re_dup will OpREFCNT_inc */
6725 OpREFCNT_set(sop, 1);
6728 n = add_data(pRExC_state, 3, "nop");
6729 RExC_rxi->data->data[n] = (void*)rop;
6730 RExC_rxi->data->data[n+1] = (void*)sop;
6731 RExC_rxi->data->data[n+2] = (void*)pad;
6734 else { /* First pass */
6735 if (PL_reginterp_cnt < ++RExC_seen_evals
6737 /* No compiled RE interpolated, has runtime
6738 components ===> unsafe. */
6739 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6740 if (PL_tainting && PL_tainted)
6741 FAIL("Eval-group in insecure regular expression");
6742 #if PERL_VERSION > 8
6743 if (IN_PERL_COMPILETIME)
6748 nextchar(pRExC_state);
6750 ret = reg_node(pRExC_state, LOGICAL);
6753 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6754 /* deal with the length of this later - MJD */
6757 ret = reganode(pRExC_state, EVAL, n);
6758 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6759 Set_Node_Offset(ret, parse_start);
6762 case '(': /* (?(?{...})...) and (?(?=...)...) */
6765 if (RExC_parse[0] == '?') { /* (?(?...)) */
6766 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6767 || RExC_parse[1] == '<'
6768 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6771 ret = reg_node(pRExC_state, LOGICAL);
6774 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6778 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6779 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6781 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6782 char *name_start= RExC_parse++;
6784 SV *sv_dat=reg_scan_name(pRExC_state,
6785 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6786 if (RExC_parse == name_start || *RExC_parse != ch)
6787 vFAIL2("Sequence (?(%c... not terminated",
6788 (ch == '>' ? '<' : ch));
6791 num = add_data( pRExC_state, 1, "S" );
6792 RExC_rxi->data->data[num]=(void*)sv_dat;
6793 SvREFCNT_inc_simple_void(sv_dat);
6795 ret = reganode(pRExC_state,NGROUPP,num);
6796 goto insert_if_check_paren;
6798 else if (RExC_parse[0] == 'D' &&
6799 RExC_parse[1] == 'E' &&
6800 RExC_parse[2] == 'F' &&
6801 RExC_parse[3] == 'I' &&
6802 RExC_parse[4] == 'N' &&
6803 RExC_parse[5] == 'E')
6805 ret = reganode(pRExC_state,DEFINEP,0);
6808 goto insert_if_check_paren;
6810 else if (RExC_parse[0] == 'R') {
6813 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6814 parno = atoi(RExC_parse++);
6815 while (isDIGIT(*RExC_parse))
6817 } else if (RExC_parse[0] == '&') {
6820 sv_dat = reg_scan_name(pRExC_state,
6821 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6822 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6824 ret = reganode(pRExC_state,INSUBP,parno);
6825 goto insert_if_check_paren;
6827 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6830 parno = atoi(RExC_parse++);
6832 while (isDIGIT(*RExC_parse))
6834 ret = reganode(pRExC_state, GROUPP, parno);
6836 insert_if_check_paren:
6837 if ((c = *nextchar(pRExC_state)) != ')')
6838 vFAIL("Switch condition not recognized");
6840 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6841 br = regbranch(pRExC_state, &flags, 1,depth+1);
6843 br = reganode(pRExC_state, LONGJMP, 0);
6845 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6846 c = *nextchar(pRExC_state);
6851 vFAIL("(?(DEFINE)....) does not allow branches");
6852 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6853 regbranch(pRExC_state, &flags, 1,depth+1);
6854 REGTAIL(pRExC_state, ret, lastbr);
6857 c = *nextchar(pRExC_state);
6862 vFAIL("Switch (?(condition)... contains too many branches");
6863 ender = reg_node(pRExC_state, TAIL);
6864 REGTAIL(pRExC_state, br, ender);
6866 REGTAIL(pRExC_state, lastbr, ender);
6867 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6870 REGTAIL(pRExC_state, ret, ender);
6871 RExC_size++; /* XXX WHY do we need this?!!
6872 For large programs it seems to be required
6873 but I can't figure out why. -- dmq*/
6877 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6881 RExC_parse--; /* for vFAIL to print correctly */
6882 vFAIL("Sequence (? incomplete");
6884 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6886 has_use_defaults = TRUE;
6887 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6888 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6889 ? REGEX_UNICODE_CHARSET
6890 : REGEX_DEPENDS_CHARSET);
6894 parse_flags: /* (?i) */
6896 U32 posflags = 0, negflags = 0;
6897 U32 *flagsp = &posflags;
6898 bool has_charset_modifier = 0;
6899 regex_charset cs = REGEX_DEPENDS_CHARSET;
6901 while (*RExC_parse) {
6902 /* && strchr("iogcmsx", *RExC_parse) */
6903 /* (?g), (?gc) and (?o) are useless here
6904 and must be globally applied -- japhy */
6905 switch (*RExC_parse) {
6906 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6907 case LOCALE_PAT_MOD:
6908 if (has_charset_modifier || flagsp == &negflags) {
6909 goto fail_modifiers;
6911 cs = REGEX_LOCALE_CHARSET;
6912 has_charset_modifier = 1;
6914 case UNICODE_PAT_MOD:
6915 if (has_charset_modifier || flagsp == &negflags) {
6916 goto fail_modifiers;
6918 cs = REGEX_UNICODE_CHARSET;
6919 has_charset_modifier = 1;
6921 case ASCII_RESTRICT_PAT_MOD:
6922 if (has_charset_modifier || flagsp == &negflags) {
6923 goto fail_modifiers;
6925 cs = REGEX_ASCII_RESTRICTED_CHARSET;
6926 has_charset_modifier = 1;
6928 case DEPENDS_PAT_MOD:
6929 if (has_use_defaults
6930 || has_charset_modifier
6931 || flagsp == &negflags)
6933 goto fail_modifiers;
6936 /* The dual charset means unicode semantics if the
6937 * pattern (or target, not known until runtime) are
6938 * utf8, or something in the pattern indicates unicode
6940 cs = (RExC_utf8 || RExC_uni_semantics)
6941 ? REGEX_UNICODE_CHARSET
6942 : REGEX_DEPENDS_CHARSET;
6943 has_charset_modifier = 1;
6945 case ONCE_PAT_MOD: /* 'o' */
6946 case GLOBAL_PAT_MOD: /* 'g' */
6947 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6948 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6949 if (! (wastedflags & wflagbit) ) {
6950 wastedflags |= wflagbit;
6953 "Useless (%s%c) - %suse /%c modifier",
6954 flagsp == &negflags ? "?-" : "?",
6956 flagsp == &negflags ? "don't " : "",
6963 case CONTINUE_PAT_MOD: /* 'c' */
6964 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6965 if (! (wastedflags & WASTED_C) ) {
6966 wastedflags |= WASTED_GC;
6969 "Useless (%sc) - %suse /gc modifier",
6970 flagsp == &negflags ? "?-" : "?",
6971 flagsp == &negflags ? "don't " : ""
6976 case KEEPCOPY_PAT_MOD: /* 'p' */
6977 if (flagsp == &negflags) {
6979 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6981 *flagsp |= RXf_PMf_KEEPCOPY;
6985 /* A flag is a default iff it is following a minus, so
6986 * if there is a minus, it means will be trying to
6987 * re-specify a default which is an error */
6988 if (has_use_defaults || flagsp == &negflags) {
6991 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6995 wastedflags = 0; /* reset so (?g-c) warns twice */
7001 RExC_flags |= posflags;
7002 RExC_flags &= ~negflags;
7003 set_regex_charset(&RExC_flags, cs);
7005 oregflags |= posflags;
7006 oregflags &= ~negflags;
7007 set_regex_charset(&oregflags, cs);
7009 nextchar(pRExC_state);
7020 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7025 }} /* one for the default block, one for the switch */
7032 ret = reganode(pRExC_state, OPEN, parno);
7035 RExC_nestroot = parno;
7036 if (RExC_seen & REG_SEEN_RECURSE
7037 && !RExC_open_parens[parno-1])
7039 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7040 "Setting open paren #%"IVdf" to %d\n",
7041 (IV)parno, REG_NODE_NUM(ret)));
7042 RExC_open_parens[parno-1]= ret;
7045 Set_Node_Length(ret, 1); /* MJD */
7046 Set_Node_Offset(ret, RExC_parse); /* MJD */
7054 /* Pick up the branches, linking them together. */
7055 parse_start = RExC_parse; /* MJD */
7056 br = regbranch(pRExC_state, &flags, 1,depth+1);
7059 if (RExC_npar > after_freeze)
7060 after_freeze = RExC_npar;
7061 RExC_npar = freeze_paren;
7064 /* branch_len = (paren != 0); */
7068 if (*RExC_parse == '|') {
7069 if (!SIZE_ONLY && RExC_extralen) {
7070 reginsert(pRExC_state, BRANCHJ, br, depth+1);
7073 reginsert(pRExC_state, BRANCH, br, depth+1);
7074 Set_Node_Length(br, paren != 0);
7075 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7079 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
7081 else if (paren == ':') {
7082 *flagp |= flags&SIMPLE;
7084 if (is_open) { /* Starts with OPEN. */
7085 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
7087 else if (paren != '?') /* Not Conditional */
7089 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7091 while (*RExC_parse == '|') {
7092 if (!SIZE_ONLY && RExC_extralen) {
7093 ender = reganode(pRExC_state, LONGJMP,0);
7094 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7097 RExC_extralen += 2; /* Account for LONGJMP. */
7098 nextchar(pRExC_state);
7100 if (RExC_npar > after_freeze)
7101 after_freeze = RExC_npar;
7102 RExC_npar = freeze_paren;
7104 br = regbranch(pRExC_state, &flags, 0, depth+1);
7108 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
7110 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7113 if (have_branch || paren != ':') {
7114 /* Make a closing node, and hook it on the end. */
7117 ender = reg_node(pRExC_state, TAIL);
7120 ender = reganode(pRExC_state, CLOSE, parno);
7121 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7122 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7123 "Setting close paren #%"IVdf" to %d\n",
7124 (IV)parno, REG_NODE_NUM(ender)));
7125 RExC_close_parens[parno-1]= ender;
7126 if (RExC_nestroot == parno)
7129 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7130 Set_Node_Length(ender,1); /* MJD */
7136 *flagp &= ~HASWIDTH;
7139 ender = reg_node(pRExC_state, SUCCEED);
7142 ender = reg_node(pRExC_state, END);
7144 assert(!RExC_opend); /* there can only be one! */
7149 REGTAIL(pRExC_state, lastbr, ender);
7151 if (have_branch && !SIZE_ONLY) {
7153 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7155 /* Hook the tails of the branches to the closing node. */
7156 for (br = ret; br; br = regnext(br)) {
7157 const U8 op = PL_regkind[OP(br)];
7159 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7161 else if (op == BRANCHJ) {
7162 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7170 static const char parens[] = "=!<,>";
7172 if (paren && (p = strchr(parens, paren))) {
7173 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7174 int flag = (p - parens) > 1;
7177 node = SUSPEND, flag = 0;
7178 reginsert(pRExC_state, node,ret, depth+1);
7179 Set_Node_Cur_Length(ret);
7180 Set_Node_Offset(ret, parse_start + 1);
7182 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7186 /* Check for proper termination. */
7188 RExC_flags = oregflags;
7189 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7190 RExC_parse = oregcomp_parse;
7191 vFAIL("Unmatched (");
7194 else if (!paren && RExC_parse < RExC_end) {
7195 if (*RExC_parse == ')') {
7197 vFAIL("Unmatched )");
7200 FAIL("Junk on end of regexp"); /* "Can't happen". */
7204 if (RExC_in_lookbehind) {
7205 RExC_in_lookbehind--;
7208 RExC_npar = after_freeze;
7213 - regbranch - one alternative of an | operator
7215 * Implements the concatenation operator.
7218 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7221 register regnode *ret;
7222 register regnode *chain = NULL;
7223 register regnode *latest;
7224 I32 flags = 0, c = 0;
7225 GET_RE_DEBUG_FLAGS_DECL;
7227 PERL_ARGS_ASSERT_REGBRANCH;
7229 DEBUG_PARSE("brnc");
7234 if (!SIZE_ONLY && RExC_extralen)
7235 ret = reganode(pRExC_state, BRANCHJ,0);
7237 ret = reg_node(pRExC_state, BRANCH);
7238 Set_Node_Length(ret, 1);
7242 if (!first && SIZE_ONLY)
7243 RExC_extralen += 1; /* BRANCHJ */
7245 *flagp = WORST; /* Tentatively. */
7248 nextchar(pRExC_state);
7249 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7251 latest = regpiece(pRExC_state, &flags,depth+1);
7252 if (latest == NULL) {
7253 if (flags & TRYAGAIN)
7257 else if (ret == NULL)
7259 *flagp |= flags&(HASWIDTH|POSTPONED);
7260 if (chain == NULL) /* First piece. */
7261 *flagp |= flags&SPSTART;
7264 REGTAIL(pRExC_state, chain, latest);
7269 if (chain == NULL) { /* Loop ran zero times. */
7270 chain = reg_node(pRExC_state, NOTHING);
7275 *flagp |= flags&SIMPLE;
7282 - regpiece - something followed by possible [*+?]
7284 * Note that the branching code sequences used for ? and the general cases
7285 * of * and + are somewhat optimized: they use the same NOTHING node as
7286 * both the endmarker for their branch list and the body of the last branch.
7287 * It might seem that this node could be dispensed with entirely, but the
7288 * endmarker role is not redundant.
7291 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7294 register regnode *ret;
7296 register char *next;
7298 const char * const origparse = RExC_parse;
7300 I32 max = REG_INFTY;
7302 const char *maxpos = NULL;
7303 GET_RE_DEBUG_FLAGS_DECL;
7305 PERL_ARGS_ASSERT_REGPIECE;
7307 DEBUG_PARSE("piec");
7309 ret = regatom(pRExC_state, &flags,depth+1);
7311 if (flags & TRYAGAIN)
7318 if (op == '{' && regcurly(RExC_parse)) {
7320 parse_start = RExC_parse; /* MJD */
7321 next = RExC_parse + 1;
7322 while (isDIGIT(*next) || *next == ',') {
7331 if (*next == '}') { /* got one */
7335 min = atoi(RExC_parse);
7339 maxpos = RExC_parse;
7341 if (!max && *maxpos != '0')
7342 max = REG_INFTY; /* meaning "infinity" */
7343 else if (max >= REG_INFTY)
7344 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7346 nextchar(pRExC_state);
7349 if ((flags&SIMPLE)) {
7350 RExC_naughty += 2 + RExC_naughty / 2;
7351 reginsert(pRExC_state, CURLY, ret, depth+1);
7352 Set_Node_Offset(ret, parse_start+1); /* MJD */
7353 Set_Node_Cur_Length(ret);
7356 regnode * const w = reg_node(pRExC_state, WHILEM);
7359 REGTAIL(pRExC_state, ret, w);
7360 if (!SIZE_ONLY && RExC_extralen) {
7361 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7362 reginsert(pRExC_state, NOTHING,ret, depth+1);
7363 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7365 reginsert(pRExC_state, CURLYX,ret, depth+1);
7367 Set_Node_Offset(ret, parse_start+1);
7368 Set_Node_Length(ret,
7369 op == '{' ? (RExC_parse - parse_start) : 1);
7371 if (!SIZE_ONLY && RExC_extralen)
7372 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
7373 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7375 RExC_whilem_seen++, RExC_extralen += 3;
7376 RExC_naughty += 4 + RExC_naughty; /* compound interest */
7385 vFAIL("Can't do {n,m} with n > m");
7387 ARG1_SET(ret, (U16)min);
7388 ARG2_SET(ret, (U16)max);
7400 #if 0 /* Now runtime fix should be reliable. */
7402 /* if this is reinstated, don't forget to put this back into perldiag:
7404 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7406 (F) The part of the regexp subject to either the * or + quantifier
7407 could match an empty string. The {#} shows in the regular
7408 expression about where the problem was discovered.
7412 if (!(flags&HASWIDTH) && op != '?')
7413 vFAIL("Regexp *+ operand could be empty");
7416 parse_start = RExC_parse;
7417 nextchar(pRExC_state);
7419 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7421 if (op == '*' && (flags&SIMPLE)) {
7422 reginsert(pRExC_state, STAR, ret, depth+1);
7426 else if (op == '*') {
7430 else if (op == '+' && (flags&SIMPLE)) {
7431 reginsert(pRExC_state, PLUS, ret, depth+1);
7435 else if (op == '+') {
7439 else if (op == '?') {
7444 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7445 ckWARN3reg(RExC_parse,
7446 "%.*s matches null string many times",
7447 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7451 if (RExC_parse < RExC_end && *RExC_parse == '?') {
7452 nextchar(pRExC_state);
7453 reginsert(pRExC_state, MINMOD, ret, depth+1);
7454 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7456 #ifndef REG_ALLOW_MINMOD_SUSPEND
7459 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7461 nextchar(pRExC_state);
7462 ender = reg_node(pRExC_state, SUCCEED);
7463 REGTAIL(pRExC_state, ret, ender);
7464 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7466 ender = reg_node(pRExC_state, TAIL);
7467 REGTAIL(pRExC_state, ret, ender);
7471 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7473 vFAIL("Nested quantifiers");
7480 /* reg_namedseq(pRExC_state,UVp)
7482 This is expected to be called by a parser routine that has
7483 recognized '\N' and needs to handle the rest. RExC_parse is
7484 expected to point at the first char following the N at the time
7487 The \N may be inside (indicated by valuep not being NULL) or outside a
7490 \N may begin either a named sequence, or if outside a character class, mean
7491 to match a non-newline. For non single-quoted regexes, the tokenizer has
7492 attempted to decide which, and in the case of a named sequence converted it
7493 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7494 where c1... are the characters in the sequence. For single-quoted regexes,
7495 the tokenizer passes the \N sequence through unchanged; this code will not
7496 attempt to determine this nor expand those. The net effect is that if the
7497 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7498 signals that this \N occurrence means to match a non-newline.
7500 Only the \N{U+...} form should occur in a character class, for the same
7501 reason that '.' inside a character class means to just match a period: it
7502 just doesn't make sense.
7504 If valuep is non-null then it is assumed that we are parsing inside
7505 of a charclass definition and the first codepoint in the resolved
7506 string is returned via *valuep and the routine will return NULL.
7507 In this mode if a multichar string is returned from the charnames
7508 handler, a warning will be issued, and only the first char in the
7509 sequence will be examined. If the string returned is zero length
7510 then the value of *valuep is undefined and NON-NULL will
7511 be returned to indicate failure. (This will NOT be a valid pointer
7514 If valuep is null then it is assumed that we are parsing normal text and a
7515 new EXACT node is inserted into the program containing the resolved string,
7516 and a pointer to the new node is returned. But if the string is zero length
7517 a NOTHING node is emitted instead.
7519 On success RExC_parse is set to the char following the endbrace.
7520 Parsing failures will generate a fatal error via vFAIL(...)
7523 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7525 char * endbrace; /* '}' following the name */
7526 regnode *ret = NULL;
7528 char* parse_start = RExC_parse - 2; /* points to the '\N' */
7532 GET_RE_DEBUG_FLAGS_DECL;
7534 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7538 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7539 * modifier. The other meaning does not */
7540 p = (RExC_flags & RXf_PMf_EXTENDED)
7541 ? regwhite( pRExC_state, RExC_parse )
7544 /* Disambiguate between \N meaning a named character versus \N meaning
7545 * [^\n]. The former is assumed when it can't be the latter. */
7546 if (*p != '{' || regcurly(p)) {
7549 /* no bare \N in a charclass */
7550 vFAIL("\\N in a character class must be a named character: \\N{...}");
7552 nextchar(pRExC_state);
7553 ret = reg_node(pRExC_state, REG_ANY);
7554 *flagp |= HASWIDTH|SIMPLE;
7557 Set_Node_Length(ret, 1); /* MJD */
7561 /* Here, we have decided it should be a named sequence */
7563 /* The test above made sure that the next real character is a '{', but
7564 * under the /x modifier, it could be separated by space (or a comment and
7565 * \n) and this is not allowed (for consistency with \x{...} and the
7566 * tokenizer handling of \N{NAME}). */
7567 if (*RExC_parse != '{') {
7568 vFAIL("Missing braces on \\N{}");
7571 RExC_parse++; /* Skip past the '{' */
7573 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7574 || ! (endbrace == RExC_parse /* nothing between the {} */
7575 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7576 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7578 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7579 vFAIL("\\N{NAME} must be resolved by the lexer");
7582 if (endbrace == RExC_parse) { /* empty: \N{} */
7584 RExC_parse = endbrace + 1;
7585 return reg_node(pRExC_state,NOTHING);
7589 ckWARNreg(RExC_parse,
7590 "Ignoring zero length \\N{} in character class"
7592 RExC_parse = endbrace + 1;
7595 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7598 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
7599 RExC_parse += 2; /* Skip past the 'U+' */
7601 if (valuep) { /* In a bracketed char class */
7602 /* We only pay attention to the first char of
7603 multichar strings being returned. I kinda wonder
7604 if this makes sense as it does change the behaviour
7605 from earlier versions, OTOH that behaviour was broken
7606 as well. XXX Solution is to recharacterize as
7607 [rest-of-class]|multi1|multi2... */
7609 STRLEN length_of_hex;
7610 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7611 | PERL_SCAN_DISALLOW_PREFIX
7612 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7614 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7615 if (endchar < endbrace) {
7616 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7619 length_of_hex = (STRLEN)(endchar - RExC_parse);
7620 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7622 /* The tokenizer should have guaranteed validity, but it's possible to
7623 * bypass it by using single quoting, so check */
7624 if (length_of_hex == 0
7625 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7627 RExC_parse += length_of_hex; /* Includes all the valid */
7628 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7629 ? UTF8SKIP(RExC_parse)
7631 /* Guard against malformed utf8 */
7632 if (RExC_parse >= endchar) RExC_parse = endchar;
7633 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7636 RExC_parse = endbrace + 1;
7637 if (endchar == endbrace) return NULL;
7639 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7641 else { /* Not a char class */
7642 char *s; /* String to put in generated EXACT node */
7643 STRLEN len = 0; /* Its current byte length */
7644 char *endchar; /* Points to '.' or '}' ending cur char in the input
7647 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7655 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7656 * the input which is of the form now 'c1.c2.c3...}' until find the
7657 * ending brace or exceed length 255. The characters that exceed this
7658 * limit are dropped. The limit could be relaxed should it become
7659 * desirable by reparsing this as (?:\N{NAME}), so could generate
7660 * multiple EXACT nodes, as is done for just regular input. But this
7661 * is primarily a named character, and not intended to be a huge long
7662 * string, so 255 bytes should be good enough */
7664 STRLEN length_of_hex;
7665 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7666 | PERL_SCAN_DISALLOW_PREFIX
7667 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7668 UV cp; /* Ord of current character */
7670 /* Code points are separated by dots. If none, there is only one
7671 * code point, and is terminated by the brace */
7672 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7674 /* The values are Unicode even on EBCDIC machines */
7675 length_of_hex = (STRLEN)(endchar - RExC_parse);
7676 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7677 if ( length_of_hex == 0
7678 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7680 RExC_parse += length_of_hex; /* Includes all the valid */
7681 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7682 ? UTF8SKIP(RExC_parse)
7684 /* Guard against malformed utf8 */
7685 if (RExC_parse >= endchar) RExC_parse = endchar;
7686 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7689 if (! FOLD) { /* Not folding, just append to the string */
7692 /* Quit before adding this character if would exceed limit */
7693 if (len + UNISKIP(cp) > U8_MAX) break;
7695 unilen = reguni(pRExC_state, cp, s);
7700 } else { /* Folding, output the folded equivalent */
7701 STRLEN foldlen,numlen;
7702 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7703 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7705 /* Quit before exceeding size limit */
7706 if (len + foldlen > U8_MAX) break;
7708 for (foldbuf = tmpbuf;
7712 cp = utf8_to_uvchr(foldbuf, &numlen);
7714 const STRLEN unilen = reguni(pRExC_state, cp, s);
7717 /* In EBCDIC the numlen and unilen can differ. */
7719 if (numlen >= foldlen)
7723 break; /* "Can't happen." */
7727 /* Point to the beginning of the next character in the sequence. */
7728 RExC_parse = endchar + 1;
7730 /* Quit if no more characters */
7731 if (RExC_parse >= endbrace) break;
7736 if (RExC_parse < endbrace) {
7737 ckWARNreg(RExC_parse - 1,
7738 "Using just the first characters returned by \\N{}");
7741 RExC_size += STR_SZ(len);
7744 RExC_emit += STR_SZ(len);
7747 RExC_parse = endbrace + 1;
7749 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7750 with malformed in t/re/pat_advanced.t */
7752 Set_Node_Cur_Length(ret); /* MJD */
7753 nextchar(pRExC_state);
7763 * It returns the code point in utf8 for the value in *encp.
7764 * value: a code value in the source encoding
7765 * encp: a pointer to an Encode object
7767 * If the result from Encode is not a single character,
7768 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7771 S_reg_recode(pTHX_ const char value, SV **encp)
7774 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7775 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7776 const STRLEN newlen = SvCUR(sv);
7777 UV uv = UNICODE_REPLACEMENT;
7779 PERL_ARGS_ASSERT_REG_RECODE;
7783 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7786 if (!newlen || numlen != newlen) {
7787 uv = UNICODE_REPLACEMENT;
7795 - regatom - the lowest level
7797 Try to identify anything special at the start of the pattern. If there
7798 is, then handle it as required. This may involve generating a single regop,
7799 such as for an assertion; or it may involve recursing, such as to
7800 handle a () structure.
7802 If the string doesn't start with something special then we gobble up
7803 as much literal text as we can.
7805 Once we have been able to handle whatever type of thing started the
7806 sequence, we return.
7808 Note: we have to be careful with escapes, as they can be both literal
7809 and special, and in the case of \10 and friends can either, depending
7810 on context. Specifically there are two separate switches for handling
7811 escape sequences, with the one for handling literal escapes requiring
7812 a dummy entry for all of the special escapes that are actually handled
7817 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7820 register regnode *ret = NULL;
7822 char *parse_start = RExC_parse;
7824 GET_RE_DEBUG_FLAGS_DECL;
7825 DEBUG_PARSE("atom");
7826 *flagp = WORST; /* Tentatively. */
7828 PERL_ARGS_ASSERT_REGATOM;
7831 switch ((U8)*RExC_parse) {
7833 RExC_seen_zerolen++;
7834 nextchar(pRExC_state);
7835 if (RExC_flags & RXf_PMf_MULTILINE)
7836 ret = reg_node(pRExC_state, MBOL);
7837 else if (RExC_flags & RXf_PMf_SINGLELINE)
7838 ret = reg_node(pRExC_state, SBOL);
7840 ret = reg_node(pRExC_state, BOL);
7841 Set_Node_Length(ret, 1); /* MJD */
7844 nextchar(pRExC_state);
7846 RExC_seen_zerolen++;
7847 if (RExC_flags & RXf_PMf_MULTILINE)
7848 ret = reg_node(pRExC_state, MEOL);
7849 else if (RExC_flags & RXf_PMf_SINGLELINE)
7850 ret = reg_node(pRExC_state, SEOL);
7852 ret = reg_node(pRExC_state, EOL);
7853 Set_Node_Length(ret, 1); /* MJD */
7856 nextchar(pRExC_state);
7857 if (RExC_flags & RXf_PMf_SINGLELINE)
7858 ret = reg_node(pRExC_state, SANY);
7860 ret = reg_node(pRExC_state, REG_ANY);
7861 *flagp |= HASWIDTH|SIMPLE;
7863 Set_Node_Length(ret, 1); /* MJD */
7867 char * const oregcomp_parse = ++RExC_parse;
7868 ret = regclass(pRExC_state,depth+1);
7869 if (*RExC_parse != ']') {
7870 RExC_parse = oregcomp_parse;
7871 vFAIL("Unmatched [");
7873 nextchar(pRExC_state);
7874 *flagp |= HASWIDTH|SIMPLE;
7875 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7879 nextchar(pRExC_state);
7880 ret = reg(pRExC_state, 1, &flags,depth+1);
7882 if (flags & TRYAGAIN) {
7883 if (RExC_parse == RExC_end) {
7884 /* Make parent create an empty node if needed. */
7892 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7896 if (flags & TRYAGAIN) {
7900 vFAIL("Internal urp");
7901 /* Supposed to be caught earlier. */
7904 if (!regcurly(RExC_parse)) {
7913 vFAIL("Quantifier follows nothing");
7915 case LATIN_SMALL_LETTER_SHARP_S:
7916 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7917 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7918 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7919 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
7920 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7925 len=0; /* silence a spurious compiler warning */
7926 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7927 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7928 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7929 ret = reganode(pRExC_state, FOLDCHAR, cp);
7930 Set_Node_Length(ret, 1); /* MJD */
7931 nextchar(pRExC_state); /* kill whitespace under /x */
7939 This switch handles escape sequences that resolve to some kind
7940 of special regop and not to literal text. Escape sequnces that
7941 resolve to literal text are handled below in the switch marked
7944 Every entry in this switch *must* have a corresponding entry
7945 in the literal escape switch. However, the opposite is not
7946 required, as the default for this switch is to jump to the
7947 literal text handling code.
7949 switch ((U8)*++RExC_parse) {
7950 case LATIN_SMALL_LETTER_SHARP_S:
7951 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7952 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7954 /* Special Escapes */
7956 RExC_seen_zerolen++;
7957 ret = reg_node(pRExC_state, SBOL);
7959 goto finish_meta_pat;
7961 ret = reg_node(pRExC_state, GPOS);
7962 RExC_seen |= REG_SEEN_GPOS;
7964 goto finish_meta_pat;
7966 RExC_seen_zerolen++;
7967 ret = reg_node(pRExC_state, KEEPS);
7969 /* XXX:dmq : disabling in-place substitution seems to
7970 * be necessary here to avoid cases of memory corruption, as
7971 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7973 RExC_seen |= REG_SEEN_LOOKBEHIND;
7974 goto finish_meta_pat;
7976 ret = reg_node(pRExC_state, SEOL);
7978 RExC_seen_zerolen++; /* Do not optimize RE away */
7979 goto finish_meta_pat;
7981 ret = reg_node(pRExC_state, EOS);
7983 RExC_seen_zerolen++; /* Do not optimize RE away */
7984 goto finish_meta_pat;
7986 ret = reg_node(pRExC_state, CANY);
7987 RExC_seen |= REG_SEEN_CANY;
7988 *flagp |= HASWIDTH|SIMPLE;
7989 goto finish_meta_pat;
7991 ret = reg_node(pRExC_state, CLUMP);
7993 goto finish_meta_pat;
7995 switch (get_regex_charset(RExC_flags)) {
7996 case REGEX_LOCALE_CHARSET:
7999 case REGEX_UNICODE_CHARSET:
8002 case REGEX_ASCII_RESTRICTED_CHARSET:
8005 case REGEX_DEPENDS_CHARSET:
8011 ret = reg_node(pRExC_state, op);
8012 *flagp |= HASWIDTH|SIMPLE;
8013 goto finish_meta_pat;
8015 switch (get_regex_charset(RExC_flags)) {
8016 case REGEX_LOCALE_CHARSET:
8019 case REGEX_UNICODE_CHARSET:
8022 case REGEX_ASCII_RESTRICTED_CHARSET:
8025 case REGEX_DEPENDS_CHARSET:
8031 ret = reg_node(pRExC_state, op);
8032 *flagp |= HASWIDTH|SIMPLE;
8033 goto finish_meta_pat;
8035 RExC_seen_zerolen++;
8036 RExC_seen |= REG_SEEN_LOOKBEHIND;
8037 switch (get_regex_charset(RExC_flags)) {
8038 case REGEX_LOCALE_CHARSET:
8041 case REGEX_UNICODE_CHARSET:
8044 case REGEX_ASCII_RESTRICTED_CHARSET:
8047 case REGEX_DEPENDS_CHARSET:
8053 ret = reg_node(pRExC_state, op);
8054 FLAGS(ret) = get_regex_charset(RExC_flags);
8056 goto finish_meta_pat;
8058 RExC_seen_zerolen++;
8059 RExC_seen |= REG_SEEN_LOOKBEHIND;
8060 switch (get_regex_charset(RExC_flags)) {
8061 case REGEX_LOCALE_CHARSET:
8064 case REGEX_UNICODE_CHARSET:
8067 case REGEX_ASCII_RESTRICTED_CHARSET:
8070 case REGEX_DEPENDS_CHARSET:
8076 ret = reg_node(pRExC_state, op);
8077 FLAGS(ret) = get_regex_charset(RExC_flags);
8079 goto finish_meta_pat;
8081 switch (get_regex_charset(RExC_flags)) {
8082 case REGEX_LOCALE_CHARSET:
8085 case REGEX_UNICODE_CHARSET:
8088 case REGEX_ASCII_RESTRICTED_CHARSET:
8091 case REGEX_DEPENDS_CHARSET:
8097 ret = reg_node(pRExC_state, op);
8098 *flagp |= HASWIDTH|SIMPLE;
8099 goto finish_meta_pat;
8101 switch (get_regex_charset(RExC_flags)) {
8102 case REGEX_LOCALE_CHARSET:
8105 case REGEX_UNICODE_CHARSET:
8108 case REGEX_ASCII_RESTRICTED_CHARSET:
8111 case REGEX_DEPENDS_CHARSET:
8117 ret = reg_node(pRExC_state, op);
8118 *flagp |= HASWIDTH|SIMPLE;
8119 goto finish_meta_pat;
8121 switch (get_regex_charset(RExC_flags)) {
8122 case REGEX_LOCALE_CHARSET:
8125 case REGEX_ASCII_RESTRICTED_CHARSET:
8128 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8129 case REGEX_UNICODE_CHARSET:
8135 ret = reg_node(pRExC_state, op);
8136 *flagp |= HASWIDTH|SIMPLE;
8137 goto finish_meta_pat;
8139 switch (get_regex_charset(RExC_flags)) {
8140 case REGEX_LOCALE_CHARSET:
8143 case REGEX_ASCII_RESTRICTED_CHARSET:
8146 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8147 case REGEX_UNICODE_CHARSET:
8153 ret = reg_node(pRExC_state, op);
8154 *flagp |= HASWIDTH|SIMPLE;
8155 goto finish_meta_pat;
8157 ret = reg_node(pRExC_state, LNBREAK);
8158 *flagp |= HASWIDTH|SIMPLE;
8159 goto finish_meta_pat;
8161 ret = reg_node(pRExC_state, HORIZWS);
8162 *flagp |= HASWIDTH|SIMPLE;
8163 goto finish_meta_pat;
8165 ret = reg_node(pRExC_state, NHORIZWS);
8166 *flagp |= HASWIDTH|SIMPLE;
8167 goto finish_meta_pat;
8169 ret = reg_node(pRExC_state, VERTWS);
8170 *flagp |= HASWIDTH|SIMPLE;
8171 goto finish_meta_pat;
8173 ret = reg_node(pRExC_state, NVERTWS);
8174 *flagp |= HASWIDTH|SIMPLE;
8176 nextchar(pRExC_state);
8177 Set_Node_Length(ret, 2); /* MJD */
8182 char* const oldregxend = RExC_end;
8184 char* parse_start = RExC_parse - 2;
8187 if (RExC_parse[1] == '{') {
8188 /* a lovely hack--pretend we saw [\pX] instead */
8189 RExC_end = strchr(RExC_parse, '}');
8191 const U8 c = (U8)*RExC_parse;
8193 RExC_end = oldregxend;
8194 vFAIL2("Missing right brace on \\%c{}", c);
8199 RExC_end = RExC_parse + 2;
8200 if (RExC_end > oldregxend)
8201 RExC_end = oldregxend;
8205 ret = regclass(pRExC_state,depth+1);
8207 RExC_end = oldregxend;
8210 Set_Node_Offset(ret, parse_start + 2);
8211 Set_Node_Cur_Length(ret);
8212 nextchar(pRExC_state);
8213 *flagp |= HASWIDTH|SIMPLE;
8217 /* Handle \N and \N{NAME} here and not below because it can be
8218 multicharacter. join_exact() will join them up later on.
8219 Also this makes sure that things like /\N{BLAH}+/ and
8220 \N{BLAH} being multi char Just Happen. dmq*/
8222 ret= reg_namedseq(pRExC_state, NULL, flagp);
8224 case 'k': /* Handle \k<NAME> and \k'NAME' */
8227 char ch= RExC_parse[1];
8228 if (ch != '<' && ch != '\'' && ch != '{') {
8230 vFAIL2("Sequence %.2s... not terminated",parse_start);
8232 /* this pretty much dupes the code for (?P=...) in reg(), if
8233 you change this make sure you change that */
8234 char* name_start = (RExC_parse += 2);
8236 SV *sv_dat = reg_scan_name(pRExC_state,
8237 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8238 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8239 if (RExC_parse == name_start || *RExC_parse != ch)
8240 vFAIL2("Sequence %.3s... not terminated",parse_start);
8243 num = add_data( pRExC_state, 1, "S" );
8244 RExC_rxi->data->data[num]=(void*)sv_dat;
8245 SvREFCNT_inc_simple_void(sv_dat);
8249 ret = reganode(pRExC_state,
8252 : (AT_LEAST_UNI_SEMANTICS)
8260 /* override incorrect value set in reganode MJD */
8261 Set_Node_Offset(ret, parse_start+1);
8262 Set_Node_Cur_Length(ret); /* MJD */
8263 nextchar(pRExC_state);
8269 case '1': case '2': case '3': case '4':
8270 case '5': case '6': case '7': case '8': case '9':
8273 bool isg = *RExC_parse == 'g';
8278 if (*RExC_parse == '{') {
8282 if (*RExC_parse == '-') {
8286 if (hasbrace && !isDIGIT(*RExC_parse)) {
8287 if (isrel) RExC_parse--;
8289 goto parse_named_seq;
8291 num = atoi(RExC_parse);
8292 if (isg && num == 0)
8293 vFAIL("Reference to invalid group 0");
8295 num = RExC_npar - num;
8297 vFAIL("Reference to nonexistent or unclosed group");
8299 if (!isg && num > 9 && num >= RExC_npar)
8302 char * const parse_start = RExC_parse - 1; /* MJD */
8303 while (isDIGIT(*RExC_parse))
8305 if (parse_start == RExC_parse - 1)
8306 vFAIL("Unterminated \\g... pattern");
8308 if (*RExC_parse != '}')
8309 vFAIL("Unterminated \\g{...} pattern");
8313 if (num > (I32)RExC_rx->nparens)
8314 vFAIL("Reference to nonexistent group");
8317 ret = reganode(pRExC_state,
8320 : (AT_LEAST_UNI_SEMANTICS)
8328 /* override incorrect value set in reganode MJD */
8329 Set_Node_Offset(ret, parse_start+1);
8330 Set_Node_Cur_Length(ret); /* MJD */
8332 nextchar(pRExC_state);
8337 if (RExC_parse >= RExC_end)
8338 FAIL("Trailing \\");
8341 /* Do not generate "unrecognized" warnings here, we fall
8342 back into the quick-grab loop below */
8349 if (RExC_flags & RXf_PMf_EXTENDED) {
8350 if ( reg_skipcomment( pRExC_state ) )
8357 register STRLEN len;
8362 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8364 parse_start = RExC_parse - 1;
8370 ret = reg_node(pRExC_state,
8371 (U8) ((! FOLD) ? EXACT
8374 : (AT_LEAST_UNI_SEMANTICS)
8379 for (len = 0, p = RExC_parse - 1;
8380 len < 127 && p < RExC_end;
8383 char * const oldp = p;
8385 if (RExC_flags & RXf_PMf_EXTENDED)
8386 p = regwhite( pRExC_state, p );
8388 case LATIN_SMALL_LETTER_SHARP_S:
8389 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8390 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8391 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8392 goto normal_default;
8402 /* Literal Escapes Switch
8404 This switch is meant to handle escape sequences that
8405 resolve to a literal character.
8407 Every escape sequence that represents something
8408 else, like an assertion or a char class, is handled
8409 in the switch marked 'Special Escapes' above in this
8410 routine, but also has an entry here as anything that
8411 isn't explicitly mentioned here will be treated as
8412 an unescaped equivalent literal.
8416 /* These are all the special escapes. */
8417 case LATIN_SMALL_LETTER_SHARP_S:
8418 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8419 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8420 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8421 goto normal_default;
8422 case 'A': /* Start assertion */
8423 case 'b': case 'B': /* Word-boundary assertion*/
8424 case 'C': /* Single char !DANGEROUS! */
8425 case 'd': case 'D': /* digit class */
8426 case 'g': case 'G': /* generic-backref, pos assertion */
8427 case 'h': case 'H': /* HORIZWS */
8428 case 'k': case 'K': /* named backref, keep marker */
8429 case 'N': /* named char sequence */
8430 case 'p': case 'P': /* Unicode property */
8431 case 'R': /* LNBREAK */
8432 case 's': case 'S': /* space class */
8433 case 'v': case 'V': /* VERTWS */
8434 case 'w': case 'W': /* word class */
8435 case 'X': /* eXtended Unicode "combining character sequence" */
8436 case 'z': case 'Z': /* End of line/string assertion */
8440 /* Anything after here is an escape that resolves to a
8441 literal. (Except digits, which may or may not)
8460 ender = ASCII_TO_NATIVE('\033');
8464 ender = ASCII_TO_NATIVE('\007');
8469 STRLEN brace_len = len;
8471 const char* error_msg;
8473 bool valid = grok_bslash_o(p,
8480 RExC_parse = p; /* going to die anyway; point
8481 to exact spot of failure */
8488 if (PL_encoding && ender < 0x100) {
8489 goto recode_encoding;
8498 char* const e = strchr(p, '}');
8502 vFAIL("Missing right brace on \\x{}");
8505 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8506 | PERL_SCAN_DISALLOW_PREFIX;
8507 STRLEN numlen = e - p - 1;
8508 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8515 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8517 ender = grok_hex(p, &numlen, &flags, NULL);
8520 if (PL_encoding && ender < 0x100)
8521 goto recode_encoding;
8525 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8527 case '0': case '1': case '2': case '3':case '4':
8528 case '5': case '6': case '7': case '8':case '9':
8530 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8532 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8534 ender = grok_oct(p, &numlen, &flags, NULL);
8544 if (PL_encoding && ender < 0x100)
8545 goto recode_encoding;
8549 SV* enc = PL_encoding;
8550 ender = reg_recode((const char)(U8)ender, &enc);
8551 if (!enc && SIZE_ONLY)
8552 ckWARNreg(p, "Invalid escape in the specified encoding");
8558 FAIL("Trailing \\");
8561 if (!SIZE_ONLY&& isALPHA(*p))
8562 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
8563 goto normal_default;
8568 if (UTF8_IS_START(*p) && UTF) {
8570 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8571 &numlen, UTF8_ALLOW_DEFAULT);
8578 if ( RExC_flags & RXf_PMf_EXTENDED)
8579 p = regwhite( pRExC_state, p );
8581 /* Prime the casefolded buffer. */
8582 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8584 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8589 /* Emit all the Unicode characters. */
8591 for (foldbuf = tmpbuf;
8593 foldlen -= numlen) {
8594 ender = utf8_to_uvchr(foldbuf, &numlen);
8596 const STRLEN unilen = reguni(pRExC_state, ender, s);
8599 /* In EBCDIC the numlen
8600 * and unilen can differ. */
8602 if (numlen >= foldlen)
8606 break; /* "Can't happen." */
8610 const STRLEN unilen = reguni(pRExC_state, ender, s);
8619 REGC((char)ender, s++);
8625 /* Emit all the Unicode characters. */
8627 for (foldbuf = tmpbuf;
8629 foldlen -= numlen) {
8630 ender = utf8_to_uvchr(foldbuf, &numlen);
8632 const STRLEN unilen = reguni(pRExC_state, ender, s);
8635 /* In EBCDIC the numlen
8636 * and unilen can differ. */
8638 if (numlen >= foldlen)
8646 const STRLEN unilen = reguni(pRExC_state, ender, s);
8655 REGC((char)ender, s++);
8659 Set_Node_Cur_Length(ret); /* MJD */
8660 nextchar(pRExC_state);
8662 /* len is STRLEN which is unsigned, need to copy to signed */
8665 vFAIL("Internal disaster");
8669 if (len == 1 && UNI_IS_INVARIANT(ender))
8673 RExC_size += STR_SZ(len);
8676 RExC_emit += STR_SZ(len);
8684 /* Jumped to when an unrecognized character set is encountered */
8686 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8691 S_regwhite( RExC_state_t *pRExC_state, char *p )
8693 const char *e = RExC_end;
8695 PERL_ARGS_ASSERT_REGWHITE;
8700 else if (*p == '#') {
8709 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8717 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8718 Character classes ([:foo:]) can also be negated ([:^foo:]).
8719 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8720 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8721 but trigger failures because they are currently unimplemented. */
8723 #define POSIXCC_DONE(c) ((c) == ':')
8724 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8725 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8728 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8731 I32 namedclass = OOB_NAMEDCLASS;
8733 PERL_ARGS_ASSERT_REGPPOSIXCC;
8735 if (value == '[' && RExC_parse + 1 < RExC_end &&
8736 /* I smell either [: or [= or [. -- POSIX has been here, right? */
8737 POSIXCC(UCHARAT(RExC_parse))) {
8738 const char c = UCHARAT(RExC_parse);
8739 char* const s = RExC_parse++;
8741 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8743 if (RExC_parse == RExC_end)
8744 /* Grandfather lone [:, [=, [. */
8747 const char* const t = RExC_parse++; /* skip over the c */
8750 if (UCHARAT(RExC_parse) == ']') {
8751 const char *posixcc = s + 1;
8752 RExC_parse++; /* skip over the ending ] */
8755 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8756 const I32 skip = t - posixcc;
8758 /* Initially switch on the length of the name. */
8761 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8762 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8765 /* Names all of length 5. */
8766 /* alnum alpha ascii blank cntrl digit graph lower
8767 print punct space upper */
8768 /* Offset 4 gives the best switch position. */
8769 switch (posixcc[4]) {
8771 if (memEQ(posixcc, "alph", 4)) /* alpha */
8772 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8775 if (memEQ(posixcc, "spac", 4)) /* space */
8776 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8779 if (memEQ(posixcc, "grap", 4)) /* graph */
8780 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8783 if (memEQ(posixcc, "asci", 4)) /* ascii */
8784 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8787 if (memEQ(posixcc, "blan", 4)) /* blank */
8788 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8791 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8792 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8795 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8796 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8799 if (memEQ(posixcc, "lowe", 4)) /* lower */
8800 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8801 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8802 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8805 if (memEQ(posixcc, "digi", 4)) /* digit */
8806 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8807 else if (memEQ(posixcc, "prin", 4)) /* print */
8808 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8809 else if (memEQ(posixcc, "punc", 4)) /* punct */
8810 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8815 if (memEQ(posixcc, "xdigit", 6))
8816 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8820 if (namedclass == OOB_NAMEDCLASS)
8821 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8823 assert (posixcc[skip] == ':');
8824 assert (posixcc[skip+1] == ']');
8825 } else if (!SIZE_ONLY) {
8826 /* [[=foo=]] and [[.foo.]] are still future. */
8828 /* adjust RExC_parse so the warning shows after
8830 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8832 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8835 /* Maternal grandfather:
8836 * "[:" ending in ":" but not in ":]" */
8846 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8850 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8852 if (POSIXCC(UCHARAT(RExC_parse))) {
8853 const char *s = RExC_parse;
8854 const char c = *s++;
8858 if (*s && c == *s && s[1] == ']') {
8860 "POSIX syntax [%c %c] belongs inside character classes",
8863 /* [[=foo=]] and [[.foo.]] are still future. */
8864 if (POSIXCC_NOTYET(c)) {
8865 /* adjust RExC_parse so the error shows after
8867 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8869 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8875 /* No locale test, and always Unicode semantics */
8876 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8878 for (value = 0; value < 256; value++) \
8880 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \
8884 case ANYOF_N##NAME: \
8885 for (value = 0; value < 256; value++) \
8887 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \
8892 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8893 * there are two tests passed in, to use depending on that. There aren't any
8894 * cases where the label is different from the name, so no need for that
8896 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
8898 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8899 else if (UNI_SEMANTICS) { \
8900 for (value = 0; value < 256; value++) { \
8901 if (TEST_8(value)) stored += \
8902 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \
8906 for (value = 0; value < 128; value++) { \
8907 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
8908 S_set_regclass_bit(aTHX_ pRExC_state, ret, \
8909 (U8) UNI_TO_NATIVE(value), &nonbitmap); \
8915 case ANYOF_N##NAME: \
8916 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8917 else if (UNI_SEMANTICS) { \
8918 for (value = 0; value < 256; value++) { \
8919 if (! TEST_8(value)) stored += \
8920 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \
8924 for (value = 0; value < 128; value++) { \
8925 if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit( \
8926 aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
8928 if (ASCII_RESTRICTED) { \
8929 for (value = 128; value < 256; value++) { \
8930 stored += S_set_regclass_bit( \
8931 aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
8933 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \
8936 /* For a non-ut8 target string with DEPENDS semantics, all above \
8937 * ASCII Latin1 code points match the complement of any of the \
8938 * classes. But in utf8, they have their Unicode semantics, so \
8939 * can't just set them in the bitmap, or else regexec.c will think \
8940 * they matched when they shouldn't. */ \
8941 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
8949 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8950 so that it is possible to override the option here without having to
8951 rebuild the entire core. as we are required to do if we change regcomp.h
8952 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8954 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8955 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8958 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8959 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8961 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8965 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
8968 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8969 * Locale folding is done at run-time, so this function should not be
8970 * called for nodes that are for locales.
8972 * This function simply sets the bit corresponding to the fold of the input
8973 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8976 * It also sets any necessary flags, and returns the number of bits that
8977 * actually changed from 0 to 1 */
8982 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
8985 /* It assumes the bit for 'value' has already been set */
8986 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8987 ANYOF_BITMAP_SET(node, fold);
8990 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8993 && PL_fold_latin1[value] != value))
8994 { /* A character that has a fold outside of Latin1 matches outside the
8995 bitmap, but only when the target string is utf8. Similarly when we
8996 don't have unicode semantics for the above ASCII Latin-1 characters,
8997 and they have a fold, they should match if the target is utf8, and
8999 if (! *nonbitmap_ptr) {
9000 *nonbitmap_ptr = _new_invlist(2);
9002 *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
9003 ANYOF_FLAGS(node) |= ANYOF_UTF8;
9010 PERL_STATIC_INLINE U8
9011 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
9013 /* This inline function sets a bit in the bitmap if not already set, and if
9014 * appropriate, its fold, returning the number of bits that actually
9015 * changed from 0 to 1 */
9019 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9023 ANYOF_BITMAP_SET(node, value);
9026 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
9027 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value, nonbitmap_ptr);
9034 parse a class specification and produce either an ANYOF node that
9035 matches the pattern or if the pattern matches a single char only and
9036 that char is < 256 and we are case insensitive then we produce an
9041 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9044 register UV nextvalue;
9045 register IV prevvalue = OOB_UNICODE;
9046 register IV range = 0;
9047 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9048 register regnode *ret;
9051 char *rangebegin = NULL;
9052 bool need_class = 0;
9055 HV* nonbitmap = NULL;
9056 AV* unicode_alternate = NULL;
9058 UV literal_endpoint = 0;
9060 UV stored = 0; /* how many chars stored in the bitmap */
9062 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9063 case we need to change the emitted regop to an EXACT. */
9064 const char * orig_parse = RExC_parse;
9065 GET_RE_DEBUG_FLAGS_DECL;
9067 PERL_ARGS_ASSERT_REGCLASS;
9069 PERL_UNUSED_ARG(depth);
9072 DEBUG_PARSE("clas");
9074 /* Assume we are going to generate an ANYOF node. */
9075 ret = reganode(pRExC_state, ANYOF, 0);
9079 ANYOF_FLAGS(ret) = 0;
9082 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
9086 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9090 RExC_size += ANYOF_SKIP;
9091 #ifdef ANYOF_ADD_LOC_SKIP
9093 RExC_size += ANYOF_ADD_LOC_SKIP;
9096 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9099 RExC_emit += ANYOF_SKIP;
9101 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9102 #ifdef ANYOF_ADD_LOC_SKIP
9103 RExC_emit += ANYOF_ADD_LOC_SKIP;
9106 ANYOF_BITMAP_ZERO(ret);
9107 listsv = newSVpvs("# comment\n");
9110 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9112 if (!SIZE_ONLY && POSIXCC(nextvalue))
9113 checkposixcc(pRExC_state);
9115 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9116 if (UCHARAT(RExC_parse) == ']')
9120 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9124 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9127 rangebegin = RExC_parse;
9129 value = utf8n_to_uvchr((U8*)RExC_parse,
9130 RExC_end - RExC_parse,
9131 &numlen, UTF8_ALLOW_DEFAULT);
9132 RExC_parse += numlen;
9135 value = UCHARAT(RExC_parse++);
9137 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9138 if (value == '[' && POSIXCC(nextvalue))
9139 namedclass = regpposixcc(pRExC_state, value);
9140 else if (value == '\\') {
9142 value = utf8n_to_uvchr((U8*)RExC_parse,
9143 RExC_end - RExC_parse,
9144 &numlen, UTF8_ALLOW_DEFAULT);
9145 RExC_parse += numlen;
9148 value = UCHARAT(RExC_parse++);
9149 /* Some compilers cannot handle switching on 64-bit integer
9150 * values, therefore value cannot be an UV. Yes, this will
9151 * be a problem later if we want switch on Unicode.
9152 * A similar issue a little bit later when switching on
9153 * namedclass. --jhi */
9154 switch ((I32)value) {
9155 case 'w': namedclass = ANYOF_ALNUM; break;
9156 case 'W': namedclass = ANYOF_NALNUM; break;
9157 case 's': namedclass = ANYOF_SPACE; break;
9158 case 'S': namedclass = ANYOF_NSPACE; break;
9159 case 'd': namedclass = ANYOF_DIGIT; break;
9160 case 'D': namedclass = ANYOF_NDIGIT; break;
9161 case 'v': namedclass = ANYOF_VERTWS; break;
9162 case 'V': namedclass = ANYOF_NVERTWS; break;
9163 case 'h': namedclass = ANYOF_HORIZWS; break;
9164 case 'H': namedclass = ANYOF_NHORIZWS; break;
9165 case 'N': /* Handle \N{NAME} in class */
9167 /* We only pay attention to the first char of
9168 multichar strings being returned. I kinda wonder
9169 if this makes sense as it does change the behaviour
9170 from earlier versions, OTOH that behaviour was broken
9172 UV v; /* value is register so we cant & it /grrr */
9173 if (reg_namedseq(pRExC_state, &v, NULL)) {
9183 if (RExC_parse >= RExC_end)
9184 vFAIL2("Empty \\%c{}", (U8)value);
9185 if (*RExC_parse == '{') {
9186 const U8 c = (U8)value;
9187 e = strchr(RExC_parse++, '}');
9189 vFAIL2("Missing right brace on \\%c{}", c);
9190 while (isSPACE(UCHARAT(RExC_parse)))
9192 if (e == RExC_parse)
9193 vFAIL2("Empty \\%c{}", c);
9195 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9204 ckWARN2reg(RExC_parse,
9205 "\\%c uses Unicode rules, not locale rules",
9210 if (UCHARAT(RExC_parse) == '^') {
9213 value = value == 'p' ? 'P' : 'p'; /* toggle */
9214 while (isSPACE(UCHARAT(RExC_parse))) {
9220 /* Add the property name to the list. If /i matching, give
9221 * a different name which consists of the normal name
9222 * sandwiched between two underscores and '_i'. The design
9223 * is discussed in the commit message for this. */
9224 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9225 (value=='p' ? '+' : '!'),
9234 /* The \p could match something in the Latin1 range, hence
9235 * something that isn't utf8 */
9236 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
9237 namedclass = ANYOF_MAX; /* no official name, but it's named */
9239 /* \p means they want Unicode semantics */
9240 RExC_uni_semantics = 1;
9243 case 'n': value = '\n'; break;
9244 case 'r': value = '\r'; break;
9245 case 't': value = '\t'; break;
9246 case 'f': value = '\f'; break;
9247 case 'b': value = '\b'; break;
9248 case 'e': value = ASCII_TO_NATIVE('\033');break;
9249 case 'a': value = ASCII_TO_NATIVE('\007');break;
9251 RExC_parse--; /* function expects to be pointed at the 'o' */
9253 const char* error_msg;
9254 bool valid = grok_bslash_o(RExC_parse,
9259 RExC_parse += numlen;
9264 if (PL_encoding && value < 0x100) {
9265 goto recode_encoding;
9269 if (*RExC_parse == '{') {
9270 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9271 | PERL_SCAN_DISALLOW_PREFIX;
9272 char * const e = strchr(RExC_parse++, '}');
9274 vFAIL("Missing right brace on \\x{}");
9276 numlen = e - RExC_parse;
9277 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9281 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9283 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9284 RExC_parse += numlen;
9286 if (PL_encoding && value < 0x100)
9287 goto recode_encoding;
9290 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9292 case '0': case '1': case '2': case '3': case '4':
9293 case '5': case '6': case '7':
9295 /* Take 1-3 octal digits */
9296 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9298 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9299 RExC_parse += numlen;
9300 if (PL_encoding && value < 0x100)
9301 goto recode_encoding;
9306 SV* enc = PL_encoding;
9307 value = reg_recode((const char)(U8)value, &enc);
9308 if (!enc && SIZE_ONLY)
9309 ckWARNreg(RExC_parse,
9310 "Invalid escape in the specified encoding");
9314 /* Allow \_ to not give an error */
9315 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9316 ckWARN2reg(RExC_parse,
9317 "Unrecognized escape \\%c in character class passed through",
9322 } /* end of \blah */
9328 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9330 /* What matches in a locale is not known until runtime, so need to
9331 * (one time per class) allocate extra space to pass to regexec.
9332 * The space will contain a bit for each named class that is to be
9333 * matched against. This isn't needed for \p{} and pseudo-classes,
9334 * as they are not affected by locale, and hence are dealt with
9336 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9339 #ifdef ANYOF_CLASS_ADD_SKIP
9340 RExC_size += ANYOF_CLASS_ADD_SKIP;
9344 #ifdef ANYOF_CLASS_ADD_SKIP
9345 RExC_emit += ANYOF_CLASS_ADD_SKIP;
9347 ANYOF_CLASS_ZERO(ret);
9349 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9352 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
9357 RExC_parse >= rangebegin ?
9358 RExC_parse - rangebegin : 0;
9359 ckWARN4reg(RExC_parse,
9360 "False [] range \"%*.*s\"",
9363 if (prevvalue < 256) {
9365 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue, &nonbitmap);
9367 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
9370 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9371 Perl_sv_catpvf(aTHX_ listsv,
9372 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
9376 range = 0; /* this was not a true range */
9382 const char *what = NULL;
9385 /* Possible truncation here but in some 64-bit environments
9386 * the compiler gets heartburn about switch on 64-bit values.
9387 * A similar issue a little earlier when switching on value.
9389 switch ((I32)namedclass) {
9391 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9392 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9393 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9394 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9395 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9396 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9397 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9398 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9399 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9400 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9401 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
9402 /* \s, \w match all unicode if utf8. */
9403 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9404 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9406 /* \s, \w match ascii and locale only */
9407 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
9408 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
9410 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9411 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9412 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9415 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9417 for (value = 0; value < 128; value++)
9419 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
9422 what = NULL; /* Doesn't match outside ascii, so
9423 don't want to add +utf8:: */
9427 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9429 for (value = 128; value < 256; value++)
9431 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
9433 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9439 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9441 /* consecutive digits assumed */
9442 for (value = '0'; value <= '9'; value++)
9444 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
9447 what = POSIX_CC_UNI_NAME("Digit");
9451 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9453 /* consecutive digits assumed */
9454 for (value = 0; value < '0'; value++)
9456 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
9457 for (value = '9' + 1; value < 256; value++)
9459 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
9462 what = POSIX_CC_UNI_NAME("Digit");
9463 if (ASCII_RESTRICTED ) {
9464 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9468 /* this is to handle \p and \P */
9471 vFAIL("Invalid [::] class");
9474 if (what && ! (ASCII_RESTRICTED)) {
9475 /* Strings such as "+utf8::isWord\n" */
9476 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9477 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9482 } /* end of namedclass \blah */
9485 if (prevvalue > (IV)value) /* b-a */ {
9486 const int w = RExC_parse - rangebegin;
9487 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9488 range = 0; /* not a valid range */
9492 prevvalue = value; /* save the beginning of the range */
9493 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
9494 RExC_parse[1] != ']') {
9497 /* a bad range like \w-, [:word:]- ? */
9498 if (namedclass > OOB_NAMEDCLASS) {
9499 if (ckWARN(WARN_REGEXP)) {
9501 RExC_parse >= rangebegin ?
9502 RExC_parse - rangebegin : 0;
9504 "False [] range \"%*.*s\"",
9509 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
9511 range = 1; /* yeah, it's a range! */
9512 continue; /* but do it the next time */
9517 RExC_uni_semantics = 1;
9520 /* now is the next time */
9522 if (prevvalue < 256) {
9523 const IV ceilvalue = value < 256 ? value : 255;
9526 /* In EBCDIC [\x89-\x91] should include
9527 * the \x8e but [i-j] should not. */
9528 if (literal_endpoint == 2 &&
9529 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9530 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9532 if (isLOWER(prevvalue)) {
9533 for (i = prevvalue; i <= ceilvalue; i++)
9534 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9536 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
9539 for (i = prevvalue; i <= ceilvalue; i++)
9540 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9542 S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
9548 for (i = prevvalue; i <= ceilvalue; i++) {
9549 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
9553 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
9554 const UV natvalue = NATIVE_TO_UNI(value);
9556 nonbitmap = _new_invlist(2);
9558 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
9559 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
9563 /* If the code point requires utf8 to represent, and we are not
9564 * folding, it can't match unless the target is in utf8. Only
9565 * a few code points above 255 fold to below it, so XXX an
9566 * optimization would be to know which ones and set the flag
9568 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
9571 if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
9573 /* The \t sets the whole range */
9574 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
9575 prevnatvalue, natvalue);
9577 /* Currently, we don't look at every value in the range.
9578 * Therefore we have to assume the worst case: that if
9579 * folding, it will match more than one character. But in
9580 * lookbehind patterns, can only be single character
9581 * length, so disallow those folds */
9582 if (FOLD && ! RExC_in_lookbehind) {
9586 else if (prevnatvalue == natvalue) {
9587 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
9589 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
9591 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
9593 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
9594 if (RExC_precomp[0] == ':' &&
9595 RExC_precomp[1] == '[' &&
9596 (f == 0xDF || f == 0x92)) {
9597 f = NATIVE_TO_UNI(f);
9600 /* If folding and foldable and a single
9601 * character, insert also the folded version
9602 * to the charclass. */
9604 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
9605 if ((RExC_precomp[0] == ':' &&
9606 RExC_precomp[1] == '[' &&
9608 (value == 0xFB05 || value == 0xFB06))) ?
9609 foldlen == ((STRLEN)UNISKIP(f) - 1) :
9610 foldlen == (STRLEN)UNISKIP(f) )
9612 if (foldlen == (STRLEN)UNISKIP(f))
9614 Perl_sv_catpvf(aTHX_ listsv,
9616 else if (! RExC_in_lookbehind) {
9617 /* Any multicharacter foldings
9618 * (disallowed in lookbehind patterns)
9619 * require the following transform:
9620 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
9621 * where E folds into "pq" and F folds
9622 * into "rst", all other characters
9623 * fold to single characters. We save
9624 * away these multicharacter foldings,
9625 * to be later saved as part of the
9626 * additional "s" data. */
9629 if (!unicode_alternate)
9630 unicode_alternate = newAV();
9631 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
9633 av_push(unicode_alternate, sv);
9638 /* If folding and the value is one of the Greek
9639 * sigmas insert a few more sigmas to make the
9640 * folding rules of the sigmas to work right.
9641 * Note that not all the possible combinations
9642 * are handled here: some of them are handled
9643 * by the standard folding rules, and some of
9644 * them (literal or EXACTF cases) are handled
9645 * during runtime in regexec.c:S_find_byclass(). */
9646 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
9647 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9648 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
9649 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9650 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9652 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
9653 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
9654 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
9660 literal_endpoint = 0;
9664 range = 0; /* this range (if it was one) is done now */
9671 /****** !SIZE_ONLY AFTER HERE *********/
9673 /* Finish up the non-bitmap entries */
9675 UV* nonbitmap_array;
9678 /* If folding, we add to the list all characters that could fold to or
9679 * from the ones already on the list */
9681 HV* fold_intersection;
9684 /* This is a list of all the characters that participate in folds
9685 * (except marks, etc in multi-char folds */
9686 if (! PL_utf8_foldable) {
9687 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
9688 PL_utf8_foldable = _swash_to_invlist(swash);
9691 /* This is a hash that for a particular fold gives all characters
9692 * that are involved in it */
9693 if (! PL_utf8_foldclosures) {
9695 /* If we were unable to find any folds, then we likely won't be
9696 * able to find the closures. So just create an empty list.
9697 * Folding will effectively be restricted to the non-Unicode
9698 * rules hard-coded into Perl. (This case happens legitimately
9699 * during compilation of Perl itself before the Unicode tables
9701 if (invlist_len(PL_utf8_foldable) == 0) {
9702 PL_utf8_foldclosures = _new_invlist(0);
9704 /* If the folds haven't been read in, call a fold function
9706 if (! PL_utf8_tofold) {
9707 U8 dummy[UTF8_MAXBYTES+1];
9709 to_utf8_fold((U8*) "A", dummy, &dummy_len);
9711 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9715 /* Only the characters in this class that participate in folds need
9716 * be checked. Get the intersection of this class and all the
9717 * possible characters that are foldable. This can quickly narrow
9718 * down a large class */
9719 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
9721 /* Now look at the foldable characters in this class individually */
9722 fold_list = invlist_array(fold_intersection);
9723 for (i = 0; i < invlist_len(fold_intersection); i++) {
9726 /* The next entry is the beginning of the range that is in the
9728 UV start = fold_list[i++];
9731 /* The next entry is the beginning of the next range, which
9732 * isn't in the class, so the end of the current range is one
9734 UV end = fold_list[i] - 1;
9736 /* Look at every character in the range */
9737 for (j = start; j <= end; j++) {
9740 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
9742 const UV f = to_uni_fold(j, foldbuf, &foldlen);
9744 if (foldlen > (STRLEN)UNISKIP(f)) {
9746 /* Any multicharacter foldings (disallowed in
9747 * lookbehind patterns) require the following
9748 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
9749 * E folds into "pq" and F folds into "rst", all other
9750 * characters fold to single characters. We save away
9751 * these multicharacter foldings, to be later saved as
9752 * part of the additional "s" data. */
9753 if (! RExC_in_lookbehind) {
9754 /* XXX Discard this fold if any are latin1 and LOC */
9757 if (!unicode_alternate) {
9758 unicode_alternate = newAV();
9760 sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
9761 av_push(unicode_alternate, sv);
9763 /* This node is variable length */
9765 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
9768 else { /* Single character fold */
9771 /* Consider "k" =~ /[K]/i. The line above would have
9772 * just folded the 'k' to itself, and that isn't going
9773 * to match 'K'. So we look through the closure of
9774 * everything that folds to 'k'. That will find the
9775 * 'K'. Initialize the list, if necessary */
9777 /* The data structure is a hash with the keys every
9778 * character that is folded to, like 'k', and the
9779 * values each an array of everything that folds to its
9780 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
9781 if ((listp = hv_fetch(PL_utf8_foldclosures,
9782 (char *) foldbuf, foldlen, FALSE)))
9784 AV* list = (AV*) *listp;
9786 for (k = 0; k <= av_len(list); k++) {
9787 SV** c_p = av_fetch(list, k, FALSE);
9790 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
9794 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
9795 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) c, &nonbitmap);
9797 /* It may be that the code point is already
9798 * in this range or already in the bitmap,
9799 * XXX THink about LOC
9800 * in which case we need do nothing */
9801 else if ((c < start || c > end)
9803 || ! ANYOF_BITMAP_TEST(ret, c)))
9805 nonbitmap = add_range_to_invlist(nonbitmap, c, c);
9812 invlist_destroy(fold_intersection);
9813 } /* End of processing all the folds */
9815 /* Here have the full list of items to match that aren't in the
9816 * bitmap. Convert to the structure that the rest of the code is
9817 * expecting. XXX That rest of the code should convert to this
9819 nonbitmap_array = invlist_array(nonbitmap);
9820 for (i = 0; i < invlist_len(nonbitmap); i++) {
9822 /* The next entry is the beginning of the range that is in the
9824 UV start = nonbitmap_array[i++];
9826 /* The next entry is the beginning of the next range, which isn't
9827 * in the class, so the end of the current range is one less than
9829 UV end = nonbitmap_array[i] - 1;
9832 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
9835 /* The \t sets the whole range */
9836 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
9841 invlist_destroy(nonbitmap);
9844 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
9845 * set the FOLD flag yet, so this this does optimize those. It doesn't
9846 * optimize locale. Doing so perhaps could be done as long as there is
9847 * nothing like \w in it; some thought also would have to be given to the
9848 * interaction with above 0x100 chars */
9849 if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
9850 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
9851 ANYOF_BITMAP(ret)[value] ^= 0xFF;
9852 stored = 256 - stored;
9854 /* The inversion means that everything above 255 is matched; and at the
9855 * same time we clear the invert flag */
9856 ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
9862 /* This is the one character in the bitmap that needs special handling
9863 * under non-locale folding, as it folds to two characters 'ss'. This
9864 * happens if it is set and not inverting, or isn't set and are
9865 * inverting (disallowed in lookbehind patterns because they can't be
9866 * variable length) */
9868 && ! RExC_in_lookbehind
9869 && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
9870 ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
9872 OP(ret) = ANYOFV; /* Can match more than a single char */
9874 /* Under Unicode semantics), it can do this when the target string
9876 if (UNI_SEMANTICS) {
9877 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9880 if (!unicode_alternate) {
9881 unicode_alternate = newAV();
9883 sv = newSVpvn_utf8("ss", 2, TRUE);
9884 av_push(unicode_alternate, sv);
9887 /* Folding in the bitmap is taken care of above, but not for locale
9888 * (for which we have to wait to see what folding is in effect at
9889 * runtime), and for things not in the bitmap. Set run-time fold flag
9891 if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
9892 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
9896 /* A single character class can be "optimized" into an EXACTish node.
9897 * Note that since we don't currently count how many characters there are
9898 * outside the bitmap, we are XXX missing optimization possibilities for
9899 * them. This optimization can't happen unless this is a truly single
9900 * character class, which means that it can't be an inversion into a
9901 * many-character class, and there must be no possibility of there being
9902 * things outside the bitmap. 'stored' (only) for locales doesn't include
9903 * \w, etc, so have to make a special test that they aren't present
9905 * Similarly A 2-character class of the very special form like [bB] can be
9906 * optimized into an EXACTFish node, but only for non-locales, and for
9907 * characters which only have the two folds; so things like 'fF' and 'Ii'
9908 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
9910 if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
9911 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9912 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
9913 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
9914 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
9915 /* If the latest code point has a fold whose
9916 * bit is set, it must be the only other one */
9917 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
9918 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
9920 /* Note that the information needed to decide to do this optimization
9921 * is not currently available until the 2nd pass, and that the actually
9922 * used EXACTish node takes less space than the calculated ANYOF node,
9923 * and hence the amount of space calculated in the first pass is larger
9924 * than actually used, so this optimization doesn't gain us any space.
9925 * But an EXACT node is faster than an ANYOF node, and can be combined
9926 * with any adjacent EXACT nodes later by the optimizer for further
9927 * gains. The speed of executing an EXACTF is similar to an ANYOF
9928 * node, so the optimization advantage comes from the ability to join
9929 * it to adjacent EXACT nodes */
9931 const char * cur_parse= RExC_parse;
9933 RExC_emit = (regnode *)orig_emit;
9934 RExC_parse = (char *)orig_parse;
9938 /* A locale node with one point can be folded; all the other cases
9939 * with folding will have two points, since we calculate them above
9941 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
9947 } /* else 2 chars in the bit map: the folds of each other */
9948 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
9950 /* To join adjacent nodes, they must be the exact EXACTish type.
9951 * Try to use the most likely type, by using EXACTFU if the regex
9952 * calls for them, or is required because the character is
9956 else { /* Otherwise, more likely to be EXACTF type */
9960 ret = reg_node(pRExC_state, op);
9961 RExC_parse = (char *)cur_parse;
9962 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
9963 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
9964 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
9966 RExC_emit += STR_SZ(2);
9969 *STRING(ret)= (char)value;
9971 RExC_emit += STR_SZ(1);
9973 SvREFCNT_dec(listsv);
9978 AV * const av = newAV();
9980 /* The 0th element stores the character class description
9981 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9982 * to initialize the appropriate swash (which gets stored in
9983 * the 1st element), and also useful for dumping the regnode.
9984 * The 2nd element stores the multicharacter foldings,
9985 * used later (regexec.c:S_reginclass()). */
9986 av_store(av, 0, listsv);
9987 av_store(av, 1, NULL);
9988 av_store(av, 2, MUTABLE_SV(unicode_alternate));
9989 rv = newRV_noinc(MUTABLE_SV(av));
9990 n = add_data(pRExC_state, 1, "s");
9991 RExC_rxi->data->data[n] = (void*)rv;
9999 /* reg_skipcomment()
10001 Absorbs an /x style # comments from the input stream.
10002 Returns true if there is more text remaining in the stream.
10003 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10004 terminates the pattern without including a newline.
10006 Note its the callers responsibility to ensure that we are
10007 actually in /x mode
10012 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10016 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10018 while (RExC_parse < RExC_end)
10019 if (*RExC_parse++ == '\n') {
10024 /* we ran off the end of the pattern without ending
10025 the comment, so we have to add an \n when wrapping */
10026 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10034 Advances the parse position, and optionally absorbs
10035 "whitespace" from the inputstream.
10037 Without /x "whitespace" means (?#...) style comments only,
10038 with /x this means (?#...) and # comments and whitespace proper.
10040 Returns the RExC_parse point from BEFORE the scan occurs.
10042 This is the /x friendly way of saying RExC_parse++.
10046 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10048 char* const retval = RExC_parse++;
10050 PERL_ARGS_ASSERT_NEXTCHAR;
10053 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10054 RExC_parse[2] == '#') {
10055 while (*RExC_parse != ')') {
10056 if (RExC_parse == RExC_end)
10057 FAIL("Sequence (?#... not terminated");
10063 if (RExC_flags & RXf_PMf_EXTENDED) {
10064 if (isSPACE(*RExC_parse)) {
10068 else if (*RExC_parse == '#') {
10069 if ( reg_skipcomment( pRExC_state ) )
10078 - reg_node - emit a node
10080 STATIC regnode * /* Location. */
10081 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10084 register regnode *ptr;
10085 regnode * const ret = RExC_emit;
10086 GET_RE_DEBUG_FLAGS_DECL;
10088 PERL_ARGS_ASSERT_REG_NODE;
10091 SIZE_ALIGN(RExC_size);
10095 if (RExC_emit >= RExC_emit_bound)
10096 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10098 NODE_ALIGN_FILL(ret);
10100 FILL_ADVANCE_NODE(ptr, op);
10101 #ifdef RE_TRACK_PATTERN_OFFSETS
10102 if (RExC_offsets) { /* MJD */
10103 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10104 "reg_node", __LINE__,
10106 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10107 ? "Overwriting end of array!\n" : "OK",
10108 (UV)(RExC_emit - RExC_emit_start),
10109 (UV)(RExC_parse - RExC_start),
10110 (UV)RExC_offsets[0]));
10111 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10119 - reganode - emit a node with an argument
10121 STATIC regnode * /* Location. */
10122 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10125 register regnode *ptr;
10126 regnode * const ret = RExC_emit;
10127 GET_RE_DEBUG_FLAGS_DECL;
10129 PERL_ARGS_ASSERT_REGANODE;
10132 SIZE_ALIGN(RExC_size);
10137 assert(2==regarglen[op]+1);
10139 Anything larger than this has to allocate the extra amount.
10140 If we changed this to be:
10142 RExC_size += (1 + regarglen[op]);
10144 then it wouldn't matter. Its not clear what side effect
10145 might come from that so its not done so far.
10150 if (RExC_emit >= RExC_emit_bound)
10151 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10153 NODE_ALIGN_FILL(ret);
10155 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10156 #ifdef RE_TRACK_PATTERN_OFFSETS
10157 if (RExC_offsets) { /* MJD */
10158 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10162 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10163 "Overwriting end of array!\n" : "OK",
10164 (UV)(RExC_emit - RExC_emit_start),
10165 (UV)(RExC_parse - RExC_start),
10166 (UV)RExC_offsets[0]));
10167 Set_Cur_Node_Offset;
10175 - reguni - emit (if appropriate) a Unicode character
10178 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10182 PERL_ARGS_ASSERT_REGUNI;
10184 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10188 - reginsert - insert an operator in front of already-emitted operand
10190 * Means relocating the operand.
10193 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10196 register regnode *src;
10197 register regnode *dst;
10198 register regnode *place;
10199 const int offset = regarglen[(U8)op];
10200 const int size = NODE_STEP_REGNODE + offset;
10201 GET_RE_DEBUG_FLAGS_DECL;
10203 PERL_ARGS_ASSERT_REGINSERT;
10204 PERL_UNUSED_ARG(depth);
10205 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10206 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10215 if (RExC_open_parens) {
10217 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10218 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10219 if ( RExC_open_parens[paren] >= opnd ) {
10220 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10221 RExC_open_parens[paren] += size;
10223 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10225 if ( RExC_close_parens[paren] >= opnd ) {
10226 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10227 RExC_close_parens[paren] += size;
10229 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10234 while (src > opnd) {
10235 StructCopy(--src, --dst, regnode);
10236 #ifdef RE_TRACK_PATTERN_OFFSETS
10237 if (RExC_offsets) { /* MJD 20010112 */
10238 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10242 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10243 ? "Overwriting end of array!\n" : "OK",
10244 (UV)(src - RExC_emit_start),
10245 (UV)(dst - RExC_emit_start),
10246 (UV)RExC_offsets[0]));
10247 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10248 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10254 place = opnd; /* Op node, where operand used to be. */
10255 #ifdef RE_TRACK_PATTERN_OFFSETS
10256 if (RExC_offsets) { /* MJD */
10257 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10261 (UV)(place - RExC_emit_start) > RExC_offsets[0]
10262 ? "Overwriting end of array!\n" : "OK",
10263 (UV)(place - RExC_emit_start),
10264 (UV)(RExC_parse - RExC_start),
10265 (UV)RExC_offsets[0]));
10266 Set_Node_Offset(place, RExC_parse);
10267 Set_Node_Length(place, 1);
10270 src = NEXTOPER(place);
10271 FILL_ADVANCE_NODE(place, op);
10272 Zero(src, offset, regnode);
10276 - regtail - set the next-pointer at the end of a node chain of p to val.
10277 - SEE ALSO: regtail_study
10279 /* TODO: All three parms should be const */
10281 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10284 register regnode *scan;
10285 GET_RE_DEBUG_FLAGS_DECL;
10287 PERL_ARGS_ASSERT_REGTAIL;
10289 PERL_UNUSED_ARG(depth);
10295 /* Find last node. */
10298 regnode * const temp = regnext(scan);
10300 SV * const mysv=sv_newmortal();
10301 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10302 regprop(RExC_rx, mysv, scan);
10303 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10304 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10305 (temp == NULL ? "->" : ""),
10306 (temp == NULL ? PL_reg_name[OP(val)] : "")
10314 if (reg_off_by_arg[OP(scan)]) {
10315 ARG_SET(scan, val - scan);
10318 NEXT_OFF(scan) = val - scan;
10324 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10325 - Look for optimizable sequences at the same time.
10326 - currently only looks for EXACT chains.
10328 This is experimental code. The idea is to use this routine to perform
10329 in place optimizations on branches and groups as they are constructed,
10330 with the long term intention of removing optimization from study_chunk so
10331 that it is purely analytical.
10333 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10334 to control which is which.
10337 /* TODO: All four parms should be const */
10340 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10343 register regnode *scan;
10345 #ifdef EXPERIMENTAL_INPLACESCAN
10348 GET_RE_DEBUG_FLAGS_DECL;
10350 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10356 /* Find last node. */
10360 regnode * const temp = regnext(scan);
10361 #ifdef EXPERIMENTAL_INPLACESCAN
10362 if (PL_regkind[OP(scan)] == EXACT)
10363 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10367 switch (OP(scan)) {
10372 if( exact == PSEUDO )
10374 else if ( exact != OP(scan) )
10383 SV * const mysv=sv_newmortal();
10384 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10385 regprop(RExC_rx, mysv, scan);
10386 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10387 SvPV_nolen_const(mysv),
10388 REG_NODE_NUM(scan),
10389 PL_reg_name[exact]);
10396 SV * const mysv_val=sv_newmortal();
10397 DEBUG_PARSE_MSG("");
10398 regprop(RExC_rx, mysv_val, val);
10399 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10400 SvPV_nolen_const(mysv_val),
10401 (IV)REG_NODE_NUM(val),
10405 if (reg_off_by_arg[OP(scan)]) {
10406 ARG_SET(scan, val - scan);
10409 NEXT_OFF(scan) = val - scan;
10417 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10421 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10427 for (bit=0; bit<32; bit++) {
10428 if (flags & (1<<bit)) {
10429 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10432 if (!set++ && lead)
10433 PerlIO_printf(Perl_debug_log, "%s",lead);
10434 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10437 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10438 if (!set++ && lead) {
10439 PerlIO_printf(Perl_debug_log, "%s",lead);
10442 case REGEX_UNICODE_CHARSET:
10443 PerlIO_printf(Perl_debug_log, "UNICODE");
10445 case REGEX_LOCALE_CHARSET:
10446 PerlIO_printf(Perl_debug_log, "LOCALE");
10448 case REGEX_ASCII_RESTRICTED_CHARSET:
10449 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10452 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10458 PerlIO_printf(Perl_debug_log, "\n");
10460 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10466 Perl_regdump(pTHX_ const regexp *r)
10470 SV * const sv = sv_newmortal();
10471 SV *dsv= sv_newmortal();
10472 RXi_GET_DECL(r,ri);
10473 GET_RE_DEBUG_FLAGS_DECL;
10475 PERL_ARGS_ASSERT_REGDUMP;
10477 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10479 /* Header fields of interest. */
10480 if (r->anchored_substr) {
10481 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10482 RE_SV_DUMPLEN(r->anchored_substr), 30);
10483 PerlIO_printf(Perl_debug_log,
10484 "anchored %s%s at %"IVdf" ",
10485 s, RE_SV_TAIL(r->anchored_substr),
10486 (IV)r->anchored_offset);
10487 } else if (r->anchored_utf8) {
10488 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10489 RE_SV_DUMPLEN(r->anchored_utf8), 30);
10490 PerlIO_printf(Perl_debug_log,
10491 "anchored utf8 %s%s at %"IVdf" ",
10492 s, RE_SV_TAIL(r->anchored_utf8),
10493 (IV)r->anchored_offset);
10495 if (r->float_substr) {
10496 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10497 RE_SV_DUMPLEN(r->float_substr), 30);
10498 PerlIO_printf(Perl_debug_log,
10499 "floating %s%s at %"IVdf"..%"UVuf" ",
10500 s, RE_SV_TAIL(r->float_substr),
10501 (IV)r->float_min_offset, (UV)r->float_max_offset);
10502 } else if (r->float_utf8) {
10503 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10504 RE_SV_DUMPLEN(r->float_utf8), 30);
10505 PerlIO_printf(Perl_debug_log,
10506 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10507 s, RE_SV_TAIL(r->float_utf8),
10508 (IV)r->float_min_offset, (UV)r->float_max_offset);
10510 if (r->check_substr || r->check_utf8)
10511 PerlIO_printf(Perl_debug_log,
10513 (r->check_substr == r->float_substr
10514 && r->check_utf8 == r->float_utf8
10515 ? "(checking floating" : "(checking anchored"));
10516 if (r->extflags & RXf_NOSCAN)
10517 PerlIO_printf(Perl_debug_log, " noscan");
10518 if (r->extflags & RXf_CHECK_ALL)
10519 PerlIO_printf(Perl_debug_log, " isall");
10520 if (r->check_substr || r->check_utf8)
10521 PerlIO_printf(Perl_debug_log, ") ");
10523 if (ri->regstclass) {
10524 regprop(r, sv, ri->regstclass);
10525 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10527 if (r->extflags & RXf_ANCH) {
10528 PerlIO_printf(Perl_debug_log, "anchored");
10529 if (r->extflags & RXf_ANCH_BOL)
10530 PerlIO_printf(Perl_debug_log, "(BOL)");
10531 if (r->extflags & RXf_ANCH_MBOL)
10532 PerlIO_printf(Perl_debug_log, "(MBOL)");
10533 if (r->extflags & RXf_ANCH_SBOL)
10534 PerlIO_printf(Perl_debug_log, "(SBOL)");
10535 if (r->extflags & RXf_ANCH_GPOS)
10536 PerlIO_printf(Perl_debug_log, "(GPOS)");
10537 PerlIO_putc(Perl_debug_log, ' ');
10539 if (r->extflags & RXf_GPOS_SEEN)
10540 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10541 if (r->intflags & PREGf_SKIP)
10542 PerlIO_printf(Perl_debug_log, "plus ");
10543 if (r->intflags & PREGf_IMPLICIT)
10544 PerlIO_printf(Perl_debug_log, "implicit ");
10545 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10546 if (r->extflags & RXf_EVAL_SEEN)
10547 PerlIO_printf(Perl_debug_log, "with eval ");
10548 PerlIO_printf(Perl_debug_log, "\n");
10549 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10551 PERL_ARGS_ASSERT_REGDUMP;
10552 PERL_UNUSED_CONTEXT;
10553 PERL_UNUSED_ARG(r);
10554 #endif /* DEBUGGING */
10558 - regprop - printable representation of opcode
10560 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10563 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10564 if (flags & ANYOF_INVERT) \
10565 /*make sure the invert info is in each */ \
10566 sv_catpvs(sv, "^"); \
10572 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10577 RXi_GET_DECL(prog,progi);
10578 GET_RE_DEBUG_FLAGS_DECL;
10580 PERL_ARGS_ASSERT_REGPROP;
10584 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
10585 /* It would be nice to FAIL() here, but this may be called from
10586 regexec.c, and it would be hard to supply pRExC_state. */
10587 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10588 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10590 k = PL_regkind[OP(o)];
10593 sv_catpvs(sv, " ");
10594 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10595 * is a crude hack but it may be the best for now since
10596 * we have no flag "this EXACTish node was UTF-8"
10598 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10599 PERL_PV_ESCAPE_UNI_DETECT |
10600 PERL_PV_ESCAPE_NONASCII |
10601 PERL_PV_PRETTY_ELLIPSES |
10602 PERL_PV_PRETTY_LTGT |
10603 PERL_PV_PRETTY_NOCLEAR
10605 } else if (k == TRIE) {
10606 /* print the details of the trie in dumpuntil instead, as
10607 * progi->data isn't available here */
10608 const char op = OP(o);
10609 const U32 n = ARG(o);
10610 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
10611 (reg_ac_data *)progi->data->data[n] :
10613 const reg_trie_data * const trie
10614 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
10616 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
10617 DEBUG_TRIE_COMPILE_r(
10618 Perl_sv_catpvf(aTHX_ sv,
10619 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
10620 (UV)trie->startstate,
10621 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
10622 (UV)trie->wordcount,
10625 (UV)TRIE_CHARCOUNT(trie),
10626 (UV)trie->uniquecharcount
10629 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
10631 int rangestart = -1;
10632 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
10633 sv_catpvs(sv, "[");
10634 for (i = 0; i <= 256; i++) {
10635 if (i < 256 && BITMAP_TEST(bitmap,i)) {
10636 if (rangestart == -1)
10638 } else if (rangestart != -1) {
10639 if (i <= rangestart + 3)
10640 for (; rangestart < i; rangestart++)
10641 put_byte(sv, rangestart);
10643 put_byte(sv, rangestart);
10644 sv_catpvs(sv, "-");
10645 put_byte(sv, i - 1);
10650 sv_catpvs(sv, "]");
10653 } else if (k == CURLY) {
10654 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
10655 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
10656 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
10658 else if (k == WHILEM && o->flags) /* Ordinal/of */
10659 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
10660 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
10661 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
10662 if ( RXp_PAREN_NAMES(prog) ) {
10663 if ( k != REF || (OP(o) < NREF)) {
10664 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
10665 SV **name= av_fetch(list, ARG(o), 0 );
10667 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
10670 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
10671 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
10672 I32 *nums=(I32*)SvPVX(sv_dat);
10673 SV **name= av_fetch(list, nums[0], 0 );
10676 for ( n=0; n<SvIVX(sv_dat); n++ ) {
10677 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
10678 (n ? "," : ""), (IV)nums[n]);
10680 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
10684 } else if (k == GOSUB)
10685 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
10686 else if (k == VERB) {
10688 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
10689 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
10690 } else if (k == LOGICAL)
10691 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
10692 else if (k == FOLDCHAR)
10693 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
10694 else if (k == ANYOF) {
10695 int i, rangestart = -1;
10696 const U8 flags = ANYOF_FLAGS(o);
10699 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
10700 static const char * const anyofs[] = {
10733 if (flags & ANYOF_LOCALE)
10734 sv_catpvs(sv, "{loc}");
10735 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
10736 sv_catpvs(sv, "{i}");
10737 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
10738 if (flags & ANYOF_INVERT)
10739 sv_catpvs(sv, "^");
10741 /* output what the standard cp 0-255 bitmap matches */
10742 for (i = 0; i <= 256; i++) {
10743 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
10744 if (rangestart == -1)
10746 } else if (rangestart != -1) {
10747 if (i <= rangestart + 3)
10748 for (; rangestart < i; rangestart++)
10749 put_byte(sv, rangestart);
10751 put_byte(sv, rangestart);
10752 sv_catpvs(sv, "-");
10753 put_byte(sv, i - 1);
10760 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
10761 /* output any special charclass tests (used entirely under use locale) */
10762 if (ANYOF_CLASS_TEST_ANY_SET(o))
10763 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
10764 if (ANYOF_CLASS_TEST(o,i)) {
10765 sv_catpv(sv, anyofs[i]);
10769 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
10771 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
10772 sv_catpvs(sv, "{non-utf8-latin1-all}");
10775 /* output information about the unicode matching */
10776 if (flags & ANYOF_UNICODE_ALL)
10777 sv_catpvs(sv, "{unicode_all}");
10778 else if (flags & ANYOF_UTF8)
10779 sv_catpvs(sv, "{unicode}");
10780 if (flags & ANYOF_NONBITMAP_NON_UTF8)
10781 sv_catpvs(sv, "{outside bitmap}");
10785 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
10789 U8 s[UTF8_MAXBYTES_CASE+1];
10791 for (i = 0; i <= 256; i++) { /* just the first 256 */
10792 uvchr_to_utf8(s, i);
10794 if (i < 256 && swash_fetch(sw, s, TRUE)) {
10795 if (rangestart == -1)
10797 } else if (rangestart != -1) {
10798 if (i <= rangestart + 3)
10799 for (; rangestart < i; rangestart++) {
10800 const U8 * const e = uvchr_to_utf8(s,rangestart);
10802 for(p = s; p < e; p++)
10806 const U8 *e = uvchr_to_utf8(s,rangestart);
10808 for (p = s; p < e; p++)
10810 sv_catpvs(sv, "-");
10811 e = uvchr_to_utf8(s, i-1);
10812 for (p = s; p < e; p++)
10819 sv_catpvs(sv, "..."); /* et cetera */
10823 char *s = savesvpv(lv);
10824 char * const origs = s;
10826 while (*s && *s != '\n')
10830 const char * const t = ++s;
10848 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
10850 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
10851 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
10853 PERL_UNUSED_CONTEXT;
10854 PERL_UNUSED_ARG(sv);
10855 PERL_UNUSED_ARG(o);
10856 PERL_UNUSED_ARG(prog);
10857 #endif /* DEBUGGING */
10861 Perl_re_intuit_string(pTHX_ REGEXP * const r)
10862 { /* Assume that RE_INTUIT is set */
10864 struct regexp *const prog = (struct regexp *)SvANY(r);
10865 GET_RE_DEBUG_FLAGS_DECL;
10867 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
10868 PERL_UNUSED_CONTEXT;
10872 const char * const s = SvPV_nolen_const(prog->check_substr
10873 ? prog->check_substr : prog->check_utf8);
10875 if (!PL_colorset) reginitcolors();
10876 PerlIO_printf(Perl_debug_log,
10877 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
10879 prog->check_substr ? "" : "utf8 ",
10880 PL_colors[5],PL_colors[0],
10883 (strlen(s) > 60 ? "..." : ""));
10886 return prog->check_substr ? prog->check_substr : prog->check_utf8;
10892 handles refcounting and freeing the perl core regexp structure. When
10893 it is necessary to actually free the structure the first thing it
10894 does is call the 'free' method of the regexp_engine associated to
10895 the regexp, allowing the handling of the void *pprivate; member
10896 first. (This routine is not overridable by extensions, which is why
10897 the extensions free is called first.)
10899 See regdupe and regdupe_internal if you change anything here.
10901 #ifndef PERL_IN_XSUB_RE
10903 Perl_pregfree(pTHX_ REGEXP *r)
10909 Perl_pregfree2(pTHX_ REGEXP *rx)
10912 struct regexp *const r = (struct regexp *)SvANY(rx);
10913 GET_RE_DEBUG_FLAGS_DECL;
10915 PERL_ARGS_ASSERT_PREGFREE2;
10917 if (r->mother_re) {
10918 ReREFCNT_dec(r->mother_re);
10920 CALLREGFREE_PVT(rx); /* free the private data */
10921 SvREFCNT_dec(RXp_PAREN_NAMES(r));
10924 SvREFCNT_dec(r->anchored_substr);
10925 SvREFCNT_dec(r->anchored_utf8);
10926 SvREFCNT_dec(r->float_substr);
10927 SvREFCNT_dec(r->float_utf8);
10928 Safefree(r->substrs);
10930 RX_MATCH_COPY_FREE(rx);
10931 #ifdef PERL_OLD_COPY_ON_WRITE
10932 SvREFCNT_dec(r->saved_copy);
10939 This is a hacky workaround to the structural issue of match results
10940 being stored in the regexp structure which is in turn stored in
10941 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
10942 could be PL_curpm in multiple contexts, and could require multiple
10943 result sets being associated with the pattern simultaneously, such
10944 as when doing a recursive match with (??{$qr})
10946 The solution is to make a lightweight copy of the regexp structure
10947 when a qr// is returned from the code executed by (??{$qr}) this
10948 lightweight copy doesn't actually own any of its data except for
10949 the starp/end and the actual regexp structure itself.
10955 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
10957 struct regexp *ret;
10958 struct regexp *const r = (struct regexp *)SvANY(rx);
10959 register const I32 npar = r->nparens+1;
10961 PERL_ARGS_ASSERT_REG_TEMP_COPY;
10964 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
10965 ret = (struct regexp *)SvANY(ret_x);
10967 (void)ReREFCNT_inc(rx);
10968 /* We can take advantage of the existing "copied buffer" mechanism in SVs
10969 by pointing directly at the buffer, but flagging that the allocated
10970 space in the copy is zero. As we've just done a struct copy, it's now
10971 a case of zero-ing that, rather than copying the current length. */
10972 SvPV_set(ret_x, RX_WRAPPED(rx));
10973 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
10974 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
10975 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
10976 SvLEN_set(ret_x, 0);
10977 SvSTASH_set(ret_x, NULL);
10978 SvMAGIC_set(ret_x, NULL);
10979 Newx(ret->offs, npar, regexp_paren_pair);
10980 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10982 Newx(ret->substrs, 1, struct reg_substr_data);
10983 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10985 SvREFCNT_inc_void(ret->anchored_substr);
10986 SvREFCNT_inc_void(ret->anchored_utf8);
10987 SvREFCNT_inc_void(ret->float_substr);
10988 SvREFCNT_inc_void(ret->float_utf8);
10990 /* check_substr and check_utf8, if non-NULL, point to either their
10991 anchored or float namesakes, and don't hold a second reference. */
10993 RX_MATCH_COPIED_off(ret_x);
10994 #ifdef PERL_OLD_COPY_ON_WRITE
10995 ret->saved_copy = NULL;
10997 ret->mother_re = rx;
11003 /* regfree_internal()
11005 Free the private data in a regexp. This is overloadable by
11006 extensions. Perl takes care of the regexp structure in pregfree(),
11007 this covers the *pprivate pointer which technically perl doesn't
11008 know about, however of course we have to handle the
11009 regexp_internal structure when no extension is in use.
11011 Note this is called before freeing anything in the regexp
11016 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11019 struct regexp *const r = (struct regexp *)SvANY(rx);
11020 RXi_GET_DECL(r,ri);
11021 GET_RE_DEBUG_FLAGS_DECL;
11023 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11029 SV *dsv= sv_newmortal();
11030 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11031 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11032 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11033 PL_colors[4],PL_colors[5],s);
11036 #ifdef RE_TRACK_PATTERN_OFFSETS
11038 Safefree(ri->u.offsets); /* 20010421 MJD */
11041 int n = ri->data->count;
11042 PAD* new_comppad = NULL;
11047 /* If you add a ->what type here, update the comment in regcomp.h */
11048 switch (ri->data->what[n]) {
11053 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11056 Safefree(ri->data->data[n]);
11059 new_comppad = MUTABLE_AV(ri->data->data[n]);
11062 if (new_comppad == NULL)
11063 Perl_croak(aTHX_ "panic: pregfree comppad");
11064 PAD_SAVE_LOCAL(old_comppad,
11065 /* Watch out for global destruction's random ordering. */
11066 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11069 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11072 op_free((OP_4tree*)ri->data->data[n]);
11074 PAD_RESTORE_LOCAL(old_comppad);
11075 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11076 new_comppad = NULL;
11081 { /* Aho Corasick add-on structure for a trie node.
11082 Used in stclass optimization only */
11084 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11086 refcount = --aho->refcount;
11089 PerlMemShared_free(aho->states);
11090 PerlMemShared_free(aho->fail);
11091 /* do this last!!!! */
11092 PerlMemShared_free(ri->data->data[n]);
11093 PerlMemShared_free(ri->regstclass);
11099 /* trie structure. */
11101 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11103 refcount = --trie->refcount;
11106 PerlMemShared_free(trie->charmap);
11107 PerlMemShared_free(trie->states);
11108 PerlMemShared_free(trie->trans);
11110 PerlMemShared_free(trie->bitmap);
11112 PerlMemShared_free(trie->jump);
11113 PerlMemShared_free(trie->wordinfo);
11114 /* do this last!!!! */
11115 PerlMemShared_free(ri->data->data[n]);
11120 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11123 Safefree(ri->data->what);
11124 Safefree(ri->data);
11130 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11131 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11132 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11135 re_dup - duplicate a regexp.
11137 This routine is expected to clone a given regexp structure. It is only
11138 compiled under USE_ITHREADS.
11140 After all of the core data stored in struct regexp is duplicated
11141 the regexp_engine.dupe method is used to copy any private data
11142 stored in the *pprivate pointer. This allows extensions to handle
11143 any duplication it needs to do.
11145 See pregfree() and regfree_internal() if you change anything here.
11147 #if defined(USE_ITHREADS)
11148 #ifndef PERL_IN_XSUB_RE
11150 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11154 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11155 struct regexp *ret = (struct regexp *)SvANY(dstr);
11157 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11159 npar = r->nparens+1;
11160 Newx(ret->offs, npar, regexp_paren_pair);
11161 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11163 /* no need to copy these */
11164 Newx(ret->swap, npar, regexp_paren_pair);
11167 if (ret->substrs) {
11168 /* Do it this way to avoid reading from *r after the StructCopy().
11169 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11170 cache, it doesn't matter. */
11171 const bool anchored = r->check_substr
11172 ? r->check_substr == r->anchored_substr
11173 : r->check_utf8 == r->anchored_utf8;
11174 Newx(ret->substrs, 1, struct reg_substr_data);
11175 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11177 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11178 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11179 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11180 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11182 /* check_substr and check_utf8, if non-NULL, point to either their
11183 anchored or float namesakes, and don't hold a second reference. */
11185 if (ret->check_substr) {
11187 assert(r->check_utf8 == r->anchored_utf8);
11188 ret->check_substr = ret->anchored_substr;
11189 ret->check_utf8 = ret->anchored_utf8;
11191 assert(r->check_substr == r->float_substr);
11192 assert(r->check_utf8 == r->float_utf8);
11193 ret->check_substr = ret->float_substr;
11194 ret->check_utf8 = ret->float_utf8;
11196 } else if (ret->check_utf8) {
11198 ret->check_utf8 = ret->anchored_utf8;
11200 ret->check_utf8 = ret->float_utf8;
11205 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11208 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11210 if (RX_MATCH_COPIED(dstr))
11211 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
11213 ret->subbeg = NULL;
11214 #ifdef PERL_OLD_COPY_ON_WRITE
11215 ret->saved_copy = NULL;
11218 if (ret->mother_re) {
11219 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11220 /* Our storage points directly to our mother regexp, but that's
11221 1: a buffer in a different thread
11222 2: something we no longer hold a reference on
11223 so we need to copy it locally. */
11224 /* Note we need to sue SvCUR() on our mother_re, because it, in
11225 turn, may well be pointing to its own mother_re. */
11226 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11227 SvCUR(ret->mother_re)+1));
11228 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11230 ret->mother_re = NULL;
11234 #endif /* PERL_IN_XSUB_RE */
11239 This is the internal complement to regdupe() which is used to copy
11240 the structure pointed to by the *pprivate pointer in the regexp.
11241 This is the core version of the extension overridable cloning hook.
11242 The regexp structure being duplicated will be copied by perl prior
11243 to this and will be provided as the regexp *r argument, however
11244 with the /old/ structures pprivate pointer value. Thus this routine
11245 may override any copying normally done by perl.
11247 It returns a pointer to the new regexp_internal structure.
11251 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11254 struct regexp *const r = (struct regexp *)SvANY(rx);
11255 regexp_internal *reti;
11257 RXi_GET_DECL(r,ri);
11259 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11261 npar = r->nparens+1;
11264 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11265 Copy(ri->program, reti->program, len+1, regnode);
11268 reti->regstclass = NULL;
11271 struct reg_data *d;
11272 const int count = ri->data->count;
11275 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11276 char, struct reg_data);
11277 Newx(d->what, count, U8);
11280 for (i = 0; i < count; i++) {
11281 d->what[i] = ri->data->what[i];
11282 switch (d->what[i]) {
11283 /* legal options are one of: sSfpontTua
11284 see also regcomp.h and pregfree() */
11285 case 'a': /* actually an AV, but the dup function is identical. */
11288 case 'p': /* actually an AV, but the dup function is identical. */
11289 case 'u': /* actually an HV, but the dup function is identical. */
11290 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11293 /* This is cheating. */
11294 Newx(d->data[i], 1, struct regnode_charclass_class);
11295 StructCopy(ri->data->data[i], d->data[i],
11296 struct regnode_charclass_class);
11297 reti->regstclass = (regnode*)d->data[i];
11300 /* Compiled op trees are readonly and in shared memory,
11301 and can thus be shared without duplication. */
11303 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11307 /* Trie stclasses are readonly and can thus be shared
11308 * without duplication. We free the stclass in pregfree
11309 * when the corresponding reg_ac_data struct is freed.
11311 reti->regstclass= ri->regstclass;
11315 ((reg_trie_data*)ri->data->data[i])->refcount++;
11319 d->data[i] = ri->data->data[i];
11322 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11331 reti->name_list_idx = ri->name_list_idx;
11333 #ifdef RE_TRACK_PATTERN_OFFSETS
11334 if (ri->u.offsets) {
11335 Newx(reti->u.offsets, 2*len+1, U32);
11336 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11339 SetProgLen(reti,len);
11342 return (void*)reti;
11345 #endif /* USE_ITHREADS */
11347 #ifndef PERL_IN_XSUB_RE
11350 - regnext - dig the "next" pointer out of a node
11353 Perl_regnext(pTHX_ register regnode *p)
11356 register I32 offset;
11361 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11362 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11365 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11374 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11377 STRLEN l1 = strlen(pat1);
11378 STRLEN l2 = strlen(pat2);
11381 const char *message;
11383 PERL_ARGS_ASSERT_RE_CROAK2;
11389 Copy(pat1, buf, l1 , char);
11390 Copy(pat2, buf + l1, l2 , char);
11391 buf[l1 + l2] = '\n';
11392 buf[l1 + l2 + 1] = '\0';
11394 /* ANSI variant takes additional second argument */
11395 va_start(args, pat2);
11399 msv = vmess(buf, &args);
11401 message = SvPV_const(msv,l1);
11404 Copy(message, buf, l1 , char);
11405 buf[l1-1] = '\0'; /* Overwrite \n */
11406 Perl_croak(aTHX_ "%s", buf);
11409 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11411 #ifndef PERL_IN_XSUB_RE
11413 Perl_save_re_context(pTHX)
11417 struct re_save_state *state;
11419 SAVEVPTR(PL_curcop);
11420 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11422 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11423 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11424 SSPUSHUV(SAVEt_RE_STATE);
11426 Copy(&PL_reg_state, state, 1, struct re_save_state);
11428 PL_reg_start_tmp = 0;
11429 PL_reg_start_tmpl = 0;
11430 PL_reg_oldsaved = NULL;
11431 PL_reg_oldsavedlen = 0;
11432 PL_reg_maxiter = 0;
11433 PL_reg_leftiter = 0;
11434 PL_reg_poscache = NULL;
11435 PL_reg_poscache_size = 0;
11436 #ifdef PERL_OLD_COPY_ON_WRITE
11440 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11442 const REGEXP * const rx = PM_GETRE(PL_curpm);
11445 for (i = 1; i <= RX_NPARENS(rx); i++) {
11446 char digits[TYPE_CHARS(long)];
11447 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11448 GV *const *const gvp
11449 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11452 GV * const gv = *gvp;
11453 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11463 clear_re(pTHX_ void *r)
11466 ReREFCNT_dec((REGEXP *)r);
11472 S_put_byte(pTHX_ SV *sv, int c)
11474 PERL_ARGS_ASSERT_PUT_BYTE;
11476 /* Our definition of isPRINT() ignores locales, so only bytes that are
11477 not part of UTF-8 are considered printable. I assume that the same
11478 holds for UTF-EBCDIC.
11479 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11480 which Wikipedia says:
11482 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11483 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11484 identical, to the ASCII delete (DEL) or rubout control character.
11485 ) So the old condition can be simplified to !isPRINT(c) */
11488 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11491 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11495 const char string = c;
11496 if (c == '-' || c == ']' || c == '\\' || c == '^')
11497 sv_catpvs(sv, "\\");
11498 sv_catpvn(sv, &string, 1);
11503 #define CLEAR_OPTSTART \
11504 if (optstart) STMT_START { \
11505 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11509 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11511 STATIC const regnode *
11512 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11513 const regnode *last, const regnode *plast,
11514 SV* sv, I32 indent, U32 depth)
11517 register U8 op = PSEUDO; /* Arbitrary non-END op. */
11518 register const regnode *next;
11519 const regnode *optstart= NULL;
11521 RXi_GET_DECL(r,ri);
11522 GET_RE_DEBUG_FLAGS_DECL;
11524 PERL_ARGS_ASSERT_DUMPUNTIL;
11526 #ifdef DEBUG_DUMPUNTIL
11527 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11528 last ? last-start : 0,plast ? plast-start : 0);
11531 if (plast && plast < last)
11534 while (PL_regkind[op] != END && (!last || node < last)) {
11535 /* While that wasn't END last time... */
11538 if (op == CLOSE || op == WHILEM)
11540 next = regnext((regnode *)node);
11543 if (OP(node) == OPTIMIZED) {
11544 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11551 regprop(r, sv, node);
11552 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11553 (int)(2*indent + 1), "", SvPVX_const(sv));
11555 if (OP(node) != OPTIMIZED) {
11556 if (next == NULL) /* Next ptr. */
11557 PerlIO_printf(Perl_debug_log, " (0)");
11558 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11559 PerlIO_printf(Perl_debug_log, " (FAIL)");
11561 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11562 (void)PerlIO_putc(Perl_debug_log, '\n');
11566 if (PL_regkind[(U8)op] == BRANCHJ) {
11569 register const regnode *nnode = (OP(next) == LONGJMP
11570 ? regnext((regnode *)next)
11572 if (last && nnode > last)
11574 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11577 else if (PL_regkind[(U8)op] == BRANCH) {
11579 DUMPUNTIL(NEXTOPER(node), next);
11581 else if ( PL_regkind[(U8)op] == TRIE ) {
11582 const regnode *this_trie = node;
11583 const char op = OP(node);
11584 const U32 n = ARG(node);
11585 const reg_ac_data * const ac = op>=AHOCORASICK ?
11586 (reg_ac_data *)ri->data->data[n] :
11588 const reg_trie_data * const trie =
11589 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11591 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11593 const regnode *nextbranch= NULL;
11596 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11597 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
11599 PerlIO_printf(Perl_debug_log, "%*s%s ",
11600 (int)(2*(indent+3)), "",
11601 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
11602 PL_colors[0], PL_colors[1],
11603 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
11604 PERL_PV_PRETTY_ELLIPSES |
11605 PERL_PV_PRETTY_LTGT
11610 U16 dist= trie->jump[word_idx+1];
11611 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
11612 (UV)((dist ? this_trie + dist : next) - start));
11615 nextbranch= this_trie + trie->jump[0];
11616 DUMPUNTIL(this_trie + dist, nextbranch);
11618 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
11619 nextbranch= regnext((regnode *)nextbranch);
11621 PerlIO_printf(Perl_debug_log, "\n");
11624 if (last && next > last)
11629 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
11630 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
11631 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
11633 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
11635 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
11637 else if ( op == PLUS || op == STAR) {
11638 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
11640 else if (PL_regkind[(U8)op] == ANYOF) {
11641 /* arglen 1 + class block */
11642 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
11643 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
11644 node = NEXTOPER(node);
11646 else if (PL_regkind[(U8)op] == EXACT) {
11647 /* Literal string, where present. */
11648 node += NODE_SZ_STR(node) - 1;
11649 node = NEXTOPER(node);
11652 node = NEXTOPER(node);
11653 node += regarglen[(U8)op];
11655 if (op == CURLYX || op == OPEN)
11659 #ifdef DEBUG_DUMPUNTIL
11660 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
11665 #endif /* DEBUGGING */
11669 * c-indentation-style: bsd
11670 * c-basic-offset: 4
11671 * indent-tabs-mode: t
11674 * ex: set ts=8 sts=4 sw=4 noet: