5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
140 HV *paren_names; /* Paren names */
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
146 char *starttry; /* -Dr: where regtry was called. */
147 #define RExC_starttry (pRExC_state->starttry)
150 const char *lastparse;
152 AV *paren_name_list; /* idx -> name */
153 #define RExC_lastparse (pRExC_state->lastparse)
154 #define RExC_lastnum (pRExC_state->lastnum)
155 #define RExC_paren_name_list (pRExC_state->paren_name_list)
159 #define RExC_flags (pRExC_state->flags)
160 #define RExC_precomp (pRExC_state->precomp)
161 #define RExC_rx_sv (pRExC_state->rx_sv)
162 #define RExC_rx (pRExC_state->rx)
163 #define RExC_rxi (pRExC_state->rxi)
164 #define RExC_start (pRExC_state->start)
165 #define RExC_end (pRExC_state->end)
166 #define RExC_parse (pRExC_state->parse)
167 #define RExC_whilem_seen (pRExC_state->whilem_seen)
168 #ifdef RE_TRACK_PATTERN_OFFSETS
169 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
171 #define RExC_emit (pRExC_state->emit)
172 #define RExC_emit_start (pRExC_state->emit_start)
173 #define RExC_emit_bound (pRExC_state->emit_bound)
174 #define RExC_naughty (pRExC_state->naughty)
175 #define RExC_sawback (pRExC_state->sawback)
176 #define RExC_seen (pRExC_state->seen)
177 #define RExC_size (pRExC_state->size)
178 #define RExC_npar (pRExC_state->npar)
179 #define RExC_nestroot (pRExC_state->nestroot)
180 #define RExC_extralen (pRExC_state->extralen)
181 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
182 #define RExC_seen_evals (pRExC_state->seen_evals)
183 #define RExC_utf8 (pRExC_state->utf8)
184 #define RExC_uni_semantics (pRExC_state->uni_semantics)
185 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
186 #define RExC_open_parens (pRExC_state->open_parens)
187 #define RExC_close_parens (pRExC_state->close_parens)
188 #define RExC_opend (pRExC_state->opend)
189 #define RExC_paren_names (pRExC_state->paren_names)
190 #define RExC_recurse (pRExC_state->recurse)
191 #define RExC_recurse_count (pRExC_state->recurse_count)
192 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
195 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
196 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
197 ((*s) == '{' && regcurly(s)))
200 #undef SPSTART /* dratted cpp namespace... */
203 * Flags to be passed up and down.
205 #define WORST 0 /* Worst case. */
206 #define HASWIDTH 0x01 /* Known to match non-null strings. */
208 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
209 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
211 #define SPSTART 0x04 /* Starts with * or +. */
212 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
213 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
215 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
217 /* whether trie related optimizations are enabled */
218 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
219 #define TRIE_STUDY_OPT
220 #define FULL_TRIE_STUDY
226 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
227 #define PBITVAL(paren) (1 << ((paren) & 7))
228 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
229 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
230 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
232 /* If not already in utf8, do a longjmp back to the beginning */
233 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
234 #define REQUIRE_UTF8 STMT_START { \
235 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
238 /* About scan_data_t.
240 During optimisation we recurse through the regexp program performing
241 various inplace (keyhole style) optimisations. In addition study_chunk
242 and scan_commit populate this data structure with information about
243 what strings MUST appear in the pattern. We look for the longest
244 string that must appear at a fixed location, and we look for the
245 longest string that may appear at a floating location. So for instance
250 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
251 strings (because they follow a .* construct). study_chunk will identify
252 both FOO and BAR as being the longest fixed and floating strings respectively.
254 The strings can be composites, for instance
258 will result in a composite fixed substring 'foo'.
260 For each string some basic information is maintained:
262 - offset or min_offset
263 This is the position the string must appear at, or not before.
264 It also implicitly (when combined with minlenp) tells us how many
265 characters must match before the string we are searching for.
266 Likewise when combined with minlenp and the length of the string it
267 tells us how many characters must appear after the string we have
271 Only used for floating strings. This is the rightmost point that
272 the string can appear at. If set to I32 max it indicates that the
273 string can occur infinitely far to the right.
276 A pointer to the minimum length of the pattern that the string
277 was found inside. This is important as in the case of positive
278 lookahead or positive lookbehind we can have multiple patterns
283 The minimum length of the pattern overall is 3, the minimum length
284 of the lookahead part is 3, but the minimum length of the part that
285 will actually match is 1. So 'FOO's minimum length is 3, but the
286 minimum length for the F is 1. This is important as the minimum length
287 is used to determine offsets in front of and behind the string being
288 looked for. Since strings can be composites this is the length of the
289 pattern at the time it was committed with a scan_commit. Note that
290 the length is calculated by study_chunk, so that the minimum lengths
291 are not known until the full pattern has been compiled, thus the
292 pointer to the value.
296 In the case of lookbehind the string being searched for can be
297 offset past the start point of the final matching string.
298 If this value was just blithely removed from the min_offset it would
299 invalidate some of the calculations for how many chars must match
300 before or after (as they are derived from min_offset and minlen and
301 the length of the string being searched for).
302 When the final pattern is compiled and the data is moved from the
303 scan_data_t structure into the regexp structure the information
304 about lookbehind is factored in, with the information that would
305 have been lost precalculated in the end_shift field for the
308 The fields pos_min and pos_delta are used to store the minimum offset
309 and the delta to the maximum offset at the current point in the pattern.
313 typedef struct scan_data_t {
314 /*I32 len_min; unused */
315 /*I32 len_delta; unused */
319 I32 last_end; /* min value, <0 unless valid. */
322 SV **longest; /* Either &l_fixed, or &l_float. */
323 SV *longest_fixed; /* longest fixed string found in pattern */
324 I32 offset_fixed; /* offset where it starts */
325 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
326 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
327 SV *longest_float; /* longest floating string found in pattern */
328 I32 offset_float_min; /* earliest point in string it can appear */
329 I32 offset_float_max; /* latest point in string it can appear */
330 I32 *minlen_float; /* pointer to the minlen relevant to the string */
331 I32 lookbehind_float; /* is the position of the string modified by LB */
335 struct regnode_charclass_class *start_class;
339 * Forward declarations for pregcomp()'s friends.
342 static const scan_data_t zero_scan_data =
343 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
345 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
346 #define SF_BEFORE_SEOL 0x0001
347 #define SF_BEFORE_MEOL 0x0002
348 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
349 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
352 # define SF_FIX_SHIFT_EOL (0+2)
353 # define SF_FL_SHIFT_EOL (0+4)
355 # define SF_FIX_SHIFT_EOL (+2)
356 # define SF_FL_SHIFT_EOL (+4)
359 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
360 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
362 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
363 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
364 #define SF_IS_INF 0x0040
365 #define SF_HAS_PAR 0x0080
366 #define SF_IN_PAR 0x0100
367 #define SF_HAS_EVAL 0x0200
368 #define SCF_DO_SUBSTR 0x0400
369 #define SCF_DO_STCLASS_AND 0x0800
370 #define SCF_DO_STCLASS_OR 0x1000
371 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
372 #define SCF_WHILEM_VISITED_POS 0x2000
374 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
375 #define SCF_SEEN_ACCEPT 0x8000
377 #define UTF cBOOL(RExC_utf8)
378 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
379 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
380 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
381 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
382 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
383 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
384 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
386 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
388 #define OOB_UNICODE 12345678
389 #define OOB_NAMEDCLASS -1
391 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
392 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
395 /* length of regex to show in messages that don't mark a position within */
396 #define RegexLengthToShowInErrorMessages 127
399 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
400 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
401 * op/pragma/warn/regcomp.
403 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
404 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
406 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
409 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
410 * arg. Show regex, up to a maximum length. If it's too long, chop and add
413 #define _FAIL(code) STMT_START { \
414 const char *ellipses = ""; \
415 IV len = RExC_end - RExC_precomp; \
418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
419 if (len > RegexLengthToShowInErrorMessages) { \
420 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
421 len = RegexLengthToShowInErrorMessages - 10; \
427 #define FAIL(msg) _FAIL( \
428 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
429 msg, (int)len, RExC_precomp, ellipses))
431 #define FAIL2(msg,arg) _FAIL( \
432 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
433 arg, (int)len, RExC_precomp, ellipses))
436 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
438 #define Simple_vFAIL(m) STMT_START { \
439 const IV offset = RExC_parse - RExC_precomp; \
440 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
441 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
445 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
447 #define vFAIL(m) STMT_START { \
449 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
454 * Like Simple_vFAIL(), but accepts two arguments.
456 #define Simple_vFAIL2(m,a1) STMT_START { \
457 const IV offset = RExC_parse - RExC_precomp; \
458 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
459 (int)offset, RExC_precomp, RExC_precomp + offset); \
463 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
465 #define vFAIL2(m,a1) STMT_START { \
467 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
468 Simple_vFAIL2(m, a1); \
473 * Like Simple_vFAIL(), but accepts three arguments.
475 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
476 const IV offset = RExC_parse - RExC_precomp; \
477 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
478 (int)offset, RExC_precomp, RExC_precomp + offset); \
482 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
484 #define vFAIL3(m,a1,a2) STMT_START { \
486 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
487 Simple_vFAIL3(m, a1, a2); \
491 * Like Simple_vFAIL(), but accepts four arguments.
493 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
494 const IV offset = RExC_parse - RExC_precomp; \
495 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
496 (int)offset, RExC_precomp, RExC_precomp + offset); \
499 #define ckWARNreg(loc,m) STMT_START { \
500 const IV offset = loc - RExC_precomp; \
501 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 (int)offset, RExC_precomp, RExC_precomp + offset); \
505 #define ckWARNregdep(loc,m) STMT_START { \
506 const IV offset = loc - RExC_precomp; \
507 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
509 (int)offset, RExC_precomp, RExC_precomp + offset); \
512 #define ckWARN2regdep(loc,m, a1) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
516 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
519 #define ckWARN2reg(loc, m, a1) STMT_START { \
520 const IV offset = loc - RExC_precomp; \
521 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
525 #define vWARN3(loc, m, a1, a2) STMT_START { \
526 const IV offset = loc - RExC_precomp; \
527 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
528 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
531 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
532 const IV offset = loc - RExC_precomp; \
533 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
534 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
538 const IV offset = loc - RExC_precomp; \
539 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
540 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
543 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
544 const IV offset = loc - RExC_precomp; \
545 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
546 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
549 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
550 const IV offset = loc - RExC_precomp; \
551 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
552 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
556 /* Allow for side effects in s */
557 #define REGC(c,s) STMT_START { \
558 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
561 /* Macros for recording node offsets. 20001227 mjd@plover.com
562 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
563 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
564 * Element 0 holds the number n.
565 * Position is 1 indexed.
567 #ifndef RE_TRACK_PATTERN_OFFSETS
568 #define Set_Node_Offset_To_R(node,byte)
569 #define Set_Node_Offset(node,byte)
570 #define Set_Cur_Node_Offset
571 #define Set_Node_Length_To_R(node,len)
572 #define Set_Node_Length(node,len)
573 #define Set_Node_Cur_Length(node)
574 #define Node_Offset(n)
575 #define Node_Length(n)
576 #define Set_Node_Offset_Length(node,offset,len)
577 #define ProgLen(ri) ri->u.proglen
578 #define SetProgLen(ri,x) ri->u.proglen = x
580 #define ProgLen(ri) ri->u.offsets[0]
581 #define SetProgLen(ri,x) ri->u.offsets[0] = x
582 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
584 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
585 __LINE__, (int)(node), (int)(byte))); \
587 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
589 RExC_offsets[2*(node)-1] = (byte); \
594 #define Set_Node_Offset(node,byte) \
595 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
596 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
598 #define Set_Node_Length_To_R(node,len) STMT_START { \
600 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
601 __LINE__, (int)(node), (int)(len))); \
603 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
605 RExC_offsets[2*(node)] = (len); \
610 #define Set_Node_Length(node,len) \
611 Set_Node_Length_To_R((node)-RExC_emit_start, len)
612 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
613 #define Set_Node_Cur_Length(node) \
614 Set_Node_Length(node, RExC_parse - parse_start)
616 /* Get offsets and lengths */
617 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
618 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
620 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
621 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
622 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
626 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
627 #define EXPERIMENTAL_INPLACESCAN
628 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
630 #define DEBUG_STUDYDATA(str,data,depth) \
631 DEBUG_OPTIMISE_MORE_r(if(data){ \
632 PerlIO_printf(Perl_debug_log, \
633 "%*s" str "Pos:%"IVdf"/%"IVdf \
634 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
635 (int)(depth)*2, "", \
636 (IV)((data)->pos_min), \
637 (IV)((data)->pos_delta), \
638 (UV)((data)->flags), \
639 (IV)((data)->whilem_c), \
640 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
641 is_inf ? "INF " : "" \
643 if ((data)->last_found) \
644 PerlIO_printf(Perl_debug_log, \
645 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
646 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
647 SvPVX_const((data)->last_found), \
648 (IV)((data)->last_end), \
649 (IV)((data)->last_start_min), \
650 (IV)((data)->last_start_max), \
651 ((data)->longest && \
652 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
653 SvPVX_const((data)->longest_fixed), \
654 (IV)((data)->offset_fixed), \
655 ((data)->longest && \
656 (data)->longest==&((data)->longest_float)) ? "*" : "", \
657 SvPVX_const((data)->longest_float), \
658 (IV)((data)->offset_float_min), \
659 (IV)((data)->offset_float_max) \
661 PerlIO_printf(Perl_debug_log,"\n"); \
664 static void clear_re(pTHX_ void *r);
666 /* Mark that we cannot extend a found fixed substring at this point.
667 Update the longest found anchored substring and the longest found
668 floating substrings if needed. */
671 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
673 const STRLEN l = CHR_SVLEN(data->last_found);
674 const STRLEN old_l = CHR_SVLEN(*data->longest);
675 GET_RE_DEBUG_FLAGS_DECL;
677 PERL_ARGS_ASSERT_SCAN_COMMIT;
679 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
680 SvSetMagicSV(*data->longest, data->last_found);
681 if (*data->longest == data->longest_fixed) {
682 data->offset_fixed = l ? data->last_start_min : data->pos_min;
683 if (data->flags & SF_BEFORE_EOL)
685 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
687 data->flags &= ~SF_FIX_BEFORE_EOL;
688 data->minlen_fixed=minlenp;
689 data->lookbehind_fixed=0;
691 else { /* *data->longest == data->longest_float */
692 data->offset_float_min = l ? data->last_start_min : data->pos_min;
693 data->offset_float_max = (l
694 ? data->last_start_max
695 : data->pos_min + data->pos_delta);
696 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
697 data->offset_float_max = I32_MAX;
698 if (data->flags & SF_BEFORE_EOL)
700 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
702 data->flags &= ~SF_FL_BEFORE_EOL;
703 data->minlen_float=minlenp;
704 data->lookbehind_float=0;
707 SvCUR_set(data->last_found, 0);
709 SV * const sv = data->last_found;
710 if (SvUTF8(sv) && SvMAGICAL(sv)) {
711 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
717 data->flags &= ~SF_BEFORE_EOL;
718 DEBUG_STUDYDATA("commit: ",data,0);
721 /* Can match anything (initialization) */
723 S_cl_anything(struct regnode_charclass_class *cl)
725 PERL_ARGS_ASSERT_CL_ANYTHING;
727 ANYOF_CLASS_ZERO(cl);
728 ANYOF_BITMAP_SETALL(cl);
729 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_LOCALE;
732 /* Can match anything (initialization) */
734 S_cl_is_anything(const struct regnode_charclass_class *cl)
738 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
740 for (value = 0; value <= ANYOF_MAX; value += 2)
741 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
743 if (!(cl->flags & ANYOF_UNICODE_ALL))
745 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
750 /* Can match anything (initialization) */
752 S_cl_init(struct regnode_charclass_class *cl)
754 PERL_ARGS_ASSERT_CL_INIT;
756 Zero(cl, 1, struct regnode_charclass_class);
759 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
762 /* These two functions currently do the exact same thing */
763 #define cl_init_zero S_cl_init
765 /* 'And' a given class with another one. Can create false positives */
766 /* cl should not be inverted */
768 S_cl_and(struct regnode_charclass_class *cl,
769 const struct regnode_charclass_class *and_with)
771 PERL_ARGS_ASSERT_CL_AND;
773 assert(and_with->type == ANYOF);
775 /* I (khw) am not sure all these restrictions are necessary XXX */
776 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
777 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
778 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
779 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
780 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
783 if (and_with->flags & ANYOF_INVERT)
784 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
785 cl->bitmap[i] &= ~and_with->bitmap[i];
787 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
788 cl->bitmap[i] &= and_with->bitmap[i];
789 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
791 if (and_with->flags & ANYOF_INVERT) {
793 /* Here, the and'ed node is inverted. Get the AND of the flags that
794 * aren't affected by the inversion. Those that are affected are
795 * handled individually below */
796 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
797 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
798 cl->flags |= affected_flags;
800 /* We currently don't know how to deal with things that aren't in the
801 * bitmap, but we know that the intersection is no greater than what
802 * is already in cl, so let there be false positives that get sorted
803 * out after the synthetic start class succeeds, and the node is
804 * matched for real. */
806 /* The inversion of these two flags indicate that the resulting
807 * intersection doesn't have them */
808 if (and_with->flags & ANYOF_UNICODE_ALL) {
809 cl->flags &= ~ANYOF_UNICODE_ALL;
811 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
812 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
815 else { /* and'd node is not inverted */
816 if (! ANYOF_NONBITMAP(and_with)) {
818 /* Here 'and_with' doesn't match anything outside the bitmap
819 * (except possibly ANYOF_UNICODE_ALL), which means the
820 * intersection can't either, except for ANYOF_UNICODE_ALL, in
821 * which case we don't know what the intersection is, but it's no
822 * greater than what cl already has, so can just leave it alone,
823 * with possible false positives */
824 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
825 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
828 else if (! ANYOF_NONBITMAP(cl)) {
830 /* Here, 'and_with' does match something outside the bitmap, and cl
831 * doesn't have a list of things to match outside the bitmap. If
832 * cl can match all code points above 255, the intersection will
833 * be those above-255 code points that 'and_with' matches. There
834 * may be false positives from code points in 'and_with' that are
835 * outside the bitmap but below 256, but those get sorted out
836 * after the synthetic start class succeeds). If cl can't match
837 * all Unicode code points, it means here that it can't match *
838 * anything outside the bitmap, so we leave the bitmap empty */
839 if (cl->flags & ANYOF_UNICODE_ALL) {
840 ARG_SET(cl, ARG(and_with));
844 /* Here, both 'and_with' and cl match something outside the
845 * bitmap. Currently we do not do the intersection, so just match
846 * whatever cl had at the beginning. */
850 /* Take the intersection of the two sets of flags */
851 cl->flags &= and_with->flags;
855 /* 'OR' a given class with another one. Can create false positives */
856 /* cl should not be inverted */
858 S_cl_or(struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
860 PERL_ARGS_ASSERT_CL_OR;
862 if (or_with->flags & ANYOF_INVERT) {
864 /* Here, the or'd node is to be inverted. This means we take the
865 * complement of everything not in the bitmap, but currently we don't
866 * know what that is, so give up and match anything */
867 if (ANYOF_NONBITMAP(or_with)) {
871 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
872 * <= (B1 | !B2) | (CL1 | !CL2)
873 * which is wasteful if CL2 is small, but we ignore CL2:
874 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
875 * XXXX Can we handle case-fold? Unclear:
876 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
877 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
879 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
880 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
881 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
884 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
885 cl->bitmap[i] |= ~or_with->bitmap[i];
886 } /* XXXX: logic is complicated otherwise */
891 /* And, we can just take the union of the flags that aren't affected
892 * by the inversion */
893 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
895 /* For the remaining flags:
896 ANYOF_UNICODE_ALL and inverted means to not match anything above
897 255, which means that the union with cl should just be
898 what cl has in it, so can ignore this flag
899 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
900 is 127-255 to match them, but then invert that, so the
901 union with cl should just be what cl has in it, so can
904 } else { /* 'or_with' is not inverted */
905 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
906 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
907 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
908 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
911 /* OR char bitmap and class bitmap separately */
912 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
913 cl->bitmap[i] |= or_with->bitmap[i];
914 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
915 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
916 cl->classflags[i] |= or_with->classflags[i];
917 cl->flags |= ANYOF_CLASS;
920 else { /* XXXX: logic is complicated, leave it along for a moment. */
925 cl->flags |= or_with->flags;
927 if (ANYOF_NONBITMAP(or_with)) {
929 /* Use the added node's outside-the-bit-map match if there isn't a
930 * conflict. If there is a conflict (both nodes match something
931 * outside the bitmap, but what they match outside is not the same
932 * pointer, and hence not easily compared until XXX we extend
933 * inversion lists this far), give up and allow the start class to
934 * match everything outside the bitmap */
935 if (! ANYOF_NONBITMAP(cl)) {
936 ARG_SET(cl, ARG(or_with));
938 else if (ARG(cl) != ARG(or_with)) {
939 cl->flags |= ANYOF_UNICODE_ALL;
945 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
946 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
947 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
948 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
953 dump_trie(trie,widecharmap,revcharmap)
954 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
955 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
957 These routines dump out a trie in a somewhat readable format.
958 The _interim_ variants are used for debugging the interim
959 tables that are used to generate the final compressed
960 representation which is what dump_trie expects.
962 Part of the reason for their existence is to provide a form
963 of documentation as to how the different representations function.
968 Dumps the final compressed table form of the trie to Perl_debug_log.
969 Used for debugging make_trie().
973 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
974 AV *revcharmap, U32 depth)
977 SV *sv=sv_newmortal();
978 int colwidth= widecharmap ? 6 : 4;
980 GET_RE_DEBUG_FLAGS_DECL;
982 PERL_ARGS_ASSERT_DUMP_TRIE;
984 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
985 (int)depth * 2 + 2,"",
986 "Match","Base","Ofs" );
988 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
989 SV ** const tmp = av_fetch( revcharmap, state, 0);
991 PerlIO_printf( Perl_debug_log, "%*s",
993 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
994 PL_colors[0], PL_colors[1],
995 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
996 PERL_PV_ESCAPE_FIRSTCHAR
1001 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1002 (int)depth * 2 + 2,"");
1004 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1005 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1006 PerlIO_printf( Perl_debug_log, "\n");
1008 for( state = 1 ; state < trie->statecount ; state++ ) {
1009 const U32 base = trie->states[ state ].trans.base;
1011 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1013 if ( trie->states[ state ].wordnum ) {
1014 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1016 PerlIO_printf( Perl_debug_log, "%6s", "" );
1019 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1024 while( ( base + ofs < trie->uniquecharcount ) ||
1025 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1026 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1029 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1031 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1032 if ( ( base + ofs >= trie->uniquecharcount ) &&
1033 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1034 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1036 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1038 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1040 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1044 PerlIO_printf( Perl_debug_log, "]");
1047 PerlIO_printf( Perl_debug_log, "\n" );
1049 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1050 for (word=1; word <= trie->wordcount; word++) {
1051 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1052 (int)word, (int)(trie->wordinfo[word].prev),
1053 (int)(trie->wordinfo[word].len));
1055 PerlIO_printf(Perl_debug_log, "\n" );
1058 Dumps a fully constructed but uncompressed trie in list form.
1059 List tries normally only are used for construction when the number of
1060 possible chars (trie->uniquecharcount) is very high.
1061 Used for debugging make_trie().
1064 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1065 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1069 SV *sv=sv_newmortal();
1070 int colwidth= widecharmap ? 6 : 4;
1071 GET_RE_DEBUG_FLAGS_DECL;
1073 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1075 /* print out the table precompression. */
1076 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1077 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1078 "------:-----+-----------------\n" );
1080 for( state=1 ; state < next_alloc ; state ++ ) {
1083 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1084 (int)depth * 2 + 2,"", (UV)state );
1085 if ( ! trie->states[ state ].wordnum ) {
1086 PerlIO_printf( Perl_debug_log, "%5s| ","");
1088 PerlIO_printf( Perl_debug_log, "W%4x| ",
1089 trie->states[ state ].wordnum
1092 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1093 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1095 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1097 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1098 PL_colors[0], PL_colors[1],
1099 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1100 PERL_PV_ESCAPE_FIRSTCHAR
1102 TRIE_LIST_ITEM(state,charid).forid,
1103 (UV)TRIE_LIST_ITEM(state,charid).newstate
1106 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1107 (int)((depth * 2) + 14), "");
1110 PerlIO_printf( Perl_debug_log, "\n");
1115 Dumps a fully constructed but uncompressed trie in table form.
1116 This is the normal DFA style state transition table, with a few
1117 twists to facilitate compression later.
1118 Used for debugging make_trie().
1121 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1122 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1127 SV *sv=sv_newmortal();
1128 int colwidth= widecharmap ? 6 : 4;
1129 GET_RE_DEBUG_FLAGS_DECL;
1131 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1134 print out the table precompression so that we can do a visual check
1135 that they are identical.
1138 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1140 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1141 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1143 PerlIO_printf( Perl_debug_log, "%*s",
1145 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1146 PL_colors[0], PL_colors[1],
1147 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1148 PERL_PV_ESCAPE_FIRSTCHAR
1154 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1156 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1157 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1160 PerlIO_printf( Perl_debug_log, "\n" );
1162 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1164 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1165 (int)depth * 2 + 2,"",
1166 (UV)TRIE_NODENUM( state ) );
1168 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1169 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1171 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1173 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1175 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1176 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1178 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1179 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1187 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1188 startbranch: the first branch in the whole branch sequence
1189 first : start branch of sequence of branch-exact nodes.
1190 May be the same as startbranch
1191 last : Thing following the last branch.
1192 May be the same as tail.
1193 tail : item following the branch sequence
1194 count : words in the sequence
1195 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1196 depth : indent depth
1198 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1200 A trie is an N'ary tree where the branches are determined by digital
1201 decomposition of the key. IE, at the root node you look up the 1st character and
1202 follow that branch repeat until you find the end of the branches. Nodes can be
1203 marked as "accepting" meaning they represent a complete word. Eg:
1207 would convert into the following structure. Numbers represent states, letters
1208 following numbers represent valid transitions on the letter from that state, if
1209 the number is in square brackets it represents an accepting state, otherwise it
1210 will be in parenthesis.
1212 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1216 (1) +-i->(6)-+-s->[7]
1218 +-s->(3)-+-h->(4)-+-e->[5]
1220 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1222 This shows that when matching against the string 'hers' we will begin at state 1
1223 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1224 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1225 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1226 single traverse. We store a mapping from accepting to state to which word was
1227 matched, and then when we have multiple possibilities we try to complete the
1228 rest of the regex in the order in which they occured in the alternation.
1230 The only prior NFA like behaviour that would be changed by the TRIE support is
1231 the silent ignoring of duplicate alternations which are of the form:
1233 / (DUPE|DUPE) X? (?{ ... }) Y /x
1235 Thus EVAL blocks following a trie may be called a different number of times with
1236 and without the optimisation. With the optimisations dupes will be silently
1237 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1238 the following demonstrates:
1240 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1242 which prints out 'word' three times, but
1244 'words'=~/(word|word|word)(?{ print $1 })S/
1246 which doesnt print it out at all. This is due to other optimisations kicking in.
1248 Example of what happens on a structural level:
1250 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1252 1: CURLYM[1] {1,32767}(18)
1263 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1264 and should turn into:
1266 1: CURLYM[1] {1,32767}(18)
1268 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1276 Cases where tail != last would be like /(?foo|bar)baz/:
1286 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1287 and would end up looking like:
1290 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1297 d = uvuni_to_utf8_flags(d, uv, 0);
1299 is the recommended Unicode-aware way of saying
1304 #define TRIE_STORE_REVCHAR \
1307 SV *zlopp = newSV(2); \
1308 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1309 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1310 SvCUR_set(zlopp, kapow - flrbbbbb); \
1313 av_push(revcharmap, zlopp); \
1315 char ooooff = (char)uvc; \
1316 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1320 #define TRIE_READ_CHAR STMT_START { \
1324 if ( foldlen > 0 ) { \
1325 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1330 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1331 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1332 foldlen -= UNISKIP( uvc ); \
1333 scan = foldbuf + UNISKIP( uvc ); \
1336 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1346 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1347 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1348 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1349 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1351 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1352 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1353 TRIE_LIST_CUR( state )++; \
1356 #define TRIE_LIST_NEW(state) STMT_START { \
1357 Newxz( trie->states[ state ].trans.list, \
1358 4, reg_trie_trans_le ); \
1359 TRIE_LIST_CUR( state ) = 1; \
1360 TRIE_LIST_LEN( state ) = 4; \
1363 #define TRIE_HANDLE_WORD(state) STMT_START { \
1364 U16 dupe= trie->states[ state ].wordnum; \
1365 regnode * const noper_next = regnext( noper ); \
1368 /* store the word for dumping */ \
1370 if (OP(noper) != NOTHING) \
1371 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1373 tmp = newSVpvn_utf8( "", 0, UTF ); \
1374 av_push( trie_words, tmp ); \
1378 trie->wordinfo[curword].prev = 0; \
1379 trie->wordinfo[curword].len = wordlen; \
1380 trie->wordinfo[curword].accept = state; \
1382 if ( noper_next < tail ) { \
1384 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1385 trie->jump[curword] = (U16)(noper_next - convert); \
1387 jumper = noper_next; \
1389 nextbranch= regnext(cur); \
1393 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1394 /* chain, so that when the bits of chain are later */\
1395 /* linked together, the dups appear in the chain */\
1396 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1397 trie->wordinfo[dupe].prev = curword; \
1399 /* we haven't inserted this word yet. */ \
1400 trie->states[ state ].wordnum = curword; \
1405 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1406 ( ( base + charid >= ucharcount \
1407 && base + charid < ubound \
1408 && state == trie->trans[ base - ucharcount + charid ].check \
1409 && trie->trans[ base - ucharcount + charid ].next ) \
1410 ? trie->trans[ base - ucharcount + charid ].next \
1411 : ( state==1 ? special : 0 ) \
1415 #define MADE_JUMP_TRIE 2
1416 #define MADE_EXACT_TRIE 4
1419 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1422 /* first pass, loop through and scan words */
1423 reg_trie_data *trie;
1424 HV *widecharmap = NULL;
1425 AV *revcharmap = newAV();
1427 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1432 regnode *jumper = NULL;
1433 regnode *nextbranch = NULL;
1434 regnode *convert = NULL;
1435 U32 *prev_states; /* temp array mapping each state to previous one */
1436 /* we just use folder as a flag in utf8 */
1437 const U8 * folder = NULL;
1440 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1441 AV *trie_words = NULL;
1442 /* along with revcharmap, this only used during construction but both are
1443 * useful during debugging so we store them in the struct when debugging.
1446 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1447 STRLEN trie_charcount=0;
1449 SV *re_trie_maxbuff;
1450 GET_RE_DEBUG_FLAGS_DECL;
1452 PERL_ARGS_ASSERT_MAKE_TRIE;
1454 PERL_UNUSED_ARG(depth);
1459 case EXACTFU: folder = PL_fold_latin1; break;
1460 case EXACTF: folder = PL_fold; break;
1461 case EXACTFL: folder = PL_fold_locale; break;
1464 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1466 trie->startstate = 1;
1467 trie->wordcount = word_count;
1468 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1469 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1470 if (!(UTF && folder))
1471 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1472 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1473 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1476 trie_words = newAV();
1479 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1480 if (!SvIOK(re_trie_maxbuff)) {
1481 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1484 PerlIO_printf( Perl_debug_log,
1485 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1486 (int)depth * 2 + 2, "",
1487 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1488 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1492 /* Find the node we are going to overwrite */
1493 if ( first == startbranch && OP( last ) != BRANCH ) {
1494 /* whole branch chain */
1497 /* branch sub-chain */
1498 convert = NEXTOPER( first );
1501 /* -- First loop and Setup --
1503 We first traverse the branches and scan each word to determine if it
1504 contains widechars, and how many unique chars there are, this is
1505 important as we have to build a table with at least as many columns as we
1508 We use an array of integers to represent the character codes 0..255
1509 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1510 native representation of the character value as the key and IV's for the
1513 *TODO* If we keep track of how many times each character is used we can
1514 remap the columns so that the table compression later on is more
1515 efficient in terms of memory by ensuring the most common value is in the
1516 middle and the least common are on the outside. IMO this would be better
1517 than a most to least common mapping as theres a decent chance the most
1518 common letter will share a node with the least common, meaning the node
1519 will not be compressible. With a middle is most common approach the worst
1520 case is when we have the least common nodes twice.
1524 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1525 regnode * const noper = NEXTOPER( cur );
1526 const U8 *uc = (U8*)STRING( noper );
1527 const U8 * const e = uc + STR_LEN( noper );
1529 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1530 const U8 *scan = (U8*)NULL;
1531 U32 wordlen = 0; /* required init */
1533 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1535 if (OP(noper) == NOTHING) {
1539 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1540 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1541 regardless of encoding */
1543 for ( ; uc < e ; uc += len ) {
1544 TRIE_CHARCOUNT(trie)++;
1548 if ( !trie->charmap[ uvc ] ) {
1549 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1551 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1555 /* store the codepoint in the bitmap, and its folded
1557 TRIE_BITMAP_SET(trie,uvc);
1559 /* store the folded codepoint */
1560 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1563 /* store first byte of utf8 representation of
1564 variant codepoints */
1565 if (! UNI_IS_INVARIANT(uvc)) {
1566 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1569 set_bit = 0; /* We've done our bit :-) */
1574 widecharmap = newHV();
1576 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1579 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1581 if ( !SvTRUE( *svpp ) ) {
1582 sv_setiv( *svpp, ++trie->uniquecharcount );
1587 if( cur == first ) {
1590 } else if (chars < trie->minlen) {
1592 } else if (chars > trie->maxlen) {
1596 } /* end first pass */
1597 DEBUG_TRIE_COMPILE_r(
1598 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1599 (int)depth * 2 + 2,"",
1600 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1601 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1602 (int)trie->minlen, (int)trie->maxlen )
1606 We now know what we are dealing with in terms of unique chars and
1607 string sizes so we can calculate how much memory a naive
1608 representation using a flat table will take. If it's over a reasonable
1609 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1610 conservative but potentially much slower representation using an array
1613 At the end we convert both representations into the same compressed
1614 form that will be used in regexec.c for matching with. The latter
1615 is a form that cannot be used to construct with but has memory
1616 properties similar to the list form and access properties similar
1617 to the table form making it both suitable for fast searches and
1618 small enough that its feasable to store for the duration of a program.
1620 See the comment in the code where the compressed table is produced
1621 inplace from the flat tabe representation for an explanation of how
1622 the compression works.
1627 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1630 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1632 Second Pass -- Array Of Lists Representation
1634 Each state will be represented by a list of charid:state records
1635 (reg_trie_trans_le) the first such element holds the CUR and LEN
1636 points of the allocated array. (See defines above).
1638 We build the initial structure using the lists, and then convert
1639 it into the compressed table form which allows faster lookups
1640 (but cant be modified once converted).
1643 STRLEN transcount = 1;
1645 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1646 "%*sCompiling trie using list compiler\n",
1647 (int)depth * 2 + 2, ""));
1649 trie->states = (reg_trie_state *)
1650 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1651 sizeof(reg_trie_state) );
1655 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1657 regnode * const noper = NEXTOPER( cur );
1658 U8 *uc = (U8*)STRING( noper );
1659 const U8 * const e = uc + STR_LEN( noper );
1660 U32 state = 1; /* required init */
1661 U16 charid = 0; /* sanity init */
1662 U8 *scan = (U8*)NULL; /* sanity init */
1663 STRLEN foldlen = 0; /* required init */
1664 U32 wordlen = 0; /* required init */
1665 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1667 if (OP(noper) != NOTHING) {
1668 for ( ; uc < e ; uc += len ) {
1673 charid = trie->charmap[ uvc ];
1675 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1679 charid=(U16)SvIV( *svpp );
1682 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1689 if ( !trie->states[ state ].trans.list ) {
1690 TRIE_LIST_NEW( state );
1692 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1693 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1694 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1699 newstate = next_alloc++;
1700 prev_states[newstate] = state;
1701 TRIE_LIST_PUSH( state, charid, newstate );
1706 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1710 TRIE_HANDLE_WORD(state);
1712 } /* end second pass */
1714 /* next alloc is the NEXT state to be allocated */
1715 trie->statecount = next_alloc;
1716 trie->states = (reg_trie_state *)
1717 PerlMemShared_realloc( trie->states,
1719 * sizeof(reg_trie_state) );
1721 /* and now dump it out before we compress it */
1722 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1723 revcharmap, next_alloc,
1727 trie->trans = (reg_trie_trans *)
1728 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1735 for( state=1 ; state < next_alloc ; state ++ ) {
1739 DEBUG_TRIE_COMPILE_MORE_r(
1740 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1744 if (trie->states[state].trans.list) {
1745 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1749 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1750 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1751 if ( forid < minid ) {
1753 } else if ( forid > maxid ) {
1757 if ( transcount < tp + maxid - minid + 1) {
1759 trie->trans = (reg_trie_trans *)
1760 PerlMemShared_realloc( trie->trans,
1762 * sizeof(reg_trie_trans) );
1763 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1765 base = trie->uniquecharcount + tp - minid;
1766 if ( maxid == minid ) {
1768 for ( ; zp < tp ; zp++ ) {
1769 if ( ! trie->trans[ zp ].next ) {
1770 base = trie->uniquecharcount + zp - minid;
1771 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1772 trie->trans[ zp ].check = state;
1778 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1779 trie->trans[ tp ].check = state;
1784 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1785 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1786 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1787 trie->trans[ tid ].check = state;
1789 tp += ( maxid - minid + 1 );
1791 Safefree(trie->states[ state ].trans.list);
1794 DEBUG_TRIE_COMPILE_MORE_r(
1795 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1798 trie->states[ state ].trans.base=base;
1800 trie->lasttrans = tp + 1;
1804 Second Pass -- Flat Table Representation.
1806 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1807 We know that we will need Charcount+1 trans at most to store the data
1808 (one row per char at worst case) So we preallocate both structures
1809 assuming worst case.
1811 We then construct the trie using only the .next slots of the entry
1814 We use the .check field of the first entry of the node temporarily to
1815 make compression both faster and easier by keeping track of how many non
1816 zero fields are in the node.
1818 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1821 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1822 number representing the first entry of the node, and state as a
1823 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1824 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1825 are 2 entrys per node. eg:
1833 The table is internally in the right hand, idx form. However as we also
1834 have to deal with the states array which is indexed by nodenum we have to
1835 use TRIE_NODENUM() to convert.
1838 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1839 "%*sCompiling trie using table compiler\n",
1840 (int)depth * 2 + 2, ""));
1842 trie->trans = (reg_trie_trans *)
1843 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1844 * trie->uniquecharcount + 1,
1845 sizeof(reg_trie_trans) );
1846 trie->states = (reg_trie_state *)
1847 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1848 sizeof(reg_trie_state) );
1849 next_alloc = trie->uniquecharcount + 1;
1852 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1854 regnode * const noper = NEXTOPER( cur );
1855 const U8 *uc = (U8*)STRING( noper );
1856 const U8 * const e = uc + STR_LEN( noper );
1858 U32 state = 1; /* required init */
1860 U16 charid = 0; /* sanity init */
1861 U32 accept_state = 0; /* sanity init */
1862 U8 *scan = (U8*)NULL; /* sanity init */
1864 STRLEN foldlen = 0; /* required init */
1865 U32 wordlen = 0; /* required init */
1866 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1868 if ( OP(noper) != NOTHING ) {
1869 for ( ; uc < e ; uc += len ) {
1874 charid = trie->charmap[ uvc ];
1876 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1877 charid = svpp ? (U16)SvIV(*svpp) : 0;
1881 if ( !trie->trans[ state + charid ].next ) {
1882 trie->trans[ state + charid ].next = next_alloc;
1883 trie->trans[ state ].check++;
1884 prev_states[TRIE_NODENUM(next_alloc)]
1885 = TRIE_NODENUM(state);
1886 next_alloc += trie->uniquecharcount;
1888 state = trie->trans[ state + charid ].next;
1890 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1892 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1895 accept_state = TRIE_NODENUM( state );
1896 TRIE_HANDLE_WORD(accept_state);
1898 } /* end second pass */
1900 /* and now dump it out before we compress it */
1901 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1903 next_alloc, depth+1));
1907 * Inplace compress the table.*
1909 For sparse data sets the table constructed by the trie algorithm will
1910 be mostly 0/FAIL transitions or to put it another way mostly empty.
1911 (Note that leaf nodes will not contain any transitions.)
1913 This algorithm compresses the tables by eliminating most such
1914 transitions, at the cost of a modest bit of extra work during lookup:
1916 - Each states[] entry contains a .base field which indicates the
1917 index in the state[] array wheres its transition data is stored.
1919 - If .base is 0 there are no valid transitions from that node.
1921 - If .base is nonzero then charid is added to it to find an entry in
1924 -If trans[states[state].base+charid].check!=state then the
1925 transition is taken to be a 0/Fail transition. Thus if there are fail
1926 transitions at the front of the node then the .base offset will point
1927 somewhere inside the previous nodes data (or maybe even into a node
1928 even earlier), but the .check field determines if the transition is
1932 The following process inplace converts the table to the compressed
1933 table: We first do not compress the root node 1,and mark all its
1934 .check pointers as 1 and set its .base pointer as 1 as well. This
1935 allows us to do a DFA construction from the compressed table later,
1936 and ensures that any .base pointers we calculate later are greater
1939 - We set 'pos' to indicate the first entry of the second node.
1941 - We then iterate over the columns of the node, finding the first and
1942 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1943 and set the .check pointers accordingly, and advance pos
1944 appropriately and repreat for the next node. Note that when we copy
1945 the next pointers we have to convert them from the original
1946 NODEIDX form to NODENUM form as the former is not valid post
1949 - If a node has no transitions used we mark its base as 0 and do not
1950 advance the pos pointer.
1952 - If a node only has one transition we use a second pointer into the
1953 structure to fill in allocated fail transitions from other states.
1954 This pointer is independent of the main pointer and scans forward
1955 looking for null transitions that are allocated to a state. When it
1956 finds one it writes the single transition into the "hole". If the
1957 pointer doesnt find one the single transition is appended as normal.
1959 - Once compressed we can Renew/realloc the structures to release the
1962 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1963 specifically Fig 3.47 and the associated pseudocode.
1967 const U32 laststate = TRIE_NODENUM( next_alloc );
1970 trie->statecount = laststate;
1972 for ( state = 1 ; state < laststate ; state++ ) {
1974 const U32 stateidx = TRIE_NODEIDX( state );
1975 const U32 o_used = trie->trans[ stateidx ].check;
1976 U32 used = trie->trans[ stateidx ].check;
1977 trie->trans[ stateidx ].check = 0;
1979 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1980 if ( flag || trie->trans[ stateidx + charid ].next ) {
1981 if ( trie->trans[ stateidx + charid ].next ) {
1983 for ( ; zp < pos ; zp++ ) {
1984 if ( ! trie->trans[ zp ].next ) {
1988 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1989 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1990 trie->trans[ zp ].check = state;
1991 if ( ++zp > pos ) pos = zp;
1998 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2000 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2001 trie->trans[ pos ].check = state;
2006 trie->lasttrans = pos + 1;
2007 trie->states = (reg_trie_state *)
2008 PerlMemShared_realloc( trie->states, laststate
2009 * sizeof(reg_trie_state) );
2010 DEBUG_TRIE_COMPILE_MORE_r(
2011 PerlIO_printf( Perl_debug_log,
2012 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2013 (int)depth * 2 + 2,"",
2014 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2017 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2020 } /* end table compress */
2022 DEBUG_TRIE_COMPILE_MORE_r(
2023 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2024 (int)depth * 2 + 2, "",
2025 (UV)trie->statecount,
2026 (UV)trie->lasttrans)
2028 /* resize the trans array to remove unused space */
2029 trie->trans = (reg_trie_trans *)
2030 PerlMemShared_realloc( trie->trans, trie->lasttrans
2031 * sizeof(reg_trie_trans) );
2033 { /* Modify the program and insert the new TRIE node */
2034 U8 nodetype =(U8)(flags & 0xFF);
2038 regnode *optimize = NULL;
2039 #ifdef RE_TRACK_PATTERN_OFFSETS
2042 U32 mjd_nodelen = 0;
2043 #endif /* RE_TRACK_PATTERN_OFFSETS */
2044 #endif /* DEBUGGING */
2046 This means we convert either the first branch or the first Exact,
2047 depending on whether the thing following (in 'last') is a branch
2048 or not and whther first is the startbranch (ie is it a sub part of
2049 the alternation or is it the whole thing.)
2050 Assuming its a sub part we convert the EXACT otherwise we convert
2051 the whole branch sequence, including the first.
2053 /* Find the node we are going to overwrite */
2054 if ( first != startbranch || OP( last ) == BRANCH ) {
2055 /* branch sub-chain */
2056 NEXT_OFF( first ) = (U16)(last - first);
2057 #ifdef RE_TRACK_PATTERN_OFFSETS
2059 mjd_offset= Node_Offset((convert));
2060 mjd_nodelen= Node_Length((convert));
2063 /* whole branch chain */
2065 #ifdef RE_TRACK_PATTERN_OFFSETS
2068 const regnode *nop = NEXTOPER( convert );
2069 mjd_offset= Node_Offset((nop));
2070 mjd_nodelen= Node_Length((nop));
2074 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2075 (int)depth * 2 + 2, "",
2076 (UV)mjd_offset, (UV)mjd_nodelen)
2079 /* But first we check to see if there is a common prefix we can
2080 split out as an EXACT and put in front of the TRIE node. */
2081 trie->startstate= 1;
2082 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2084 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2088 const U32 base = trie->states[ state ].trans.base;
2090 if ( trie->states[state].wordnum )
2093 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2094 if ( ( base + ofs >= trie->uniquecharcount ) &&
2095 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2096 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2098 if ( ++count > 1 ) {
2099 SV **tmp = av_fetch( revcharmap, ofs, 0);
2100 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2101 if ( state == 1 ) break;
2103 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2105 PerlIO_printf(Perl_debug_log,
2106 "%*sNew Start State=%"UVuf" Class: [",
2107 (int)depth * 2 + 2, "",
2110 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2111 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2113 TRIE_BITMAP_SET(trie,*ch);
2115 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2117 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2121 TRIE_BITMAP_SET(trie,*ch);
2123 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2124 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2130 SV **tmp = av_fetch( revcharmap, idx, 0);
2132 char *ch = SvPV( *tmp, len );
2134 SV *sv=sv_newmortal();
2135 PerlIO_printf( Perl_debug_log,
2136 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2137 (int)depth * 2 + 2, "",
2139 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2140 PL_colors[0], PL_colors[1],
2141 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2142 PERL_PV_ESCAPE_FIRSTCHAR
2147 OP( convert ) = nodetype;
2148 str=STRING(convert);
2151 STR_LEN(convert) += len;
2157 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2162 trie->prefixlen = (state-1);
2164 regnode *n = convert+NODE_SZ_STR(convert);
2165 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2166 trie->startstate = state;
2167 trie->minlen -= (state - 1);
2168 trie->maxlen -= (state - 1);
2170 /* At least the UNICOS C compiler choked on this
2171 * being argument to DEBUG_r(), so let's just have
2174 #ifdef PERL_EXT_RE_BUILD
2180 regnode *fix = convert;
2181 U32 word = trie->wordcount;
2183 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2184 while( ++fix < n ) {
2185 Set_Node_Offset_Length(fix, 0, 0);
2188 SV ** const tmp = av_fetch( trie_words, word, 0 );
2190 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2191 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2193 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2201 NEXT_OFF(convert) = (U16)(tail - convert);
2202 DEBUG_r(optimize= n);
2208 if ( trie->maxlen ) {
2209 NEXT_OFF( convert ) = (U16)(tail - convert);
2210 ARG_SET( convert, data_slot );
2211 /* Store the offset to the first unabsorbed branch in
2212 jump[0], which is otherwise unused by the jump logic.
2213 We use this when dumping a trie and during optimisation. */
2215 trie->jump[0] = (U16)(nextbranch - convert);
2217 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2218 * and there is a bitmap
2219 * and the first "jump target" node we found leaves enough room
2220 * then convert the TRIE node into a TRIEC node, with the bitmap
2221 * embedded inline in the opcode - this is hypothetically faster.
2223 if ( !trie->states[trie->startstate].wordnum
2225 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2227 OP( convert ) = TRIEC;
2228 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2229 PerlMemShared_free(trie->bitmap);
2232 OP( convert ) = TRIE;
2234 /* store the type in the flags */
2235 convert->flags = nodetype;
2239 + regarglen[ OP( convert ) ];
2241 /* XXX We really should free up the resource in trie now,
2242 as we won't use them - (which resources?) dmq */
2244 /* needed for dumping*/
2245 DEBUG_r(if (optimize) {
2246 regnode *opt = convert;
2248 while ( ++opt < optimize) {
2249 Set_Node_Offset_Length(opt,0,0);
2252 Try to clean up some of the debris left after the
2255 while( optimize < jumper ) {
2256 mjd_nodelen += Node_Length((optimize));
2257 OP( optimize ) = OPTIMIZED;
2258 Set_Node_Offset_Length(optimize,0,0);
2261 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2263 } /* end node insert */
2265 /* Finish populating the prev field of the wordinfo array. Walk back
2266 * from each accept state until we find another accept state, and if
2267 * so, point the first word's .prev field at the second word. If the
2268 * second already has a .prev field set, stop now. This will be the
2269 * case either if we've already processed that word's accept state,
2270 * or that state had multiple words, and the overspill words were
2271 * already linked up earlier.
2278 for (word=1; word <= trie->wordcount; word++) {
2280 if (trie->wordinfo[word].prev)
2282 state = trie->wordinfo[word].accept;
2284 state = prev_states[state];
2287 prev = trie->states[state].wordnum;
2291 trie->wordinfo[word].prev = prev;
2293 Safefree(prev_states);
2297 /* and now dump out the compressed format */
2298 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2300 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2302 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2303 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2305 SvREFCNT_dec(revcharmap);
2309 : trie->startstate>1
2315 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2317 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2319 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2320 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2323 We find the fail state for each state in the trie, this state is the longest proper
2324 suffix of the current state's 'word' that is also a proper prefix of another word in our
2325 trie. State 1 represents the word '' and is thus the default fail state. This allows
2326 the DFA not to have to restart after its tried and failed a word at a given point, it
2327 simply continues as though it had been matching the other word in the first place.
2329 'abcdgu'=~/abcdefg|cdgu/
2330 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2331 fail, which would bring us to the state representing 'd' in the second word where we would
2332 try 'g' and succeed, proceeding to match 'cdgu'.
2334 /* add a fail transition */
2335 const U32 trie_offset = ARG(source);
2336 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2338 const U32 ucharcount = trie->uniquecharcount;
2339 const U32 numstates = trie->statecount;
2340 const U32 ubound = trie->lasttrans + ucharcount;
2344 U32 base = trie->states[ 1 ].trans.base;
2347 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2348 GET_RE_DEBUG_FLAGS_DECL;
2350 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2352 PERL_UNUSED_ARG(depth);
2356 ARG_SET( stclass, data_slot );
2357 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2358 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2359 aho->trie=trie_offset;
2360 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2361 Copy( trie->states, aho->states, numstates, reg_trie_state );
2362 Newxz( q, numstates, U32);
2363 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2366 /* initialize fail[0..1] to be 1 so that we always have
2367 a valid final fail state */
2368 fail[ 0 ] = fail[ 1 ] = 1;
2370 for ( charid = 0; charid < ucharcount ; charid++ ) {
2371 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2373 q[ q_write ] = newstate;
2374 /* set to point at the root */
2375 fail[ q[ q_write++ ] ]=1;
2378 while ( q_read < q_write) {
2379 const U32 cur = q[ q_read++ % numstates ];
2380 base = trie->states[ cur ].trans.base;
2382 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2383 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2385 U32 fail_state = cur;
2388 fail_state = fail[ fail_state ];
2389 fail_base = aho->states[ fail_state ].trans.base;
2390 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2392 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2393 fail[ ch_state ] = fail_state;
2394 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2396 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2398 q[ q_write++ % numstates] = ch_state;
2402 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2403 when we fail in state 1, this allows us to use the
2404 charclass scan to find a valid start char. This is based on the principle
2405 that theres a good chance the string being searched contains lots of stuff
2406 that cant be a start char.
2408 fail[ 0 ] = fail[ 1 ] = 0;
2409 DEBUG_TRIE_COMPILE_r({
2410 PerlIO_printf(Perl_debug_log,
2411 "%*sStclass Failtable (%"UVuf" states): 0",
2412 (int)(depth * 2), "", (UV)numstates
2414 for( q_read=1; q_read<numstates; q_read++ ) {
2415 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2417 PerlIO_printf(Perl_debug_log, "\n");
2420 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2425 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2426 * These need to be revisited when a newer toolchain becomes available.
2428 #if defined(__sparc64__) && defined(__GNUC__)
2429 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2430 # undef SPARC64_GCC_WORKAROUND
2431 # define SPARC64_GCC_WORKAROUND 1
2435 #define DEBUG_PEEP(str,scan,depth) \
2436 DEBUG_OPTIMISE_r({if (scan){ \
2437 SV * const mysv=sv_newmortal(); \
2438 regnode *Next = regnext(scan); \
2439 regprop(RExC_rx, mysv, scan); \
2440 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2441 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2442 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2449 #define JOIN_EXACT(scan,min,flags) \
2450 if (PL_regkind[OP(scan)] == EXACT) \
2451 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2454 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2455 /* Merge several consecutive EXACTish nodes into one. */
2456 regnode *n = regnext(scan);
2458 regnode *next = scan + NODE_SZ_STR(scan);
2462 regnode *stop = scan;
2463 GET_RE_DEBUG_FLAGS_DECL;
2465 PERL_UNUSED_ARG(depth);
2468 PERL_ARGS_ASSERT_JOIN_EXACT;
2469 #ifndef EXPERIMENTAL_INPLACESCAN
2470 PERL_UNUSED_ARG(flags);
2471 PERL_UNUSED_ARG(val);
2473 DEBUG_PEEP("join",scan,depth);
2475 /* Skip NOTHING, merge EXACT*. */
2477 ( PL_regkind[OP(n)] == NOTHING ||
2478 (stringok && (OP(n) == OP(scan))))
2480 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2482 if (OP(n) == TAIL || n > next)
2484 if (PL_regkind[OP(n)] == NOTHING) {
2485 DEBUG_PEEP("skip:",n,depth);
2486 NEXT_OFF(scan) += NEXT_OFF(n);
2487 next = n + NODE_STEP_REGNODE;
2494 else if (stringok) {
2495 const unsigned int oldl = STR_LEN(scan);
2496 regnode * const nnext = regnext(n);
2498 DEBUG_PEEP("merg",n,depth);
2501 if (oldl + STR_LEN(n) > U8_MAX)
2503 NEXT_OFF(scan) += NEXT_OFF(n);
2504 STR_LEN(scan) += STR_LEN(n);
2505 next = n + NODE_SZ_STR(n);
2506 /* Now we can overwrite *n : */
2507 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2515 #ifdef EXPERIMENTAL_INPLACESCAN
2516 if (flags && !NEXT_OFF(n)) {
2517 DEBUG_PEEP("atch", val, depth);
2518 if (reg_off_by_arg[OP(n)]) {
2519 ARG_SET(n, val - n);
2522 NEXT_OFF(n) = val - n;
2528 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2529 #define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2530 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2531 #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2534 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
2535 && ( STR_LEN(scan) >= 6 ) )
2538 Two problematic code points in Unicode casefolding of EXACT nodes:
2540 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2541 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2547 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2548 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2550 This means that in case-insensitive matching (or "loose matching",
2551 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2552 length of the above casefolded versions) can match a target string
2553 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2554 This would rather mess up the minimum length computation.
2556 What we'll do is to look for the tail four bytes, and then peek
2557 at the preceding two bytes to see whether we need to decrease
2558 the minimum length by four (six minus two).
2560 Thanks to the design of UTF-8, there cannot be false matches:
2561 A sequence of valid UTF-8 bytes cannot be a subsequence of
2562 another valid sequence of UTF-8 bytes.
2565 char * const s0 = STRING(scan), *s, *t;
2566 char * const s1 = s0 + STR_LEN(scan) - 1;
2567 char * const s2 = s1 - 4;
2568 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2569 const char t0[] = "\xaf\x49\xaf\x42";
2571 const char t0[] = "\xcc\x88\xcc\x81";
2573 const char * const t1 = t0 + 3;
2576 s < s2 && (t = ninstr(s, s1, t0, t1));
2579 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2580 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2582 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2583 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2591 n = scan + NODE_SZ_STR(scan);
2593 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2600 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2604 /* REx optimizer. Converts nodes into quicker variants "in place".
2605 Finds fixed substrings. */
2607 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2608 to the position after last scanned or to NULL. */
2610 #define INIT_AND_WITHP \
2611 assert(!and_withp); \
2612 Newx(and_withp,1,struct regnode_charclass_class); \
2613 SAVEFREEPV(and_withp)
2615 /* this is a chain of data about sub patterns we are processing that
2616 need to be handled separately/specially in study_chunk. Its so
2617 we can simulate recursion without losing state. */
2619 typedef struct scan_frame {
2620 regnode *last; /* last node to process in this frame */
2621 regnode *next; /* next node to process when last is reached */
2622 struct scan_frame *prev; /*previous frame*/
2623 I32 stop; /* what stopparen do we use */
2627 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2629 #define CASE_SYNST_FNC(nAmE) \
2631 if (flags & SCF_DO_STCLASS_AND) { \
2632 for (value = 0; value < 256; value++) \
2633 if (!is_ ## nAmE ## _cp(value)) \
2634 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2637 for (value = 0; value < 256; value++) \
2638 if (is_ ## nAmE ## _cp(value)) \
2639 ANYOF_BITMAP_SET(data->start_class, value); \
2643 if (flags & SCF_DO_STCLASS_AND) { \
2644 for (value = 0; value < 256; value++) \
2645 if (is_ ## nAmE ## _cp(value)) \
2646 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2649 for (value = 0; value < 256; value++) \
2650 if (!is_ ## nAmE ## _cp(value)) \
2651 ANYOF_BITMAP_SET(data->start_class, value); \
2658 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2659 I32 *minlenp, I32 *deltap,
2664 struct regnode_charclass_class *and_withp,
2665 U32 flags, U32 depth)
2666 /* scanp: Start here (read-write). */
2667 /* deltap: Write maxlen-minlen here. */
2668 /* last: Stop before this one. */
2669 /* data: string data about the pattern */
2670 /* stopparen: treat close N as END */
2671 /* recursed: which subroutines have we recursed into */
2672 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2675 I32 min = 0, pars = 0, code;
2676 regnode *scan = *scanp, *next;
2678 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2679 int is_inf_internal = 0; /* The studied chunk is infinite */
2680 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2681 scan_data_t data_fake;
2682 SV *re_trie_maxbuff = NULL;
2683 regnode *first_non_open = scan;
2684 I32 stopmin = I32_MAX;
2685 scan_frame *frame = NULL;
2686 GET_RE_DEBUG_FLAGS_DECL;
2688 PERL_ARGS_ASSERT_STUDY_CHUNK;
2691 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2695 while (first_non_open && OP(first_non_open) == OPEN)
2696 first_non_open=regnext(first_non_open);
2701 while ( scan && OP(scan) != END && scan < last ){
2702 /* Peephole optimizer: */
2703 DEBUG_STUDYDATA("Peep:", data,depth);
2704 DEBUG_PEEP("Peep",scan,depth);
2705 JOIN_EXACT(scan,&min,0);
2707 /* Follow the next-chain of the current node and optimize
2708 away all the NOTHINGs from it. */
2709 if (OP(scan) != CURLYX) {
2710 const int max = (reg_off_by_arg[OP(scan)]
2712 /* I32 may be smaller than U16 on CRAYs! */
2713 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2714 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2718 /* Skip NOTHING and LONGJMP. */
2719 while ((n = regnext(n))
2720 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2721 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2722 && off + noff < max)
2724 if (reg_off_by_arg[OP(scan)])
2727 NEXT_OFF(scan) = off;
2732 /* The principal pseudo-switch. Cannot be a switch, since we
2733 look into several different things. */
2734 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2735 || OP(scan) == IFTHEN) {
2736 next = regnext(scan);
2738 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2740 if (OP(next) == code || code == IFTHEN) {
2741 /* NOTE - There is similar code to this block below for handling
2742 TRIE nodes on a re-study. If you change stuff here check there
2744 I32 max1 = 0, min1 = I32_MAX, num = 0;
2745 struct regnode_charclass_class accum;
2746 regnode * const startbranch=scan;
2748 if (flags & SCF_DO_SUBSTR)
2749 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2750 if (flags & SCF_DO_STCLASS)
2751 cl_init_zero(&accum);
2753 while (OP(scan) == code) {
2754 I32 deltanext, minnext, f = 0, fake;
2755 struct regnode_charclass_class this_class;
2758 data_fake.flags = 0;
2760 data_fake.whilem_c = data->whilem_c;
2761 data_fake.last_closep = data->last_closep;
2764 data_fake.last_closep = &fake;
2766 data_fake.pos_delta = delta;
2767 next = regnext(scan);
2768 scan = NEXTOPER(scan);
2770 scan = NEXTOPER(scan);
2771 if (flags & SCF_DO_STCLASS) {
2772 cl_init(&this_class);
2773 data_fake.start_class = &this_class;
2774 f = SCF_DO_STCLASS_AND;
2776 if (flags & SCF_WHILEM_VISITED_POS)
2777 f |= SCF_WHILEM_VISITED_POS;
2779 /* we suppose the run is continuous, last=next...*/
2780 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2782 stopparen, recursed, NULL, f,depth+1);
2785 if (max1 < minnext + deltanext)
2786 max1 = minnext + deltanext;
2787 if (deltanext == I32_MAX)
2788 is_inf = is_inf_internal = 1;
2790 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2792 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2793 if ( stopmin > minnext)
2794 stopmin = min + min1;
2795 flags &= ~SCF_DO_SUBSTR;
2797 data->flags |= SCF_SEEN_ACCEPT;
2800 if (data_fake.flags & SF_HAS_EVAL)
2801 data->flags |= SF_HAS_EVAL;
2802 data->whilem_c = data_fake.whilem_c;
2804 if (flags & SCF_DO_STCLASS)
2805 cl_or(&accum, &this_class);
2807 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2809 if (flags & SCF_DO_SUBSTR) {
2810 data->pos_min += min1;
2811 data->pos_delta += max1 - min1;
2812 if (max1 != min1 || is_inf)
2813 data->longest = &(data->longest_float);
2816 delta += max1 - min1;
2817 if (flags & SCF_DO_STCLASS_OR) {
2818 cl_or(data->start_class, &accum);
2820 cl_and(data->start_class, and_withp);
2821 flags &= ~SCF_DO_STCLASS;
2824 else if (flags & SCF_DO_STCLASS_AND) {
2826 cl_and(data->start_class, &accum);
2827 flags &= ~SCF_DO_STCLASS;
2830 /* Switch to OR mode: cache the old value of
2831 * data->start_class */
2833 StructCopy(data->start_class, and_withp,
2834 struct regnode_charclass_class);
2835 flags &= ~SCF_DO_STCLASS_AND;
2836 StructCopy(&accum, data->start_class,
2837 struct regnode_charclass_class);
2838 flags |= SCF_DO_STCLASS_OR;
2839 data->start_class->flags |= ANYOF_EOS;
2843 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2846 Assuming this was/is a branch we are dealing with: 'scan' now
2847 points at the item that follows the branch sequence, whatever
2848 it is. We now start at the beginning of the sequence and look
2855 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2857 If we can find such a subsequence we need to turn the first
2858 element into a trie and then add the subsequent branch exact
2859 strings to the trie.
2863 1. patterns where the whole set of branches can be converted.
2865 2. patterns where only a subset can be converted.
2867 In case 1 we can replace the whole set with a single regop
2868 for the trie. In case 2 we need to keep the start and end
2871 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2872 becomes BRANCH TRIE; BRANCH X;
2874 There is an additional case, that being where there is a
2875 common prefix, which gets split out into an EXACT like node
2876 preceding the TRIE node.
2878 If x(1..n)==tail then we can do a simple trie, if not we make
2879 a "jump" trie, such that when we match the appropriate word
2880 we "jump" to the appropriate tail node. Essentially we turn
2881 a nested if into a case structure of sorts.
2886 if (!re_trie_maxbuff) {
2887 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2888 if (!SvIOK(re_trie_maxbuff))
2889 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2891 if ( SvIV(re_trie_maxbuff)>=0 ) {
2893 regnode *first = (regnode *)NULL;
2894 regnode *last = (regnode *)NULL;
2895 regnode *tail = scan;
2900 SV * const mysv = sv_newmortal(); /* for dumping */
2902 /* var tail is used because there may be a TAIL
2903 regop in the way. Ie, the exacts will point to the
2904 thing following the TAIL, but the last branch will
2905 point at the TAIL. So we advance tail. If we
2906 have nested (?:) we may have to move through several
2910 while ( OP( tail ) == TAIL ) {
2911 /* this is the TAIL generated by (?:) */
2912 tail = regnext( tail );
2917 regprop(RExC_rx, mysv, tail );
2918 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2919 (int)depth * 2 + 2, "",
2920 "Looking for TRIE'able sequences. Tail node is: ",
2921 SvPV_nolen_const( mysv )
2927 step through the branches, cur represents each
2928 branch, noper is the first thing to be matched
2929 as part of that branch and noper_next is the
2930 regnext() of that node. if noper is an EXACT
2931 and noper_next is the same as scan (our current
2932 position in the regex) then the EXACT branch is
2933 a possible optimization target. Once we have
2934 two or more consecutive such branches we can
2935 create a trie of the EXACT's contents and stich
2936 it in place. If the sequence represents all of
2937 the branches we eliminate the whole thing and
2938 replace it with a single TRIE. If it is a
2939 subsequence then we need to stitch it in. This
2940 means the first branch has to remain, and needs
2941 to be repointed at the item on the branch chain
2942 following the last branch optimized. This could
2943 be either a BRANCH, in which case the
2944 subsequence is internal, or it could be the
2945 item following the branch sequence in which
2946 case the subsequence is at the end.
2950 /* dont use tail as the end marker for this traverse */
2951 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2952 regnode * const noper = NEXTOPER( cur );
2953 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2954 regnode * const noper_next = regnext( noper );
2958 regprop(RExC_rx, mysv, cur);
2959 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2960 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2962 regprop(RExC_rx, mysv, noper);
2963 PerlIO_printf( Perl_debug_log, " -> %s",
2964 SvPV_nolen_const(mysv));
2967 regprop(RExC_rx, mysv, noper_next );
2968 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2969 SvPV_nolen_const(mysv));
2971 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2972 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2974 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2975 : PL_regkind[ OP( noper ) ] == EXACT )
2976 || OP(noper) == NOTHING )
2978 && noper_next == tail
2983 if ( !first || optype == NOTHING ) {
2984 if (!first) first = cur;
2985 optype = OP( noper );
2991 Currently we do not believe that the trie logic can
2992 handle case insensitive matching properly when the
2993 pattern is not unicode (thus forcing unicode semantics).
2995 If/when this is fixed the following define can be swapped
2996 in below to fully enable trie logic.
2998 XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
3001 #define TRIE_TYPE_IS_SAFE 1
3004 #define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
3006 if ( last && TRIE_TYPE_IS_SAFE ) {
3007 make_trie( pRExC_state,
3008 startbranch, first, cur, tail, count,
3011 if ( PL_regkind[ OP( noper ) ] == EXACT
3013 && noper_next == tail
3018 optype = OP( noper );
3028 regprop(RExC_rx, mysv, cur);
3029 PerlIO_printf( Perl_debug_log,
3030 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3031 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3035 if ( last && TRIE_TYPE_IS_SAFE ) {
3036 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3037 #ifdef TRIE_STUDY_OPT
3038 if ( ((made == MADE_EXACT_TRIE &&
3039 startbranch == first)
3040 || ( first_non_open == first )) &&
3042 flags |= SCF_TRIE_RESTUDY;
3043 if ( startbranch == first
3046 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3056 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3057 scan = NEXTOPER(NEXTOPER(scan));
3058 } else /* single branch is optimized. */
3059 scan = NEXTOPER(scan);
3061 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3062 scan_frame *newframe = NULL;
3067 if (OP(scan) != SUSPEND) {
3068 /* set the pointer */
3069 if (OP(scan) == GOSUB) {
3071 RExC_recurse[ARG2L(scan)] = scan;
3072 start = RExC_open_parens[paren-1];
3073 end = RExC_close_parens[paren-1];
3076 start = RExC_rxi->program + 1;
3080 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3081 SAVEFREEPV(recursed);
3083 if (!PAREN_TEST(recursed,paren+1)) {
3084 PAREN_SET(recursed,paren+1);
3085 Newx(newframe,1,scan_frame);
3087 if (flags & SCF_DO_SUBSTR) {
3088 SCAN_COMMIT(pRExC_state,data,minlenp);
3089 data->longest = &(data->longest_float);
3091 is_inf = is_inf_internal = 1;
3092 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3093 cl_anything(data->start_class);
3094 flags &= ~SCF_DO_STCLASS;
3097 Newx(newframe,1,scan_frame);
3100 end = regnext(scan);
3105 SAVEFREEPV(newframe);
3106 newframe->next = regnext(scan);
3107 newframe->last = last;
3108 newframe->stop = stopparen;
3109 newframe->prev = frame;
3119 else if (OP(scan) == EXACT) {
3120 I32 l = STR_LEN(scan);
3123 const U8 * const s = (U8*)STRING(scan);
3124 l = utf8_length(s, s + l);
3125 uc = utf8_to_uvchr(s, NULL);
3127 uc = *((U8*)STRING(scan));
3130 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3131 /* The code below prefers earlier match for fixed
3132 offset, later match for variable offset. */
3133 if (data->last_end == -1) { /* Update the start info. */
3134 data->last_start_min = data->pos_min;
3135 data->last_start_max = is_inf
3136 ? I32_MAX : data->pos_min + data->pos_delta;
3138 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3140 SvUTF8_on(data->last_found);
3142 SV * const sv = data->last_found;
3143 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3144 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3145 if (mg && mg->mg_len >= 0)
3146 mg->mg_len += utf8_length((U8*)STRING(scan),
3147 (U8*)STRING(scan)+STR_LEN(scan));
3149 data->last_end = data->pos_min + l;
3150 data->pos_min += l; /* As in the first entry. */
3151 data->flags &= ~SF_BEFORE_EOL;
3153 if (flags & SCF_DO_STCLASS_AND) {
3154 /* Check whether it is compatible with what we know already! */
3158 /* If compatible, we or it in below. It is compatible if is
3159 * in the bitmp and either 1) its bit or its fold is set, or 2)
3160 * it's for a locale. Even if there isn't unicode semantics
3161 * here, at runtime there may be because of matching against a
3162 * utf8 string, so accept a possible false positive for
3163 * latin1-range folds */
3165 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3166 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3167 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3168 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3173 ANYOF_CLASS_ZERO(data->start_class);
3174 ANYOF_BITMAP_ZERO(data->start_class);
3176 ANYOF_BITMAP_SET(data->start_class, uc);
3177 else if (uc >= 0x100) {
3180 /* Some Unicode code points fold to the Latin1 range; as
3181 * XXX temporary code, instead of figuring out if this is
3182 * one, just assume it is and set all the start class bits
3183 * that could be some such above 255 code point's fold
3184 * which will generate fals positives. As the code
3185 * elsewhere that does compute the fold settles down, it
3186 * can be extracted out and re-used here */
3187 for (i = 0; i < 256; i++){
3188 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3189 ANYOF_BITMAP_SET(data->start_class, i);
3193 data->start_class->flags &= ~ANYOF_EOS;
3195 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3197 else if (flags & SCF_DO_STCLASS_OR) {
3198 /* false positive possible if the class is case-folded */
3200 ANYOF_BITMAP_SET(data->start_class, uc);
3202 data->start_class->flags |= ANYOF_UNICODE_ALL;
3203 data->start_class->flags &= ~ANYOF_EOS;
3204 cl_and(data->start_class, and_withp);
3206 flags &= ~SCF_DO_STCLASS;
3208 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3209 I32 l = STR_LEN(scan);
3210 UV uc = *((U8*)STRING(scan));
3212 /* Search for fixed substrings supports EXACT only. */
3213 if (flags & SCF_DO_SUBSTR) {
3215 SCAN_COMMIT(pRExC_state, data, minlenp);
3218 const U8 * const s = (U8 *)STRING(scan);
3219 l = utf8_length(s, s + l);
3220 uc = utf8_to_uvchr(s, NULL);
3223 if (flags & SCF_DO_SUBSTR)
3225 if (flags & SCF_DO_STCLASS_AND) {
3226 /* Check whether it is compatible with what we know already! */
3229 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3230 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3231 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3235 ANYOF_CLASS_ZERO(data->start_class);
3236 ANYOF_BITMAP_ZERO(data->start_class);
3238 ANYOF_BITMAP_SET(data->start_class, uc);
3239 data->start_class->flags &= ~ANYOF_EOS;
3240 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3241 if (OP(scan) == EXACTFL) {
3242 data->start_class->flags |= ANYOF_LOCALE;
3246 /* Also set the other member of the fold pair. In case
3247 * that unicode semantics is called for at runtime, use
3248 * the full latin1 fold. (Can't do this for locale,
3249 * because not known until runtime */
3250 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3253 else if (uc >= 0x100) {
3255 for (i = 0; i < 256; i++){
3256 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3257 ANYOF_BITMAP_SET(data->start_class, i);
3262 else if (flags & SCF_DO_STCLASS_OR) {
3263 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3264 /* false positive possible if the class is case-folded.
3265 Assume that the locale settings are the same... */
3267 ANYOF_BITMAP_SET(data->start_class, uc);
3268 if (OP(scan) != EXACTFL) {
3270 /* And set the other member of the fold pair, but
3271 * can't do that in locale because not known until
3273 ANYOF_BITMAP_SET(data->start_class,
3274 PL_fold_latin1[uc]);
3277 data->start_class->flags &= ~ANYOF_EOS;
3279 cl_and(data->start_class, and_withp);
3281 flags &= ~SCF_DO_STCLASS;
3283 else if (REGNODE_VARIES(OP(scan))) {
3284 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3285 I32 f = flags, pos_before = 0;
3286 regnode * const oscan = scan;
3287 struct regnode_charclass_class this_class;
3288 struct regnode_charclass_class *oclass = NULL;
3289 I32 next_is_eval = 0;
3291 switch (PL_regkind[OP(scan)]) {
3292 case WHILEM: /* End of (?:...)* . */
3293 scan = NEXTOPER(scan);
3296 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3297 next = NEXTOPER(scan);
3298 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3300 maxcount = REG_INFTY;
3301 next = regnext(scan);
3302 scan = NEXTOPER(scan);
3306 if (flags & SCF_DO_SUBSTR)
3311 if (flags & SCF_DO_STCLASS) {
3313 maxcount = REG_INFTY;
3314 next = regnext(scan);
3315 scan = NEXTOPER(scan);
3318 is_inf = is_inf_internal = 1;
3319 scan = regnext(scan);
3320 if (flags & SCF_DO_SUBSTR) {
3321 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3322 data->longest = &(data->longest_float);
3324 goto optimize_curly_tail;
3326 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3327 && (scan->flags == stopparen))
3332 mincount = ARG1(scan);
3333 maxcount = ARG2(scan);
3335 next = regnext(scan);
3336 if (OP(scan) == CURLYX) {
3337 I32 lp = (data ? *(data->last_closep) : 0);
3338 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3340 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3341 next_is_eval = (OP(scan) == EVAL);
3343 if (flags & SCF_DO_SUBSTR) {
3344 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3345 pos_before = data->pos_min;
3349 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3351 data->flags |= SF_IS_INF;
3353 if (flags & SCF_DO_STCLASS) {
3354 cl_init(&this_class);
3355 oclass = data->start_class;
3356 data->start_class = &this_class;
3357 f |= SCF_DO_STCLASS_AND;
3358 f &= ~SCF_DO_STCLASS_OR;
3360 /* Exclude from super-linear cache processing any {n,m}
3361 regops for which the combination of input pos and regex
3362 pos is not enough information to determine if a match
3365 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3366 regex pos at the \s*, the prospects for a match depend not
3367 only on the input position but also on how many (bar\s*)
3368 repeats into the {4,8} we are. */
3369 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3370 f &= ~SCF_WHILEM_VISITED_POS;
3372 /* This will finish on WHILEM, setting scan, or on NULL: */
3373 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3374 last, data, stopparen, recursed, NULL,
3376 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3378 if (flags & SCF_DO_STCLASS)
3379 data->start_class = oclass;
3380 if (mincount == 0 || minnext == 0) {
3381 if (flags & SCF_DO_STCLASS_OR) {
3382 cl_or(data->start_class, &this_class);
3384 else if (flags & SCF_DO_STCLASS_AND) {
3385 /* Switch to OR mode: cache the old value of
3386 * data->start_class */
3388 StructCopy(data->start_class, and_withp,
3389 struct regnode_charclass_class);
3390 flags &= ~SCF_DO_STCLASS_AND;
3391 StructCopy(&this_class, data->start_class,
3392 struct regnode_charclass_class);
3393 flags |= SCF_DO_STCLASS_OR;
3394 data->start_class->flags |= ANYOF_EOS;
3396 } else { /* Non-zero len */
3397 if (flags & SCF_DO_STCLASS_OR) {
3398 cl_or(data->start_class, &this_class);
3399 cl_and(data->start_class, and_withp);
3401 else if (flags & SCF_DO_STCLASS_AND)
3402 cl_and(data->start_class, &this_class);
3403 flags &= ~SCF_DO_STCLASS;
3405 if (!scan) /* It was not CURLYX, but CURLY. */
3407 if ( /* ? quantifier ok, except for (?{ ... }) */
3408 (next_is_eval || !(mincount == 0 && maxcount == 1))
3409 && (minnext == 0) && (deltanext == 0)
3410 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3411 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3413 ckWARNreg(RExC_parse,
3414 "Quantifier unexpected on zero-length expression");
3417 min += minnext * mincount;
3418 is_inf_internal |= ((maxcount == REG_INFTY
3419 && (minnext + deltanext) > 0)
3420 || deltanext == I32_MAX);
3421 is_inf |= is_inf_internal;
3422 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3424 /* Try powerful optimization CURLYX => CURLYN. */
3425 if ( OP(oscan) == CURLYX && data
3426 && data->flags & SF_IN_PAR
3427 && !(data->flags & SF_HAS_EVAL)
3428 && !deltanext && minnext == 1 ) {
3429 /* Try to optimize to CURLYN. */
3430 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3431 regnode * const nxt1 = nxt;
3438 if (!REGNODE_SIMPLE(OP(nxt))
3439 && !(PL_regkind[OP(nxt)] == EXACT
3440 && STR_LEN(nxt) == 1))
3446 if (OP(nxt) != CLOSE)
3448 if (RExC_open_parens) {
3449 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3450 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3452 /* Now we know that nxt2 is the only contents: */
3453 oscan->flags = (U8)ARG(nxt);
3455 OP(nxt1) = NOTHING; /* was OPEN. */
3458 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3459 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3460 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3461 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3462 OP(nxt + 1) = OPTIMIZED; /* was count. */
3463 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3468 /* Try optimization CURLYX => CURLYM. */
3469 if ( OP(oscan) == CURLYX && data
3470 && !(data->flags & SF_HAS_PAR)
3471 && !(data->flags & SF_HAS_EVAL)
3472 && !deltanext /* atom is fixed width */
3473 && minnext != 0 /* CURLYM can't handle zero width */
3475 /* XXXX How to optimize if data == 0? */
3476 /* Optimize to a simpler form. */
3477 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3481 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3482 && (OP(nxt2) != WHILEM))
3484 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3485 /* Need to optimize away parenths. */
3486 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3487 /* Set the parenth number. */
3488 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3490 oscan->flags = (U8)ARG(nxt);
3491 if (RExC_open_parens) {
3492 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3493 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3495 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3496 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3499 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3500 OP(nxt + 1) = OPTIMIZED; /* was count. */
3501 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3502 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3505 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3506 regnode *nnxt = regnext(nxt1);
3508 if (reg_off_by_arg[OP(nxt1)])
3509 ARG_SET(nxt1, nxt2 - nxt1);
3510 else if (nxt2 - nxt1 < U16_MAX)
3511 NEXT_OFF(nxt1) = nxt2 - nxt1;
3513 OP(nxt) = NOTHING; /* Cannot beautify */
3518 /* Optimize again: */
3519 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3520 NULL, stopparen, recursed, NULL, 0,depth+1);
3525 else if ((OP(oscan) == CURLYX)
3526 && (flags & SCF_WHILEM_VISITED_POS)
3527 /* See the comment on a similar expression above.
3528 However, this time it's not a subexpression
3529 we care about, but the expression itself. */
3530 && (maxcount == REG_INFTY)
3531 && data && ++data->whilem_c < 16) {
3532 /* This stays as CURLYX, we can put the count/of pair. */
3533 /* Find WHILEM (as in regexec.c) */
3534 regnode *nxt = oscan + NEXT_OFF(oscan);
3536 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3538 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3539 | (RExC_whilem_seen << 4)); /* On WHILEM */
3541 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3543 if (flags & SCF_DO_SUBSTR) {
3544 SV *last_str = NULL;
3545 int counted = mincount != 0;
3547 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3548 #if defined(SPARC64_GCC_WORKAROUND)
3551 const char *s = NULL;
3554 if (pos_before >= data->last_start_min)
3557 b = data->last_start_min;
3560 s = SvPV_const(data->last_found, l);
3561 old = b - data->last_start_min;
3564 I32 b = pos_before >= data->last_start_min
3565 ? pos_before : data->last_start_min;
3567 const char * const s = SvPV_const(data->last_found, l);
3568 I32 old = b - data->last_start_min;
3572 old = utf8_hop((U8*)s, old) - (U8*)s;
3574 /* Get the added string: */
3575 last_str = newSVpvn_utf8(s + old, l, UTF);
3576 if (deltanext == 0 && pos_before == b) {
3577 /* What was added is a constant string */
3579 SvGROW(last_str, (mincount * l) + 1);
3580 repeatcpy(SvPVX(last_str) + l,
3581 SvPVX_const(last_str), l, mincount - 1);
3582 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3583 /* Add additional parts. */
3584 SvCUR_set(data->last_found,
3585 SvCUR(data->last_found) - l);
3586 sv_catsv(data->last_found, last_str);
3588 SV * sv = data->last_found;
3590 SvUTF8(sv) && SvMAGICAL(sv) ?
3591 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3592 if (mg && mg->mg_len >= 0)
3593 mg->mg_len += CHR_SVLEN(last_str) - l;
3595 data->last_end += l * (mincount - 1);
3598 /* start offset must point into the last copy */
3599 data->last_start_min += minnext * (mincount - 1);
3600 data->last_start_max += is_inf ? I32_MAX
3601 : (maxcount - 1) * (minnext + data->pos_delta);
3604 /* It is counted once already... */
3605 data->pos_min += minnext * (mincount - counted);
3606 data->pos_delta += - counted * deltanext +
3607 (minnext + deltanext) * maxcount - minnext * mincount;
3608 if (mincount != maxcount) {
3609 /* Cannot extend fixed substrings found inside
3611 SCAN_COMMIT(pRExC_state,data,minlenp);
3612 if (mincount && last_str) {
3613 SV * const sv = data->last_found;
3614 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3615 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3619 sv_setsv(sv, last_str);
3620 data->last_end = data->pos_min;
3621 data->last_start_min =
3622 data->pos_min - CHR_SVLEN(last_str);
3623 data->last_start_max = is_inf
3625 : data->pos_min + data->pos_delta
3626 - CHR_SVLEN(last_str);
3628 data->longest = &(data->longest_float);
3630 SvREFCNT_dec(last_str);
3632 if (data && (fl & SF_HAS_EVAL))
3633 data->flags |= SF_HAS_EVAL;
3634 optimize_curly_tail:
3635 if (OP(oscan) != CURLYX) {
3636 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3638 NEXT_OFF(oscan) += NEXT_OFF(next);
3641 default: /* REF, ANYOFV, and CLUMP only? */
3642 if (flags & SCF_DO_SUBSTR) {
3643 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3644 data->longest = &(data->longest_float);
3646 is_inf = is_inf_internal = 1;
3647 if (flags & SCF_DO_STCLASS_OR)
3648 cl_anything(data->start_class);
3649 flags &= ~SCF_DO_STCLASS;
3653 else if (OP(scan) == LNBREAK) {
3654 if (flags & SCF_DO_STCLASS) {
3656 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3657 if (flags & SCF_DO_STCLASS_AND) {
3658 for (value = 0; value < 256; value++)
3659 if (!is_VERTWS_cp(value))
3660 ANYOF_BITMAP_CLEAR(data->start_class, value);
3663 for (value = 0; value < 256; value++)
3664 if (is_VERTWS_cp(value))
3665 ANYOF_BITMAP_SET(data->start_class, value);
3667 if (flags & SCF_DO_STCLASS_OR)
3668 cl_and(data->start_class, and_withp);
3669 flags &= ~SCF_DO_STCLASS;
3673 if (flags & SCF_DO_SUBSTR) {
3674 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3676 data->pos_delta += 1;
3677 data->longest = &(data->longest_float);
3680 else if (OP(scan) == FOLDCHAR) {
3681 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3682 flags &= ~SCF_DO_STCLASS;
3685 if (flags & SCF_DO_SUBSTR) {
3686 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3688 data->pos_delta += d;
3689 data->longest = &(data->longest_float);
3692 else if (REGNODE_SIMPLE(OP(scan))) {
3695 if (flags & SCF_DO_SUBSTR) {
3696 SCAN_COMMIT(pRExC_state,data,minlenp);
3700 if (flags & SCF_DO_STCLASS) {
3701 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3703 /* Some of the logic below assumes that switching
3704 locale on will only add false positives. */
3705 switch (PL_regkind[OP(scan)]) {
3709 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3710 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3711 cl_anything(data->start_class);
3714 if (OP(scan) == SANY)
3716 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3717 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3718 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3719 cl_anything(data->start_class);
3721 if (flags & SCF_DO_STCLASS_AND || !value)
3722 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3725 if (flags & SCF_DO_STCLASS_AND)
3726 cl_and(data->start_class,
3727 (struct regnode_charclass_class*)scan);
3729 cl_or(data->start_class,
3730 (struct regnode_charclass_class*)scan);
3733 if (flags & SCF_DO_STCLASS_AND) {
3734 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3735 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3736 if (OP(scan) == ALNUMU) {
3737 for (value = 0; value < 256; value++) {
3738 if (!isWORDCHAR_L1(value)) {
3739 ANYOF_BITMAP_CLEAR(data->start_class, value);
3743 for (value = 0; value < 256; value++) {
3744 if (!isALNUM(value)) {
3745 ANYOF_BITMAP_CLEAR(data->start_class, value);
3752 if (data->start_class->flags & ANYOF_LOCALE)
3753 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3754 else if (OP(scan) == ALNUMU) {
3755 for (value = 0; value < 256; value++) {
3756 if (isWORDCHAR_L1(value)) {
3757 ANYOF_BITMAP_SET(data->start_class, value);
3761 for (value = 0; value < 256; value++) {
3762 if (isALNUM(value)) {
3763 ANYOF_BITMAP_SET(data->start_class, value);
3770 if (flags & SCF_DO_STCLASS_AND) {
3771 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3772 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3773 if (OP(scan) == NALNUMU) {
3774 for (value = 0; value < 256; value++) {
3775 if (isWORDCHAR_L1(value)) {
3776 ANYOF_BITMAP_CLEAR(data->start_class, value);
3780 for (value = 0; value < 256; value++) {
3781 if (isALNUM(value)) {
3782 ANYOF_BITMAP_CLEAR(data->start_class, value);
3789 if (data->start_class->flags & ANYOF_LOCALE)
3790 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3792 if (OP(scan) == NALNUMU) {
3793 for (value = 0; value < 256; value++) {
3794 if (! isWORDCHAR_L1(value)) {
3795 ANYOF_BITMAP_SET(data->start_class, value);
3799 for (value = 0; value < 256; value++) {
3800 if (! isALNUM(value)) {
3801 ANYOF_BITMAP_SET(data->start_class, value);
3809 if (flags & SCF_DO_STCLASS_AND) {
3810 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3811 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3812 if (OP(scan) == SPACEU) {
3813 for (value = 0; value < 256; value++) {
3814 if (!isSPACE_L1(value)) {
3815 ANYOF_BITMAP_CLEAR(data->start_class, value);
3819 for (value = 0; value < 256; value++) {
3820 if (!isSPACE(value)) {
3821 ANYOF_BITMAP_CLEAR(data->start_class, value);
3828 if (data->start_class->flags & ANYOF_LOCALE) {
3829 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3831 else if (OP(scan) == SPACEU) {
3832 for (value = 0; value < 256; value++) {
3833 if (isSPACE_L1(value)) {
3834 ANYOF_BITMAP_SET(data->start_class, value);
3838 for (value = 0; value < 256; value++) {
3839 if (isSPACE(value)) {
3840 ANYOF_BITMAP_SET(data->start_class, value);
3847 if (flags & SCF_DO_STCLASS_AND) {
3848 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3849 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3850 if (OP(scan) == NSPACEU) {
3851 for (value = 0; value < 256; value++) {
3852 if (isSPACE_L1(value)) {
3853 ANYOF_BITMAP_CLEAR(data->start_class, value);
3857 for (value = 0; value < 256; value++) {
3858 if (isSPACE(value)) {
3859 ANYOF_BITMAP_CLEAR(data->start_class, value);
3866 if (data->start_class->flags & ANYOF_LOCALE)
3867 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3868 else if (OP(scan) == NSPACEU) {
3869 for (value = 0; value < 256; value++) {
3870 if (!isSPACE_L1(value)) {
3871 ANYOF_BITMAP_SET(data->start_class, value);
3876 for (value = 0; value < 256; value++) {
3877 if (!isSPACE(value)) {
3878 ANYOF_BITMAP_SET(data->start_class, value);
3885 if (flags & SCF_DO_STCLASS_AND) {
3886 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3887 for (value = 0; value < 256; value++)
3888 if (!isDIGIT(value))
3889 ANYOF_BITMAP_CLEAR(data->start_class, value);
3892 if (data->start_class->flags & ANYOF_LOCALE)
3893 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3895 for (value = 0; value < 256; value++)
3897 ANYOF_BITMAP_SET(data->start_class, value);
3902 if (flags & SCF_DO_STCLASS_AND) {
3903 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3904 for (value = 0; value < 256; value++)
3906 ANYOF_BITMAP_CLEAR(data->start_class, value);
3909 if (data->start_class->flags & ANYOF_LOCALE)
3910 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3912 for (value = 0; value < 256; value++)
3913 if (!isDIGIT(value))
3914 ANYOF_BITMAP_SET(data->start_class, value);
3918 CASE_SYNST_FNC(VERTWS);
3919 CASE_SYNST_FNC(HORIZWS);
3922 if (flags & SCF_DO_STCLASS_OR)
3923 cl_and(data->start_class, and_withp);
3924 flags &= ~SCF_DO_STCLASS;
3927 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3928 data->flags |= (OP(scan) == MEOL
3932 else if ( PL_regkind[OP(scan)] == BRANCHJ
3933 /* Lookbehind, or need to calculate parens/evals/stclass: */
3934 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3935 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3936 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3937 || OP(scan) == UNLESSM )
3939 /* Negative Lookahead/lookbehind
3940 In this case we can't do fixed string optimisation.
3943 I32 deltanext, minnext, fake = 0;
3945 struct regnode_charclass_class intrnl;
3948 data_fake.flags = 0;
3950 data_fake.whilem_c = data->whilem_c;
3951 data_fake.last_closep = data->last_closep;
3954 data_fake.last_closep = &fake;
3955 data_fake.pos_delta = delta;
3956 if ( flags & SCF_DO_STCLASS && !scan->flags
3957 && OP(scan) == IFMATCH ) { /* Lookahead */
3959 data_fake.start_class = &intrnl;
3960 f |= SCF_DO_STCLASS_AND;
3962 if (flags & SCF_WHILEM_VISITED_POS)
3963 f |= SCF_WHILEM_VISITED_POS;
3964 next = regnext(scan);
3965 nscan = NEXTOPER(NEXTOPER(scan));
3966 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3967 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3970 FAIL("Variable length lookbehind not implemented");
3972 else if (minnext > (I32)U8_MAX) {
3973 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3975 scan->flags = (U8)minnext;
3978 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3980 if (data_fake.flags & SF_HAS_EVAL)
3981 data->flags |= SF_HAS_EVAL;
3982 data->whilem_c = data_fake.whilem_c;
3984 if (f & SCF_DO_STCLASS_AND) {
3985 if (flags & SCF_DO_STCLASS_OR) {
3986 /* OR before, AND after: ideally we would recurse with
3987 * data_fake to get the AND applied by study of the
3988 * remainder of the pattern, and then derecurse;
3989 * *** HACK *** for now just treat as "no information".
3990 * See [perl #56690].
3992 cl_init(data->start_class);
3994 /* AND before and after: combine and continue */
3995 const int was = (data->start_class->flags & ANYOF_EOS);
3997 cl_and(data->start_class, &intrnl);
3999 data->start_class->flags |= ANYOF_EOS;
4003 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4005 /* Positive Lookahead/lookbehind
4006 In this case we can do fixed string optimisation,
4007 but we must be careful about it. Note in the case of
4008 lookbehind the positions will be offset by the minimum
4009 length of the pattern, something we won't know about
4010 until after the recurse.
4012 I32 deltanext, fake = 0;
4014 struct regnode_charclass_class intrnl;
4016 /* We use SAVEFREEPV so that when the full compile
4017 is finished perl will clean up the allocated
4018 minlens when it's all done. This way we don't
4019 have to worry about freeing them when we know
4020 they wont be used, which would be a pain.
4023 Newx( minnextp, 1, I32 );
4024 SAVEFREEPV(minnextp);
4027 StructCopy(data, &data_fake, scan_data_t);
4028 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4031 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4032 data_fake.last_found=newSVsv(data->last_found);
4036 data_fake.last_closep = &fake;
4037 data_fake.flags = 0;
4038 data_fake.pos_delta = delta;
4040 data_fake.flags |= SF_IS_INF;
4041 if ( flags & SCF_DO_STCLASS && !scan->flags
4042 && OP(scan) == IFMATCH ) { /* Lookahead */
4044 data_fake.start_class = &intrnl;
4045 f |= SCF_DO_STCLASS_AND;
4047 if (flags & SCF_WHILEM_VISITED_POS)
4048 f |= SCF_WHILEM_VISITED_POS;
4049 next = regnext(scan);
4050 nscan = NEXTOPER(NEXTOPER(scan));
4052 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4053 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4056 FAIL("Variable length lookbehind not implemented");
4058 else if (*minnextp > (I32)U8_MAX) {
4059 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4061 scan->flags = (U8)*minnextp;
4066 if (f & SCF_DO_STCLASS_AND) {
4067 const int was = (data->start_class->flags & ANYOF_EOS);
4069 cl_and(data->start_class, &intrnl);
4071 data->start_class->flags |= ANYOF_EOS;
4074 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4076 if (data_fake.flags & SF_HAS_EVAL)
4077 data->flags |= SF_HAS_EVAL;
4078 data->whilem_c = data_fake.whilem_c;
4079 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4080 if (RExC_rx->minlen<*minnextp)
4081 RExC_rx->minlen=*minnextp;
4082 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4083 SvREFCNT_dec(data_fake.last_found);
4085 if ( data_fake.minlen_fixed != minlenp )
4087 data->offset_fixed= data_fake.offset_fixed;
4088 data->minlen_fixed= data_fake.minlen_fixed;
4089 data->lookbehind_fixed+= scan->flags;
4091 if ( data_fake.minlen_float != minlenp )
4093 data->minlen_float= data_fake.minlen_float;
4094 data->offset_float_min=data_fake.offset_float_min;
4095 data->offset_float_max=data_fake.offset_float_max;
4096 data->lookbehind_float+= scan->flags;
4105 else if (OP(scan) == OPEN) {
4106 if (stopparen != (I32)ARG(scan))
4109 else if (OP(scan) == CLOSE) {
4110 if (stopparen == (I32)ARG(scan)) {
4113 if ((I32)ARG(scan) == is_par) {
4114 next = regnext(scan);
4116 if ( next && (OP(next) != WHILEM) && next < last)
4117 is_par = 0; /* Disable optimization */
4120 *(data->last_closep) = ARG(scan);
4122 else if (OP(scan) == EVAL) {
4124 data->flags |= SF_HAS_EVAL;
4126 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4127 if (flags & SCF_DO_SUBSTR) {
4128 SCAN_COMMIT(pRExC_state,data,minlenp);
4129 flags &= ~SCF_DO_SUBSTR;
4131 if (data && OP(scan)==ACCEPT) {
4132 data->flags |= SCF_SEEN_ACCEPT;
4137 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4139 if (flags & SCF_DO_SUBSTR) {
4140 SCAN_COMMIT(pRExC_state,data,minlenp);
4141 data->longest = &(data->longest_float);
4143 is_inf = is_inf_internal = 1;
4144 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4145 cl_anything(data->start_class);
4146 flags &= ~SCF_DO_STCLASS;
4148 else if (OP(scan) == GPOS) {
4149 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4150 !(delta || is_inf || (data && data->pos_delta)))
4152 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4153 RExC_rx->extflags |= RXf_ANCH_GPOS;
4154 if (RExC_rx->gofs < (U32)min)
4155 RExC_rx->gofs = min;
4157 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4161 #ifdef TRIE_STUDY_OPT
4162 #ifdef FULL_TRIE_STUDY
4163 else if (PL_regkind[OP(scan)] == TRIE) {
4164 /* NOTE - There is similar code to this block above for handling
4165 BRANCH nodes on the initial study. If you change stuff here
4167 regnode *trie_node= scan;
4168 regnode *tail= regnext(scan);
4169 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4170 I32 max1 = 0, min1 = I32_MAX;
4171 struct regnode_charclass_class accum;
4173 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4174 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4175 if (flags & SCF_DO_STCLASS)
4176 cl_init_zero(&accum);
4182 const regnode *nextbranch= NULL;
4185 for ( word=1 ; word <= trie->wordcount ; word++)
4187 I32 deltanext=0, minnext=0, f = 0, fake;
4188 struct regnode_charclass_class this_class;
4190 data_fake.flags = 0;
4192 data_fake.whilem_c = data->whilem_c;
4193 data_fake.last_closep = data->last_closep;
4196 data_fake.last_closep = &fake;
4197 data_fake.pos_delta = delta;
4198 if (flags & SCF_DO_STCLASS) {
4199 cl_init(&this_class);
4200 data_fake.start_class = &this_class;
4201 f = SCF_DO_STCLASS_AND;
4203 if (flags & SCF_WHILEM_VISITED_POS)
4204 f |= SCF_WHILEM_VISITED_POS;
4206 if (trie->jump[word]) {
4208 nextbranch = trie_node + trie->jump[0];
4209 scan= trie_node + trie->jump[word];
4210 /* We go from the jump point to the branch that follows
4211 it. Note this means we need the vestigal unused branches
4212 even though they arent otherwise used.
4214 minnext = study_chunk(pRExC_state, &scan, minlenp,
4215 &deltanext, (regnode *)nextbranch, &data_fake,
4216 stopparen, recursed, NULL, f,depth+1);
4218 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4219 nextbranch= regnext((regnode*)nextbranch);
4221 if (min1 > (I32)(minnext + trie->minlen))
4222 min1 = minnext + trie->minlen;
4223 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4224 max1 = minnext + deltanext + trie->maxlen;
4225 if (deltanext == I32_MAX)
4226 is_inf = is_inf_internal = 1;
4228 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4230 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4231 if ( stopmin > min + min1)
4232 stopmin = min + min1;
4233 flags &= ~SCF_DO_SUBSTR;
4235 data->flags |= SCF_SEEN_ACCEPT;
4238 if (data_fake.flags & SF_HAS_EVAL)
4239 data->flags |= SF_HAS_EVAL;
4240 data->whilem_c = data_fake.whilem_c;
4242 if (flags & SCF_DO_STCLASS)
4243 cl_or(&accum, &this_class);
4246 if (flags & SCF_DO_SUBSTR) {
4247 data->pos_min += min1;
4248 data->pos_delta += max1 - min1;
4249 if (max1 != min1 || is_inf)
4250 data->longest = &(data->longest_float);
4253 delta += max1 - min1;
4254 if (flags & SCF_DO_STCLASS_OR) {
4255 cl_or(data->start_class, &accum);
4257 cl_and(data->start_class, and_withp);
4258 flags &= ~SCF_DO_STCLASS;
4261 else if (flags & SCF_DO_STCLASS_AND) {
4263 cl_and(data->start_class, &accum);
4264 flags &= ~SCF_DO_STCLASS;
4267 /* Switch to OR mode: cache the old value of
4268 * data->start_class */
4270 StructCopy(data->start_class, and_withp,
4271 struct regnode_charclass_class);
4272 flags &= ~SCF_DO_STCLASS_AND;
4273 StructCopy(&accum, data->start_class,
4274 struct regnode_charclass_class);
4275 flags |= SCF_DO_STCLASS_OR;
4276 data->start_class->flags |= ANYOF_EOS;
4283 else if (PL_regkind[OP(scan)] == TRIE) {
4284 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4287 min += trie->minlen;
4288 delta += (trie->maxlen - trie->minlen);
4289 flags &= ~SCF_DO_STCLASS; /* xxx */
4290 if (flags & SCF_DO_SUBSTR) {
4291 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4292 data->pos_min += trie->minlen;
4293 data->pos_delta += (trie->maxlen - trie->minlen);
4294 if (trie->maxlen != trie->minlen)
4295 data->longest = &(data->longest_float);
4297 if (trie->jump) /* no more substrings -- for now /grr*/
4298 flags &= ~SCF_DO_SUBSTR;
4300 #endif /* old or new */
4301 #endif /* TRIE_STUDY_OPT */
4303 /* Else: zero-length, ignore. */
4304 scan = regnext(scan);
4309 stopparen = frame->stop;
4310 frame = frame->prev;
4311 goto fake_study_recurse;
4316 DEBUG_STUDYDATA("pre-fin:",data,depth);
4319 *deltap = is_inf_internal ? I32_MAX : delta;
4320 if (flags & SCF_DO_SUBSTR && is_inf)
4321 data->pos_delta = I32_MAX - data->pos_min;
4322 if (is_par > (I32)U8_MAX)
4324 if (is_par && pars==1 && data) {
4325 data->flags |= SF_IN_PAR;
4326 data->flags &= ~SF_HAS_PAR;
4328 else if (pars && data) {
4329 data->flags |= SF_HAS_PAR;
4330 data->flags &= ~SF_IN_PAR;
4332 if (flags & SCF_DO_STCLASS_OR)
4333 cl_and(data->start_class, and_withp);
4334 if (flags & SCF_TRIE_RESTUDY)
4335 data->flags |= SCF_TRIE_RESTUDY;
4337 DEBUG_STUDYDATA("post-fin:",data,depth);
4339 return min < stopmin ? min : stopmin;
4343 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4345 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4347 PERL_ARGS_ASSERT_ADD_DATA;
4349 Renewc(RExC_rxi->data,
4350 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4351 char, struct reg_data);
4353 Renew(RExC_rxi->data->what, count + n, U8);
4355 Newx(RExC_rxi->data->what, n, U8);
4356 RExC_rxi->data->count = count + n;
4357 Copy(s, RExC_rxi->data->what + count, n, U8);
4361 /*XXX: todo make this not included in a non debugging perl */
4362 #ifndef PERL_IN_XSUB_RE
4364 Perl_reginitcolors(pTHX)
4367 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4369 char *t = savepv(s);
4373 t = strchr(t, '\t');
4379 PL_colors[i] = t = (char *)"";
4384 PL_colors[i++] = (char *)"";
4391 #ifdef TRIE_STUDY_OPT
4392 #define CHECK_RESTUDY_GOTO \
4394 (data.flags & SCF_TRIE_RESTUDY) \
4398 #define CHECK_RESTUDY_GOTO
4402 - pregcomp - compile a regular expression into internal code
4404 * We can't allocate space until we know how big the compiled form will be,
4405 * but we can't compile it (and thus know how big it is) until we've got a
4406 * place to put the code. So we cheat: we compile it twice, once with code
4407 * generation turned off and size counting turned on, and once "for real".
4408 * This also means that we don't allocate space until we are sure that the
4409 * thing really will compile successfully, and we never have to move the
4410 * code and thus invalidate pointers into it. (Note that it has to be in
4411 * one piece because free() must be able to free it all.) [NB: not true in perl]
4413 * Beware that the optimization-preparation code in here knows about some
4414 * of the structure of the compiled regexp. [I'll say.]
4419 #ifndef PERL_IN_XSUB_RE
4420 #define RE_ENGINE_PTR &PL_core_reg_engine
4422 extern const struct regexp_engine my_reg_engine;
4423 #define RE_ENGINE_PTR &my_reg_engine
4426 #ifndef PERL_IN_XSUB_RE
4428 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4431 HV * const table = GvHV(PL_hintgv);
4433 PERL_ARGS_ASSERT_PREGCOMP;
4435 /* Dispatch a request to compile a regexp to correct
4438 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4439 GET_RE_DEBUG_FLAGS_DECL;
4440 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4441 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4443 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4446 return CALLREGCOMP_ENG(eng, pattern, flags);
4449 return Perl_re_compile(aTHX_ pattern, flags);
4454 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4459 register regexp_internal *ri;
4468 /* these are all flags - maybe they should be turned
4469 * into a single int with different bit masks */
4470 I32 sawlookahead = 0;
4473 bool used_setjump = FALSE;
4478 RExC_state_t RExC_state;
4479 RExC_state_t * const pRExC_state = &RExC_state;
4480 #ifdef TRIE_STUDY_OPT
4482 RExC_state_t copyRExC_state;
4484 GET_RE_DEBUG_FLAGS_DECL;
4486 PERL_ARGS_ASSERT_RE_COMPILE;
4488 DEBUG_r(if (!PL_colorset) reginitcolors());
4490 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4491 RExC_uni_semantics = 0;
4493 /****************** LONG JUMP TARGET HERE***********************/
4494 /* Longjmp back to here if have to switch in midstream to utf8 */
4495 if (! RExC_orig_utf8) {
4496 JMPENV_PUSH(jump_ret);
4497 used_setjump = TRUE;
4500 if (jump_ret == 0) { /* First time through */
4501 exp = SvPV(pattern, plen);
4503 /* ignore the utf8ness if the pattern is 0 length */
4505 RExC_utf8 = RExC_orig_utf8 = 0;
4509 SV *dsv= sv_newmortal();
4510 RE_PV_QUOTED_DECL(s, RExC_utf8,
4511 dsv, exp, plen, 60);
4512 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4513 PL_colors[4],PL_colors[5],s);
4516 else { /* longjumped back */
4519 /* If the cause for the longjmp was other than changing to utf8, pop
4520 * our own setjmp, and longjmp to the correct handler */
4521 if (jump_ret != UTF8_LONGJMP) {
4523 JMPENV_JUMP(jump_ret);
4528 /* It's possible to write a regexp in ascii that represents Unicode
4529 codepoints outside of the byte range, such as via \x{100}. If we
4530 detect such a sequence we have to convert the entire pattern to utf8
4531 and then recompile, as our sizing calculation will have been based
4532 on 1 byte == 1 character, but we will need to use utf8 to encode
4533 at least some part of the pattern, and therefore must convert the whole
4536 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4537 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4538 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4540 RExC_orig_utf8 = RExC_utf8 = 1;
4544 #ifdef TRIE_STUDY_OPT
4548 /* Set to use unicode semantics if the pattern is in utf8 and has the
4549 * 'depends' charset specified, as it means unicode when utf8 */
4550 pm_flags = orig_pm_flags;
4552 if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
4553 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4557 RExC_flags = pm_flags;
4561 RExC_in_lookbehind = 0;
4562 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4563 RExC_seen_evals = 0;
4566 /* First pass: determine size, legality. */
4574 RExC_emit = &PL_regdummy;
4575 RExC_whilem_seen = 0;
4576 RExC_open_parens = NULL;
4577 RExC_close_parens = NULL;
4579 RExC_paren_names = NULL;
4581 RExC_paren_name_list = NULL;
4583 RExC_recurse = NULL;
4584 RExC_recurse_count = 0;
4586 #if 0 /* REGC() is (currently) a NOP at the first pass.
4587 * Clever compilers notice this and complain. --jhi */
4588 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4590 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4591 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4592 RExC_precomp = NULL;
4596 /* Here, finished first pass. Get rid of any added setjmp */
4602 PerlIO_printf(Perl_debug_log,
4603 "Required size %"IVdf" nodes\n"
4604 "Starting second pass (creation)\n",
4607 RExC_lastparse=NULL;
4610 /* The first pass could have found things that force Unicode semantics */
4611 if ((RExC_utf8 || RExC_uni_semantics)
4612 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
4614 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4617 /* Small enough for pointer-storage convention?
4618 If extralen==0, this means that we will not need long jumps. */
4619 if (RExC_size >= 0x10000L && RExC_extralen)
4620 RExC_size += RExC_extralen;
4623 if (RExC_whilem_seen > 15)
4624 RExC_whilem_seen = 15;
4626 /* Allocate space and zero-initialize. Note, the two step process
4627 of zeroing when in debug mode, thus anything assigned has to
4628 happen after that */
4629 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4630 r = (struct regexp*)SvANY(rx);
4631 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4632 char, regexp_internal);
4633 if ( r == NULL || ri == NULL )
4634 FAIL("Regexp out of space");
4636 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4637 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4639 /* bulk initialize base fields with 0. */
4640 Zero(ri, sizeof(regexp_internal), char);
4643 /* non-zero initialization begins here */
4645 r->engine= RE_ENGINE_PTR;
4646 r->extflags = pm_flags;
4648 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4649 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
4651 /* The caret is output if there are any defaults: if not all the STD
4652 * flags are set, or if no character set specifier is needed */
4654 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4656 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4657 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4658 >> RXf_PMf_STD_PMMOD_SHIFT);
4659 const char *fptr = STD_PAT_MODS; /*"msix"*/
4661 /* Allocate for the worst case, which is all the std flags are turned
4662 * on. If more precision is desired, we could do a population count of
4663 * the flags set. This could be done with a small lookup table, or by
4664 * shifting, masking and adding, or even, when available, assembly
4665 * language for a machine-language population count.
4666 * We never output a minus, as all those are defaults, so are
4667 * covered by the caret */
4668 const STRLEN wraplen = plen + has_p + has_runon
4669 + has_default /* If needs a caret */
4671 /* If needs a character set specifier */
4672 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
4673 + (sizeof(STD_PAT_MODS) - 1)
4674 + (sizeof("(?:)") - 1);
4676 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4678 SvFLAGS(rx) |= SvUTF8(pattern);
4681 /* If a default, cover it using the caret */
4683 *p++= DEFAULT_PAT_MOD;
4687 const char* const name = get_regex_charset_name(r->extflags, &len);
4688 Copy(name, p, len, char);
4692 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4695 while((ch = *fptr++)) {
4703 Copy(RExC_precomp, p, plen, char);
4704 assert ((RX_WRAPPED(rx) - p) < 16);
4705 r->pre_prefix = p - RX_WRAPPED(rx);
4711 SvCUR_set(rx, p - SvPVX_const(rx));
4715 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4717 if (RExC_seen & REG_SEEN_RECURSE) {
4718 Newxz(RExC_open_parens, RExC_npar,regnode *);
4719 SAVEFREEPV(RExC_open_parens);
4720 Newxz(RExC_close_parens,RExC_npar,regnode *);
4721 SAVEFREEPV(RExC_close_parens);
4724 /* Useful during FAIL. */
4725 #ifdef RE_TRACK_PATTERN_OFFSETS
4726 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4727 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4728 "%s %"UVuf" bytes for offset annotations.\n",
4729 ri->u.offsets ? "Got" : "Couldn't get",
4730 (UV)((2*RExC_size+1) * sizeof(U32))));
4732 SetProgLen(ri,RExC_size);
4737 /* Second pass: emit code. */
4738 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4743 RExC_emit_start = ri->program;
4744 RExC_emit = ri->program;
4745 RExC_emit_bound = ri->program + RExC_size + 1;
4747 /* Store the count of eval-groups for security checks: */
4748 RExC_rx->seen_evals = RExC_seen_evals;
4749 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4750 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4754 /* XXXX To minimize changes to RE engine we always allocate
4755 3-units-long substrs field. */
4756 Newx(r->substrs, 1, struct reg_substr_data);
4757 if (RExC_recurse_count) {
4758 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4759 SAVEFREEPV(RExC_recurse);
4763 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4764 Zero(r->substrs, 1, struct reg_substr_data);
4766 #ifdef TRIE_STUDY_OPT
4768 StructCopy(&zero_scan_data, &data, scan_data_t);
4769 copyRExC_state = RExC_state;
4772 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4774 RExC_state = copyRExC_state;
4775 if (seen & REG_TOP_LEVEL_BRANCHES)
4776 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4778 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4779 if (data.last_found) {
4780 SvREFCNT_dec(data.longest_fixed);
4781 SvREFCNT_dec(data.longest_float);
4782 SvREFCNT_dec(data.last_found);
4784 StructCopy(&zero_scan_data, &data, scan_data_t);
4787 StructCopy(&zero_scan_data, &data, scan_data_t);
4790 /* Dig out information for optimizations. */
4791 r->extflags = RExC_flags; /* was pm_op */
4792 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4795 SvUTF8_on(rx); /* Unicode in it? */
4796 ri->regstclass = NULL;
4797 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4798 r->intflags |= PREGf_NAUGHTY;
4799 scan = ri->program + 1; /* First BRANCH. */
4801 /* testing for BRANCH here tells us whether there is "must appear"
4802 data in the pattern. If there is then we can use it for optimisations */
4803 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4805 STRLEN longest_float_length, longest_fixed_length;
4806 struct regnode_charclass_class ch_class; /* pointed to by data */
4808 I32 last_close = 0; /* pointed to by data */
4809 regnode *first= scan;
4810 regnode *first_next= regnext(first);
4812 * Skip introductions and multiplicators >= 1
4813 * so that we can extract the 'meat' of the pattern that must
4814 * match in the large if() sequence following.
4815 * NOTE that EXACT is NOT covered here, as it is normally
4816 * picked up by the optimiser separately.
4818 * This is unfortunate as the optimiser isnt handling lookahead
4819 * properly currently.
4822 while ((OP(first) == OPEN && (sawopen = 1)) ||
4823 /* An OR of *one* alternative - should not happen now. */
4824 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4825 /* for now we can't handle lookbehind IFMATCH*/
4826 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4827 (OP(first) == PLUS) ||
4828 (OP(first) == MINMOD) ||
4829 /* An {n,m} with n>0 */
4830 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4831 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4834 * the only op that could be a regnode is PLUS, all the rest
4835 * will be regnode_1 or regnode_2.
4838 if (OP(first) == PLUS)
4841 first += regarglen[OP(first)];
4843 first = NEXTOPER(first);
4844 first_next= regnext(first);
4847 /* Starting-point info. */
4849 DEBUG_PEEP("first:",first,0);
4850 /* Ignore EXACT as we deal with it later. */
4851 if (PL_regkind[OP(first)] == EXACT) {
4852 if (OP(first) == EXACT)
4853 NOOP; /* Empty, get anchored substr later. */
4855 ri->regstclass = first;
4858 else if (PL_regkind[OP(first)] == TRIE &&
4859 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4862 /* this can happen only on restudy */
4863 if ( OP(first) == TRIE ) {
4864 struct regnode_1 *trieop = (struct regnode_1 *)
4865 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4866 StructCopy(first,trieop,struct regnode_1);
4867 trie_op=(regnode *)trieop;
4869 struct regnode_charclass *trieop = (struct regnode_charclass *)
4870 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4871 StructCopy(first,trieop,struct regnode_charclass);
4872 trie_op=(regnode *)trieop;
4875 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4876 ri->regstclass = trie_op;
4879 else if (REGNODE_SIMPLE(OP(first)))
4880 ri->regstclass = first;
4881 else if (PL_regkind[OP(first)] == BOUND ||
4882 PL_regkind[OP(first)] == NBOUND)
4883 ri->regstclass = first;
4884 else if (PL_regkind[OP(first)] == BOL) {
4885 r->extflags |= (OP(first) == MBOL
4887 : (OP(first) == SBOL
4890 first = NEXTOPER(first);
4893 else if (OP(first) == GPOS) {
4894 r->extflags |= RXf_ANCH_GPOS;
4895 first = NEXTOPER(first);
4898 else if ((!sawopen || !RExC_sawback) &&
4899 (OP(first) == STAR &&
4900 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4901 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4903 /* turn .* into ^.* with an implied $*=1 */
4905 (OP(NEXTOPER(first)) == REG_ANY)
4908 r->extflags |= type;
4909 r->intflags |= PREGf_IMPLICIT;
4910 first = NEXTOPER(first);
4913 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4914 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4915 /* x+ must match at the 1st pos of run of x's */
4916 r->intflags |= PREGf_SKIP;
4918 /* Scan is after the zeroth branch, first is atomic matcher. */
4919 #ifdef TRIE_STUDY_OPT
4922 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4923 (IV)(first - scan + 1))
4927 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4928 (IV)(first - scan + 1))
4934 * If there's something expensive in the r.e., find the
4935 * longest literal string that must appear and make it the
4936 * regmust. Resolve ties in favor of later strings, since
4937 * the regstart check works with the beginning of the r.e.
4938 * and avoiding duplication strengthens checking. Not a
4939 * strong reason, but sufficient in the absence of others.
4940 * [Now we resolve ties in favor of the earlier string if
4941 * it happens that c_offset_min has been invalidated, since the
4942 * earlier string may buy us something the later one won't.]
4945 data.longest_fixed = newSVpvs("");
4946 data.longest_float = newSVpvs("");
4947 data.last_found = newSVpvs("");
4948 data.longest = &(data.longest_fixed);
4950 if (!ri->regstclass) {
4952 data.start_class = &ch_class;
4953 stclass_flag = SCF_DO_STCLASS_AND;
4954 } else /* XXXX Check for BOUND? */
4956 data.last_closep = &last_close;
4958 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4959 &data, -1, NULL, NULL,
4960 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4966 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4967 && data.last_start_min == 0 && data.last_end > 0
4968 && !RExC_seen_zerolen
4969 && !(RExC_seen & REG_SEEN_VERBARG)
4970 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4971 r->extflags |= RXf_CHECK_ALL;
4972 scan_commit(pRExC_state, &data,&minlen,0);
4973 SvREFCNT_dec(data.last_found);
4975 /* Note that code very similar to this but for anchored string
4976 follows immediately below, changes may need to be made to both.
4979 longest_float_length = CHR_SVLEN(data.longest_float);
4980 if (longest_float_length
4981 || (data.flags & SF_FL_BEFORE_EOL
4982 && (!(data.flags & SF_FL_BEFORE_MEOL)
4983 || (RExC_flags & RXf_PMf_MULTILINE))))
4987 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4988 && data.offset_fixed == data.offset_float_min
4989 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4990 goto remove_float; /* As in (a)+. */
4992 /* copy the information about the longest float from the reg_scan_data
4993 over to the program. */
4994 if (SvUTF8(data.longest_float)) {
4995 r->float_utf8 = data.longest_float;
4996 r->float_substr = NULL;
4998 r->float_substr = data.longest_float;
4999 r->float_utf8 = NULL;
5001 /* float_end_shift is how many chars that must be matched that
5002 follow this item. We calculate it ahead of time as once the
5003 lookbehind offset is added in we lose the ability to correctly
5005 ml = data.minlen_float ? *(data.minlen_float)
5006 : (I32)longest_float_length;
5007 r->float_end_shift = ml - data.offset_float_min
5008 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5009 + data.lookbehind_float;
5010 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5011 r->float_max_offset = data.offset_float_max;
5012 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5013 r->float_max_offset -= data.lookbehind_float;
5015 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5016 && (!(data.flags & SF_FL_BEFORE_MEOL)
5017 || (RExC_flags & RXf_PMf_MULTILINE)));
5018 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5022 r->float_substr = r->float_utf8 = NULL;
5023 SvREFCNT_dec(data.longest_float);
5024 longest_float_length = 0;
5027 /* Note that code very similar to this but for floating string
5028 is immediately above, changes may need to be made to both.
5031 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5032 if (longest_fixed_length
5033 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5034 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5035 || (RExC_flags & RXf_PMf_MULTILINE))))
5039 /* copy the information about the longest fixed
5040 from the reg_scan_data over to the program. */
5041 if (SvUTF8(data.longest_fixed)) {
5042 r->anchored_utf8 = data.longest_fixed;
5043 r->anchored_substr = NULL;
5045 r->anchored_substr = data.longest_fixed;
5046 r->anchored_utf8 = NULL;
5048 /* fixed_end_shift is how many chars that must be matched that
5049 follow this item. We calculate it ahead of time as once the
5050 lookbehind offset is added in we lose the ability to correctly
5052 ml = data.minlen_fixed ? *(data.minlen_fixed)
5053 : (I32)longest_fixed_length;
5054 r->anchored_end_shift = ml - data.offset_fixed
5055 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5056 + data.lookbehind_fixed;
5057 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5059 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5060 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5061 || (RExC_flags & RXf_PMf_MULTILINE)));
5062 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5065 r->anchored_substr = r->anchored_utf8 = NULL;
5066 SvREFCNT_dec(data.longest_fixed);
5067 longest_fixed_length = 0;
5070 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5071 ri->regstclass = NULL;
5073 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5075 && !(data.start_class->flags & ANYOF_EOS)
5076 && !cl_is_anything(data.start_class))
5078 const U32 n = add_data(pRExC_state, 1, "f");
5079 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5081 Newx(RExC_rxi->data->data[n], 1,
5082 struct regnode_charclass_class);
5083 StructCopy(data.start_class,
5084 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5085 struct regnode_charclass_class);
5086 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5087 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5088 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5089 regprop(r, sv, (regnode*)data.start_class);
5090 PerlIO_printf(Perl_debug_log,
5091 "synthetic stclass \"%s\".\n",
5092 SvPVX_const(sv));});
5095 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5096 if (longest_fixed_length > longest_float_length) {
5097 r->check_end_shift = r->anchored_end_shift;
5098 r->check_substr = r->anchored_substr;
5099 r->check_utf8 = r->anchored_utf8;
5100 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5101 if (r->extflags & RXf_ANCH_SINGLE)
5102 r->extflags |= RXf_NOSCAN;
5105 r->check_end_shift = r->float_end_shift;
5106 r->check_substr = r->float_substr;
5107 r->check_utf8 = r->float_utf8;
5108 r->check_offset_min = r->float_min_offset;
5109 r->check_offset_max = r->float_max_offset;
5111 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5112 This should be changed ASAP! */
5113 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5114 r->extflags |= RXf_USE_INTUIT;
5115 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5116 r->extflags |= RXf_INTUIT_TAIL;
5118 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5119 if ( (STRLEN)minlen < longest_float_length )
5120 minlen= longest_float_length;
5121 if ( (STRLEN)minlen < longest_fixed_length )
5122 minlen= longest_fixed_length;
5126 /* Several toplevels. Best we can is to set minlen. */
5128 struct regnode_charclass_class ch_class;
5131 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5133 scan = ri->program + 1;
5135 data.start_class = &ch_class;
5136 data.last_closep = &last_close;
5139 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5140 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5144 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5145 = r->float_substr = r->float_utf8 = NULL;
5147 if (!(data.start_class->flags & ANYOF_EOS)
5148 && !cl_is_anything(data.start_class))
5150 const U32 n = add_data(pRExC_state, 1, "f");
5151 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5153 Newx(RExC_rxi->data->data[n], 1,
5154 struct regnode_charclass_class);
5155 StructCopy(data.start_class,
5156 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5157 struct regnode_charclass_class);
5158 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5159 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5160 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5161 regprop(r, sv, (regnode*)data.start_class);
5162 PerlIO_printf(Perl_debug_log,
5163 "synthetic stclass \"%s\".\n",
5164 SvPVX_const(sv));});
5168 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5169 the "real" pattern. */
5171 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5172 (IV)minlen, (IV)r->minlen);
5174 r->minlenret = minlen;
5175 if (r->minlen < minlen)
5178 if (RExC_seen & REG_SEEN_GPOS)
5179 r->extflags |= RXf_GPOS_SEEN;
5180 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5181 r->extflags |= RXf_LOOKBEHIND_SEEN;
5182 if (RExC_seen & REG_SEEN_EVAL)
5183 r->extflags |= RXf_EVAL_SEEN;
5184 if (RExC_seen & REG_SEEN_CANY)
5185 r->extflags |= RXf_CANY_SEEN;
5186 if (RExC_seen & REG_SEEN_VERBARG)
5187 r->intflags |= PREGf_VERBARG_SEEN;
5188 if (RExC_seen & REG_SEEN_CUTGROUP)
5189 r->intflags |= PREGf_CUTGROUP_SEEN;
5190 if (RExC_paren_names)
5191 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5193 RXp_PAREN_NAMES(r) = NULL;
5195 #ifdef STUPID_PATTERN_CHECKS
5196 if (RX_PRELEN(rx) == 0)
5197 r->extflags |= RXf_NULL;
5198 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5199 /* XXX: this should happen BEFORE we compile */
5200 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5201 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5202 r->extflags |= RXf_WHITE;
5203 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5204 r->extflags |= RXf_START_ONLY;
5206 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5207 /* XXX: this should happen BEFORE we compile */
5208 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5210 regnode *first = ri->program + 1;
5213 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5214 r->extflags |= RXf_NULL;
5215 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5216 r->extflags |= RXf_START_ONLY;
5217 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5218 && OP(regnext(first)) == END)
5219 r->extflags |= RXf_WHITE;
5223 if (RExC_paren_names) {
5224 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5225 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5228 ri->name_list_idx = 0;
5230 if (RExC_recurse_count) {
5231 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5232 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5233 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5236 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5237 /* assume we don't need to swap parens around before we match */
5240 PerlIO_printf(Perl_debug_log,"Final program:\n");
5243 #ifdef RE_TRACK_PATTERN_OFFSETS
5244 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5245 const U32 len = ri->u.offsets[0];
5247 GET_RE_DEBUG_FLAGS_DECL;
5248 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5249 for (i = 1; i <= len; i++) {
5250 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5251 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5252 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5254 PerlIO_printf(Perl_debug_log, "\n");
5260 #undef RE_ENGINE_PTR
5264 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5267 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5269 PERL_UNUSED_ARG(value);
5271 if (flags & RXapif_FETCH) {
5272 return reg_named_buff_fetch(rx, key, flags);
5273 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5274 Perl_croak_no_modify(aTHX);
5276 } else if (flags & RXapif_EXISTS) {
5277 return reg_named_buff_exists(rx, key, flags)
5280 } else if (flags & RXapif_REGNAMES) {
5281 return reg_named_buff_all(rx, flags);
5282 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5283 return reg_named_buff_scalar(rx, flags);
5285 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5291 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5294 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5295 PERL_UNUSED_ARG(lastkey);
5297 if (flags & RXapif_FIRSTKEY)
5298 return reg_named_buff_firstkey(rx, flags);
5299 else if (flags & RXapif_NEXTKEY)
5300 return reg_named_buff_nextkey(rx, flags);
5302 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5308 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5311 AV *retarray = NULL;
5313 struct regexp *const rx = (struct regexp *)SvANY(r);
5315 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5317 if (flags & RXapif_ALL)
5320 if (rx && RXp_PAREN_NAMES(rx)) {
5321 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5324 SV* sv_dat=HeVAL(he_str);
5325 I32 *nums=(I32*)SvPVX(sv_dat);
5326 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5327 if ((I32)(rx->nparens) >= nums[i]
5328 && rx->offs[nums[i]].start != -1
5329 && rx->offs[nums[i]].end != -1)
5332 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5336 ret = newSVsv(&PL_sv_undef);
5339 av_push(retarray, ret);
5342 return newRV_noinc(MUTABLE_SV(retarray));
5349 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5352 struct regexp *const rx = (struct regexp *)SvANY(r);
5354 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5356 if (rx && RXp_PAREN_NAMES(rx)) {
5357 if (flags & RXapif_ALL) {
5358 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5360 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5374 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5376 struct regexp *const rx = (struct regexp *)SvANY(r);
5378 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5380 if ( rx && RXp_PAREN_NAMES(rx) ) {
5381 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5383 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5390 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5392 struct regexp *const rx = (struct regexp *)SvANY(r);
5393 GET_RE_DEBUG_FLAGS_DECL;
5395 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5397 if (rx && RXp_PAREN_NAMES(rx)) {
5398 HV *hv = RXp_PAREN_NAMES(rx);
5400 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5403 SV* sv_dat = HeVAL(temphe);
5404 I32 *nums = (I32*)SvPVX(sv_dat);
5405 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5406 if ((I32)(rx->lastparen) >= nums[i] &&
5407 rx->offs[nums[i]].start != -1 &&
5408 rx->offs[nums[i]].end != -1)
5414 if (parno || flags & RXapif_ALL) {
5415 return newSVhek(HeKEY_hek(temphe));
5423 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5428 struct regexp *const rx = (struct regexp *)SvANY(r);
5430 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5432 if (rx && RXp_PAREN_NAMES(rx)) {
5433 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5434 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5435 } else if (flags & RXapif_ONE) {
5436 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5437 av = MUTABLE_AV(SvRV(ret));
5438 length = av_len(av);
5440 return newSViv(length + 1);
5442 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5446 return &PL_sv_undef;
5450 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5452 struct regexp *const rx = (struct regexp *)SvANY(r);
5455 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5457 if (rx && RXp_PAREN_NAMES(rx)) {
5458 HV *hv= RXp_PAREN_NAMES(rx);
5460 (void)hv_iterinit(hv);
5461 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5464 SV* sv_dat = HeVAL(temphe);
5465 I32 *nums = (I32*)SvPVX(sv_dat);
5466 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5467 if ((I32)(rx->lastparen) >= nums[i] &&
5468 rx->offs[nums[i]].start != -1 &&
5469 rx->offs[nums[i]].end != -1)
5475 if (parno || flags & RXapif_ALL) {
5476 av_push(av, newSVhek(HeKEY_hek(temphe)));
5481 return newRV_noinc(MUTABLE_SV(av));
5485 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5488 struct regexp *const rx = (struct regexp *)SvANY(r);
5493 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5496 sv_setsv(sv,&PL_sv_undef);
5500 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5502 i = rx->offs[0].start;
5506 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5508 s = rx->subbeg + rx->offs[0].end;
5509 i = rx->sublen - rx->offs[0].end;
5512 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5513 (s1 = rx->offs[paren].start) != -1 &&
5514 (t1 = rx->offs[paren].end) != -1)
5518 s = rx->subbeg + s1;
5520 sv_setsv(sv,&PL_sv_undef);
5523 assert(rx->sublen >= (s - rx->subbeg) + i );
5525 const int oldtainted = PL_tainted;
5527 sv_setpvn(sv, s, i);
5528 PL_tainted = oldtainted;
5529 if ( (rx->extflags & RXf_CANY_SEEN)
5530 ? (RXp_MATCH_UTF8(rx)
5531 && (!i || is_utf8_string((U8*)s, i)))
5532 : (RXp_MATCH_UTF8(rx)) )
5539 if (RXp_MATCH_TAINTED(rx)) {
5540 if (SvTYPE(sv) >= SVt_PVMG) {
5541 MAGIC* const mg = SvMAGIC(sv);
5544 SvMAGIC_set(sv, mg->mg_moremagic);
5546 if ((mgt = SvMAGIC(sv))) {
5547 mg->mg_moremagic = mgt;
5548 SvMAGIC_set(sv, mg);
5558 sv_setsv(sv,&PL_sv_undef);
5564 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5565 SV const * const value)
5567 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5569 PERL_UNUSED_ARG(rx);
5570 PERL_UNUSED_ARG(paren);
5571 PERL_UNUSED_ARG(value);
5574 Perl_croak_no_modify(aTHX);
5578 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5581 struct regexp *const rx = (struct regexp *)SvANY(r);
5585 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5587 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5589 /* $` / ${^PREMATCH} */
5590 case RX_BUFF_IDX_PREMATCH:
5591 if (rx->offs[0].start != -1) {
5592 i = rx->offs[0].start;
5600 /* $' / ${^POSTMATCH} */
5601 case RX_BUFF_IDX_POSTMATCH:
5602 if (rx->offs[0].end != -1) {
5603 i = rx->sublen - rx->offs[0].end;
5605 s1 = rx->offs[0].end;
5611 /* $& / ${^MATCH}, $1, $2, ... */
5613 if (paren <= (I32)rx->nparens &&
5614 (s1 = rx->offs[paren].start) != -1 &&
5615 (t1 = rx->offs[paren].end) != -1)
5620 if (ckWARN(WARN_UNINITIALIZED))
5621 report_uninit((const SV *)sv);
5626 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5627 const char * const s = rx->subbeg + s1;
5632 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5639 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5641 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5642 PERL_UNUSED_ARG(rx);
5646 return newSVpvs("Regexp");
5649 /* Scans the name of a named buffer from the pattern.
5650 * If flags is REG_RSN_RETURN_NULL returns null.
5651 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5652 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5653 * to the parsed name as looked up in the RExC_paren_names hash.
5654 * If there is an error throws a vFAIL().. type exception.
5657 #define REG_RSN_RETURN_NULL 0
5658 #define REG_RSN_RETURN_NAME 1
5659 #define REG_RSN_RETURN_DATA 2
5662 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5664 char *name_start = RExC_parse;
5666 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5668 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5669 /* skip IDFIRST by using do...while */
5672 RExC_parse += UTF8SKIP(RExC_parse);
5673 } while (isALNUM_utf8((U8*)RExC_parse));
5677 } while (isALNUM(*RExC_parse));
5682 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5683 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5684 if ( flags == REG_RSN_RETURN_NAME)
5686 else if (flags==REG_RSN_RETURN_DATA) {
5689 if ( ! sv_name ) /* should not happen*/
5690 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5691 if (RExC_paren_names)
5692 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5694 sv_dat = HeVAL(he_str);
5696 vFAIL("Reference to nonexistent named group");
5700 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5707 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5708 int rem=(int)(RExC_end - RExC_parse); \
5717 if (RExC_lastparse!=RExC_parse) \
5718 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5721 iscut ? "..." : "<" \
5724 PerlIO_printf(Perl_debug_log,"%16s",""); \
5727 num = RExC_size + 1; \
5729 num=REG_NODE_NUM(RExC_emit); \
5730 if (RExC_lastnum!=num) \
5731 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5733 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5734 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5735 (int)((depth*2)), "", \
5739 RExC_lastparse=RExC_parse; \
5744 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5745 DEBUG_PARSE_MSG((funcname)); \
5746 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5748 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5749 DEBUG_PARSE_MSG((funcname)); \
5750 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5753 /* This section of code defines the inversion list object and its methods. The
5754 * interfaces are highly subject to change, so as much as possible is static to
5755 * this file. An inversion list is here implemented as a malloc'd C array with
5756 * some added info. More will be coming when functionality is added later.
5758 * Some of the methods should always be private to the implementation, and some
5759 * should eventually be made public */
5761 #define INVLIST_INITIAL_LEN 10
5762 #define INVLIST_ARRAY_KEY "array"
5763 #define INVLIST_MAX_KEY "max"
5764 #define INVLIST_LEN_KEY "len"
5766 PERL_STATIC_INLINE UV*
5767 S_invlist_array(pTHX_ HV* const invlist)
5769 /* Returns the pointer to the inversion list's array. Every time the
5770 * length changes, this needs to be called in case malloc or realloc moved
5773 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5775 PERL_ARGS_ASSERT_INVLIST_ARRAY;
5777 if (list_ptr == NULL) {
5778 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5782 return INT2PTR(UV *, SvUV(*list_ptr));
5785 PERL_STATIC_INLINE void
5786 S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
5788 PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
5790 /* Sets the array stored in the inversion list to the memory beginning with
5793 if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
5794 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5799 PERL_STATIC_INLINE UV
5800 S_invlist_len(pTHX_ HV* const invlist)
5802 /* Returns the current number of elements in the inversion list's array */
5804 SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
5806 PERL_ARGS_ASSERT_INVLIST_LEN;
5808 if (len_ptr == NULL) {
5809 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5813 return SvUV(*len_ptr);
5816 PERL_STATIC_INLINE UV
5817 S_invlist_max(pTHX_ HV* const invlist)
5819 /* Returns the maximum number of elements storable in the inversion list's
5820 * array, without having to realloc() */
5822 SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
5824 PERL_ARGS_ASSERT_INVLIST_MAX;
5826 if (max_ptr == NULL) {
5827 Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
5831 return SvUV(*max_ptr);
5834 PERL_STATIC_INLINE void
5835 S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
5837 /* Sets the current number of elements stored in the inversion list */
5839 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
5841 if (len != 0 && len > invlist_max(invlist)) {
5842 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
5845 if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
5846 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5851 PERL_STATIC_INLINE void
5852 S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
5855 /* Sets the maximum number of elements storable in the inversion list
5856 * without having to realloc() */
5858 PERL_ARGS_ASSERT_INVLIST_SET_MAX;
5860 if (max < invlist_len(invlist)) {
5861 Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
5864 if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
5865 Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
5870 #ifndef PERL_IN_XSUB_RE
5872 Perl__new_invlist(pTHX_ IV initial_size)
5875 /* Return a pointer to a newly constructed inversion list, with enough
5876 * space to store 'initial_size' elements. If that number is negative, a
5877 * system default is used instead */
5879 HV* invlist = newHV();
5882 if (initial_size < 0) {
5883 initial_size = INVLIST_INITIAL_LEN;
5886 /* Allocate the initial space */
5887 Newx(list, initial_size, UV);
5888 invlist_set_array(invlist, list);
5890 /* set_len has to come before set_max, as the latter inspects the len */
5891 invlist_set_len(invlist, 0);
5892 invlist_set_max(invlist, initial_size);
5898 PERL_STATIC_INLINE void
5899 S_invlist_destroy(pTHX_ HV* const invlist)
5901 /* Inversion list destructor */
5903 SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
5905 PERL_ARGS_ASSERT_INVLIST_DESTROY;
5907 if (list_ptr != NULL) {
5908 UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
5914 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
5916 /* Change the maximum size of an inversion list (up or down) */
5920 const UV old_max = invlist_max(invlist);
5922 PERL_ARGS_ASSERT_INVLIST_EXTEND;
5924 if (old_max == new_max) { /* If a no-op */
5928 array = orig_array = invlist_array(invlist);
5929 Renew(array, new_max, UV);
5931 /* If the size change moved the list in memory, set the new one */
5932 if (array != orig_array) {
5933 invlist_set_array(invlist, array);
5936 invlist_set_max(invlist, new_max);
5940 PERL_STATIC_INLINE void
5941 S_invlist_trim(pTHX_ HV* const invlist)
5943 PERL_ARGS_ASSERT_INVLIST_TRIM;
5945 /* Change the length of the inversion list to how many entries it currently
5948 invlist_extend(invlist, invlist_len(invlist));
5951 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
5954 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
5956 #ifndef PERL_IN_XSUB_RE
5958 Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
5960 /* Subject to change or removal. Append the range from 'start' to 'end' at
5961 * the end of the inversion list. The range must be above any existing
5964 UV* array = invlist_array(invlist);
5965 UV max = invlist_max(invlist);
5966 UV len = invlist_len(invlist);
5968 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
5972 /* Here, the existing list is non-empty. The current max entry in the
5973 * list is generally the first value not in the set, except when the
5974 * set extends to the end of permissible values, in which case it is
5975 * the first entry in that final set, and so this call is an attempt to
5976 * append out-of-order */
5978 UV final_element = len - 1;
5979 if (array[final_element] > start
5980 || ELEMENT_IN_INVLIST_SET(final_element))
5982 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
5985 /* Here, it is a legal append. If the new range begins with the first
5986 * value not in the set, it is extending the set, so the new first
5987 * value not in the set is one greater than the newly extended range.
5989 if (array[final_element] == start) {
5990 if (end != UV_MAX) {
5991 array[final_element] = end + 1;
5994 /* But if the end is the maximum representable on the machine,
5995 * just let the range that this would extend have no end */
5996 invlist_set_len(invlist, len - 1);
6002 /* Here the new range doesn't extend any existing set. Add it */
6004 len += 2; /* Includes an element each for the start and end of range */
6006 /* If overflows the existing space, extend, which may cause the array to be
6009 invlist_extend(invlist, len);
6010 array = invlist_array(invlist);
6013 invlist_set_len(invlist, len);
6015 /* The next item on the list starts the range, the one after that is
6016 * one past the new range. */
6017 array[len - 2] = start;
6018 if (end != UV_MAX) {
6019 array[len - 1] = end + 1;
6022 /* But if the end is the maximum representable on the machine, just let
6023 * the range have no end */
6024 invlist_set_len(invlist, len - 1);
6030 S_invlist_union(pTHX_ HV* const a, HV* const b)
6032 /* Return a new inversion list which is the union of two inversion lists.
6033 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6034 * Richard Gillam, published by Addison-Wesley, and explained at some
6035 * length there. The preface says to incorporate its examples into your
6036 * code at your own risk.
6038 * The algorithm is like a merge sort.
6040 * XXX A potential performance improvement is to keep track as we go along
6041 * if only one of the inputs contributes to the result, meaning the other
6042 * is a subset of that one. In that case, we can skip the final copy and
6043 * return the larger of the input lists */
6045 UV* array_a = invlist_array(a); /* a's array */
6046 UV* array_b = invlist_array(b);
6047 UV len_a = invlist_len(a); /* length of a's array */
6048 UV len_b = invlist_len(b);
6050 HV* u; /* the resulting union */
6054 UV i_a = 0; /* current index into a's array */
6058 /* running count, as explained in the algorithm source book; items are
6059 * stopped accumulating and are output when the count changes to/from 0.
6060 * The count is incremented when we start a range that's in the set, and
6061 * decremented when we start a range that's not in the set. So its range
6062 * is 0 to 2. Only when the count is zero is something not in the set.
6066 PERL_ARGS_ASSERT_INVLIST_UNION;
6068 /* Size the union for the worst case: that the sets are completely
6070 u = _new_invlist(len_a + len_b);
6071 array_u = invlist_array(u);
6073 /* Go through each list item by item, stopping when exhausted one of
6075 while (i_a < len_a && i_b < len_b) {
6076 UV cp; /* The element to potentially add to the union's array */
6077 bool cp_in_set; /* is it in the the input list's set or not */
6079 /* We need to take one or the other of the two inputs for the union.
6080 * Since we are merging two sorted lists, we take the smaller of the
6081 * next items. In case of a tie, we take the one that is in its set
6082 * first. If we took one not in the set first, it would decrement the
6083 * count, possibly to 0 which would cause it to be output as ending the
6084 * range, and the next time through we would take the same number, and
6085 * output it again as beginning the next range. By doing it the
6086 * opposite way, there is no possibility that the count will be
6087 * momentarily decremented to 0, and thus the two adjoining ranges will
6088 * be seamlessly merged. (In a tie and both are in the set or both not
6089 * in the set, it doesn't matter which we take first.) */
6090 if (array_a[i_a] < array_b[i_b]
6091 || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a)))
6093 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6097 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6101 /* Here, have chosen which of the two inputs to look at. Only output
6102 * if the running count changes to/from 0, which marks the
6103 * beginning/end of a range in that's in the set */
6106 array_u[i_u++] = cp;
6113 array_u[i_u++] = cp;
6118 /* Here, we are finished going through at least one of the lists, which
6119 * means there is something remaining in at most one. We check if the list
6120 * that hasn't been exhausted is positioned such that we are in the middle
6121 * of a range in its set or not. (We are in the set if the next item in
6122 * the array marks the beginning of something not in the set) If in the
6123 * set, we decrement 'count'; if 0, there is potentially more to output.
6124 * There are four cases:
6125 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6126 * in the union is entirely from the non-exhausted set.
6127 * 2) Both were in their sets, count is 2. Nothing further should
6128 * be output, as everything that remains will be in the exhausted
6129 * list's set, hence in the union; decrementing to 1 but not 0 insures
6131 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6132 * Nothing further should be output because the union includes
6133 * everything from the exhausted set. Not decrementing insures that.
6134 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6135 * decrementing to 0 insures that we look at the remainder of the
6136 * non-exhausted set */
6137 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6138 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6143 /* The final length is what we've output so far, plus what else is about to
6144 * be output. (If 'count' is non-zero, then the input list we exhausted
6145 * has everything remaining up to the machine's limit in its set, and hence
6146 * in the union, so there will be no further output. */
6149 /* At most one of the subexpressions will be non-zero */
6150 len_u += (len_a - i_a) + (len_b - i_b);
6153 /* Set result to final length, which can change the pointer to array_u, so
6155 if (len_u != invlist_len(u)) {
6156 invlist_set_len(u, len_u);
6158 array_u = invlist_array(u);
6161 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6162 * the other) ended with everything above it not in its set. That means
6163 * that the remaining part of the union is precisely the same as the
6164 * non-exhausted list, so can just copy it unchanged. (If both list were
6165 * exhausted at the same time, then the operations below will be both 0.)
6168 IV copy_count; /* At most one will have a non-zero copy count */
6169 if ((copy_count = len_a - i_a) > 0) {
6170 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6172 else if ((copy_count = len_b - i_b) > 0) {
6173 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6181 S_invlist_intersection(pTHX_ HV* const a, HV* const b)
6183 /* Return the intersection of two inversion lists. The basis for this
6184 * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
6185 * by Addison-Wesley, and explained at some length there. The preface says
6186 * to incorporate its examples into your code at your own risk.
6188 * The algorithm is like a merge sort, and is essentially the same as the
6192 UV* array_a = invlist_array(a); /* a's array */
6193 UV* array_b = invlist_array(b);
6194 UV len_a = invlist_len(a); /* length of a's array */
6195 UV len_b = invlist_len(b);
6197 HV* r; /* the resulting intersection */
6201 UV i_a = 0; /* current index into a's array */
6205 /* running count, as explained in the algorithm source book; items are
6206 * stopped accumulating and are output when the count changes to/from 2.
6207 * The count is incremented when we start a range that's in the set, and
6208 * decremented when we start a range that's not in the set. So its range
6209 * is 0 to 2. Only when the count is 2 is something in the intersection.
6213 PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
6215 /* Size the intersection for the worst case: that the intersection ends up
6216 * fragmenting everything to be completely disjoint */
6217 r= _new_invlist(len_a + len_b);
6218 array_r = invlist_array(r);
6220 /* Go through each list item by item, stopping when exhausted one of
6222 while (i_a < len_a && i_b < len_b) {
6223 UV cp; /* The element to potentially add to the intersection's
6225 bool cp_in_set; /* Is it in the input list's set or not */
6227 /* We need to take one or the other of the two inputs for the union.
6228 * Since we are merging two sorted lists, we take the smaller of the
6229 * next items. In case of a tie, we take the one that is not in its
6230 * set first (a difference from the union algorithm). If we took one
6231 * in the set first, it would increment the count, possibly to 2 which
6232 * would cause it to be output as starting a range in the intersection,
6233 * and the next time through we would take that same number, and output
6234 * it again as ending the set. By doing it the opposite of this, we
6235 * there is no possibility that the count will be momentarily
6236 * incremented to 2. (In a tie and both are in the set or both not in
6237 * the set, it doesn't matter which we take first.) */
6238 if (array_a[i_a] < array_b[i_b]
6239 || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
6241 cp_in_set = ELEMENT_IN_INVLIST_SET(i_a);
6245 cp_in_set = ELEMENT_IN_INVLIST_SET(i_b);
6249 /* Here, have chosen which of the two inputs to look at. Only output
6250 * if the running count changes to/from 2, which marks the
6251 * beginning/end of a range that's in the intersection */
6255 array_r[i_r++] = cp;
6260 array_r[i_r++] = cp;
6266 /* Here, we are finished going through at least one of the sets, which
6267 * means there is something remaining in at most one. See the comments in
6269 if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
6270 || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
6275 /* The final length is what we've output so far plus what else is in the
6276 * intersection. Only one of the subexpressions below will be non-zero */
6279 len_r += (len_a - i_a) + (len_b - i_b);
6282 /* Set result to final length, which can change the pointer to array_r, so
6284 if (len_r != invlist_len(r)) {
6285 invlist_set_len(r, len_r);
6287 array_r = invlist_array(r);
6290 /* Finish outputting any remaining */
6291 if (count == 2) { /* Only one of will have a non-zero copy count */
6293 if ((copy_count = len_a - i_a) > 0) {
6294 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
6296 else if ((copy_count = len_b - i_b) > 0) {
6297 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
6305 S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end)
6307 /* Add the range from 'start' to 'end' inclusive to the inversion list's
6308 * set. A pointer to the inversion list is returned. This may actually be
6309 * a new list, in which case the passed in one has been destroyed. The
6310 * passed in inversion list can be NULL, in which case a new one is created
6311 * with just the one range in it */
6317 if (invlist == NULL) {
6318 invlist = _new_invlist(2);
6322 len = invlist_len(invlist);
6325 /* If comes after the final entry, can just append it to the end */
6327 || start >= invlist_array(invlist)
6328 [invlist_len(invlist) - 1])
6330 _append_range_to_invlist(invlist, start, end);
6334 /* Here, can't just append things, create and return a new inversion list
6335 * which is the union of this range and the existing inversion list */
6336 range_invlist = _new_invlist(2);
6337 _append_range_to_invlist(range_invlist, start, end);
6339 added_invlist = invlist_union(invlist, range_invlist);
6341 /* The passed in list can be freed, as well as our temporary */
6342 invlist_destroy(range_invlist);
6343 if (invlist != added_invlist) {
6344 invlist_destroy(invlist);
6347 return added_invlist;
6350 PERL_STATIC_INLINE HV*
6351 S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) {
6352 return add_range_to_invlist(invlist, cp, cp);
6355 /* End of inversion list object */
6358 - reg - regular expression, i.e. main body or parenthesized thing
6360 * Caller must absorb opening parenthesis.
6362 * Combining parenthesis handling with the base level of regular expression
6363 * is a trifle forced, but the need to tie the tails of the branches to what
6364 * follows makes it hard to avoid.
6366 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
6368 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
6370 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
6374 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
6375 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
6378 register regnode *ret; /* Will be the head of the group. */
6379 register regnode *br;
6380 register regnode *lastbr;
6381 register regnode *ender = NULL;
6382 register I32 parno = 0;
6384 U32 oregflags = RExC_flags;
6385 bool have_branch = 0;
6387 I32 freeze_paren = 0;
6388 I32 after_freeze = 0;
6390 /* for (?g), (?gc), and (?o) warnings; warning
6391 about (?c) will warn about (?g) -- japhy */
6393 #define WASTED_O 0x01
6394 #define WASTED_G 0x02
6395 #define WASTED_C 0x04
6396 #define WASTED_GC (0x02|0x04)
6397 I32 wastedflags = 0x00;
6399 char * parse_start = RExC_parse; /* MJD */
6400 char * const oregcomp_parse = RExC_parse;
6402 GET_RE_DEBUG_FLAGS_DECL;
6404 PERL_ARGS_ASSERT_REG;
6405 DEBUG_PARSE("reg ");
6407 *flagp = 0; /* Tentatively. */
6410 /* Make an OPEN node, if parenthesized. */
6412 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
6413 char *start_verb = RExC_parse;
6414 STRLEN verb_len = 0;
6415 char *start_arg = NULL;
6416 unsigned char op = 0;
6418 int internal_argval = 0; /* internal_argval is only useful if !argok */
6419 while ( *RExC_parse && *RExC_parse != ')' ) {
6420 if ( *RExC_parse == ':' ) {
6421 start_arg = RExC_parse + 1;
6427 verb_len = RExC_parse - start_verb;
6430 while ( *RExC_parse && *RExC_parse != ')' )
6432 if ( *RExC_parse != ')' )
6433 vFAIL("Unterminated verb pattern argument");
6434 if ( RExC_parse == start_arg )
6437 if ( *RExC_parse != ')' )
6438 vFAIL("Unterminated verb pattern");
6441 switch ( *start_verb ) {
6442 case 'A': /* (*ACCEPT) */
6443 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
6445 internal_argval = RExC_nestroot;
6448 case 'C': /* (*COMMIT) */
6449 if ( memEQs(start_verb,verb_len,"COMMIT") )
6452 case 'F': /* (*FAIL) */
6453 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
6458 case ':': /* (*:NAME) */
6459 case 'M': /* (*MARK:NAME) */
6460 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
6465 case 'P': /* (*PRUNE) */
6466 if ( memEQs(start_verb,verb_len,"PRUNE") )
6469 case 'S': /* (*SKIP) */
6470 if ( memEQs(start_verb,verb_len,"SKIP") )
6473 case 'T': /* (*THEN) */
6474 /* [19:06] <TimToady> :: is then */
6475 if ( memEQs(start_verb,verb_len,"THEN") ) {
6477 RExC_seen |= REG_SEEN_CUTGROUP;
6483 vFAIL3("Unknown verb pattern '%.*s'",
6484 verb_len, start_verb);
6487 if ( start_arg && internal_argval ) {
6488 vFAIL3("Verb pattern '%.*s' may not have an argument",
6489 verb_len, start_verb);
6490 } else if ( argok < 0 && !start_arg ) {
6491 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
6492 verb_len, start_verb);
6494 ret = reganode(pRExC_state, op, internal_argval);
6495 if ( ! internal_argval && ! SIZE_ONLY ) {
6497 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
6498 ARG(ret) = add_data( pRExC_state, 1, "S" );
6499 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
6506 if (!internal_argval)
6507 RExC_seen |= REG_SEEN_VERBARG;
6508 } else if ( start_arg ) {
6509 vFAIL3("Verb pattern '%.*s' may not have an argument",
6510 verb_len, start_verb);
6512 ret = reg_node(pRExC_state, op);
6514 nextchar(pRExC_state);
6517 if (*RExC_parse == '?') { /* (?...) */
6518 bool is_logical = 0;
6519 const char * const seqstart = RExC_parse;
6520 bool has_use_defaults = FALSE;
6523 paren = *RExC_parse++;
6524 ret = NULL; /* For look-ahead/behind. */
6527 case 'P': /* (?P...) variants for those used to PCRE/Python */
6528 paren = *RExC_parse++;
6529 if ( paren == '<') /* (?P<...>) named capture */
6531 else if (paren == '>') { /* (?P>name) named recursion */
6532 goto named_recursion;
6534 else if (paren == '=') { /* (?P=...) named backref */
6535 /* this pretty much dupes the code for \k<NAME> in regatom(), if
6536 you change this make sure you change that */
6537 char* name_start = RExC_parse;
6539 SV *sv_dat = reg_scan_name(pRExC_state,
6540 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6541 if (RExC_parse == name_start || *RExC_parse != ')')
6542 vFAIL2("Sequence %.3s... not terminated",parse_start);
6545 num = add_data( pRExC_state, 1, "S" );
6546 RExC_rxi->data->data[num]=(void*)sv_dat;
6547 SvREFCNT_inc_simple_void(sv_dat);
6550 ret = reganode(pRExC_state,
6553 : (MORE_ASCII_RESTRICTED)
6555 : (AT_LEAST_UNI_SEMANTICS)
6563 Set_Node_Offset(ret, parse_start+1);
6564 Set_Node_Cur_Length(ret); /* MJD */
6566 nextchar(pRExC_state);
6570 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6572 case '<': /* (?<...) */
6573 if (*RExC_parse == '!')
6575 else if (*RExC_parse != '=')
6581 case '\'': /* (?'...') */
6582 name_start= RExC_parse;
6583 svname = reg_scan_name(pRExC_state,
6584 SIZE_ONLY ? /* reverse test from the others */
6585 REG_RSN_RETURN_NAME :
6586 REG_RSN_RETURN_NULL);
6587 if (RExC_parse == name_start) {
6589 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6592 if (*RExC_parse != paren)
6593 vFAIL2("Sequence (?%c... not terminated",
6594 paren=='>' ? '<' : paren);
6598 if (!svname) /* shouldn't happen */
6600 "panic: reg_scan_name returned NULL");
6601 if (!RExC_paren_names) {
6602 RExC_paren_names= newHV();
6603 sv_2mortal(MUTABLE_SV(RExC_paren_names));
6605 RExC_paren_name_list= newAV();
6606 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
6609 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
6611 sv_dat = HeVAL(he_str);
6613 /* croak baby croak */
6615 "panic: paren_name hash element allocation failed");
6616 } else if ( SvPOK(sv_dat) ) {
6617 /* (?|...) can mean we have dupes so scan to check
6618 its already been stored. Maybe a flag indicating
6619 we are inside such a construct would be useful,
6620 but the arrays are likely to be quite small, so
6621 for now we punt -- dmq */
6622 IV count = SvIV(sv_dat);
6623 I32 *pv = (I32*)SvPVX(sv_dat);
6625 for ( i = 0 ; i < count ; i++ ) {
6626 if ( pv[i] == RExC_npar ) {
6632 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
6633 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
6634 pv[count] = RExC_npar;
6635 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
6638 (void)SvUPGRADE(sv_dat,SVt_PVNV);
6639 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
6641 SvIV_set(sv_dat, 1);
6644 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
6645 SvREFCNT_dec(svname);
6648 /*sv_dump(sv_dat);*/
6650 nextchar(pRExC_state);
6652 goto capturing_parens;
6654 RExC_seen |= REG_SEEN_LOOKBEHIND;
6655 RExC_in_lookbehind++;
6657 case '=': /* (?=...) */
6658 RExC_seen_zerolen++;
6660 case '!': /* (?!...) */
6661 RExC_seen_zerolen++;
6662 if (*RExC_parse == ')') {
6663 ret=reg_node(pRExC_state, OPFAIL);
6664 nextchar(pRExC_state);
6668 case '|': /* (?|...) */
6669 /* branch reset, behave like a (?:...) except that
6670 buffers in alternations share the same numbers */
6672 after_freeze = freeze_paren = RExC_npar;
6674 case ':': /* (?:...) */
6675 case '>': /* (?>...) */
6677 case '$': /* (?$...) */
6678 case '@': /* (?@...) */
6679 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
6681 case '#': /* (?#...) */
6682 while (*RExC_parse && *RExC_parse != ')')
6684 if (*RExC_parse != ')')
6685 FAIL("Sequence (?#... not terminated");
6686 nextchar(pRExC_state);
6689 case '0' : /* (?0) */
6690 case 'R' : /* (?R) */
6691 if (*RExC_parse != ')')
6692 FAIL("Sequence (?R) not terminated");
6693 ret = reg_node(pRExC_state, GOSTART);
6694 *flagp |= POSTPONED;
6695 nextchar(pRExC_state);
6698 { /* named and numeric backreferences */
6700 case '&': /* (?&NAME) */
6701 parse_start = RExC_parse - 1;
6704 SV *sv_dat = reg_scan_name(pRExC_state,
6705 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6706 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6708 goto gen_recurse_regop;
6711 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6713 vFAIL("Illegal pattern");
6715 goto parse_recursion;
6717 case '-': /* (?-1) */
6718 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6719 RExC_parse--; /* rewind to let it be handled later */
6723 case '1': case '2': case '3': case '4': /* (?1) */
6724 case '5': case '6': case '7': case '8': case '9':
6727 num = atoi(RExC_parse);
6728 parse_start = RExC_parse - 1; /* MJD */
6729 if (*RExC_parse == '-')
6731 while (isDIGIT(*RExC_parse))
6733 if (*RExC_parse!=')')
6734 vFAIL("Expecting close bracket");
6737 if ( paren == '-' ) {
6739 Diagram of capture buffer numbering.
6740 Top line is the normal capture buffer numbers
6741 Bottom line is the negative indexing as from
6745 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6749 num = RExC_npar + num;
6752 vFAIL("Reference to nonexistent group");
6754 } else if ( paren == '+' ) {
6755 num = RExC_npar + num - 1;
6758 ret = reganode(pRExC_state, GOSUB, num);
6760 if (num > (I32)RExC_rx->nparens) {
6762 vFAIL("Reference to nonexistent group");
6764 ARG2L_SET( ret, RExC_recurse_count++);
6766 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6767 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6771 RExC_seen |= REG_SEEN_RECURSE;
6772 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6773 Set_Node_Offset(ret, parse_start); /* MJD */
6775 *flagp |= POSTPONED;
6776 nextchar(pRExC_state);
6778 } /* named and numeric backreferences */
6781 case '?': /* (??...) */
6783 if (*RExC_parse != '{') {
6785 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6788 *flagp |= POSTPONED;
6789 paren = *RExC_parse++;
6791 case '{': /* (?{...}) */
6796 char *s = RExC_parse;
6798 RExC_seen_zerolen++;
6799 RExC_seen |= REG_SEEN_EVAL;
6800 while (count && (c = *RExC_parse)) {
6811 if (*RExC_parse != ')') {
6813 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6817 OP_4tree *sop, *rop;
6818 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6821 Perl_save_re_context(aTHX);
6822 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6823 sop->op_private |= OPpREFCOUNTED;
6824 /* re_dup will OpREFCNT_inc */
6825 OpREFCNT_set(sop, 1);
6828 n = add_data(pRExC_state, 3, "nop");
6829 RExC_rxi->data->data[n] = (void*)rop;
6830 RExC_rxi->data->data[n+1] = (void*)sop;
6831 RExC_rxi->data->data[n+2] = (void*)pad;
6834 else { /* First pass */
6835 if (PL_reginterp_cnt < ++RExC_seen_evals
6837 /* No compiled RE interpolated, has runtime
6838 components ===> unsafe. */
6839 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6840 if (PL_tainting && PL_tainted)
6841 FAIL("Eval-group in insecure regular expression");
6842 #if PERL_VERSION > 8
6843 if (IN_PERL_COMPILETIME)
6848 nextchar(pRExC_state);
6850 ret = reg_node(pRExC_state, LOGICAL);
6853 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6854 /* deal with the length of this later - MJD */
6857 ret = reganode(pRExC_state, EVAL, n);
6858 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6859 Set_Node_Offset(ret, parse_start);
6862 case '(': /* (?(?{...})...) and (?(?=...)...) */
6865 if (RExC_parse[0] == '?') { /* (?(?...)) */
6866 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6867 || RExC_parse[1] == '<'
6868 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6871 ret = reg_node(pRExC_state, LOGICAL);
6874 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6878 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6879 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6881 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6882 char *name_start= RExC_parse++;
6884 SV *sv_dat=reg_scan_name(pRExC_state,
6885 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6886 if (RExC_parse == name_start || *RExC_parse != ch)
6887 vFAIL2("Sequence (?(%c... not terminated",
6888 (ch == '>' ? '<' : ch));
6891 num = add_data( pRExC_state, 1, "S" );
6892 RExC_rxi->data->data[num]=(void*)sv_dat;
6893 SvREFCNT_inc_simple_void(sv_dat);
6895 ret = reganode(pRExC_state,NGROUPP,num);
6896 goto insert_if_check_paren;
6898 else if (RExC_parse[0] == 'D' &&
6899 RExC_parse[1] == 'E' &&
6900 RExC_parse[2] == 'F' &&
6901 RExC_parse[3] == 'I' &&
6902 RExC_parse[4] == 'N' &&
6903 RExC_parse[5] == 'E')
6905 ret = reganode(pRExC_state,DEFINEP,0);
6908 goto insert_if_check_paren;
6910 else if (RExC_parse[0] == 'R') {
6913 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6914 parno = atoi(RExC_parse++);
6915 while (isDIGIT(*RExC_parse))
6917 } else if (RExC_parse[0] == '&') {
6920 sv_dat = reg_scan_name(pRExC_state,
6921 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6922 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6924 ret = reganode(pRExC_state,INSUBP,parno);
6925 goto insert_if_check_paren;
6927 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6930 parno = atoi(RExC_parse++);
6932 while (isDIGIT(*RExC_parse))
6934 ret = reganode(pRExC_state, GROUPP, parno);
6936 insert_if_check_paren:
6937 if ((c = *nextchar(pRExC_state)) != ')')
6938 vFAIL("Switch condition not recognized");
6940 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6941 br = regbranch(pRExC_state, &flags, 1,depth+1);
6943 br = reganode(pRExC_state, LONGJMP, 0);
6945 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6946 c = *nextchar(pRExC_state);
6951 vFAIL("(?(DEFINE)....) does not allow branches");
6952 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6953 regbranch(pRExC_state, &flags, 1,depth+1);
6954 REGTAIL(pRExC_state, ret, lastbr);
6957 c = *nextchar(pRExC_state);
6962 vFAIL("Switch (?(condition)... contains too many branches");
6963 ender = reg_node(pRExC_state, TAIL);
6964 REGTAIL(pRExC_state, br, ender);
6966 REGTAIL(pRExC_state, lastbr, ender);
6967 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6970 REGTAIL(pRExC_state, ret, ender);
6971 RExC_size++; /* XXX WHY do we need this?!!
6972 For large programs it seems to be required
6973 but I can't figure out why. -- dmq*/
6977 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6981 RExC_parse--; /* for vFAIL to print correctly */
6982 vFAIL("Sequence (? incomplete");
6984 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6986 has_use_defaults = TRUE;
6987 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6988 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
6989 ? REGEX_UNICODE_CHARSET
6990 : REGEX_DEPENDS_CHARSET);
6994 parse_flags: /* (?i) */
6996 U32 posflags = 0, negflags = 0;
6997 U32 *flagsp = &posflags;
6998 bool has_charset_modifier = 0;
6999 regex_charset cs = REGEX_DEPENDS_CHARSET;
7001 while (*RExC_parse) {
7002 /* && strchr("iogcmsx", *RExC_parse) */
7003 /* (?g), (?gc) and (?o) are useless here
7004 and must be globally applied -- japhy */
7005 switch (*RExC_parse) {
7006 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
7007 case LOCALE_PAT_MOD:
7008 if (has_charset_modifier || flagsp == &negflags) {
7009 goto fail_modifiers;
7011 cs = REGEX_LOCALE_CHARSET;
7012 has_charset_modifier = 1;
7014 case UNICODE_PAT_MOD:
7015 if (has_charset_modifier || flagsp == &negflags) {
7016 goto fail_modifiers;
7018 cs = REGEX_UNICODE_CHARSET;
7019 has_charset_modifier = 1;
7021 case ASCII_RESTRICT_PAT_MOD:
7022 if (has_charset_modifier || flagsp == &negflags) {
7023 goto fail_modifiers;
7025 if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
7026 /* Doubled modifier implies more restricted */
7027 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
7031 cs = REGEX_ASCII_RESTRICTED_CHARSET;
7033 has_charset_modifier = 1;
7035 case DEPENDS_PAT_MOD:
7036 if (has_use_defaults
7037 || has_charset_modifier
7038 || flagsp == &negflags)
7040 goto fail_modifiers;
7043 /* The dual charset means unicode semantics if the
7044 * pattern (or target, not known until runtime) are
7045 * utf8, or something in the pattern indicates unicode
7047 cs = (RExC_utf8 || RExC_uni_semantics)
7048 ? REGEX_UNICODE_CHARSET
7049 : REGEX_DEPENDS_CHARSET;
7050 has_charset_modifier = 1;
7052 case ONCE_PAT_MOD: /* 'o' */
7053 case GLOBAL_PAT_MOD: /* 'g' */
7054 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7055 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
7056 if (! (wastedflags & wflagbit) ) {
7057 wastedflags |= wflagbit;
7060 "Useless (%s%c) - %suse /%c modifier",
7061 flagsp == &negflags ? "?-" : "?",
7063 flagsp == &negflags ? "don't " : "",
7070 case CONTINUE_PAT_MOD: /* 'c' */
7071 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
7072 if (! (wastedflags & WASTED_C) ) {
7073 wastedflags |= WASTED_GC;
7076 "Useless (%sc) - %suse /gc modifier",
7077 flagsp == &negflags ? "?-" : "?",
7078 flagsp == &negflags ? "don't " : ""
7083 case KEEPCOPY_PAT_MOD: /* 'p' */
7084 if (flagsp == &negflags) {
7086 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
7088 *flagsp |= RXf_PMf_KEEPCOPY;
7092 /* A flag is a default iff it is following a minus, so
7093 * if there is a minus, it means will be trying to
7094 * re-specify a default which is an error */
7095 if (has_use_defaults || flagsp == &negflags) {
7098 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7102 wastedflags = 0; /* reset so (?g-c) warns twice */
7108 RExC_flags |= posflags;
7109 RExC_flags &= ~negflags;
7110 set_regex_charset(&RExC_flags, cs);
7112 oregflags |= posflags;
7113 oregflags &= ~negflags;
7114 set_regex_charset(&oregflags, cs);
7116 nextchar(pRExC_state);
7127 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7132 }} /* one for the default block, one for the switch */
7139 ret = reganode(pRExC_state, OPEN, parno);
7142 RExC_nestroot = parno;
7143 if (RExC_seen & REG_SEEN_RECURSE
7144 && !RExC_open_parens[parno-1])
7146 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7147 "Setting open paren #%"IVdf" to %d\n",
7148 (IV)parno, REG_NODE_NUM(ret)));
7149 RExC_open_parens[parno-1]= ret;
7152 Set_Node_Length(ret, 1); /* MJD */
7153 Set_Node_Offset(ret, RExC_parse); /* MJD */
7161 /* Pick up the branches, linking them together. */
7162 parse_start = RExC_parse; /* MJD */
7163 br = regbranch(pRExC_state, &flags, 1,depth+1);
7165 /* branch_len = (paren != 0); */
7169 if (*RExC_parse == '|') {
7170 if (!SIZE_ONLY && RExC_extralen) {
7171 reginsert(pRExC_state, BRANCHJ, br, depth+1);
7174 reginsert(pRExC_state, BRANCH, br, depth+1);
7175 Set_Node_Length(br, paren != 0);
7176 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
7180 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
7182 else if (paren == ':') {
7183 *flagp |= flags&SIMPLE;
7185 if (is_open) { /* Starts with OPEN. */
7186 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
7188 else if (paren != '?') /* Not Conditional */
7190 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7192 while (*RExC_parse == '|') {
7193 if (!SIZE_ONLY && RExC_extralen) {
7194 ender = reganode(pRExC_state, LONGJMP,0);
7195 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
7198 RExC_extralen += 2; /* Account for LONGJMP. */
7199 nextchar(pRExC_state);
7201 if (RExC_npar > after_freeze)
7202 after_freeze = RExC_npar;
7203 RExC_npar = freeze_paren;
7205 br = regbranch(pRExC_state, &flags, 0, depth+1);
7209 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
7211 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
7214 if (have_branch || paren != ':') {
7215 /* Make a closing node, and hook it on the end. */
7218 ender = reg_node(pRExC_state, TAIL);
7221 ender = reganode(pRExC_state, CLOSE, parno);
7222 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
7223 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7224 "Setting close paren #%"IVdf" to %d\n",
7225 (IV)parno, REG_NODE_NUM(ender)));
7226 RExC_close_parens[parno-1]= ender;
7227 if (RExC_nestroot == parno)
7230 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
7231 Set_Node_Length(ender,1); /* MJD */
7237 *flagp &= ~HASWIDTH;
7240 ender = reg_node(pRExC_state, SUCCEED);
7243 ender = reg_node(pRExC_state, END);
7245 assert(!RExC_opend); /* there can only be one! */
7250 REGTAIL(pRExC_state, lastbr, ender);
7252 if (have_branch && !SIZE_ONLY) {
7254 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
7256 /* Hook the tails of the branches to the closing node. */
7257 for (br = ret; br; br = regnext(br)) {
7258 const U8 op = PL_regkind[OP(br)];
7260 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
7262 else if (op == BRANCHJ) {
7263 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
7271 static const char parens[] = "=!<,>";
7273 if (paren && (p = strchr(parens, paren))) {
7274 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
7275 int flag = (p - parens) > 1;
7278 node = SUSPEND, flag = 0;
7279 reginsert(pRExC_state, node,ret, depth+1);
7280 Set_Node_Cur_Length(ret);
7281 Set_Node_Offset(ret, parse_start + 1);
7283 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
7287 /* Check for proper termination. */
7289 RExC_flags = oregflags;
7290 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
7291 RExC_parse = oregcomp_parse;
7292 vFAIL("Unmatched (");
7295 else if (!paren && RExC_parse < RExC_end) {
7296 if (*RExC_parse == ')') {
7298 vFAIL("Unmatched )");
7301 FAIL("Junk on end of regexp"); /* "Can't happen". */
7305 if (RExC_in_lookbehind) {
7306 RExC_in_lookbehind--;
7308 if (after_freeze > RExC_npar)
7309 RExC_npar = after_freeze;
7314 - regbranch - one alternative of an | operator
7316 * Implements the concatenation operator.
7319 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
7322 register regnode *ret;
7323 register regnode *chain = NULL;
7324 register regnode *latest;
7325 I32 flags = 0, c = 0;
7326 GET_RE_DEBUG_FLAGS_DECL;
7328 PERL_ARGS_ASSERT_REGBRANCH;
7330 DEBUG_PARSE("brnc");
7335 if (!SIZE_ONLY && RExC_extralen)
7336 ret = reganode(pRExC_state, BRANCHJ,0);
7338 ret = reg_node(pRExC_state, BRANCH);
7339 Set_Node_Length(ret, 1);
7343 if (!first && SIZE_ONLY)
7344 RExC_extralen += 1; /* BRANCHJ */
7346 *flagp = WORST; /* Tentatively. */
7349 nextchar(pRExC_state);
7350 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
7352 latest = regpiece(pRExC_state, &flags,depth+1);
7353 if (latest == NULL) {
7354 if (flags & TRYAGAIN)
7358 else if (ret == NULL)
7360 *flagp |= flags&(HASWIDTH|POSTPONED);
7361 if (chain == NULL) /* First piece. */
7362 *flagp |= flags&SPSTART;
7365 REGTAIL(pRExC_state, chain, latest);
7370 if (chain == NULL) { /* Loop ran zero times. */
7371 chain = reg_node(pRExC_state, NOTHING);
7376 *flagp |= flags&SIMPLE;
7383 - regpiece - something followed by possible [*+?]
7385 * Note that the branching code sequences used for ? and the general cases
7386 * of * and + are somewhat optimized: they use the same NOTHING node as
7387 * both the endmarker for their branch list and the body of the last branch.
7388 * It might seem that this node could be dispensed with entirely, but the
7389 * endmarker role is not redundant.
7392 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7395 register regnode *ret;
7397 register char *next;
7399 const char * const origparse = RExC_parse;
7401 I32 max = REG_INFTY;
7403 const char *maxpos = NULL;
7404 GET_RE_DEBUG_FLAGS_DECL;
7406 PERL_ARGS_ASSERT_REGPIECE;
7408 DEBUG_PARSE("piec");
7410 ret = regatom(pRExC_state, &flags,depth+1);
7412 if (flags & TRYAGAIN)
7419 if (op == '{' && regcurly(RExC_parse)) {
7421 parse_start = RExC_parse; /* MJD */
7422 next = RExC_parse + 1;
7423 while (isDIGIT(*next) || *next == ',') {
7432 if (*next == '}') { /* got one */
7436 min = atoi(RExC_parse);
7440 maxpos = RExC_parse;
7442 if (!max && *maxpos != '0')
7443 max = REG_INFTY; /* meaning "infinity" */
7444 else if (max >= REG_INFTY)
7445 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
7447 nextchar(pRExC_state);
7450 if ((flags&SIMPLE)) {
7451 RExC_naughty += 2 + RExC_naughty / 2;
7452 reginsert(pRExC_state, CURLY, ret, depth+1);
7453 Set_Node_Offset(ret, parse_start+1); /* MJD */
7454 Set_Node_Cur_Length(ret);
7457 regnode * const w = reg_node(pRExC_state, WHILEM);
7460 REGTAIL(pRExC_state, ret, w);
7461 if (!SIZE_ONLY && RExC_extralen) {
7462 reginsert(pRExC_state, LONGJMP,ret, depth+1);
7463 reginsert(pRExC_state, NOTHING,ret, depth+1);
7464 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
7466 reginsert(pRExC_state, CURLYX,ret, depth+1);
7468 Set_Node_Offset(ret, parse_start+1);
7469 Set_Node_Length(ret,
7470 op == '{' ? (RExC_parse - parse_start) : 1);
7472 if (!SIZE_ONLY && RExC_extralen)
7473 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
7474 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
7476 RExC_whilem_seen++, RExC_extralen += 3;
7477 RExC_naughty += 4 + RExC_naughty; /* compound interest */
7486 vFAIL("Can't do {n,m} with n > m");
7488 ARG1_SET(ret, (U16)min);
7489 ARG2_SET(ret, (U16)max);
7501 #if 0 /* Now runtime fix should be reliable. */
7503 /* if this is reinstated, don't forget to put this back into perldiag:
7505 =item Regexp *+ operand could be empty at {#} in regex m/%s/
7507 (F) The part of the regexp subject to either the * or + quantifier
7508 could match an empty string. The {#} shows in the regular
7509 expression about where the problem was discovered.
7513 if (!(flags&HASWIDTH) && op != '?')
7514 vFAIL("Regexp *+ operand could be empty");
7517 parse_start = RExC_parse;
7518 nextchar(pRExC_state);
7520 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
7522 if (op == '*' && (flags&SIMPLE)) {
7523 reginsert(pRExC_state, STAR, ret, depth+1);
7527 else if (op == '*') {
7531 else if (op == '+' && (flags&SIMPLE)) {
7532 reginsert(pRExC_state, PLUS, ret, depth+1);
7536 else if (op == '+') {
7540 else if (op == '?') {
7545 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
7546 ckWARN3reg(RExC_parse,
7547 "%.*s matches null string many times",
7548 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
7552 if (RExC_parse < RExC_end && *RExC_parse == '?') {
7553 nextchar(pRExC_state);
7554 reginsert(pRExC_state, MINMOD, ret, depth+1);
7555 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
7557 #ifndef REG_ALLOW_MINMOD_SUSPEND
7560 if (RExC_parse < RExC_end && *RExC_parse == '+') {
7562 nextchar(pRExC_state);
7563 ender = reg_node(pRExC_state, SUCCEED);
7564 REGTAIL(pRExC_state, ret, ender);
7565 reginsert(pRExC_state, SUSPEND, ret, depth+1);
7567 ender = reg_node(pRExC_state, TAIL);
7568 REGTAIL(pRExC_state, ret, ender);
7572 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
7574 vFAIL("Nested quantifiers");
7581 /* reg_namedseq(pRExC_state,UVp)
7583 This is expected to be called by a parser routine that has
7584 recognized '\N' and needs to handle the rest. RExC_parse is
7585 expected to point at the first char following the N at the time
7588 The \N may be inside (indicated by valuep not being NULL) or outside a
7591 \N may begin either a named sequence, or if outside a character class, mean
7592 to match a non-newline. For non single-quoted regexes, the tokenizer has
7593 attempted to decide which, and in the case of a named sequence converted it
7594 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
7595 where c1... are the characters in the sequence. For single-quoted regexes,
7596 the tokenizer passes the \N sequence through unchanged; this code will not
7597 attempt to determine this nor expand those. The net effect is that if the
7598 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
7599 signals that this \N occurrence means to match a non-newline.
7601 Only the \N{U+...} form should occur in a character class, for the same
7602 reason that '.' inside a character class means to just match a period: it
7603 just doesn't make sense.
7605 If valuep is non-null then it is assumed that we are parsing inside
7606 of a charclass definition and the first codepoint in the resolved
7607 string is returned via *valuep and the routine will return NULL.
7608 In this mode if a multichar string is returned from the charnames
7609 handler, a warning will be issued, and only the first char in the
7610 sequence will be examined. If the string returned is zero length
7611 then the value of *valuep is undefined and NON-NULL will
7612 be returned to indicate failure. (This will NOT be a valid pointer
7615 If valuep is null then it is assumed that we are parsing normal text and a
7616 new EXACT node is inserted into the program containing the resolved string,
7617 and a pointer to the new node is returned. But if the string is zero length
7618 a NOTHING node is emitted instead.
7620 On success RExC_parse is set to the char following the endbrace.
7621 Parsing failures will generate a fatal error via vFAIL(...)
7624 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
7626 char * endbrace; /* '}' following the name */
7627 regnode *ret = NULL;
7629 char* parse_start = RExC_parse - 2; /* points to the '\N' */
7633 GET_RE_DEBUG_FLAGS_DECL;
7635 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
7639 /* The [^\n] meaning of \N ignores spaces and comments under the /x
7640 * modifier. The other meaning does not */
7641 p = (RExC_flags & RXf_PMf_EXTENDED)
7642 ? regwhite( pRExC_state, RExC_parse )
7645 /* Disambiguate between \N meaning a named character versus \N meaning
7646 * [^\n]. The former is assumed when it can't be the latter. */
7647 if (*p != '{' || regcurly(p)) {
7650 /* no bare \N in a charclass */
7651 vFAIL("\\N in a character class must be a named character: \\N{...}");
7653 nextchar(pRExC_state);
7654 ret = reg_node(pRExC_state, REG_ANY);
7655 *flagp |= HASWIDTH|SIMPLE;
7658 Set_Node_Length(ret, 1); /* MJD */
7662 /* Here, we have decided it should be a named sequence */
7664 /* The test above made sure that the next real character is a '{', but
7665 * under the /x modifier, it could be separated by space (or a comment and
7666 * \n) and this is not allowed (for consistency with \x{...} and the
7667 * tokenizer handling of \N{NAME}). */
7668 if (*RExC_parse != '{') {
7669 vFAIL("Missing braces on \\N{}");
7672 RExC_parse++; /* Skip past the '{' */
7674 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
7675 || ! (endbrace == RExC_parse /* nothing between the {} */
7676 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
7677 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
7679 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
7680 vFAIL("\\N{NAME} must be resolved by the lexer");
7683 if (endbrace == RExC_parse) { /* empty: \N{} */
7685 RExC_parse = endbrace + 1;
7686 return reg_node(pRExC_state,NOTHING);
7690 ckWARNreg(RExC_parse,
7691 "Ignoring zero length \\N{} in character class"
7693 RExC_parse = endbrace + 1;
7696 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
7699 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
7700 RExC_parse += 2; /* Skip past the 'U+' */
7702 if (valuep) { /* In a bracketed char class */
7703 /* We only pay attention to the first char of
7704 multichar strings being returned. I kinda wonder
7705 if this makes sense as it does change the behaviour
7706 from earlier versions, OTOH that behaviour was broken
7707 as well. XXX Solution is to recharacterize as
7708 [rest-of-class]|multi1|multi2... */
7710 STRLEN length_of_hex;
7711 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7712 | PERL_SCAN_DISALLOW_PREFIX
7713 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7715 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7716 if (endchar < endbrace) {
7717 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7720 length_of_hex = (STRLEN)(endchar - RExC_parse);
7721 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7723 /* The tokenizer should have guaranteed validity, but it's possible to
7724 * bypass it by using single quoting, so check */
7725 if (length_of_hex == 0
7726 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7728 RExC_parse += length_of_hex; /* Includes all the valid */
7729 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7730 ? UTF8SKIP(RExC_parse)
7732 /* Guard against malformed utf8 */
7733 if (RExC_parse >= endchar) RExC_parse = endchar;
7734 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7737 RExC_parse = endbrace + 1;
7738 if (endchar == endbrace) return NULL;
7740 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7742 else { /* Not a char class */
7743 char *s; /* String to put in generated EXACT node */
7744 STRLEN len = 0; /* Its current byte length */
7745 char *endchar; /* Points to '.' or '}' ending cur char in the input
7747 ret = reg_node(pRExC_state,
7748 (U8) ((! FOLD) ? EXACT
7751 : (MORE_ASCII_RESTRICTED)
7753 : (AT_LEAST_UNI_SEMANTICS)
7758 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7759 * the input which is of the form now 'c1.c2.c3...}' until find the
7760 * ending brace or exceed length 255. The characters that exceed this
7761 * limit are dropped. The limit could be relaxed should it become
7762 * desirable by reparsing this as (?:\N{NAME}), so could generate
7763 * multiple EXACT nodes, as is done for just regular input. But this
7764 * is primarily a named character, and not intended to be a huge long
7765 * string, so 255 bytes should be good enough */
7767 STRLEN length_of_hex;
7768 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7769 | PERL_SCAN_DISALLOW_PREFIX
7770 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7771 UV cp; /* Ord of current character */
7772 bool use_this_char_fold = FOLD;
7774 /* Code points are separated by dots. If none, there is only one
7775 * code point, and is terminated by the brace */
7776 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7778 /* The values are Unicode even on EBCDIC machines */
7779 length_of_hex = (STRLEN)(endchar - RExC_parse);
7780 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7781 if ( length_of_hex == 0
7782 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7784 RExC_parse += length_of_hex; /* Includes all the valid */
7785 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7786 ? UTF8SKIP(RExC_parse)
7788 /* Guard against malformed utf8 */
7789 if (RExC_parse >= endchar) RExC_parse = endchar;
7790 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7793 /* XXX ? Change to ANYOF node
7795 && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
7796 && is_TRICKYFOLD_cp(cp))
7801 /* Under /aa, we can't mix ASCII with non- in a fold. If we are
7802 * folding, and the source isn't ASCII, look through all the
7803 * characters it folds to. If any one of them is ASCII, forbid
7804 * this fold. (cp is uni, so the 127 below is correct even for
7805 * EBCDIC). Similarly under locale rules, we don't mix under 256
7806 * with above 255. XXX It really doesn't make sense to have \N{}
7807 * which means a Unicode rules under locale. I (khw) think this
7808 * should be warned about, but the counter argument is that people
7809 * who have programmed around Perl's earlier lack of specifying the
7810 * rules and used \N{} to force Unicode things in a local
7811 * environment shouldn't get suddenly a warning */
7812 if (use_this_char_fold) {
7813 if (LOC && cp < 256) { /* Fold not known until run-time */
7814 use_this_char_fold = FALSE;
7816 else if ((cp > 127 && MORE_ASCII_RESTRICTED)
7817 || (cp > 255 && LOC))
7819 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
7824 (void) toFOLD_uni(cp, tmpbuf, &foldlen);
7829 || (LOC && (UTF8_IS_INVARIANT(*s)
7830 || UTF8_IS_DOWNGRADEABLE_START(*s))))
7832 use_this_char_fold = FALSE;
7840 if (! use_this_char_fold) { /* Not folding, just append to the
7844 /* Quit before adding this character if would exceed limit */
7845 if (len + UNISKIP(cp) > U8_MAX) break;
7847 unilen = reguni(pRExC_state, cp, s);
7852 } else { /* Folding, output the folded equivalent */
7853 STRLEN foldlen,numlen;
7854 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7855 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7857 /* Quit before exceeding size limit */
7858 if (len + foldlen > U8_MAX) break;
7860 for (foldbuf = tmpbuf;
7864 cp = utf8_to_uvchr(foldbuf, &numlen);
7866 const STRLEN unilen = reguni(pRExC_state, cp, s);
7869 /* In EBCDIC the numlen and unilen can differ. */
7871 if (numlen >= foldlen)
7875 break; /* "Can't happen." */
7879 /* Point to the beginning of the next character in the sequence. */
7880 RExC_parse = endchar + 1;
7882 /* Quit if no more characters */
7883 if (RExC_parse >= endbrace) break;
7888 if (RExC_parse < endbrace) {
7889 ckWARNreg(RExC_parse - 1,
7890 "Using just the first characters returned by \\N{}");
7893 RExC_size += STR_SZ(len);
7896 RExC_emit += STR_SZ(len);
7899 RExC_parse = endbrace + 1;
7901 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7902 with malformed in t/re/pat_advanced.t */
7904 Set_Node_Cur_Length(ret); /* MJD */
7905 nextchar(pRExC_state);
7915 * It returns the code point in utf8 for the value in *encp.
7916 * value: a code value in the source encoding
7917 * encp: a pointer to an Encode object
7919 * If the result from Encode is not a single character,
7920 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7923 S_reg_recode(pTHX_ const char value, SV **encp)
7926 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7927 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7928 const STRLEN newlen = SvCUR(sv);
7929 UV uv = UNICODE_REPLACEMENT;
7931 PERL_ARGS_ASSERT_REG_RECODE;
7935 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7938 if (!newlen || numlen != newlen) {
7939 uv = UNICODE_REPLACEMENT;
7947 - regatom - the lowest level
7949 Try to identify anything special at the start of the pattern. If there
7950 is, then handle it as required. This may involve generating a single regop,
7951 such as for an assertion; or it may involve recursing, such as to
7952 handle a () structure.
7954 If the string doesn't start with something special then we gobble up
7955 as much literal text as we can.
7957 Once we have been able to handle whatever type of thing started the
7958 sequence, we return.
7960 Note: we have to be careful with escapes, as they can be both literal
7961 and special, and in the case of \10 and friends can either, depending
7962 on context. Specifically there are two separate switches for handling
7963 escape sequences, with the one for handling literal escapes requiring
7964 a dummy entry for all of the special escapes that are actually handled
7969 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7972 register regnode *ret = NULL;
7974 char *parse_start = RExC_parse;
7976 GET_RE_DEBUG_FLAGS_DECL;
7977 DEBUG_PARSE("atom");
7978 *flagp = WORST; /* Tentatively. */
7980 PERL_ARGS_ASSERT_REGATOM;
7983 switch ((U8)*RExC_parse) {
7985 RExC_seen_zerolen++;
7986 nextchar(pRExC_state);
7987 if (RExC_flags & RXf_PMf_MULTILINE)
7988 ret = reg_node(pRExC_state, MBOL);
7989 else if (RExC_flags & RXf_PMf_SINGLELINE)
7990 ret = reg_node(pRExC_state, SBOL);
7992 ret = reg_node(pRExC_state, BOL);
7993 Set_Node_Length(ret, 1); /* MJD */
7996 nextchar(pRExC_state);
7998 RExC_seen_zerolen++;
7999 if (RExC_flags & RXf_PMf_MULTILINE)
8000 ret = reg_node(pRExC_state, MEOL);
8001 else if (RExC_flags & RXf_PMf_SINGLELINE)
8002 ret = reg_node(pRExC_state, SEOL);
8004 ret = reg_node(pRExC_state, EOL);
8005 Set_Node_Length(ret, 1); /* MJD */
8008 nextchar(pRExC_state);
8009 if (RExC_flags & RXf_PMf_SINGLELINE)
8010 ret = reg_node(pRExC_state, SANY);
8012 ret = reg_node(pRExC_state, REG_ANY);
8013 *flagp |= HASWIDTH|SIMPLE;
8015 Set_Node_Length(ret, 1); /* MJD */
8019 char * const oregcomp_parse = ++RExC_parse;
8020 ret = regclass(pRExC_state,depth+1);
8021 if (*RExC_parse != ']') {
8022 RExC_parse = oregcomp_parse;
8023 vFAIL("Unmatched [");
8025 nextchar(pRExC_state);
8026 *flagp |= HASWIDTH|SIMPLE;
8027 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8031 nextchar(pRExC_state);
8032 ret = reg(pRExC_state, 1, &flags,depth+1);
8034 if (flags & TRYAGAIN) {
8035 if (RExC_parse == RExC_end) {
8036 /* Make parent create an empty node if needed. */
8044 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8048 if (flags & TRYAGAIN) {
8052 vFAIL("Internal urp");
8053 /* Supposed to be caught earlier. */
8056 if (!regcurly(RExC_parse)) {
8065 vFAIL("Quantifier follows nothing");
8067 case LATIN_SMALL_LETTER_SHARP_S:
8068 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8069 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8070 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
8071 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
8072 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
8077 len=0; /* silence a spurious compiler warning */
8078 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
8079 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
8080 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
8081 ret = reganode(pRExC_state, FOLDCHAR, cp);
8082 Set_Node_Length(ret, 1); /* MJD */
8083 nextchar(pRExC_state); /* kill whitespace under /x */
8091 This switch handles escape sequences that resolve to some kind
8092 of special regop and not to literal text. Escape sequnces that
8093 resolve to literal text are handled below in the switch marked
8096 Every entry in this switch *must* have a corresponding entry
8097 in the literal escape switch. However, the opposite is not
8098 required, as the default for this switch is to jump to the
8099 literal text handling code.
8101 switch ((U8)*++RExC_parse) {
8102 case LATIN_SMALL_LETTER_SHARP_S:
8103 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8104 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8106 /* Special Escapes */
8108 RExC_seen_zerolen++;
8109 ret = reg_node(pRExC_state, SBOL);
8111 goto finish_meta_pat;
8113 ret = reg_node(pRExC_state, GPOS);
8114 RExC_seen |= REG_SEEN_GPOS;
8116 goto finish_meta_pat;
8118 RExC_seen_zerolen++;
8119 ret = reg_node(pRExC_state, KEEPS);
8121 /* XXX:dmq : disabling in-place substitution seems to
8122 * be necessary here to avoid cases of memory corruption, as
8123 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
8125 RExC_seen |= REG_SEEN_LOOKBEHIND;
8126 goto finish_meta_pat;
8128 ret = reg_node(pRExC_state, SEOL);
8130 RExC_seen_zerolen++; /* Do not optimize RE away */
8131 goto finish_meta_pat;
8133 ret = reg_node(pRExC_state, EOS);
8135 RExC_seen_zerolen++; /* Do not optimize RE away */
8136 goto finish_meta_pat;
8138 ret = reg_node(pRExC_state, CANY);
8139 RExC_seen |= REG_SEEN_CANY;
8140 *flagp |= HASWIDTH|SIMPLE;
8141 goto finish_meta_pat;
8143 ret = reg_node(pRExC_state, CLUMP);
8145 goto finish_meta_pat;
8147 switch (get_regex_charset(RExC_flags)) {
8148 case REGEX_LOCALE_CHARSET:
8151 case REGEX_UNICODE_CHARSET:
8154 case REGEX_ASCII_RESTRICTED_CHARSET:
8155 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8158 case REGEX_DEPENDS_CHARSET:
8164 ret = reg_node(pRExC_state, op);
8165 *flagp |= HASWIDTH|SIMPLE;
8166 goto finish_meta_pat;
8168 switch (get_regex_charset(RExC_flags)) {
8169 case REGEX_LOCALE_CHARSET:
8172 case REGEX_UNICODE_CHARSET:
8175 case REGEX_ASCII_RESTRICTED_CHARSET:
8176 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8179 case REGEX_DEPENDS_CHARSET:
8185 ret = reg_node(pRExC_state, op);
8186 *flagp |= HASWIDTH|SIMPLE;
8187 goto finish_meta_pat;
8189 RExC_seen_zerolen++;
8190 RExC_seen |= REG_SEEN_LOOKBEHIND;
8191 switch (get_regex_charset(RExC_flags)) {
8192 case REGEX_LOCALE_CHARSET:
8195 case REGEX_UNICODE_CHARSET:
8198 case REGEX_ASCII_RESTRICTED_CHARSET:
8199 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8202 case REGEX_DEPENDS_CHARSET:
8208 ret = reg_node(pRExC_state, op);
8209 FLAGS(ret) = get_regex_charset(RExC_flags);
8211 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8212 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
8214 goto finish_meta_pat;
8216 RExC_seen_zerolen++;
8217 RExC_seen |= REG_SEEN_LOOKBEHIND;
8218 switch (get_regex_charset(RExC_flags)) {
8219 case REGEX_LOCALE_CHARSET:
8222 case REGEX_UNICODE_CHARSET:
8225 case REGEX_ASCII_RESTRICTED_CHARSET:
8226 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8229 case REGEX_DEPENDS_CHARSET:
8235 ret = reg_node(pRExC_state, op);
8236 FLAGS(ret) = get_regex_charset(RExC_flags);
8238 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
8239 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
8241 goto finish_meta_pat;
8243 switch (get_regex_charset(RExC_flags)) {
8244 case REGEX_LOCALE_CHARSET:
8247 case REGEX_UNICODE_CHARSET:
8250 case REGEX_ASCII_RESTRICTED_CHARSET:
8251 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8254 case REGEX_DEPENDS_CHARSET:
8260 ret = reg_node(pRExC_state, op);
8261 *flagp |= HASWIDTH|SIMPLE;
8262 goto finish_meta_pat;
8264 switch (get_regex_charset(RExC_flags)) {
8265 case REGEX_LOCALE_CHARSET:
8268 case REGEX_UNICODE_CHARSET:
8271 case REGEX_ASCII_RESTRICTED_CHARSET:
8272 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8275 case REGEX_DEPENDS_CHARSET:
8281 ret = reg_node(pRExC_state, op);
8282 *flagp |= HASWIDTH|SIMPLE;
8283 goto finish_meta_pat;
8285 switch (get_regex_charset(RExC_flags)) {
8286 case REGEX_LOCALE_CHARSET:
8289 case REGEX_ASCII_RESTRICTED_CHARSET:
8290 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8293 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8294 case REGEX_UNICODE_CHARSET:
8300 ret = reg_node(pRExC_state, op);
8301 *flagp |= HASWIDTH|SIMPLE;
8302 goto finish_meta_pat;
8304 switch (get_regex_charset(RExC_flags)) {
8305 case REGEX_LOCALE_CHARSET:
8308 case REGEX_ASCII_RESTRICTED_CHARSET:
8309 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
8312 case REGEX_DEPENDS_CHARSET: /* No difference between these */
8313 case REGEX_UNICODE_CHARSET:
8319 ret = reg_node(pRExC_state, op);
8320 *flagp |= HASWIDTH|SIMPLE;
8321 goto finish_meta_pat;
8323 ret = reg_node(pRExC_state, LNBREAK);
8324 *flagp |= HASWIDTH|SIMPLE;
8325 goto finish_meta_pat;
8327 ret = reg_node(pRExC_state, HORIZWS);
8328 *flagp |= HASWIDTH|SIMPLE;
8329 goto finish_meta_pat;
8331 ret = reg_node(pRExC_state, NHORIZWS);
8332 *flagp |= HASWIDTH|SIMPLE;
8333 goto finish_meta_pat;
8335 ret = reg_node(pRExC_state, VERTWS);
8336 *flagp |= HASWIDTH|SIMPLE;
8337 goto finish_meta_pat;
8339 ret = reg_node(pRExC_state, NVERTWS);
8340 *flagp |= HASWIDTH|SIMPLE;
8342 nextchar(pRExC_state);
8343 Set_Node_Length(ret, 2); /* MJD */
8348 char* const oldregxend = RExC_end;
8350 char* parse_start = RExC_parse - 2;
8353 if (RExC_parse[1] == '{') {
8354 /* a lovely hack--pretend we saw [\pX] instead */
8355 RExC_end = strchr(RExC_parse, '}');
8357 const U8 c = (U8)*RExC_parse;
8359 RExC_end = oldregxend;
8360 vFAIL2("Missing right brace on \\%c{}", c);
8365 RExC_end = RExC_parse + 2;
8366 if (RExC_end > oldregxend)
8367 RExC_end = oldregxend;
8371 ret = regclass(pRExC_state,depth+1);
8373 RExC_end = oldregxend;
8376 Set_Node_Offset(ret, parse_start + 2);
8377 Set_Node_Cur_Length(ret);
8378 nextchar(pRExC_state);
8379 *flagp |= HASWIDTH|SIMPLE;
8383 /* Handle \N and \N{NAME} here and not below because it can be
8384 multicharacter. join_exact() will join them up later on.
8385 Also this makes sure that things like /\N{BLAH}+/ and
8386 \N{BLAH} being multi char Just Happen. dmq*/
8388 ret= reg_namedseq(pRExC_state, NULL, flagp);
8390 case 'k': /* Handle \k<NAME> and \k'NAME' */
8393 char ch= RExC_parse[1];
8394 if (ch != '<' && ch != '\'' && ch != '{') {
8396 vFAIL2("Sequence %.2s... not terminated",parse_start);
8398 /* this pretty much dupes the code for (?P=...) in reg(), if
8399 you change this make sure you change that */
8400 char* name_start = (RExC_parse += 2);
8402 SV *sv_dat = reg_scan_name(pRExC_state,
8403 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8404 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
8405 if (RExC_parse == name_start || *RExC_parse != ch)
8406 vFAIL2("Sequence %.3s... not terminated",parse_start);
8409 num = add_data( pRExC_state, 1, "S" );
8410 RExC_rxi->data->data[num]=(void*)sv_dat;
8411 SvREFCNT_inc_simple_void(sv_dat);
8415 ret = reganode(pRExC_state,
8418 : (MORE_ASCII_RESTRICTED)
8420 : (AT_LEAST_UNI_SEMANTICS)
8428 /* override incorrect value set in reganode MJD */
8429 Set_Node_Offset(ret, parse_start+1);
8430 Set_Node_Cur_Length(ret); /* MJD */
8431 nextchar(pRExC_state);
8437 case '1': case '2': case '3': case '4':
8438 case '5': case '6': case '7': case '8': case '9':
8441 bool isg = *RExC_parse == 'g';
8446 if (*RExC_parse == '{') {
8450 if (*RExC_parse == '-') {
8454 if (hasbrace && !isDIGIT(*RExC_parse)) {
8455 if (isrel) RExC_parse--;
8457 goto parse_named_seq;
8459 num = atoi(RExC_parse);
8460 if (isg && num == 0)
8461 vFAIL("Reference to invalid group 0");
8463 num = RExC_npar - num;
8465 vFAIL("Reference to nonexistent or unclosed group");
8467 if (!isg && num > 9 && num >= RExC_npar)
8470 char * const parse_start = RExC_parse - 1; /* MJD */
8471 while (isDIGIT(*RExC_parse))
8473 if (parse_start == RExC_parse - 1)
8474 vFAIL("Unterminated \\g... pattern");
8476 if (*RExC_parse != '}')
8477 vFAIL("Unterminated \\g{...} pattern");
8481 if (num > (I32)RExC_rx->nparens)
8482 vFAIL("Reference to nonexistent group");
8485 ret = reganode(pRExC_state,
8488 : (MORE_ASCII_RESTRICTED)
8490 : (AT_LEAST_UNI_SEMANTICS)
8498 /* override incorrect value set in reganode MJD */
8499 Set_Node_Offset(ret, parse_start+1);
8500 Set_Node_Cur_Length(ret); /* MJD */
8502 nextchar(pRExC_state);
8507 if (RExC_parse >= RExC_end)
8508 FAIL("Trailing \\");
8511 /* Do not generate "unrecognized" warnings here, we fall
8512 back into the quick-grab loop below */
8519 if (RExC_flags & RXf_PMf_EXTENDED) {
8520 if ( reg_skipcomment( pRExC_state ) )
8527 register STRLEN len;
8532 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
8533 regnode * orig_emit;
8535 parse_start = RExC_parse - 1;
8541 orig_emit = RExC_emit; /* Save the original output node position in
8542 case we need to output a different node
8544 ret = reg_node(pRExC_state,
8545 (U8) ((! FOLD) ? EXACT
8548 : (MORE_ASCII_RESTRICTED)
8550 : (AT_LEAST_UNI_SEMANTICS)
8555 for (len = 0, p = RExC_parse - 1;
8556 len < 127 && p < RExC_end;
8559 char * const oldp = p;
8561 if (RExC_flags & RXf_PMf_EXTENDED)
8562 p = regwhite( pRExC_state, p );
8564 case LATIN_SMALL_LETTER_SHARP_S:
8565 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8566 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8567 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8568 goto normal_default;
8578 /* Literal Escapes Switch
8580 This switch is meant to handle escape sequences that
8581 resolve to a literal character.
8583 Every escape sequence that represents something
8584 else, like an assertion or a char class, is handled
8585 in the switch marked 'Special Escapes' above in this
8586 routine, but also has an entry here as anything that
8587 isn't explicitly mentioned here will be treated as
8588 an unescaped equivalent literal.
8592 /* These are all the special escapes. */
8593 case LATIN_SMALL_LETTER_SHARP_S:
8594 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
8595 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
8596 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
8597 goto normal_default;
8598 case 'A': /* Start assertion */
8599 case 'b': case 'B': /* Word-boundary assertion*/
8600 case 'C': /* Single char !DANGEROUS! */
8601 case 'd': case 'D': /* digit class */
8602 case 'g': case 'G': /* generic-backref, pos assertion */
8603 case 'h': case 'H': /* HORIZWS */
8604 case 'k': case 'K': /* named backref, keep marker */
8605 case 'N': /* named char sequence */
8606 case 'p': case 'P': /* Unicode property */
8607 case 'R': /* LNBREAK */
8608 case 's': case 'S': /* space class */
8609 case 'v': case 'V': /* VERTWS */
8610 case 'w': case 'W': /* word class */
8611 case 'X': /* eXtended Unicode "combining character sequence" */
8612 case 'z': case 'Z': /* End of line/string assertion */
8616 /* Anything after here is an escape that resolves to a
8617 literal. (Except digits, which may or may not)
8636 ender = ASCII_TO_NATIVE('\033');
8640 ender = ASCII_TO_NATIVE('\007');
8645 STRLEN brace_len = len;
8647 const char* error_msg;
8649 bool valid = grok_bslash_o(p,
8656 RExC_parse = p; /* going to die anyway; point
8657 to exact spot of failure */
8664 if (PL_encoding && ender < 0x100) {
8665 goto recode_encoding;
8674 char* const e = strchr(p, '}');
8678 vFAIL("Missing right brace on \\x{}");
8681 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8682 | PERL_SCAN_DISALLOW_PREFIX;
8683 STRLEN numlen = e - p - 1;
8684 ender = grok_hex(p + 1, &numlen, &flags, NULL);
8691 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8693 ender = grok_hex(p, &numlen, &flags, NULL);
8696 if (PL_encoding && ender < 0x100)
8697 goto recode_encoding;
8701 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
8703 case '0': case '1': case '2': case '3':case '4':
8704 case '5': case '6': case '7': case '8':case '9':
8706 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
8708 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8710 ender = grok_oct(p, &numlen, &flags, NULL);
8720 if (PL_encoding && ender < 0x100)
8721 goto recode_encoding;
8725 SV* enc = PL_encoding;
8726 ender = reg_recode((const char)(U8)ender, &enc);
8727 if (!enc && SIZE_ONLY)
8728 ckWARNreg(p, "Invalid escape in the specified encoding");
8734 FAIL("Trailing \\");
8737 if (!SIZE_ONLY&& isALPHA(*p)) {
8738 /* Include any { following the alpha to emphasize
8739 * that it could be part of an escape at some point
8741 int len = (*(p + 1) == '{') ? 2 : 1;
8742 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
8744 goto normal_default;
8749 if (UTF8_IS_START(*p) && UTF) {
8751 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
8752 &numlen, UTF8_ALLOW_DEFAULT);
8758 } /* End of switch on the literal */
8760 /* Certain characters are problematic because their folded
8761 * length is so different from their original length that it
8762 * isn't handleable by the optimizer. They are therefore not
8763 * placed in an EXACTish node; and are here handled specially.
8764 * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
8765 * putting it in a special node keeps regexec from having to
8766 * deal with a non-utf8 multi-char fold */
8768 && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
8769 && is_TRICKYFOLD_cp(ender))
8771 /* If is in middle of outputting characters into an
8772 * EXACTish node, go output what we have so far, and
8773 * position the parse so that this will be called again
8776 p = RExC_parse + len - 1;
8781 /* Here we are ready to output our tricky fold
8782 * character. What's done is to pretend it's in a
8783 * [bracketed] class, and let the code that deals with
8784 * those handle it, as that code has all the
8785 * intelligence necessary. First save the current
8786 * parse state, get rid of the already allocated EXACT
8787 * node that the ANYOFV node will replace, and point
8788 * the parse to a buffer which we fill with the
8789 * character we want the regclass code to think is
8791 char* const oldregxend = RExC_end;
8793 RExC_emit = orig_emit;
8794 RExC_parse = tmpbuf;
8796 tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
8797 tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
8798 RExC_end = RExC_parse + 2;
8801 tmpbuf[0] = (char) ender;
8802 RExC_end = RExC_parse + 1;
8805 ret = regclass(pRExC_state,depth+1);
8807 /* Here, have parsed the buffer. Reset the parse to
8808 * the actual input, and return */
8809 RExC_end = oldregxend;
8812 Set_Node_Offset(ret, RExC_parse);
8813 Set_Node_Cur_Length(ret);
8814 nextchar(pRExC_state);
8815 *flagp |= HASWIDTH|SIMPLE;
8820 if ( RExC_flags & RXf_PMf_EXTENDED)
8821 p = regwhite( pRExC_state, p );
8823 /* Prime the casefolded buffer. Locale rules, which apply
8824 * only to code points < 256, aren't known until execution,
8825 * so for them, just output the original character using
8827 if (LOC && ender < 256) {
8828 if (UNI_IS_INVARIANT(ender)) {
8829 *tmpbuf = (U8) ender;
8832 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
8833 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
8837 else if (isASCII(ender)) { /* Note: Here can't also be LOC
8839 ender = toLOWER(ender);
8840 *tmpbuf = (U8) ender;
8843 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
8845 /* Locale and /aa require more selectivity about the
8846 * fold, so are handled below. Otherwise, here, just
8848 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
8851 /* Under locale rules or /aa we are not to mix,
8852 * respectively, ords < 256 or ASCII with non-. So
8853 * reject folds that mix them, using only the
8854 * non-folded code point. So do the fold to a
8855 * temporary, and inspect each character in it. */
8856 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
8858 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
8859 U8* e = s + foldlen;
8860 bool fold_ok = TRUE;
8864 || (LOC && (UTF8_IS_INVARIANT(*s)
8865 || UTF8_IS_DOWNGRADEABLE_START(*s))))
8873 Copy(trialbuf, tmpbuf, foldlen, U8);
8877 uvuni_to_utf8(tmpbuf, ender);
8878 foldlen = UNISKIP(ender);
8882 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
8887 /* Emit all the Unicode characters. */
8889 for (foldbuf = tmpbuf;
8891 foldlen -= numlen) {
8892 ender = utf8_to_uvchr(foldbuf, &numlen);
8894 const STRLEN unilen = reguni(pRExC_state, ender, s);
8897 /* In EBCDIC the numlen
8898 * and unilen can differ. */
8900 if (numlen >= foldlen)
8904 break; /* "Can't happen." */
8908 const STRLEN unilen = reguni(pRExC_state, ender, s);
8917 REGC((char)ender, s++);
8923 /* Emit all the Unicode characters. */
8925 for (foldbuf = tmpbuf;
8927 foldlen -= numlen) {
8928 ender = utf8_to_uvchr(foldbuf, &numlen);
8930 const STRLEN unilen = reguni(pRExC_state, ender, s);
8933 /* In EBCDIC the numlen
8934 * and unilen can differ. */
8936 if (numlen >= foldlen)
8944 const STRLEN unilen = reguni(pRExC_state, ender, s);
8953 REGC((char)ender, s++);
8955 loopdone: /* Jumped to when encounters something that shouldn't be in
8958 Set_Node_Cur_Length(ret); /* MJD */
8959 nextchar(pRExC_state);
8961 /* len is STRLEN which is unsigned, need to copy to signed */
8964 vFAIL("Internal disaster");
8968 if (len == 1 && UNI_IS_INVARIANT(ender))
8972 RExC_size += STR_SZ(len);
8975 RExC_emit += STR_SZ(len);
8983 /* Jumped to when an unrecognized character set is encountered */
8985 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
8990 S_regwhite( RExC_state_t *pRExC_state, char *p )
8992 const char *e = RExC_end;
8994 PERL_ARGS_ASSERT_REGWHITE;
8999 else if (*p == '#') {
9008 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9016 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9017 Character classes ([:foo:]) can also be negated ([:^foo:]).
9018 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9019 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9020 but trigger failures because they are currently unimplemented. */
9022 #define POSIXCC_DONE(c) ((c) == ':')
9023 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9024 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9027 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9030 I32 namedclass = OOB_NAMEDCLASS;
9032 PERL_ARGS_ASSERT_REGPPOSIXCC;
9034 if (value == '[' && RExC_parse + 1 < RExC_end &&
9035 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9036 POSIXCC(UCHARAT(RExC_parse))) {
9037 const char c = UCHARAT(RExC_parse);
9038 char* const s = RExC_parse++;
9040 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9042 if (RExC_parse == RExC_end)
9043 /* Grandfather lone [:, [=, [. */
9046 const char* const t = RExC_parse++; /* skip over the c */
9049 if (UCHARAT(RExC_parse) == ']') {
9050 const char *posixcc = s + 1;
9051 RExC_parse++; /* skip over the ending ] */
9054 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9055 const I32 skip = t - posixcc;
9057 /* Initially switch on the length of the name. */
9060 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9061 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9064 /* Names all of length 5. */
9065 /* alnum alpha ascii blank cntrl digit graph lower
9066 print punct space upper */
9067 /* Offset 4 gives the best switch position. */
9068 switch (posixcc[4]) {
9070 if (memEQ(posixcc, "alph", 4)) /* alpha */
9071 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9074 if (memEQ(posixcc, "spac", 4)) /* space */
9075 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9078 if (memEQ(posixcc, "grap", 4)) /* graph */
9079 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9082 if (memEQ(posixcc, "asci", 4)) /* ascii */
9083 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9086 if (memEQ(posixcc, "blan", 4)) /* blank */
9087 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9090 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9091 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9094 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9095 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9098 if (memEQ(posixcc, "lowe", 4)) /* lower */
9099 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9100 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9101 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9104 if (memEQ(posixcc, "digi", 4)) /* digit */
9105 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9106 else if (memEQ(posixcc, "prin", 4)) /* print */
9107 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9108 else if (memEQ(posixcc, "punc", 4)) /* punct */
9109 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9114 if (memEQ(posixcc, "xdigit", 6))
9115 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9119 if (namedclass == OOB_NAMEDCLASS)
9120 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9122 assert (posixcc[skip] == ':');
9123 assert (posixcc[skip+1] == ']');
9124 } else if (!SIZE_ONLY) {
9125 /* [[=foo=]] and [[.foo.]] are still future. */
9127 /* adjust RExC_parse so the warning shows after
9129 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9131 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9134 /* Maternal grandfather:
9135 * "[:" ending in ":" but not in ":]" */
9145 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
9149 PERL_ARGS_ASSERT_CHECKPOSIXCC;
9151 if (POSIXCC(UCHARAT(RExC_parse))) {
9152 const char *s = RExC_parse;
9153 const char c = *s++;
9157 if (*s && c == *s && s[1] == ']') {
9159 "POSIX syntax [%c %c] belongs inside character classes",
9162 /* [[=foo=]] and [[.foo.]] are still future. */
9163 if (POSIXCC_NOTYET(c)) {
9164 /* adjust RExC_parse so the error shows after
9166 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
9168 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9174 /* No locale test, and always Unicode semantics */
9175 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
9177 for (value = 0; value < 256; value++) \
9179 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9183 case ANYOF_N##NAME: \
9184 for (value = 0; value < 256; value++) \
9186 stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9191 /* Like the above, but there are differences if we are in uni-8-bit or not, so
9192 * there are two tests passed in, to use depending on that. There aren't any
9193 * cases where the label is different from the name, so no need for that
9195 #define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
9197 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
9198 else if (UNI_SEMANTICS) { \
9199 for (value = 0; value < 256; value++) { \
9200 if (TEST_8(value)) stored += \
9201 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9205 for (value = 0; value < 128; value++) { \
9206 if (TEST_7(UNI_TO_NATIVE(value))) stored += \
9207 set_regclass_bit(pRExC_state, ret, \
9208 (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9214 case ANYOF_N##NAME: \
9215 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
9216 else if (UNI_SEMANTICS) { \
9217 for (value = 0; value < 256; value++) { \
9218 if (! TEST_8(value)) stored += \
9219 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
9223 for (value = 0; value < 128; value++) { \
9224 if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
9225 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9227 if (AT_LEAST_ASCII_RESTRICTED) { \
9228 for (value = 128; value < 256; value++) { \
9229 stored += set_regclass_bit( \
9230 pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
9232 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
9235 /* For a non-ut8 target string with DEPENDS semantics, all above \
9236 * ASCII Latin1 code points match the complement of any of the \
9237 * classes. But in utf8, they have their Unicode semantics, so \
9238 * can't just set them in the bitmap, or else regexec.c will think \
9239 * they matched when they shouldn't. */ \
9240 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
9248 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9251 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
9252 * Locale folding is done at run-time, so this function should not be
9253 * called for nodes that are for locales.
9255 * This function sets the bit corresponding to the fold of the input
9256 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
9259 * It also knows about the characters that are in the bitmap that have
9260 * folds that are matchable only outside it, and sets the appropriate lists
9263 * It returns the number of bits that actually changed from 0 to 1 */
9268 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
9270 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
9273 /* It assumes the bit for 'value' has already been set */
9274 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
9275 ANYOF_BITMAP_SET(node, fold);
9278 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
9279 /* Certain Latin1 characters have matches outside the bitmap. To get
9280 * here, 'value' is one of those characters. None of these matches is
9281 * valid for ASCII characters under /aa, which have been excluded by
9282 * the 'if' above. The matches fall into three categories:
9283 * 1) They are singly folded-to or -from an above 255 character, as
9284 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
9286 * 2) They are part of a multi-char fold with another character in the
9287 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
9288 * 3) They are part of a multi-char fold with a character not in the
9289 * bitmap, such as various ligatures.
9290 * We aren't dealing fully with multi-char folds, except we do deal
9291 * with the pattern containing a character that has a multi-char fold
9292 * (not so much the inverse).
9293 * For types 1) and 3), the matches only happen when the target string
9294 * is utf8; that's not true for 2), and we set a flag for it.
9296 * The code below adds to the passed in inversion list the single fold
9297 * closures for 'value'. The values are hard-coded here so that an
9298 * innocent-looking character class, like /[ks]/i won't have to go out
9299 * to disk to find the possible matches. XXX It would be better to
9300 * generate these via regen, in case a new version of the Unicode
9301 * standard adds new mappings, though that is not really likely. */
9306 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
9310 /* LATIN SMALL LETTER LONG S */
9311 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
9314 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9315 GREEK_SMALL_LETTER_MU);
9316 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9317 GREEK_CAPITAL_LETTER_MU);
9319 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9320 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9322 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
9323 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
9324 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9325 PL_fold_latin1[value]);
9328 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9329 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
9330 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9332 case LATIN_SMALL_LETTER_SHARP_S:
9334 /* Under /d and /u, this can match the two chars "ss" */
9335 if (! MORE_ASCII_RESTRICTED) {
9336 add_alternate(alternate_ptr, (U8 *) "ss", 2);
9338 /* And under /u, it can match even if the target is not
9340 if (UNI_SEMANTICS) {
9341 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9349 /* These all are targets of multi-character folds, which can
9350 * occur with only non-Latin1 characters in the fold, so they
9351 * can match if the target string isn't UTF-8 */
9352 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
9360 /* These all are targets of multi-character folds, which occur
9361 * only with a non-Latin1 character as part of the fold, so
9362 * they can't match unless the target string is in UTF-8, so no
9363 * action here is necessary */
9366 /* Use deprecated warning to increase the chances of this
9368 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
9372 else if (DEPENDS_SEMANTICS
9374 && PL_fold_latin1[value] != value)
9376 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
9377 * folds only when the target string is in UTF-8. We add the fold
9378 * here to the list of things to match outside the bitmap, which
9379 * won't be looked at unless it is UTF8 (or else if something else
9380 * says to look even if not utf8, but those things better not happen
9381 * under DEPENDS semantics. */
9382 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
9389 PERL_STATIC_INLINE U8
9390 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr)
9392 /* This inline function sets a bit in the bitmap if not already set, and if
9393 * appropriate, its fold, returning the number of bits that actually
9394 * changed from 0 to 1 */
9398 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
9400 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
9404 ANYOF_BITMAP_SET(node, value);
9407 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
9408 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
9415 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
9417 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
9418 * alternate list, pointed to by 'alternate_ptr'. This is an array of
9419 * the multi-character folds of characters in the node */
9422 PERL_ARGS_ASSERT_ADD_ALTERNATE;
9424 if (! *alternate_ptr) {
9425 *alternate_ptr = newAV();
9427 sv = newSVpvn_utf8((char*)string, len, TRUE);
9428 av_push(*alternate_ptr, sv);
9433 parse a class specification and produce either an ANYOF node that
9434 matches the pattern or perhaps will be optimized into an EXACTish node
9435 instead. The node contains a bit map for the first 256 characters, with the
9436 corresponding bit set if that character is in the list. For characters
9437 above 255, a range list is used */
9440 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
9443 register UV nextvalue;
9444 register IV prevvalue = OOB_UNICODE;
9445 register IV range = 0;
9446 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
9447 register regnode *ret;
9450 char *rangebegin = NULL;
9451 bool need_class = 0;
9453 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9454 than just initialized. */
9457 /* code points this node matches that can't be stored in the bitmap */
9458 HV* nonbitmap = NULL;
9460 /* The items that are to match that aren't stored in the bitmap, but are a
9461 * result of things that are stored there. This is the fold closure of
9462 * such a character, either because it has DEPENDS semantics and shouldn't
9463 * be matched unless the target string is utf8, or is a code point that is
9464 * too large for the bit map, as for example, the fold of the MICRO SIGN is
9465 * above 255. This all is solely for performance reasons. By having this
9466 * code know the outside-the-bitmap folds that the bitmapped characters are
9467 * involved with, we don't have to go out to disk to find the list of
9468 * matches, unless the character class includes code points that aren't
9469 * storable in the bit map. That means that a character class with an 's'
9470 * in it, for example, doesn't need to go out to disk to find everything
9471 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
9472 * empty unless there is something whose fold we don't know about, and will
9473 * have to go out to the disk to find. */
9474 HV* l1_fold_invlist = NULL;
9476 /* List of multi-character folds that are matched by this node */
9477 AV* unicode_alternate = NULL;
9479 UV literal_endpoint = 0;
9481 UV stored = 0; /* how many chars stored in the bitmap */
9483 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
9484 case we need to change the emitted regop to an EXACT. */
9485 const char * orig_parse = RExC_parse;
9486 GET_RE_DEBUG_FLAGS_DECL;
9488 PERL_ARGS_ASSERT_REGCLASS;
9490 PERL_UNUSED_ARG(depth);
9493 DEBUG_PARSE("clas");
9495 /* Assume we are going to generate an ANYOF node. */
9496 ret = reganode(pRExC_state, ANYOF, 0);
9500 ANYOF_FLAGS(ret) = 0;
9503 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
9507 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
9511 RExC_size += ANYOF_SKIP;
9512 #ifdef ANYOF_ADD_LOC_SKIP
9514 RExC_size += ANYOF_ADD_LOC_SKIP;
9517 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
9520 RExC_emit += ANYOF_SKIP;
9522 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
9523 #ifdef ANYOF_ADD_LOC_SKIP
9524 RExC_emit += ANYOF_ADD_LOC_SKIP;
9527 ANYOF_BITMAP_ZERO(ret);
9528 listsv = newSVpvs("# comment\n");
9529 initial_listsv_len = SvCUR(listsv);
9532 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9534 if (!SIZE_ONLY && POSIXCC(nextvalue))
9535 checkposixcc(pRExC_state);
9537 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
9538 if (UCHARAT(RExC_parse) == ']')
9542 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
9546 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9549 rangebegin = RExC_parse;
9551 value = utf8n_to_uvchr((U8*)RExC_parse,
9552 RExC_end - RExC_parse,
9553 &numlen, UTF8_ALLOW_DEFAULT);
9554 RExC_parse += numlen;
9557 value = UCHARAT(RExC_parse++);
9559 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
9560 if (value == '[' && POSIXCC(nextvalue))
9561 namedclass = regpposixcc(pRExC_state, value);
9562 else if (value == '\\') {
9564 value = utf8n_to_uvchr((U8*)RExC_parse,
9565 RExC_end - RExC_parse,
9566 &numlen, UTF8_ALLOW_DEFAULT);
9567 RExC_parse += numlen;
9570 value = UCHARAT(RExC_parse++);
9571 /* Some compilers cannot handle switching on 64-bit integer
9572 * values, therefore value cannot be an UV. Yes, this will
9573 * be a problem later if we want switch on Unicode.
9574 * A similar issue a little bit later when switching on
9575 * namedclass. --jhi */
9576 switch ((I32)value) {
9577 case 'w': namedclass = ANYOF_ALNUM; break;
9578 case 'W': namedclass = ANYOF_NALNUM; break;
9579 case 's': namedclass = ANYOF_SPACE; break;
9580 case 'S': namedclass = ANYOF_NSPACE; break;
9581 case 'd': namedclass = ANYOF_DIGIT; break;
9582 case 'D': namedclass = ANYOF_NDIGIT; break;
9583 case 'v': namedclass = ANYOF_VERTWS; break;
9584 case 'V': namedclass = ANYOF_NVERTWS; break;
9585 case 'h': namedclass = ANYOF_HORIZWS; break;
9586 case 'H': namedclass = ANYOF_NHORIZWS; break;
9587 case 'N': /* Handle \N{NAME} in class */
9589 /* We only pay attention to the first char of
9590 multichar strings being returned. I kinda wonder
9591 if this makes sense as it does change the behaviour
9592 from earlier versions, OTOH that behaviour was broken
9594 UV v; /* value is register so we cant & it /grrr */
9595 if (reg_namedseq(pRExC_state, &v, NULL)) {
9605 if (RExC_parse >= RExC_end)
9606 vFAIL2("Empty \\%c{}", (U8)value);
9607 if (*RExC_parse == '{') {
9608 const U8 c = (U8)value;
9609 e = strchr(RExC_parse++, '}');
9611 vFAIL2("Missing right brace on \\%c{}", c);
9612 while (isSPACE(UCHARAT(RExC_parse)))
9614 if (e == RExC_parse)
9615 vFAIL2("Empty \\%c{}", c);
9617 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
9625 if (UCHARAT(RExC_parse) == '^') {
9628 value = value == 'p' ? 'P' : 'p'; /* toggle */
9629 while (isSPACE(UCHARAT(RExC_parse))) {
9635 /* Add the property name to the list. If /i matching, give
9636 * a different name which consists of the normal name
9637 * sandwiched between two underscores and '_i'. The design
9638 * is discussed in the commit message for this. */
9639 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
9640 (value=='p' ? '+' : '!'),
9649 /* The \p could match something in the Latin1 range, hence
9650 * something that isn't utf8 */
9651 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
9652 namedclass = ANYOF_MAX; /* no official name, but it's named */
9654 /* \p means they want Unicode semantics */
9655 RExC_uni_semantics = 1;
9658 case 'n': value = '\n'; break;
9659 case 'r': value = '\r'; break;
9660 case 't': value = '\t'; break;
9661 case 'f': value = '\f'; break;
9662 case 'b': value = '\b'; break;
9663 case 'e': value = ASCII_TO_NATIVE('\033');break;
9664 case 'a': value = ASCII_TO_NATIVE('\007');break;
9666 RExC_parse--; /* function expects to be pointed at the 'o' */
9668 const char* error_msg;
9669 bool valid = grok_bslash_o(RExC_parse,
9674 RExC_parse += numlen;
9679 if (PL_encoding && value < 0x100) {
9680 goto recode_encoding;
9684 if (*RExC_parse == '{') {
9685 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9686 | PERL_SCAN_DISALLOW_PREFIX;
9687 char * const e = strchr(RExC_parse++, '}');
9689 vFAIL("Missing right brace on \\x{}");
9691 numlen = e - RExC_parse;
9692 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9696 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9698 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
9699 RExC_parse += numlen;
9701 if (PL_encoding && value < 0x100)
9702 goto recode_encoding;
9705 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
9707 case '0': case '1': case '2': case '3': case '4':
9708 case '5': case '6': case '7':
9710 /* Take 1-3 octal digits */
9711 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9713 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
9714 RExC_parse += numlen;
9715 if (PL_encoding && value < 0x100)
9716 goto recode_encoding;
9721 SV* enc = PL_encoding;
9722 value = reg_recode((const char)(U8)value, &enc);
9723 if (!enc && SIZE_ONLY)
9724 ckWARNreg(RExC_parse,
9725 "Invalid escape in the specified encoding");
9729 /* Allow \_ to not give an error */
9730 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
9731 ckWARN2reg(RExC_parse,
9732 "Unrecognized escape \\%c in character class passed through",
9737 } /* end of \blah */
9743 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
9745 /* What matches in a locale is not known until runtime, so need to
9746 * (one time per class) allocate extra space to pass to regexec.
9747 * The space will contain a bit for each named class that is to be
9748 * matched against. This isn't needed for \p{} and pseudo-classes,
9749 * as they are not affected by locale, and hence are dealt with
9751 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
9754 #ifdef ANYOF_CLASS_ADD_SKIP
9755 RExC_size += ANYOF_CLASS_ADD_SKIP;
9759 #ifdef ANYOF_CLASS_ADD_SKIP
9760 RExC_emit += ANYOF_CLASS_ADD_SKIP;
9762 ANYOF_CLASS_ZERO(ret);
9764 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
9767 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
9768 * literal, as is the character that began the false range, i.e.
9769 * the 'a' in the examples */
9773 RExC_parse >= rangebegin ?
9774 RExC_parse - rangebegin : 0;
9775 ckWARN4reg(RExC_parse,
9776 "False [] range \"%*.*s\"",
9780 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9781 if (prevvalue < 256) {
9783 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
9786 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
9790 range = 0; /* this was not a true range */
9796 const char *what = NULL;
9799 /* Possible truncation here but in some 64-bit environments
9800 * the compiler gets heartburn about switch on 64-bit values.
9801 * A similar issue a little earlier when switching on value.
9803 switch ((I32)namedclass) {
9805 case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
9806 case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
9807 case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
9808 case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
9809 case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
9810 case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
9811 case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
9812 case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
9813 case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
9814 case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
9815 /* \s, \w match all unicode if utf8. */
9816 case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
9817 case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
9818 case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
9819 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
9820 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
9823 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
9825 for (value = 0; value < 128; value++)
9827 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9830 what = NULL; /* Doesn't match outside ascii, so
9831 don't want to add +utf8:: */
9835 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
9837 for (value = 128; value < 256; value++)
9839 set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
9841 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9847 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
9849 /* consecutive digits assumed */
9850 for (value = '0'; value <= '9'; value++)
9852 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9859 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
9861 /* consecutive digits assumed */
9862 for (value = 0; value < '0'; value++)
9864 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9865 for (value = '9' + 1; value < 256; value++)
9867 set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
9871 if (AT_LEAST_ASCII_RESTRICTED ) {
9872 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
9876 /* this is to handle \p and \P */
9879 vFAIL("Invalid [::] class");
9882 if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
9883 /* Strings such as "+utf8::isWord\n" */
9884 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
9889 } /* end of namedclass \blah */
9892 if (prevvalue > (IV)value) /* b-a */ {
9893 const int w = RExC_parse - rangebegin;
9894 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
9895 range = 0; /* not a valid range */
9899 prevvalue = value; /* save the beginning of the range */
9900 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
9901 RExC_parse[1] != ']') {
9904 /* a bad range like \w-, [:word:]- ? */
9905 if (namedclass > OOB_NAMEDCLASS) {
9906 if (ckWARN(WARN_REGEXP)) {
9908 RExC_parse >= rangebegin ?
9909 RExC_parse - rangebegin : 0;
9911 "False [] range \"%*.*s\"",
9916 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
9918 range = 1; /* yeah, it's a range! */
9919 continue; /* but do it the next time */
9923 /* non-Latin1 code point implies unicode semantics. Must be set in
9924 * pass1 so is there for the whole of pass 2 */
9926 RExC_uni_semantics = 1;
9929 /* now is the next time */
9931 if (prevvalue < 256) {
9932 const IV ceilvalue = value < 256 ? value : 255;
9935 /* In EBCDIC [\x89-\x91] should include
9936 * the \x8e but [i-j] should not. */
9937 if (literal_endpoint == 2 &&
9938 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
9939 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
9941 if (isLOWER(prevvalue)) {
9942 for (i = prevvalue; i <= ceilvalue; i++)
9943 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9945 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9948 for (i = prevvalue; i <= ceilvalue; i++)
9949 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
9951 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9957 for (i = prevvalue; i <= ceilvalue; i++) {
9958 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
9962 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
9963 const UV natvalue = NATIVE_TO_UNI(value);
9964 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
9967 literal_endpoint = 0;
9971 range = 0; /* this range (if it was one) is done now */
9978 /****** !SIZE_ONLY AFTER HERE *********/
9980 /* If folding and there are code points above 255, we calculate all
9981 * characters that could fold to or from the ones already on the list */
9982 if (FOLD && nonbitmap) {
9985 HV* fold_intersection;
9988 /* This is a list of all the characters that participate in folds
9989 * (except marks, etc in multi-char folds */
9990 if (! PL_utf8_foldable) {
9991 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
9992 PL_utf8_foldable = _swash_to_invlist(swash);
9995 /* This is a hash that for a particular fold gives all characters
9996 * that are involved in it */
9997 if (! PL_utf8_foldclosures) {
9999 /* If we were unable to find any folds, then we likely won't be
10000 * able to find the closures. So just create an empty list.
10001 * Folding will effectively be restricted to the non-Unicode rules
10002 * hard-coded into Perl. (This case happens legitimately during
10003 * compilation of Perl itself before the Unicode tables are
10005 if (invlist_len(PL_utf8_foldable) == 0) {
10006 PL_utf8_foldclosures = _new_invlist(0);
10008 /* If the folds haven't been read in, call a fold function
10010 if (! PL_utf8_tofold) {
10011 U8 dummy[UTF8_MAXBYTES+1];
10013 to_utf8_fold((U8*) "A", dummy, &dummy_len);
10015 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
10019 /* Only the characters in this class that participate in folds need
10020 * be checked. Get the intersection of this class and all the
10021 * possible characters that are foldable. This can quickly narrow
10022 * down a large class */
10023 fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
10025 /* Now look at the foldable characters in this class individually */
10026 fold_list = invlist_array(fold_intersection);
10027 for (i = 0; i < invlist_len(fold_intersection); i++) {
10030 /* The next entry is the beginning of the range that is in the
10032 UV start = fold_list[i++];
10035 /* The next entry is the beginning of the next range, which
10036 * isn't in the class, so the end of the current range is one
10037 * less than that */
10038 UV end = fold_list[i] - 1;
10040 /* Look at every character in the range */
10041 for (j = start; j <= end; j++) {
10044 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10046 const UV f = to_uni_fold(j, foldbuf, &foldlen);
10048 if (foldlen > (STRLEN)UNISKIP(f)) {
10050 /* Any multicharacter foldings (disallowed in
10051 * lookbehind patterns) require the following
10052 * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
10053 * E folds into "pq" and F folds into "rst", all other
10054 * characters fold to single characters. We save away
10055 * these multicharacter foldings, to be later saved as
10056 * part of the additional "s" data. */
10057 if (! RExC_in_lookbehind) {
10059 U8* e = foldbuf + foldlen;
10061 /* If any of the folded characters of this are in
10062 * the Latin1 range, tell the regex engine that
10063 * this can match a non-utf8 target string. The
10064 * only multi-byte fold whose source is in the
10065 * Latin1 range (U+00DF) applies only when the
10066 * target string is utf8, or under unicode rules */
10067 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
10070 /* Can't mix ascii with non- under /aa */
10071 if (MORE_ASCII_RESTRICTED
10072 && (isASCII(*loc) != isASCII(j)))
10074 goto end_multi_fold;
10076 if (UTF8_IS_INVARIANT(*loc)
10077 || UTF8_IS_DOWNGRADEABLE_START(*loc))
10079 /* Can't mix above and below 256 under
10082 goto end_multi_fold;
10085 |= ANYOF_NONBITMAP_NON_UTF8;
10088 loc += UTF8SKIP(loc);
10092 add_alternate(&unicode_alternate, foldbuf, foldlen);
10097 /* Single character fold. Add everything in its fold
10098 * closure to the list that this node should match */
10101 /* The fold closures data structure is a hash with the
10102 * keys being every character that is folded to, like
10103 * 'k', and the values each an array of everything that
10104 * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
10105 if ((listp = hv_fetch(PL_utf8_foldclosures,
10106 (char *) foldbuf, foldlen, FALSE)))
10108 AV* list = (AV*) *listp;
10110 for (k = 0; k <= av_len(list); k++) {
10111 SV** c_p = av_fetch(list, k, FALSE);
10114 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
10118 /* /aa doesn't allow folds between ASCII and
10119 * non-; /l doesn't allow them between above
10121 if ((MORE_ASCII_RESTRICTED
10122 && (isASCII(c) != isASCII(j)))
10123 || (LOC && ((c < 256) != (j < 256))))
10128 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
10129 stored += set_regclass_bit(pRExC_state,
10132 &l1_fold_invlist, &unicode_alternate);
10134 /* It may be that the code point is already
10135 * in this range or already in the bitmap,
10136 * in which case we need do nothing */
10137 else if ((c < start || c > end)
10139 || ! ANYOF_BITMAP_TEST(ret, c)))
10141 nonbitmap = add_cp_to_invlist(nonbitmap, c);
10148 invlist_destroy(fold_intersection);
10151 /* Combine the two lists into one. */
10152 if (l1_fold_invlist) {
10154 nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
10157 nonbitmap = l1_fold_invlist;
10161 /* Here, we have calculated what code points should be in the character
10162 * class. Now we can see about various optimizations. Fold calculation
10163 * needs to take place before inversion. Otherwise /[^k]/i would invert to
10164 * include K, which under /i would match k. */
10166 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
10167 * set the FOLD flag yet, so this this does optimize those. It doesn't
10168 * optimize locale. Doing so perhaps could be done as long as there is
10169 * nothing like \w in it; some thought also would have to be given to the
10170 * interaction with above 0x100 chars */
10172 && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
10173 && ! unicode_alternate
10175 && SvCUR(listsv) == initial_listsv_len)
10177 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
10178 ANYOF_BITMAP(ret)[value] ^= 0xFF;
10179 stored = 256 - stored;
10181 /* The inversion means that everything above 255 is matched; and at the
10182 * same time we clear the invert flag */
10183 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
10186 /* Folding in the bitmap is taken care of above, but not for locale (for
10187 * which we have to wait to see what folding is in effect at runtime), and
10188 * for things not in the bitmap. Set run-time fold flag for these */
10189 if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
10190 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
10193 /* A single character class can be "optimized" into an EXACTish node.
10194 * Note that since we don't currently count how many characters there are
10195 * outside the bitmap, we are XXX missing optimization possibilities for
10196 * them. This optimization can't happen unless this is a truly single
10197 * character class, which means that it can't be an inversion into a
10198 * many-character class, and there must be no possibility of there being
10199 * things outside the bitmap. 'stored' (only) for locales doesn't include
10200 * \w, etc, so have to make a special test that they aren't present
10202 * Similarly A 2-character class of the very special form like [bB] can be
10203 * optimized into an EXACTFish node, but only for non-locales, and for
10204 * characters which only have the two folds; so things like 'fF' and 'Ii'
10205 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
10208 && ! unicode_alternate
10209 && SvCUR(listsv) == initial_listsv_len
10210 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
10211 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10212 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
10213 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
10214 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
10215 /* If the latest code point has a fold whose
10216 * bit is set, it must be the only other one */
10217 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
10218 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
10220 /* Note that the information needed to decide to do this optimization
10221 * is not currently available until the 2nd pass, and that the actually
10222 * used EXACTish node takes less space than the calculated ANYOF node,
10223 * and hence the amount of space calculated in the first pass is larger
10224 * than actually used, so this optimization doesn't gain us any space.
10225 * But an EXACT node is faster than an ANYOF node, and can be combined
10226 * with any adjacent EXACT nodes later by the optimizer for further
10227 * gains. The speed of executing an EXACTF is similar to an ANYOF
10228 * node, so the optimization advantage comes from the ability to join
10229 * it to adjacent EXACT nodes */
10231 const char * cur_parse= RExC_parse;
10233 RExC_emit = (regnode *)orig_emit;
10234 RExC_parse = (char *)orig_parse;
10238 /* A locale node with one point can be folded; all the other cases
10239 * with folding will have two points, since we calculate them above
10241 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
10247 } /* else 2 chars in the bit map: the folds of each other */
10248 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
10250 /* To join adjacent nodes, they must be the exact EXACTish type.
10251 * Try to use the most likely type, by using EXACTFU if the regex
10252 * calls for them, or is required because the character is
10256 else { /* Otherwise, more likely to be EXACTF type */
10260 ret = reg_node(pRExC_state, op);
10261 RExC_parse = (char *)cur_parse;
10262 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
10263 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
10264 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
10266 RExC_emit += STR_SZ(2);
10269 *STRING(ret)= (char)value;
10271 RExC_emit += STR_SZ(1);
10273 SvREFCNT_dec(listsv);
10278 UV* nonbitmap_array = invlist_array(nonbitmap);
10279 UV nonbitmap_len = invlist_len(nonbitmap);
10282 /* Here have the full list of items to match that aren't in the
10283 * bitmap. Convert to the structure that the rest of the code is
10284 * expecting. XXX That rest of the code should convert to this
10286 for (i = 0; i < nonbitmap_len; i++) {
10288 /* The next entry is the beginning of the range that is in the
10290 UV start = nonbitmap_array[i++];
10293 /* The next entry is the beginning of the next range, which isn't
10294 * in the class, so the end of the current range is one less than
10295 * that. But if there is no next range, it means that the range
10296 * begun by 'start' extends to infinity, which for this platform
10297 * ends at UV_MAX */
10298 if (i == nonbitmap_len) {
10302 end = nonbitmap_array[i] - 1;
10305 if (start == end) {
10306 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
10309 /* The \t sets the whole range */
10310 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
10315 invlist_destroy(nonbitmap);
10318 if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
10319 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
10320 SvREFCNT_dec(listsv);
10321 SvREFCNT_dec(unicode_alternate);
10325 AV * const av = newAV();
10327 /* The 0th element stores the character class description
10328 * in its textual form: used later (regexec.c:Perl_regclass_swash())
10329 * to initialize the appropriate swash (which gets stored in
10330 * the 1st element), and also useful for dumping the regnode.
10331 * The 2nd element stores the multicharacter foldings,
10332 * used later (regexec.c:S_reginclass()). */
10333 av_store(av, 0, listsv);
10334 av_store(av, 1, NULL);
10335 av_store(av, 2, MUTABLE_SV(unicode_alternate));
10336 if (unicode_alternate) { /* This node is variable length */
10339 rv = newRV_noinc(MUTABLE_SV(av));
10340 n = add_data(pRExC_state, 1, "s");
10341 RExC_rxi->data->data[n] = (void*)rv;
10349 /* reg_skipcomment()
10351 Absorbs an /x style # comments from the input stream.
10352 Returns true if there is more text remaining in the stream.
10353 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
10354 terminates the pattern without including a newline.
10356 Note its the callers responsibility to ensure that we are
10357 actually in /x mode
10362 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
10366 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
10368 while (RExC_parse < RExC_end)
10369 if (*RExC_parse++ == '\n') {
10374 /* we ran off the end of the pattern without ending
10375 the comment, so we have to add an \n when wrapping */
10376 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10384 Advances the parse position, and optionally absorbs
10385 "whitespace" from the inputstream.
10387 Without /x "whitespace" means (?#...) style comments only,
10388 with /x this means (?#...) and # comments and whitespace proper.
10390 Returns the RExC_parse point from BEFORE the scan occurs.
10392 This is the /x friendly way of saying RExC_parse++.
10396 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
10398 char* const retval = RExC_parse++;
10400 PERL_ARGS_ASSERT_NEXTCHAR;
10403 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
10404 RExC_parse[2] == '#') {
10405 while (*RExC_parse != ')') {
10406 if (RExC_parse == RExC_end)
10407 FAIL("Sequence (?#... not terminated");
10413 if (RExC_flags & RXf_PMf_EXTENDED) {
10414 if (isSPACE(*RExC_parse)) {
10418 else if (*RExC_parse == '#') {
10419 if ( reg_skipcomment( pRExC_state ) )
10428 - reg_node - emit a node
10430 STATIC regnode * /* Location. */
10431 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
10434 register regnode *ptr;
10435 regnode * const ret = RExC_emit;
10436 GET_RE_DEBUG_FLAGS_DECL;
10438 PERL_ARGS_ASSERT_REG_NODE;
10441 SIZE_ALIGN(RExC_size);
10445 if (RExC_emit >= RExC_emit_bound)
10446 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10448 NODE_ALIGN_FILL(ret);
10450 FILL_ADVANCE_NODE(ptr, op);
10451 #ifdef RE_TRACK_PATTERN_OFFSETS
10452 if (RExC_offsets) { /* MJD */
10453 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
10454 "reg_node", __LINE__,
10456 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
10457 ? "Overwriting end of array!\n" : "OK",
10458 (UV)(RExC_emit - RExC_emit_start),
10459 (UV)(RExC_parse - RExC_start),
10460 (UV)RExC_offsets[0]));
10461 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
10469 - reganode - emit a node with an argument
10471 STATIC regnode * /* Location. */
10472 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
10475 register regnode *ptr;
10476 regnode * const ret = RExC_emit;
10477 GET_RE_DEBUG_FLAGS_DECL;
10479 PERL_ARGS_ASSERT_REGANODE;
10482 SIZE_ALIGN(RExC_size);
10487 assert(2==regarglen[op]+1);
10489 Anything larger than this has to allocate the extra amount.
10490 If we changed this to be:
10492 RExC_size += (1 + regarglen[op]);
10494 then it wouldn't matter. Its not clear what side effect
10495 might come from that so its not done so far.
10500 if (RExC_emit >= RExC_emit_bound)
10501 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
10503 NODE_ALIGN_FILL(ret);
10505 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
10506 #ifdef RE_TRACK_PATTERN_OFFSETS
10507 if (RExC_offsets) { /* MJD */
10508 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10512 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
10513 "Overwriting end of array!\n" : "OK",
10514 (UV)(RExC_emit - RExC_emit_start),
10515 (UV)(RExC_parse - RExC_start),
10516 (UV)RExC_offsets[0]));
10517 Set_Cur_Node_Offset;
10525 - reguni - emit (if appropriate) a Unicode character
10528 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
10532 PERL_ARGS_ASSERT_REGUNI;
10534 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
10538 - reginsert - insert an operator in front of already-emitted operand
10540 * Means relocating the operand.
10543 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
10546 register regnode *src;
10547 register regnode *dst;
10548 register regnode *place;
10549 const int offset = regarglen[(U8)op];
10550 const int size = NODE_STEP_REGNODE + offset;
10551 GET_RE_DEBUG_FLAGS_DECL;
10553 PERL_ARGS_ASSERT_REGINSERT;
10554 PERL_UNUSED_ARG(depth);
10555 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
10556 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
10565 if (RExC_open_parens) {
10567 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
10568 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
10569 if ( RExC_open_parens[paren] >= opnd ) {
10570 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
10571 RExC_open_parens[paren] += size;
10573 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
10575 if ( RExC_close_parens[paren] >= opnd ) {
10576 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
10577 RExC_close_parens[paren] += size;
10579 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
10584 while (src > opnd) {
10585 StructCopy(--src, --dst, regnode);
10586 #ifdef RE_TRACK_PATTERN_OFFSETS
10587 if (RExC_offsets) { /* MJD 20010112 */
10588 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
10592 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
10593 ? "Overwriting end of array!\n" : "OK",
10594 (UV)(src - RExC_emit_start),
10595 (UV)(dst - RExC_emit_start),
10596 (UV)RExC_offsets[0]));
10597 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
10598 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
10604 place = opnd; /* Op node, where operand used to be. */
10605 #ifdef RE_TRACK_PATTERN_OFFSETS
10606 if (RExC_offsets) { /* MJD */
10607 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
10611 (UV)(place - RExC_emit_start) > RExC_offsets[0]
10612 ? "Overwriting end of array!\n" : "OK",
10613 (UV)(place - RExC_emit_start),
10614 (UV)(RExC_parse - RExC_start),
10615 (UV)RExC_offsets[0]));
10616 Set_Node_Offset(place, RExC_parse);
10617 Set_Node_Length(place, 1);
10620 src = NEXTOPER(place);
10621 FILL_ADVANCE_NODE(place, op);
10622 Zero(src, offset, regnode);
10626 - regtail - set the next-pointer at the end of a node chain of p to val.
10627 - SEE ALSO: regtail_study
10629 /* TODO: All three parms should be const */
10631 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10634 register regnode *scan;
10635 GET_RE_DEBUG_FLAGS_DECL;
10637 PERL_ARGS_ASSERT_REGTAIL;
10639 PERL_UNUSED_ARG(depth);
10645 /* Find last node. */
10648 regnode * const temp = regnext(scan);
10650 SV * const mysv=sv_newmortal();
10651 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
10652 regprop(RExC_rx, mysv, scan);
10653 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
10654 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
10655 (temp == NULL ? "->" : ""),
10656 (temp == NULL ? PL_reg_name[OP(val)] : "")
10664 if (reg_off_by_arg[OP(scan)]) {
10665 ARG_SET(scan, val - scan);
10668 NEXT_OFF(scan) = val - scan;
10674 - regtail_study - set the next-pointer at the end of a node chain of p to val.
10675 - Look for optimizable sequences at the same time.
10676 - currently only looks for EXACT chains.
10678 This is experimental code. The idea is to use this routine to perform
10679 in place optimizations on branches and groups as they are constructed,
10680 with the long term intention of removing optimization from study_chunk so
10681 that it is purely analytical.
10683 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
10684 to control which is which.
10687 /* TODO: All four parms should be const */
10690 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
10693 register regnode *scan;
10695 #ifdef EXPERIMENTAL_INPLACESCAN
10698 GET_RE_DEBUG_FLAGS_DECL;
10700 PERL_ARGS_ASSERT_REGTAIL_STUDY;
10706 /* Find last node. */
10710 regnode * const temp = regnext(scan);
10711 #ifdef EXPERIMENTAL_INPLACESCAN
10712 if (PL_regkind[OP(scan)] == EXACT)
10713 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
10717 switch (OP(scan)) {
10723 if( exact == PSEUDO )
10725 else if ( exact != OP(scan) )
10734 SV * const mysv=sv_newmortal();
10735 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
10736 regprop(RExC_rx, mysv, scan);
10737 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
10738 SvPV_nolen_const(mysv),
10739 REG_NODE_NUM(scan),
10740 PL_reg_name[exact]);
10747 SV * const mysv_val=sv_newmortal();
10748 DEBUG_PARSE_MSG("");
10749 regprop(RExC_rx, mysv_val, val);
10750 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
10751 SvPV_nolen_const(mysv_val),
10752 (IV)REG_NODE_NUM(val),
10756 if (reg_off_by_arg[OP(scan)]) {
10757 ARG_SET(scan, val - scan);
10760 NEXT_OFF(scan) = val - scan;
10768 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
10772 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
10778 for (bit=0; bit<32; bit++) {
10779 if (flags & (1<<bit)) {
10780 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
10783 if (!set++ && lead)
10784 PerlIO_printf(Perl_debug_log, "%s",lead);
10785 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
10788 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
10789 if (!set++ && lead) {
10790 PerlIO_printf(Perl_debug_log, "%s",lead);
10793 case REGEX_UNICODE_CHARSET:
10794 PerlIO_printf(Perl_debug_log, "UNICODE");
10796 case REGEX_LOCALE_CHARSET:
10797 PerlIO_printf(Perl_debug_log, "LOCALE");
10799 case REGEX_ASCII_RESTRICTED_CHARSET:
10800 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
10802 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
10803 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
10806 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
10812 PerlIO_printf(Perl_debug_log, "\n");
10814 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
10820 Perl_regdump(pTHX_ const regexp *r)
10824 SV * const sv = sv_newmortal();
10825 SV *dsv= sv_newmortal();
10826 RXi_GET_DECL(r,ri);
10827 GET_RE_DEBUG_FLAGS_DECL;
10829 PERL_ARGS_ASSERT_REGDUMP;
10831 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
10833 /* Header fields of interest. */
10834 if (r->anchored_substr) {
10835 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
10836 RE_SV_DUMPLEN(r->anchored_substr), 30);
10837 PerlIO_printf(Perl_debug_log,
10838 "anchored %s%s at %"IVdf" ",
10839 s, RE_SV_TAIL(r->anchored_substr),
10840 (IV)r->anchored_offset);
10841 } else if (r->anchored_utf8) {
10842 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
10843 RE_SV_DUMPLEN(r->anchored_utf8), 30);
10844 PerlIO_printf(Perl_debug_log,
10845 "anchored utf8 %s%s at %"IVdf" ",
10846 s, RE_SV_TAIL(r->anchored_utf8),
10847 (IV)r->anchored_offset);
10849 if (r->float_substr) {
10850 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
10851 RE_SV_DUMPLEN(r->float_substr), 30);
10852 PerlIO_printf(Perl_debug_log,
10853 "floating %s%s at %"IVdf"..%"UVuf" ",
10854 s, RE_SV_TAIL(r->float_substr),
10855 (IV)r->float_min_offset, (UV)r->float_max_offset);
10856 } else if (r->float_utf8) {
10857 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
10858 RE_SV_DUMPLEN(r->float_utf8), 30);
10859 PerlIO_printf(Perl_debug_log,
10860 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
10861 s, RE_SV_TAIL(r->float_utf8),
10862 (IV)r->float_min_offset, (UV)r->float_max_offset);
10864 if (r->check_substr || r->check_utf8)
10865 PerlIO_printf(Perl_debug_log,
10867 (r->check_substr == r->float_substr
10868 && r->check_utf8 == r->float_utf8
10869 ? "(checking floating" : "(checking anchored"));
10870 if (r->extflags & RXf_NOSCAN)
10871 PerlIO_printf(Perl_debug_log, " noscan");
10872 if (r->extflags & RXf_CHECK_ALL)
10873 PerlIO_printf(Perl_debug_log, " isall");
10874 if (r->check_substr || r->check_utf8)
10875 PerlIO_printf(Perl_debug_log, ") ");
10877 if (ri->regstclass) {
10878 regprop(r, sv, ri->regstclass);
10879 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
10881 if (r->extflags & RXf_ANCH) {
10882 PerlIO_printf(Perl_debug_log, "anchored");
10883 if (r->extflags & RXf_ANCH_BOL)
10884 PerlIO_printf(Perl_debug_log, "(BOL)");
10885 if (r->extflags & RXf_ANCH_MBOL)
10886 PerlIO_printf(Perl_debug_log, "(MBOL)");
10887 if (r->extflags & RXf_ANCH_SBOL)
10888 PerlIO_printf(Perl_debug_log, "(SBOL)");
10889 if (r->extflags & RXf_ANCH_GPOS)
10890 PerlIO_printf(Perl_debug_log, "(GPOS)");
10891 PerlIO_putc(Perl_debug_log, ' ');
10893 if (r->extflags & RXf_GPOS_SEEN)
10894 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
10895 if (r->intflags & PREGf_SKIP)
10896 PerlIO_printf(Perl_debug_log, "plus ");
10897 if (r->intflags & PREGf_IMPLICIT)
10898 PerlIO_printf(Perl_debug_log, "implicit ");
10899 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
10900 if (r->extflags & RXf_EVAL_SEEN)
10901 PerlIO_printf(Perl_debug_log, "with eval ");
10902 PerlIO_printf(Perl_debug_log, "\n");
10903 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
10905 PERL_ARGS_ASSERT_REGDUMP;
10906 PERL_UNUSED_CONTEXT;
10907 PERL_UNUSED_ARG(r);
10908 #endif /* DEBUGGING */
10912 - regprop - printable representation of opcode
10914 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
10917 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
10918 if (flags & ANYOF_INVERT) \
10919 /*make sure the invert info is in each */ \
10920 sv_catpvs(sv, "^"); \
10926 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
10931 RXi_GET_DECL(prog,progi);
10932 GET_RE_DEBUG_FLAGS_DECL;
10934 PERL_ARGS_ASSERT_REGPROP;
10938 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
10939 /* It would be nice to FAIL() here, but this may be called from
10940 regexec.c, and it would be hard to supply pRExC_state. */
10941 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
10942 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
10944 k = PL_regkind[OP(o)];
10947 sv_catpvs(sv, " ");
10948 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
10949 * is a crude hack but it may be the best for now since
10950 * we have no flag "this EXACTish node was UTF-8"
10952 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
10953 PERL_PV_ESCAPE_UNI_DETECT |
10954 PERL_PV_ESCAPE_NONASCII |
10955 PERL_PV_PRETTY_ELLIPSES |
10956 PERL_PV_PRETTY_LTGT |
10957 PERL_PV_PRETTY_NOCLEAR
10959 } else if (k == TRIE) {
10960 /* print the details of the trie in dumpuntil instead, as
10961 * progi->data isn't available here */
10962 const char op = OP(o);
10963 const U32 n = ARG(o);
10964 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
10965 (reg_ac_data *)progi->data->data[n] :
10967 const reg_trie_data * const trie
10968 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
10970 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
10971 DEBUG_TRIE_COMPILE_r(
10972 Perl_sv_catpvf(aTHX_ sv,
10973 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
10974 (UV)trie->startstate,
10975 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
10976 (UV)trie->wordcount,
10979 (UV)TRIE_CHARCOUNT(trie),
10980 (UV)trie->uniquecharcount
10983 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
10985 int rangestart = -1;
10986 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
10987 sv_catpvs(sv, "[");
10988 for (i = 0; i <= 256; i++) {
10989 if (i < 256 && BITMAP_TEST(bitmap,i)) {
10990 if (rangestart == -1)
10992 } else if (rangestart != -1) {
10993 if (i <= rangestart + 3)
10994 for (; rangestart < i; rangestart++)
10995 put_byte(sv, rangestart);
10997 put_byte(sv, rangestart);
10998 sv_catpvs(sv, "-");
10999 put_byte(sv, i - 1);
11004 sv_catpvs(sv, "]");
11007 } else if (k == CURLY) {
11008 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
11009 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
11010 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
11012 else if (k == WHILEM && o->flags) /* Ordinal/of */
11013 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
11014 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
11015 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
11016 if ( RXp_PAREN_NAMES(prog) ) {
11017 if ( k != REF || (OP(o) < NREF)) {
11018 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
11019 SV **name= av_fetch(list, ARG(o), 0 );
11021 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11024 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
11025 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
11026 I32 *nums=(I32*)SvPVX(sv_dat);
11027 SV **name= av_fetch(list, nums[0], 0 );
11030 for ( n=0; n<SvIVX(sv_dat); n++ ) {
11031 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
11032 (n ? "," : ""), (IV)nums[n]);
11034 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
11038 } else if (k == GOSUB)
11039 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
11040 else if (k == VERB) {
11042 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
11043 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
11044 } else if (k == LOGICAL)
11045 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
11046 else if (k == FOLDCHAR)
11047 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
11048 else if (k == ANYOF) {
11049 int i, rangestart = -1;
11050 const U8 flags = ANYOF_FLAGS(o);
11053 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
11054 static const char * const anyofs[] = {
11087 if (flags & ANYOF_LOCALE)
11088 sv_catpvs(sv, "{loc}");
11089 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
11090 sv_catpvs(sv, "{i}");
11091 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
11092 if (flags & ANYOF_INVERT)
11093 sv_catpvs(sv, "^");
11095 /* output what the standard cp 0-255 bitmap matches */
11096 for (i = 0; i <= 256; i++) {
11097 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
11098 if (rangestart == -1)
11100 } else if (rangestart != -1) {
11101 if (i <= rangestart + 3)
11102 for (; rangestart < i; rangestart++)
11103 put_byte(sv, rangestart);
11105 put_byte(sv, rangestart);
11106 sv_catpvs(sv, "-");
11107 put_byte(sv, i - 1);
11114 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11115 /* output any special charclass tests (used entirely under use locale) */
11116 if (ANYOF_CLASS_TEST_ANY_SET(o))
11117 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
11118 if (ANYOF_CLASS_TEST(o,i)) {
11119 sv_catpv(sv, anyofs[i]);
11123 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
11125 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
11126 sv_catpvs(sv, "{non-utf8-latin1-all}");
11129 /* output information about the unicode matching */
11130 if (flags & ANYOF_UNICODE_ALL)
11131 sv_catpvs(sv, "{unicode_all}");
11132 else if (ANYOF_NONBITMAP(o))
11133 sv_catpvs(sv, "{unicode}");
11134 if (flags & ANYOF_NONBITMAP_NON_UTF8)
11135 sv_catpvs(sv, "{outside bitmap}");
11139 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
11143 U8 s[UTF8_MAXBYTES_CASE+1];
11145 for (i = 0; i <= 256; i++) { /* just the first 256 */
11146 uvchr_to_utf8(s, i);
11148 if (i < 256 && swash_fetch(sw, s, TRUE)) {
11149 if (rangestart == -1)
11151 } else if (rangestart != -1) {
11152 if (i <= rangestart + 3)
11153 for (; rangestart < i; rangestart++) {
11154 const U8 * const e = uvchr_to_utf8(s,rangestart);
11156 for(p = s; p < e; p++)
11160 const U8 *e = uvchr_to_utf8(s,rangestart);
11162 for (p = s; p < e; p++)
11164 sv_catpvs(sv, "-");
11165 e = uvchr_to_utf8(s, i-1);
11166 for (p = s; p < e; p++)
11173 sv_catpvs(sv, "..."); /* et cetera */
11177 char *s = savesvpv(lv);
11178 char * const origs = s;
11180 while (*s && *s != '\n')
11184 const char * const t = ++s;
11202 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
11204 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
11205 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
11207 PERL_UNUSED_CONTEXT;
11208 PERL_UNUSED_ARG(sv);
11209 PERL_UNUSED_ARG(o);
11210 PERL_UNUSED_ARG(prog);
11211 #endif /* DEBUGGING */
11215 Perl_re_intuit_string(pTHX_ REGEXP * const r)
11216 { /* Assume that RE_INTUIT is set */
11218 struct regexp *const prog = (struct regexp *)SvANY(r);
11219 GET_RE_DEBUG_FLAGS_DECL;
11221 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
11222 PERL_UNUSED_CONTEXT;
11226 const char * const s = SvPV_nolen_const(prog->check_substr
11227 ? prog->check_substr : prog->check_utf8);
11229 if (!PL_colorset) reginitcolors();
11230 PerlIO_printf(Perl_debug_log,
11231 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
11233 prog->check_substr ? "" : "utf8 ",
11234 PL_colors[5],PL_colors[0],
11237 (strlen(s) > 60 ? "..." : ""));
11240 return prog->check_substr ? prog->check_substr : prog->check_utf8;
11246 handles refcounting and freeing the perl core regexp structure. When
11247 it is necessary to actually free the structure the first thing it
11248 does is call the 'free' method of the regexp_engine associated to
11249 the regexp, allowing the handling of the void *pprivate; member
11250 first. (This routine is not overridable by extensions, which is why
11251 the extensions free is called first.)
11253 See regdupe and regdupe_internal if you change anything here.
11255 #ifndef PERL_IN_XSUB_RE
11257 Perl_pregfree(pTHX_ REGEXP *r)
11263 Perl_pregfree2(pTHX_ REGEXP *rx)
11266 struct regexp *const r = (struct regexp *)SvANY(rx);
11267 GET_RE_DEBUG_FLAGS_DECL;
11269 PERL_ARGS_ASSERT_PREGFREE2;
11271 if (r->mother_re) {
11272 ReREFCNT_dec(r->mother_re);
11274 CALLREGFREE_PVT(rx); /* free the private data */
11275 SvREFCNT_dec(RXp_PAREN_NAMES(r));
11278 SvREFCNT_dec(r->anchored_substr);
11279 SvREFCNT_dec(r->anchored_utf8);
11280 SvREFCNT_dec(r->float_substr);
11281 SvREFCNT_dec(r->float_utf8);
11282 Safefree(r->substrs);
11284 RX_MATCH_COPY_FREE(rx);
11285 #ifdef PERL_OLD_COPY_ON_WRITE
11286 SvREFCNT_dec(r->saved_copy);
11293 This is a hacky workaround to the structural issue of match results
11294 being stored in the regexp structure which is in turn stored in
11295 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
11296 could be PL_curpm in multiple contexts, and could require multiple
11297 result sets being associated with the pattern simultaneously, such
11298 as when doing a recursive match with (??{$qr})
11300 The solution is to make a lightweight copy of the regexp structure
11301 when a qr// is returned from the code executed by (??{$qr}) this
11302 lightweight copy doesn't actually own any of its data except for
11303 the starp/end and the actual regexp structure itself.
11309 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
11311 struct regexp *ret;
11312 struct regexp *const r = (struct regexp *)SvANY(rx);
11313 register const I32 npar = r->nparens+1;
11315 PERL_ARGS_ASSERT_REG_TEMP_COPY;
11318 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
11319 ret = (struct regexp *)SvANY(ret_x);
11321 (void)ReREFCNT_inc(rx);
11322 /* We can take advantage of the existing "copied buffer" mechanism in SVs
11323 by pointing directly at the buffer, but flagging that the allocated
11324 space in the copy is zero. As we've just done a struct copy, it's now
11325 a case of zero-ing that, rather than copying the current length. */
11326 SvPV_set(ret_x, RX_WRAPPED(rx));
11327 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
11328 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
11329 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
11330 SvLEN_set(ret_x, 0);
11331 SvSTASH_set(ret_x, NULL);
11332 SvMAGIC_set(ret_x, NULL);
11333 Newx(ret->offs, npar, regexp_paren_pair);
11334 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11336 Newx(ret->substrs, 1, struct reg_substr_data);
11337 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11339 SvREFCNT_inc_void(ret->anchored_substr);
11340 SvREFCNT_inc_void(ret->anchored_utf8);
11341 SvREFCNT_inc_void(ret->float_substr);
11342 SvREFCNT_inc_void(ret->float_utf8);
11344 /* check_substr and check_utf8, if non-NULL, point to either their
11345 anchored or float namesakes, and don't hold a second reference. */
11347 RX_MATCH_COPIED_off(ret_x);
11348 #ifdef PERL_OLD_COPY_ON_WRITE
11349 ret->saved_copy = NULL;
11351 ret->mother_re = rx;
11357 /* regfree_internal()
11359 Free the private data in a regexp. This is overloadable by
11360 extensions. Perl takes care of the regexp structure in pregfree(),
11361 this covers the *pprivate pointer which technically perl doesn't
11362 know about, however of course we have to handle the
11363 regexp_internal structure when no extension is in use.
11365 Note this is called before freeing anything in the regexp
11370 Perl_regfree_internal(pTHX_ REGEXP * const rx)
11373 struct regexp *const r = (struct regexp *)SvANY(rx);
11374 RXi_GET_DECL(r,ri);
11375 GET_RE_DEBUG_FLAGS_DECL;
11377 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
11383 SV *dsv= sv_newmortal();
11384 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
11385 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
11386 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
11387 PL_colors[4],PL_colors[5],s);
11390 #ifdef RE_TRACK_PATTERN_OFFSETS
11392 Safefree(ri->u.offsets); /* 20010421 MJD */
11395 int n = ri->data->count;
11396 PAD* new_comppad = NULL;
11401 /* If you add a ->what type here, update the comment in regcomp.h */
11402 switch (ri->data->what[n]) {
11407 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
11410 Safefree(ri->data->data[n]);
11413 new_comppad = MUTABLE_AV(ri->data->data[n]);
11416 if (new_comppad == NULL)
11417 Perl_croak(aTHX_ "panic: pregfree comppad");
11418 PAD_SAVE_LOCAL(old_comppad,
11419 /* Watch out for global destruction's random ordering. */
11420 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
11423 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
11426 op_free((OP_4tree*)ri->data->data[n]);
11428 PAD_RESTORE_LOCAL(old_comppad);
11429 SvREFCNT_dec(MUTABLE_SV(new_comppad));
11430 new_comppad = NULL;
11435 { /* Aho Corasick add-on structure for a trie node.
11436 Used in stclass optimization only */
11438 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
11440 refcount = --aho->refcount;
11443 PerlMemShared_free(aho->states);
11444 PerlMemShared_free(aho->fail);
11445 /* do this last!!!! */
11446 PerlMemShared_free(ri->data->data[n]);
11447 PerlMemShared_free(ri->regstclass);
11453 /* trie structure. */
11455 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
11457 refcount = --trie->refcount;
11460 PerlMemShared_free(trie->charmap);
11461 PerlMemShared_free(trie->states);
11462 PerlMemShared_free(trie->trans);
11464 PerlMemShared_free(trie->bitmap);
11466 PerlMemShared_free(trie->jump);
11467 PerlMemShared_free(trie->wordinfo);
11468 /* do this last!!!! */
11469 PerlMemShared_free(ri->data->data[n]);
11474 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
11477 Safefree(ri->data->what);
11478 Safefree(ri->data);
11484 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
11485 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
11486 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
11489 re_dup - duplicate a regexp.
11491 This routine is expected to clone a given regexp structure. It is only
11492 compiled under USE_ITHREADS.
11494 After all of the core data stored in struct regexp is duplicated
11495 the regexp_engine.dupe method is used to copy any private data
11496 stored in the *pprivate pointer. This allows extensions to handle
11497 any duplication it needs to do.
11499 See pregfree() and regfree_internal() if you change anything here.
11501 #if defined(USE_ITHREADS)
11502 #ifndef PERL_IN_XSUB_RE
11504 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
11508 const struct regexp *r = (const struct regexp *)SvANY(sstr);
11509 struct regexp *ret = (struct regexp *)SvANY(dstr);
11511 PERL_ARGS_ASSERT_RE_DUP_GUTS;
11513 npar = r->nparens+1;
11514 Newx(ret->offs, npar, regexp_paren_pair);
11515 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
11517 /* no need to copy these */
11518 Newx(ret->swap, npar, regexp_paren_pair);
11521 if (ret->substrs) {
11522 /* Do it this way to avoid reading from *r after the StructCopy().
11523 That way, if any of the sv_dup_inc()s dislodge *r from the L1
11524 cache, it doesn't matter. */
11525 const bool anchored = r->check_substr
11526 ? r->check_substr == r->anchored_substr
11527 : r->check_utf8 == r->anchored_utf8;
11528 Newx(ret->substrs, 1, struct reg_substr_data);
11529 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
11531 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
11532 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
11533 ret->float_substr = sv_dup_inc(ret->float_substr, param);
11534 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
11536 /* check_substr and check_utf8, if non-NULL, point to either their
11537 anchored or float namesakes, and don't hold a second reference. */
11539 if (ret->check_substr) {
11541 assert(r->check_utf8 == r->anchored_utf8);
11542 ret->check_substr = ret->anchored_substr;
11543 ret->check_utf8 = ret->anchored_utf8;
11545 assert(r->check_substr == r->float_substr);
11546 assert(r->check_utf8 == r->float_utf8);
11547 ret->check_substr = ret->float_substr;
11548 ret->check_utf8 = ret->float_utf8;
11550 } else if (ret->check_utf8) {
11552 ret->check_utf8 = ret->anchored_utf8;
11554 ret->check_utf8 = ret->float_utf8;
11559 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
11562 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
11564 if (RX_MATCH_COPIED(dstr))
11565 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
11567 ret->subbeg = NULL;
11568 #ifdef PERL_OLD_COPY_ON_WRITE
11569 ret->saved_copy = NULL;
11572 if (ret->mother_re) {
11573 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
11574 /* Our storage points directly to our mother regexp, but that's
11575 1: a buffer in a different thread
11576 2: something we no longer hold a reference on
11577 so we need to copy it locally. */
11578 /* Note we need to sue SvCUR() on our mother_re, because it, in
11579 turn, may well be pointing to its own mother_re. */
11580 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
11581 SvCUR(ret->mother_re)+1));
11582 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
11584 ret->mother_re = NULL;
11588 #endif /* PERL_IN_XSUB_RE */
11593 This is the internal complement to regdupe() which is used to copy
11594 the structure pointed to by the *pprivate pointer in the regexp.
11595 This is the core version of the extension overridable cloning hook.
11596 The regexp structure being duplicated will be copied by perl prior
11597 to this and will be provided as the regexp *r argument, however
11598 with the /old/ structures pprivate pointer value. Thus this routine
11599 may override any copying normally done by perl.
11601 It returns a pointer to the new regexp_internal structure.
11605 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
11608 struct regexp *const r = (struct regexp *)SvANY(rx);
11609 regexp_internal *reti;
11611 RXi_GET_DECL(r,ri);
11613 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
11615 npar = r->nparens+1;
11618 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
11619 Copy(ri->program, reti->program, len+1, regnode);
11622 reti->regstclass = NULL;
11625 struct reg_data *d;
11626 const int count = ri->data->count;
11629 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
11630 char, struct reg_data);
11631 Newx(d->what, count, U8);
11634 for (i = 0; i < count; i++) {
11635 d->what[i] = ri->data->what[i];
11636 switch (d->what[i]) {
11637 /* legal options are one of: sSfpontTua
11638 see also regcomp.h and pregfree() */
11639 case 'a': /* actually an AV, but the dup function is identical. */
11642 case 'p': /* actually an AV, but the dup function is identical. */
11643 case 'u': /* actually an HV, but the dup function is identical. */
11644 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
11647 /* This is cheating. */
11648 Newx(d->data[i], 1, struct regnode_charclass_class);
11649 StructCopy(ri->data->data[i], d->data[i],
11650 struct regnode_charclass_class);
11651 reti->regstclass = (regnode*)d->data[i];
11654 /* Compiled op trees are readonly and in shared memory,
11655 and can thus be shared without duplication. */
11657 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
11661 /* Trie stclasses are readonly and can thus be shared
11662 * without duplication. We free the stclass in pregfree
11663 * when the corresponding reg_ac_data struct is freed.
11665 reti->regstclass= ri->regstclass;
11669 ((reg_trie_data*)ri->data->data[i])->refcount++;
11673 d->data[i] = ri->data->data[i];
11676 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
11685 reti->name_list_idx = ri->name_list_idx;
11687 #ifdef RE_TRACK_PATTERN_OFFSETS
11688 if (ri->u.offsets) {
11689 Newx(reti->u.offsets, 2*len+1, U32);
11690 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
11693 SetProgLen(reti,len);
11696 return (void*)reti;
11699 #endif /* USE_ITHREADS */
11701 #ifndef PERL_IN_XSUB_RE
11704 - regnext - dig the "next" pointer out of a node
11707 Perl_regnext(pTHX_ register regnode *p)
11710 register I32 offset;
11715 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
11716 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
11719 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
11728 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
11731 STRLEN l1 = strlen(pat1);
11732 STRLEN l2 = strlen(pat2);
11735 const char *message;
11737 PERL_ARGS_ASSERT_RE_CROAK2;
11743 Copy(pat1, buf, l1 , char);
11744 Copy(pat2, buf + l1, l2 , char);
11745 buf[l1 + l2] = '\n';
11746 buf[l1 + l2 + 1] = '\0';
11748 /* ANSI variant takes additional second argument */
11749 va_start(args, pat2);
11753 msv = vmess(buf, &args);
11755 message = SvPV_const(msv,l1);
11758 Copy(message, buf, l1 , char);
11759 buf[l1-1] = '\0'; /* Overwrite \n */
11760 Perl_croak(aTHX_ "%s", buf);
11763 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
11765 #ifndef PERL_IN_XSUB_RE
11767 Perl_save_re_context(pTHX)
11771 struct re_save_state *state;
11773 SAVEVPTR(PL_curcop);
11774 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
11776 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
11777 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
11778 SSPUSHUV(SAVEt_RE_STATE);
11780 Copy(&PL_reg_state, state, 1, struct re_save_state);
11782 PL_reg_start_tmp = 0;
11783 PL_reg_start_tmpl = 0;
11784 PL_reg_oldsaved = NULL;
11785 PL_reg_oldsavedlen = 0;
11786 PL_reg_maxiter = 0;
11787 PL_reg_leftiter = 0;
11788 PL_reg_poscache = NULL;
11789 PL_reg_poscache_size = 0;
11790 #ifdef PERL_OLD_COPY_ON_WRITE
11794 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
11796 const REGEXP * const rx = PM_GETRE(PL_curpm);
11799 for (i = 1; i <= RX_NPARENS(rx); i++) {
11800 char digits[TYPE_CHARS(long)];
11801 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
11802 GV *const *const gvp
11803 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
11806 GV * const gv = *gvp;
11807 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
11817 clear_re(pTHX_ void *r)
11820 ReREFCNT_dec((REGEXP *)r);
11826 S_put_byte(pTHX_ SV *sv, int c)
11828 PERL_ARGS_ASSERT_PUT_BYTE;
11830 /* Our definition of isPRINT() ignores locales, so only bytes that are
11831 not part of UTF-8 are considered printable. I assume that the same
11832 holds for UTF-EBCDIC.
11833 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
11834 which Wikipedia says:
11836 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
11837 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
11838 identical, to the ASCII delete (DEL) or rubout control character.
11839 ) So the old condition can be simplified to !isPRINT(c) */
11842 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
11845 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
11849 const char string = c;
11850 if (c == '-' || c == ']' || c == '\\' || c == '^')
11851 sv_catpvs(sv, "\\");
11852 sv_catpvn(sv, &string, 1);
11857 #define CLEAR_OPTSTART \
11858 if (optstart) STMT_START { \
11859 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
11863 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
11865 STATIC const regnode *
11866 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
11867 const regnode *last, const regnode *plast,
11868 SV* sv, I32 indent, U32 depth)
11871 register U8 op = PSEUDO; /* Arbitrary non-END op. */
11872 register const regnode *next;
11873 const regnode *optstart= NULL;
11875 RXi_GET_DECL(r,ri);
11876 GET_RE_DEBUG_FLAGS_DECL;
11878 PERL_ARGS_ASSERT_DUMPUNTIL;
11880 #ifdef DEBUG_DUMPUNTIL
11881 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
11882 last ? last-start : 0,plast ? plast-start : 0);
11885 if (plast && plast < last)
11888 while (PL_regkind[op] != END && (!last || node < last)) {
11889 /* While that wasn't END last time... */
11892 if (op == CLOSE || op == WHILEM)
11894 next = regnext((regnode *)node);
11897 if (OP(node) == OPTIMIZED) {
11898 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
11905 regprop(r, sv, node);
11906 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
11907 (int)(2*indent + 1), "", SvPVX_const(sv));
11909 if (OP(node) != OPTIMIZED) {
11910 if (next == NULL) /* Next ptr. */
11911 PerlIO_printf(Perl_debug_log, " (0)");
11912 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
11913 PerlIO_printf(Perl_debug_log, " (FAIL)");
11915 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
11916 (void)PerlIO_putc(Perl_debug_log, '\n');
11920 if (PL_regkind[(U8)op] == BRANCHJ) {
11923 register const regnode *nnode = (OP(next) == LONGJMP
11924 ? regnext((regnode *)next)
11926 if (last && nnode > last)
11928 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
11931 else if (PL_regkind[(U8)op] == BRANCH) {
11933 DUMPUNTIL(NEXTOPER(node), next);
11935 else if ( PL_regkind[(U8)op] == TRIE ) {
11936 const regnode *this_trie = node;
11937 const char op = OP(node);
11938 const U32 n = ARG(node);
11939 const reg_ac_data * const ac = op>=AHOCORASICK ?
11940 (reg_ac_data *)ri->data->data[n] :
11942 const reg_trie_data * const trie =
11943 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
11945 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
11947 const regnode *nextbranch= NULL;
11950 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
11951 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
11953 PerlIO_printf(Perl_debug_log, "%*s%s ",
11954 (int)(2*(indent+3)), "",
11955 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
11956 PL_colors[0], PL_colors[1],
11957 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
11958 PERL_PV_PRETTY_ELLIPSES |
11959 PERL_PV_PRETTY_LTGT
11964 U16 dist= trie->jump[word_idx+1];
11965 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
11966 (UV)((dist ? this_trie + dist : next) - start));
11969 nextbranch= this_trie + trie->jump[0];
11970 DUMPUNTIL(this_trie + dist, nextbranch);
11972 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
11973 nextbranch= regnext((regnode *)nextbranch);
11975 PerlIO_printf(Perl_debug_log, "\n");
11978 if (last && next > last)
11983 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
11984 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
11985 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
11987 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
11989 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
11991 else if ( op == PLUS || op == STAR) {
11992 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
11994 else if (PL_regkind[(U8)op] == ANYOF) {
11995 /* arglen 1 + class block */
11996 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
11997 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
11998 node = NEXTOPER(node);
12000 else if (PL_regkind[(U8)op] == EXACT) {
12001 /* Literal string, where present. */
12002 node += NODE_SZ_STR(node) - 1;
12003 node = NEXTOPER(node);
12006 node = NEXTOPER(node);
12007 node += regarglen[(U8)op];
12009 if (op == CURLYX || op == OPEN)
12013 #ifdef DEBUG_DUMPUNTIL
12014 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
12019 #endif /* DEBUGGING */
12023 * c-indentation-style: bsd
12024 * c-basic-offset: 4
12025 * indent-tabs-mode: t
12028 * ex: set ts=8 sts=4 sw=4 noet: