5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
89 #ifndef PERL_IN_XSUB_RE
90 # include "charclass_invlists.h"
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 REGEXP *rx_sv; /* The SV that is the regexp. */
114 regexp *rx; /* perl core regexp structure */
115 regexp_internal *rxi; /* internal data for regexp object pprivate field */
116 char *start; /* Start of input for compile */
117 char *end; /* End of input for compile */
118 char *parse; /* Input-scan pointer. */
119 I32 whilem_seen; /* number of WHILEM in this expr */
120 regnode *emit_start; /* Start of emitted-code area */
121 regnode *emit_bound; /* First regnode outside of the allocated space */
122 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
123 I32 naughty; /* How bad is this pattern? */
124 I32 sawback; /* Did we see \1, ...? */
126 I32 size; /* Code size. */
127 I32 npar; /* Capture buffer count, (OPEN). */
128 I32 cpar; /* Capture buffer count, (CLOSE). */
129 I32 nestroot; /* root parens we are in - used by accept */
133 regnode **open_parens; /* pointers to open parens */
134 regnode **close_parens; /* pointers to close parens */
135 regnode *opend; /* END node in program */
136 I32 utf8; /* whether the pattern is utf8 or not */
137 I32 orig_utf8; /* whether the pattern was originally in utf8 */
138 /* XXX use this for future optimisation of case
139 * where pattern must be upgraded to utf8. */
140 I32 uni_semantics; /* If a d charset modifier should use unicode
141 rules, even if the pattern is not in
143 HV *paren_names; /* Paren names */
145 regnode **recurse; /* Recurse regops */
146 I32 recurse_count; /* Number of recurse regops */
149 I32 override_recoding;
151 char *starttry; /* -Dr: where regtry was called. */
152 #define RExC_starttry (pRExC_state->starttry)
155 const char *lastparse;
157 AV *paren_name_list; /* idx -> name */
158 #define RExC_lastparse (pRExC_state->lastparse)
159 #define RExC_lastnum (pRExC_state->lastnum)
160 #define RExC_paren_name_list (pRExC_state->paren_name_list)
164 #define RExC_flags (pRExC_state->flags)
165 #define RExC_precomp (pRExC_state->precomp)
166 #define RExC_rx_sv (pRExC_state->rx_sv)
167 #define RExC_rx (pRExC_state->rx)
168 #define RExC_rxi (pRExC_state->rxi)
169 #define RExC_start (pRExC_state->start)
170 #define RExC_end (pRExC_state->end)
171 #define RExC_parse (pRExC_state->parse)
172 #define RExC_whilem_seen (pRExC_state->whilem_seen)
173 #ifdef RE_TRACK_PATTERN_OFFSETS
174 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
176 #define RExC_emit (pRExC_state->emit)
177 #define RExC_emit_start (pRExC_state->emit_start)
178 #define RExC_emit_bound (pRExC_state->emit_bound)
179 #define RExC_naughty (pRExC_state->naughty)
180 #define RExC_sawback (pRExC_state->sawback)
181 #define RExC_seen (pRExC_state->seen)
182 #define RExC_size (pRExC_state->size)
183 #define RExC_npar (pRExC_state->npar)
184 #define RExC_nestroot (pRExC_state->nestroot)
185 #define RExC_extralen (pRExC_state->extralen)
186 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
187 #define RExC_seen_evals (pRExC_state->seen_evals)
188 #define RExC_utf8 (pRExC_state->utf8)
189 #define RExC_uni_semantics (pRExC_state->uni_semantics)
190 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
191 #define RExC_open_parens (pRExC_state->open_parens)
192 #define RExC_close_parens (pRExC_state->close_parens)
193 #define RExC_opend (pRExC_state->opend)
194 #define RExC_paren_names (pRExC_state->paren_names)
195 #define RExC_recurse (pRExC_state->recurse)
196 #define RExC_recurse_count (pRExC_state->recurse_count)
197 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
198 #define RExC_contains_locale (pRExC_state->contains_locale)
199 #define RExC_override_recoding (pRExC_state->override_recoding)
202 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
203 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
204 ((*s) == '{' && regcurly(s)))
207 #undef SPSTART /* dratted cpp namespace... */
210 * Flags to be passed up and down.
212 #define WORST 0 /* Worst case. */
213 #define HASWIDTH 0x01 /* Known to match non-null strings. */
215 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
216 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
218 #define SPSTART 0x04 /* Starts with * or +. */
219 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
220 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
222 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
224 /* whether trie related optimizations are enabled */
225 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
226 #define TRIE_STUDY_OPT
227 #define FULL_TRIE_STUDY
233 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
234 #define PBITVAL(paren) (1 << ((paren) & 7))
235 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
236 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
237 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
239 /* If not already in utf8, do a longjmp back to the beginning */
240 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
241 #define REQUIRE_UTF8 STMT_START { \
242 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
245 /* About scan_data_t.
247 During optimisation we recurse through the regexp program performing
248 various inplace (keyhole style) optimisations. In addition study_chunk
249 and scan_commit populate this data structure with information about
250 what strings MUST appear in the pattern. We look for the longest
251 string that must appear at a fixed location, and we look for the
252 longest string that may appear at a floating location. So for instance
257 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
258 strings (because they follow a .* construct). study_chunk will identify
259 both FOO and BAR as being the longest fixed and floating strings respectively.
261 The strings can be composites, for instance
265 will result in a composite fixed substring 'foo'.
267 For each string some basic information is maintained:
269 - offset or min_offset
270 This is the position the string must appear at, or not before.
271 It also implicitly (when combined with minlenp) tells us how many
272 characters must match before the string we are searching for.
273 Likewise when combined with minlenp and the length of the string it
274 tells us how many characters must appear after the string we have
278 Only used for floating strings. This is the rightmost point that
279 the string can appear at. If set to I32 max it indicates that the
280 string can occur infinitely far to the right.
283 A pointer to the minimum length of the pattern that the string
284 was found inside. This is important as in the case of positive
285 lookahead or positive lookbehind we can have multiple patterns
290 The minimum length of the pattern overall is 3, the minimum length
291 of the lookahead part is 3, but the minimum length of the part that
292 will actually match is 1. So 'FOO's minimum length is 3, but the
293 minimum length for the F is 1. This is important as the minimum length
294 is used to determine offsets in front of and behind the string being
295 looked for. Since strings can be composites this is the length of the
296 pattern at the time it was committed with a scan_commit. Note that
297 the length is calculated by study_chunk, so that the minimum lengths
298 are not known until the full pattern has been compiled, thus the
299 pointer to the value.
303 In the case of lookbehind the string being searched for can be
304 offset past the start point of the final matching string.
305 If this value was just blithely removed from the min_offset it would
306 invalidate some of the calculations for how many chars must match
307 before or after (as they are derived from min_offset and minlen and
308 the length of the string being searched for).
309 When the final pattern is compiled and the data is moved from the
310 scan_data_t structure into the regexp structure the information
311 about lookbehind is factored in, with the information that would
312 have been lost precalculated in the end_shift field for the
315 The fields pos_min and pos_delta are used to store the minimum offset
316 and the delta to the maximum offset at the current point in the pattern.
320 typedef struct scan_data_t {
321 /*I32 len_min; unused */
322 /*I32 len_delta; unused */
326 I32 last_end; /* min value, <0 unless valid. */
329 SV **longest; /* Either &l_fixed, or &l_float. */
330 SV *longest_fixed; /* longest fixed string found in pattern */
331 I32 offset_fixed; /* offset where it starts */
332 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
333 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
334 SV *longest_float; /* longest floating string found in pattern */
335 I32 offset_float_min; /* earliest point in string it can appear */
336 I32 offset_float_max; /* latest point in string it can appear */
337 I32 *minlen_float; /* pointer to the minlen relevant to the string */
338 I32 lookbehind_float; /* is the position of the string modified by LB */
342 struct regnode_charclass_class *start_class;
346 * Forward declarations for pregcomp()'s friends.
349 static const scan_data_t zero_scan_data =
350 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
352 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
353 #define SF_BEFORE_SEOL 0x0001
354 #define SF_BEFORE_MEOL 0x0002
355 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
356 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
359 # define SF_FIX_SHIFT_EOL (0+2)
360 # define SF_FL_SHIFT_EOL (0+4)
362 # define SF_FIX_SHIFT_EOL (+2)
363 # define SF_FL_SHIFT_EOL (+4)
366 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
367 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
369 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
370 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
371 #define SF_IS_INF 0x0040
372 #define SF_HAS_PAR 0x0080
373 #define SF_IN_PAR 0x0100
374 #define SF_HAS_EVAL 0x0200
375 #define SCF_DO_SUBSTR 0x0400
376 #define SCF_DO_STCLASS_AND 0x0800
377 #define SCF_DO_STCLASS_OR 0x1000
378 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
379 #define SCF_WHILEM_VISITED_POS 0x2000
381 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
382 #define SCF_SEEN_ACCEPT 0x8000
384 #define UTF cBOOL(RExC_utf8)
386 /* The enums for all these are ordered so things work out correctly */
387 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
388 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
389 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
390 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
391 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
392 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
393 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
395 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
397 #define OOB_UNICODE 12345678
398 #define OOB_NAMEDCLASS -1
400 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
401 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
404 /* length of regex to show in messages that don't mark a position within */
405 #define RegexLengthToShowInErrorMessages 127
408 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
409 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
410 * op/pragma/warn/regcomp.
412 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
413 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
415 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
418 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
419 * arg. Show regex, up to a maximum length. If it's too long, chop and add
422 #define _FAIL(code) STMT_START { \
423 const char *ellipses = ""; \
424 IV len = RExC_end - RExC_precomp; \
427 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
428 if (len > RegexLengthToShowInErrorMessages) { \
429 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
430 len = RegexLengthToShowInErrorMessages - 10; \
436 #define FAIL(msg) _FAIL( \
437 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
438 msg, (int)len, RExC_precomp, ellipses))
440 #define FAIL2(msg,arg) _FAIL( \
441 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
442 arg, (int)len, RExC_precomp, ellipses))
445 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
447 #define Simple_vFAIL(m) STMT_START { \
448 const IV offset = RExC_parse - RExC_precomp; \
449 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
450 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
454 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
456 #define vFAIL(m) STMT_START { \
458 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
463 * Like Simple_vFAIL(), but accepts two arguments.
465 #define Simple_vFAIL2(m,a1) STMT_START { \
466 const IV offset = RExC_parse - RExC_precomp; \
467 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
468 (int)offset, RExC_precomp, RExC_precomp + offset); \
472 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
474 #define vFAIL2(m,a1) STMT_START { \
476 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
477 Simple_vFAIL2(m, a1); \
482 * Like Simple_vFAIL(), but accepts three arguments.
484 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
485 const IV offset = RExC_parse - RExC_precomp; \
486 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
487 (int)offset, RExC_precomp, RExC_precomp + offset); \
491 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
493 #define vFAIL3(m,a1,a2) STMT_START { \
495 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
496 Simple_vFAIL3(m, a1, a2); \
500 * Like Simple_vFAIL(), but accepts four arguments.
502 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
503 const IV offset = RExC_parse - RExC_precomp; \
504 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
505 (int)offset, RExC_precomp, RExC_precomp + offset); \
508 #define ckWARNreg(loc,m) STMT_START { \
509 const IV offset = loc - RExC_precomp; \
510 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
511 (int)offset, RExC_precomp, RExC_precomp + offset); \
514 #define ckWARNregdep(loc,m) STMT_START { \
515 const IV offset = loc - RExC_precomp; \
516 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
518 (int)offset, RExC_precomp, RExC_precomp + offset); \
521 #define ckWARN2regdep(loc,m, a1) STMT_START { \
522 const IV offset = loc - RExC_precomp; \
523 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
525 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
528 #define ckWARN2reg(loc, m, a1) STMT_START { \
529 const IV offset = loc - RExC_precomp; \
530 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
531 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
534 #define vWARN3(loc, m, a1, a2) STMT_START { \
535 const IV offset = loc - RExC_precomp; \
536 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
537 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
540 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
541 const IV offset = loc - RExC_precomp; \
542 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
543 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
546 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
547 const IV offset = loc - RExC_precomp; \
548 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
549 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
552 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
553 const IV offset = loc - RExC_precomp; \
554 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
555 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
558 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
559 const IV offset = loc - RExC_precomp; \
560 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
561 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
565 /* Allow for side effects in s */
566 #define REGC(c,s) STMT_START { \
567 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
570 /* Macros for recording node offsets. 20001227 mjd@plover.com
571 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
572 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
573 * Element 0 holds the number n.
574 * Position is 1 indexed.
576 #ifndef RE_TRACK_PATTERN_OFFSETS
577 #define Set_Node_Offset_To_R(node,byte)
578 #define Set_Node_Offset(node,byte)
579 #define Set_Cur_Node_Offset
580 #define Set_Node_Length_To_R(node,len)
581 #define Set_Node_Length(node,len)
582 #define Set_Node_Cur_Length(node)
583 #define Node_Offset(n)
584 #define Node_Length(n)
585 #define Set_Node_Offset_Length(node,offset,len)
586 #define ProgLen(ri) ri->u.proglen
587 #define SetProgLen(ri,x) ri->u.proglen = x
589 #define ProgLen(ri) ri->u.offsets[0]
590 #define SetProgLen(ri,x) ri->u.offsets[0] = x
591 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
593 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
594 __LINE__, (int)(node), (int)(byte))); \
596 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
598 RExC_offsets[2*(node)-1] = (byte); \
603 #define Set_Node_Offset(node,byte) \
604 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
605 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
607 #define Set_Node_Length_To_R(node,len) STMT_START { \
609 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
610 __LINE__, (int)(node), (int)(len))); \
612 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
614 RExC_offsets[2*(node)] = (len); \
619 #define Set_Node_Length(node,len) \
620 Set_Node_Length_To_R((node)-RExC_emit_start, len)
621 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
622 #define Set_Node_Cur_Length(node) \
623 Set_Node_Length(node, RExC_parse - parse_start)
625 /* Get offsets and lengths */
626 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
627 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
629 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
630 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
631 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
635 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
636 #define EXPERIMENTAL_INPLACESCAN
637 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
639 #define DEBUG_STUDYDATA(str,data,depth) \
640 DEBUG_OPTIMISE_MORE_r(if(data){ \
641 PerlIO_printf(Perl_debug_log, \
642 "%*s" str "Pos:%"IVdf"/%"IVdf \
643 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
644 (int)(depth)*2, "", \
645 (IV)((data)->pos_min), \
646 (IV)((data)->pos_delta), \
647 (UV)((data)->flags), \
648 (IV)((data)->whilem_c), \
649 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
650 is_inf ? "INF " : "" \
652 if ((data)->last_found) \
653 PerlIO_printf(Perl_debug_log, \
654 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
655 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
656 SvPVX_const((data)->last_found), \
657 (IV)((data)->last_end), \
658 (IV)((data)->last_start_min), \
659 (IV)((data)->last_start_max), \
660 ((data)->longest && \
661 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
662 SvPVX_const((data)->longest_fixed), \
663 (IV)((data)->offset_fixed), \
664 ((data)->longest && \
665 (data)->longest==&((data)->longest_float)) ? "*" : "", \
666 SvPVX_const((data)->longest_float), \
667 (IV)((data)->offset_float_min), \
668 (IV)((data)->offset_float_max) \
670 PerlIO_printf(Perl_debug_log,"\n"); \
673 static void clear_re(pTHX_ void *r);
675 /* Mark that we cannot extend a found fixed substring at this point.
676 Update the longest found anchored substring and the longest found
677 floating substrings if needed. */
680 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
682 const STRLEN l = CHR_SVLEN(data->last_found);
683 const STRLEN old_l = CHR_SVLEN(*data->longest);
684 GET_RE_DEBUG_FLAGS_DECL;
686 PERL_ARGS_ASSERT_SCAN_COMMIT;
688 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
689 SvSetMagicSV(*data->longest, data->last_found);
690 if (*data->longest == data->longest_fixed) {
691 data->offset_fixed = l ? data->last_start_min : data->pos_min;
692 if (data->flags & SF_BEFORE_EOL)
694 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
696 data->flags &= ~SF_FIX_BEFORE_EOL;
697 data->minlen_fixed=minlenp;
698 data->lookbehind_fixed=0;
700 else { /* *data->longest == data->longest_float */
701 data->offset_float_min = l ? data->last_start_min : data->pos_min;
702 data->offset_float_max = (l
703 ? data->last_start_max
704 : data->pos_min + data->pos_delta);
705 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
706 data->offset_float_max = I32_MAX;
707 if (data->flags & SF_BEFORE_EOL)
709 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
711 data->flags &= ~SF_FL_BEFORE_EOL;
712 data->minlen_float=minlenp;
713 data->lookbehind_float=0;
716 SvCUR_set(data->last_found, 0);
718 SV * const sv = data->last_found;
719 if (SvUTF8(sv) && SvMAGICAL(sv)) {
720 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
726 data->flags &= ~SF_BEFORE_EOL;
727 DEBUG_STUDYDATA("commit: ",data,0);
730 /* Can match anything (initialization) */
732 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
734 PERL_ARGS_ASSERT_CL_ANYTHING;
736 ANYOF_BITMAP_SETALL(cl);
737 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
738 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
740 /* If any portion of the regex is to operate under locale rules,
741 * initialization includes it. The reason this isn't done for all regexes
742 * is that the optimizer was written under the assumption that locale was
743 * all-or-nothing. Given the complexity and lack of documentation in the
744 * optimizer, and that there are inadequate test cases for locale, so many
745 * parts of it may not work properly, it is safest to avoid locale unless
747 if (RExC_contains_locale) {
748 ANYOF_CLASS_SETALL(cl); /* /l uses class */
749 cl->flags |= ANYOF_LOCALE;
752 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
756 /* Can match anything (initialization) */
758 S_cl_is_anything(const struct regnode_charclass_class *cl)
762 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
764 for (value = 0; value <= ANYOF_MAX; value += 2)
765 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
767 if (!(cl->flags & ANYOF_UNICODE_ALL))
769 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
774 /* Can match anything (initialization) */
776 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
778 PERL_ARGS_ASSERT_CL_INIT;
780 Zero(cl, 1, struct regnode_charclass_class);
782 cl_anything(pRExC_state, cl);
783 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
786 /* These two functions currently do the exact same thing */
787 #define cl_init_zero S_cl_init
789 /* 'AND' a given class with another one. Can create false positives. 'cl'
790 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
791 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
793 S_cl_and(struct regnode_charclass_class *cl,
794 const struct regnode_charclass_class *and_with)
796 PERL_ARGS_ASSERT_CL_AND;
798 assert(and_with->type == ANYOF);
800 /* I (khw) am not sure all these restrictions are necessary XXX */
801 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
802 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
803 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
804 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
805 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
808 if (and_with->flags & ANYOF_INVERT)
809 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
810 cl->bitmap[i] &= ~and_with->bitmap[i];
812 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
813 cl->bitmap[i] &= and_with->bitmap[i];
814 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
816 if (and_with->flags & ANYOF_INVERT) {
818 /* Here, the and'ed node is inverted. Get the AND of the flags that
819 * aren't affected by the inversion. Those that are affected are
820 * handled individually below */
821 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
822 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
823 cl->flags |= affected_flags;
825 /* We currently don't know how to deal with things that aren't in the
826 * bitmap, but we know that the intersection is no greater than what
827 * is already in cl, so let there be false positives that get sorted
828 * out after the synthetic start class succeeds, and the node is
829 * matched for real. */
831 /* The inversion of these two flags indicate that the resulting
832 * intersection doesn't have them */
833 if (and_with->flags & ANYOF_UNICODE_ALL) {
834 cl->flags &= ~ANYOF_UNICODE_ALL;
836 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
837 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
840 else { /* and'd node is not inverted */
841 U8 outside_bitmap_but_not_utf8; /* Temp variable */
843 if (! ANYOF_NONBITMAP(and_with)) {
845 /* Here 'and_with' doesn't match anything outside the bitmap
846 * (except possibly ANYOF_UNICODE_ALL), which means the
847 * intersection can't either, except for ANYOF_UNICODE_ALL, in
848 * which case we don't know what the intersection is, but it's no
849 * greater than what cl already has, so can just leave it alone,
850 * with possible false positives */
851 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
852 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
853 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
856 else if (! ANYOF_NONBITMAP(cl)) {
858 /* Here, 'and_with' does match something outside the bitmap, and cl
859 * doesn't have a list of things to match outside the bitmap. If
860 * cl can match all code points above 255, the intersection will
861 * be those above-255 code points that 'and_with' matches. If cl
862 * can't match all Unicode code points, it means that it can't
863 * match anything outside the bitmap (since the 'if' that got us
864 * into this block tested for that), so we leave the bitmap empty.
866 if (cl->flags & ANYOF_UNICODE_ALL) {
867 ARG_SET(cl, ARG(and_with));
869 /* and_with's ARG may match things that don't require UTF8.
870 * And now cl's will too, in spite of this being an 'and'. See
871 * the comments below about the kludge */
872 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
876 /* Here, both 'and_with' and cl match something outside the
877 * bitmap. Currently we do not do the intersection, so just match
878 * whatever cl had at the beginning. */
882 /* Take the intersection of the two sets of flags. However, the
883 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
884 * kludge around the fact that this flag is not treated like the others
885 * which are initialized in cl_anything(). The way the optimizer works
886 * is that the synthetic start class (SSC) is initialized to match
887 * anything, and then the first time a real node is encountered, its
888 * values are AND'd with the SSC's with the result being the values of
889 * the real node. However, there are paths through the optimizer where
890 * the AND never gets called, so those initialized bits are set
891 * inappropriately, which is not usually a big deal, as they just cause
892 * false positives in the SSC, which will just mean a probably
893 * imperceptible slow down in execution. However this bit has a
894 * higher false positive consequence in that it can cause utf8.pm,
895 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
896 * bigger slowdown and also causes significant extra memory to be used.
897 * In order to prevent this, the code now takes a different tack. The
898 * bit isn't set unless some part of the regular expression needs it,
899 * but once set it won't get cleared. This means that these extra
900 * modules won't get loaded unless there was some path through the
901 * pattern that would have required them anyway, and so any false
902 * positives that occur by not ANDing them out when they could be
903 * aren't as severe as they would be if we treated this bit like all
905 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
906 & ANYOF_NONBITMAP_NON_UTF8;
907 cl->flags &= and_with->flags;
908 cl->flags |= outside_bitmap_but_not_utf8;
912 /* 'OR' a given class with another one. Can create false positives. 'cl'
913 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
914 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
916 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
918 PERL_ARGS_ASSERT_CL_OR;
920 if (or_with->flags & ANYOF_INVERT) {
922 /* Here, the or'd node is to be inverted. This means we take the
923 * complement of everything not in the bitmap, but currently we don't
924 * know what that is, so give up and match anything */
925 if (ANYOF_NONBITMAP(or_with)) {
926 cl_anything(pRExC_state, cl);
929 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
930 * <= (B1 | !B2) | (CL1 | !CL2)
931 * which is wasteful if CL2 is small, but we ignore CL2:
932 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
933 * XXXX Can we handle case-fold? Unclear:
934 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
935 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
937 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
938 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
939 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
942 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
943 cl->bitmap[i] |= ~or_with->bitmap[i];
944 } /* XXXX: logic is complicated otherwise */
946 cl_anything(pRExC_state, cl);
949 /* And, we can just take the union of the flags that aren't affected
950 * by the inversion */
951 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
953 /* For the remaining flags:
954 ANYOF_UNICODE_ALL and inverted means to not match anything above
955 255, which means that the union with cl should just be
956 what cl has in it, so can ignore this flag
957 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
958 is 127-255 to match them, but then invert that, so the
959 union with cl should just be what cl has in it, so can
962 } else { /* 'or_with' is not inverted */
963 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
964 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
965 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
966 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
969 /* OR char bitmap and class bitmap separately */
970 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
971 cl->bitmap[i] |= or_with->bitmap[i];
972 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
973 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
974 cl->classflags[i] |= or_with->classflags[i];
975 cl->flags |= ANYOF_CLASS;
978 else { /* XXXX: logic is complicated, leave it along for a moment. */
979 cl_anything(pRExC_state, cl);
982 if (ANYOF_NONBITMAP(or_with)) {
984 /* Use the added node's outside-the-bit-map match if there isn't a
985 * conflict. If there is a conflict (both nodes match something
986 * outside the bitmap, but what they match outside is not the same
987 * pointer, and hence not easily compared until XXX we extend
988 * inversion lists this far), give up and allow the start class to
989 * match everything outside the bitmap. If that stuff is all above
990 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
991 if (! ANYOF_NONBITMAP(cl)) {
992 ARG_SET(cl, ARG(or_with));
994 else if (ARG(cl) != ARG(or_with)) {
996 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
997 cl_anything(pRExC_state, cl);
1000 cl->flags |= ANYOF_UNICODE_ALL;
1005 /* Take the union */
1006 cl->flags |= or_with->flags;
1010 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1011 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1012 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1013 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1018 dump_trie(trie,widecharmap,revcharmap)
1019 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1020 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1022 These routines dump out a trie in a somewhat readable format.
1023 The _interim_ variants are used for debugging the interim
1024 tables that are used to generate the final compressed
1025 representation which is what dump_trie expects.
1027 Part of the reason for their existence is to provide a form
1028 of documentation as to how the different representations function.
1033 Dumps the final compressed table form of the trie to Perl_debug_log.
1034 Used for debugging make_trie().
1038 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1039 AV *revcharmap, U32 depth)
1042 SV *sv=sv_newmortal();
1043 int colwidth= widecharmap ? 6 : 4;
1045 GET_RE_DEBUG_FLAGS_DECL;
1047 PERL_ARGS_ASSERT_DUMP_TRIE;
1049 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1050 (int)depth * 2 + 2,"",
1051 "Match","Base","Ofs" );
1053 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1054 SV ** const tmp = av_fetch( revcharmap, state, 0);
1056 PerlIO_printf( Perl_debug_log, "%*s",
1058 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1059 PL_colors[0], PL_colors[1],
1060 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1061 PERL_PV_ESCAPE_FIRSTCHAR
1066 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1067 (int)depth * 2 + 2,"");
1069 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1070 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1071 PerlIO_printf( Perl_debug_log, "\n");
1073 for( state = 1 ; state < trie->statecount ; state++ ) {
1074 const U32 base = trie->states[ state ].trans.base;
1076 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1078 if ( trie->states[ state ].wordnum ) {
1079 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1081 PerlIO_printf( Perl_debug_log, "%6s", "" );
1084 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1089 while( ( base + ofs < trie->uniquecharcount ) ||
1090 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1091 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1094 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1096 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1097 if ( ( base + ofs >= trie->uniquecharcount ) &&
1098 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1099 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1101 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1103 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1105 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1109 PerlIO_printf( Perl_debug_log, "]");
1112 PerlIO_printf( Perl_debug_log, "\n" );
1114 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1115 for (word=1; word <= trie->wordcount; word++) {
1116 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1117 (int)word, (int)(trie->wordinfo[word].prev),
1118 (int)(trie->wordinfo[word].len));
1120 PerlIO_printf(Perl_debug_log, "\n" );
1123 Dumps a fully constructed but uncompressed trie in list form.
1124 List tries normally only are used for construction when the number of
1125 possible chars (trie->uniquecharcount) is very high.
1126 Used for debugging make_trie().
1129 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1130 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1134 SV *sv=sv_newmortal();
1135 int colwidth= widecharmap ? 6 : 4;
1136 GET_RE_DEBUG_FLAGS_DECL;
1138 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1140 /* print out the table precompression. */
1141 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1142 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1143 "------:-----+-----------------\n" );
1145 for( state=1 ; state < next_alloc ; state ++ ) {
1148 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1149 (int)depth * 2 + 2,"", (UV)state );
1150 if ( ! trie->states[ state ].wordnum ) {
1151 PerlIO_printf( Perl_debug_log, "%5s| ","");
1153 PerlIO_printf( Perl_debug_log, "W%4x| ",
1154 trie->states[ state ].wordnum
1157 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1158 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1160 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1162 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1163 PL_colors[0], PL_colors[1],
1164 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1165 PERL_PV_ESCAPE_FIRSTCHAR
1167 TRIE_LIST_ITEM(state,charid).forid,
1168 (UV)TRIE_LIST_ITEM(state,charid).newstate
1171 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1172 (int)((depth * 2) + 14), "");
1175 PerlIO_printf( Perl_debug_log, "\n");
1180 Dumps a fully constructed but uncompressed trie in table form.
1181 This is the normal DFA style state transition table, with a few
1182 twists to facilitate compression later.
1183 Used for debugging make_trie().
1186 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1187 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1192 SV *sv=sv_newmortal();
1193 int colwidth= widecharmap ? 6 : 4;
1194 GET_RE_DEBUG_FLAGS_DECL;
1196 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1199 print out the table precompression so that we can do a visual check
1200 that they are identical.
1203 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1205 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1206 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1208 PerlIO_printf( Perl_debug_log, "%*s",
1210 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1211 PL_colors[0], PL_colors[1],
1212 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1213 PERL_PV_ESCAPE_FIRSTCHAR
1219 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1221 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1222 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1225 PerlIO_printf( Perl_debug_log, "\n" );
1227 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1229 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1230 (int)depth * 2 + 2,"",
1231 (UV)TRIE_NODENUM( state ) );
1233 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1234 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1236 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1238 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1240 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1241 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1243 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1244 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1252 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1253 startbranch: the first branch in the whole branch sequence
1254 first : start branch of sequence of branch-exact nodes.
1255 May be the same as startbranch
1256 last : Thing following the last branch.
1257 May be the same as tail.
1258 tail : item following the branch sequence
1259 count : words in the sequence
1260 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1261 depth : indent depth
1263 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1265 A trie is an N'ary tree where the branches are determined by digital
1266 decomposition of the key. IE, at the root node you look up the 1st character and
1267 follow that branch repeat until you find the end of the branches. Nodes can be
1268 marked as "accepting" meaning they represent a complete word. Eg:
1272 would convert into the following structure. Numbers represent states, letters
1273 following numbers represent valid transitions on the letter from that state, if
1274 the number is in square brackets it represents an accepting state, otherwise it
1275 will be in parenthesis.
1277 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1281 (1) +-i->(6)-+-s->[7]
1283 +-s->(3)-+-h->(4)-+-e->[5]
1285 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1287 This shows that when matching against the string 'hers' we will begin at state 1
1288 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1289 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1290 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1291 single traverse. We store a mapping from accepting to state to which word was
1292 matched, and then when we have multiple possibilities we try to complete the
1293 rest of the regex in the order in which they occured in the alternation.
1295 The only prior NFA like behaviour that would be changed by the TRIE support is
1296 the silent ignoring of duplicate alternations which are of the form:
1298 / (DUPE|DUPE) X? (?{ ... }) Y /x
1300 Thus EVAL blocks following a trie may be called a different number of times with
1301 and without the optimisation. With the optimisations dupes will be silently
1302 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1303 the following demonstrates:
1305 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1307 which prints out 'word' three times, but
1309 'words'=~/(word|word|word)(?{ print $1 })S/
1311 which doesnt print it out at all. This is due to other optimisations kicking in.
1313 Example of what happens on a structural level:
1315 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1317 1: CURLYM[1] {1,32767}(18)
1328 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1329 and should turn into:
1331 1: CURLYM[1] {1,32767}(18)
1333 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1341 Cases where tail != last would be like /(?foo|bar)baz/:
1351 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1352 and would end up looking like:
1355 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1362 d = uvuni_to_utf8_flags(d, uv, 0);
1364 is the recommended Unicode-aware way of saying
1369 #define TRIE_STORE_REVCHAR(val) \
1372 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1373 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1374 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1375 SvCUR_set(zlopp, kapow - flrbbbbb); \
1378 av_push(revcharmap, zlopp); \
1380 char ooooff = (char)val; \
1381 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1385 #define TRIE_READ_CHAR STMT_START { \
1388 /* if it is UTF then it is either already folded, or does not need folding */ \
1389 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1391 else if (folder == PL_fold_latin1) { \
1392 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1393 if ( foldlen > 0 ) { \
1394 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1400 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1401 skiplen = UNISKIP(uvc); \
1402 foldlen -= skiplen; \
1403 scan = foldbuf + skiplen; \
1406 /* raw data, will be folded later if needed */ \
1414 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1415 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1416 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1417 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1419 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1420 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1421 TRIE_LIST_CUR( state )++; \
1424 #define TRIE_LIST_NEW(state) STMT_START { \
1425 Newxz( trie->states[ state ].trans.list, \
1426 4, reg_trie_trans_le ); \
1427 TRIE_LIST_CUR( state ) = 1; \
1428 TRIE_LIST_LEN( state ) = 4; \
1431 #define TRIE_HANDLE_WORD(state) STMT_START { \
1432 U16 dupe= trie->states[ state ].wordnum; \
1433 regnode * const noper_next = regnext( noper ); \
1436 /* store the word for dumping */ \
1438 if (OP(noper) != NOTHING) \
1439 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1441 tmp = newSVpvn_utf8( "", 0, UTF ); \
1442 av_push( trie_words, tmp ); \
1446 trie->wordinfo[curword].prev = 0; \
1447 trie->wordinfo[curword].len = wordlen; \
1448 trie->wordinfo[curword].accept = state; \
1450 if ( noper_next < tail ) { \
1452 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1453 trie->jump[curword] = (U16)(noper_next - convert); \
1455 jumper = noper_next; \
1457 nextbranch= regnext(cur); \
1461 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1462 /* chain, so that when the bits of chain are later */\
1463 /* linked together, the dups appear in the chain */\
1464 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1465 trie->wordinfo[dupe].prev = curword; \
1467 /* we haven't inserted this word yet. */ \
1468 trie->states[ state ].wordnum = curword; \
1473 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1474 ( ( base + charid >= ucharcount \
1475 && base + charid < ubound \
1476 && state == trie->trans[ base - ucharcount + charid ].check \
1477 && trie->trans[ base - ucharcount + charid ].next ) \
1478 ? trie->trans[ base - ucharcount + charid ].next \
1479 : ( state==1 ? special : 0 ) \
1483 #define MADE_JUMP_TRIE 2
1484 #define MADE_EXACT_TRIE 4
1487 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1490 /* first pass, loop through and scan words */
1491 reg_trie_data *trie;
1492 HV *widecharmap = NULL;
1493 AV *revcharmap = newAV();
1495 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1500 regnode *jumper = NULL;
1501 regnode *nextbranch = NULL;
1502 regnode *convert = NULL;
1503 U32 *prev_states; /* temp array mapping each state to previous one */
1504 /* we just use folder as a flag in utf8 */
1505 const U8 * folder = NULL;
1508 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1509 AV *trie_words = NULL;
1510 /* along with revcharmap, this only used during construction but both are
1511 * useful during debugging so we store them in the struct when debugging.
1514 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1515 STRLEN trie_charcount=0;
1517 SV *re_trie_maxbuff;
1518 GET_RE_DEBUG_FLAGS_DECL;
1520 PERL_ARGS_ASSERT_MAKE_TRIE;
1522 PERL_UNUSED_ARG(depth);
1529 case EXACTFU_TRICKYFOLD:
1530 case EXACTFU: folder = PL_fold_latin1; break;
1531 case EXACTF: folder = PL_fold; break;
1532 case EXACTFL: folder = PL_fold_locale; break;
1533 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1536 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1538 trie->startstate = 1;
1539 trie->wordcount = word_count;
1540 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1541 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1543 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1544 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1545 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1548 trie_words = newAV();
1551 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1552 if (!SvIOK(re_trie_maxbuff)) {
1553 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1556 PerlIO_printf( Perl_debug_log,
1557 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1558 (int)depth * 2 + 2, "",
1559 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1560 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1564 /* Find the node we are going to overwrite */
1565 if ( first == startbranch && OP( last ) != BRANCH ) {
1566 /* whole branch chain */
1569 /* branch sub-chain */
1570 convert = NEXTOPER( first );
1573 /* -- First loop and Setup --
1575 We first traverse the branches and scan each word to determine if it
1576 contains widechars, and how many unique chars there are, this is
1577 important as we have to build a table with at least as many columns as we
1580 We use an array of integers to represent the character codes 0..255
1581 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1582 native representation of the character value as the key and IV's for the
1585 *TODO* If we keep track of how many times each character is used we can
1586 remap the columns so that the table compression later on is more
1587 efficient in terms of memory by ensuring the most common value is in the
1588 middle and the least common are on the outside. IMO this would be better
1589 than a most to least common mapping as theres a decent chance the most
1590 common letter will share a node with the least common, meaning the node
1591 will not be compressible. With a middle is most common approach the worst
1592 case is when we have the least common nodes twice.
1596 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1597 regnode * const noper = NEXTOPER( cur );
1598 const U8 *uc = (U8*)STRING( noper );
1599 const U8 * const e = uc + STR_LEN( noper );
1601 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1603 const U8 *scan = (U8*)NULL;
1604 U32 wordlen = 0; /* required init */
1606 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1608 if (OP(noper) == NOTHING) {
1612 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1613 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1614 regardless of encoding */
1615 if (OP( noper ) == EXACTFU_SS) {
1616 /* false positives are ok, so just set this */
1617 TRIE_BITMAP_SET(trie,0xDF);
1620 for ( ; uc < e ; uc += len ) {
1621 TRIE_CHARCOUNT(trie)++;
1626 U8 folded= folder[ (U8) uvc ];
1627 if ( !trie->charmap[ folded ] ) {
1628 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1629 TRIE_STORE_REVCHAR( folded );
1632 if ( !trie->charmap[ uvc ] ) {
1633 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1634 TRIE_STORE_REVCHAR( uvc );
1637 /* store the codepoint in the bitmap, and its folded
1639 TRIE_BITMAP_SET(trie, uvc);
1641 /* store the folded codepoint */
1642 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1645 /* store first byte of utf8 representation of
1646 variant codepoints */
1647 if (! UNI_IS_INVARIANT(uvc)) {
1648 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1651 set_bit = 0; /* We've done our bit :-) */
1656 widecharmap = newHV();
1658 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1661 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1663 if ( !SvTRUE( *svpp ) ) {
1664 sv_setiv( *svpp, ++trie->uniquecharcount );
1665 TRIE_STORE_REVCHAR(uvc);
1669 if( cur == first ) {
1670 trie->minlen = chars;
1671 trie->maxlen = chars;
1672 } else if (chars < trie->minlen) {
1673 trie->minlen = chars;
1674 } else if (chars > trie->maxlen) {
1675 trie->maxlen = chars;
1677 if (OP( noper ) == EXACTFU_SS) {
1678 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1679 if (trie->minlen > 1)
1682 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1683 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1684 * - We assume that any such sequence might match a 2 byte string */
1685 if (trie->minlen > 2 )
1689 } /* end first pass */
1690 DEBUG_TRIE_COMPILE_r(
1691 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1692 (int)depth * 2 + 2,"",
1693 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1694 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1695 (int)trie->minlen, (int)trie->maxlen )
1699 We now know what we are dealing with in terms of unique chars and
1700 string sizes so we can calculate how much memory a naive
1701 representation using a flat table will take. If it's over a reasonable
1702 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1703 conservative but potentially much slower representation using an array
1706 At the end we convert both representations into the same compressed
1707 form that will be used in regexec.c for matching with. The latter
1708 is a form that cannot be used to construct with but has memory
1709 properties similar to the list form and access properties similar
1710 to the table form making it both suitable for fast searches and
1711 small enough that its feasable to store for the duration of a program.
1713 See the comment in the code where the compressed table is produced
1714 inplace from the flat tabe representation for an explanation of how
1715 the compression works.
1720 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1723 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1725 Second Pass -- Array Of Lists Representation
1727 Each state will be represented by a list of charid:state records
1728 (reg_trie_trans_le) the first such element holds the CUR and LEN
1729 points of the allocated array. (See defines above).
1731 We build the initial structure using the lists, and then convert
1732 it into the compressed table form which allows faster lookups
1733 (but cant be modified once converted).
1736 STRLEN transcount = 1;
1738 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1739 "%*sCompiling trie using list compiler\n",
1740 (int)depth * 2 + 2, ""));
1742 trie->states = (reg_trie_state *)
1743 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1744 sizeof(reg_trie_state) );
1748 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1750 regnode * const noper = NEXTOPER( cur );
1751 U8 *uc = (U8*)STRING( noper );
1752 const U8 * const e = uc + STR_LEN( noper );
1753 U32 state = 1; /* required init */
1754 U16 charid = 0; /* sanity init */
1755 U8 *scan = (U8*)NULL; /* sanity init */
1756 STRLEN foldlen = 0; /* required init */
1757 U32 wordlen = 0; /* required init */
1758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1761 if (OP(noper) != NOTHING) {
1762 for ( ; uc < e ; uc += len ) {
1767 charid = trie->charmap[ uvc ];
1769 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1773 charid=(U16)SvIV( *svpp );
1776 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1783 if ( !trie->states[ state ].trans.list ) {
1784 TRIE_LIST_NEW( state );
1786 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1787 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1788 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1793 newstate = next_alloc++;
1794 prev_states[newstate] = state;
1795 TRIE_LIST_PUSH( state, charid, newstate );
1800 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1804 TRIE_HANDLE_WORD(state);
1806 } /* end second pass */
1808 /* next alloc is the NEXT state to be allocated */
1809 trie->statecount = next_alloc;
1810 trie->states = (reg_trie_state *)
1811 PerlMemShared_realloc( trie->states,
1813 * sizeof(reg_trie_state) );
1815 /* and now dump it out before we compress it */
1816 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1817 revcharmap, next_alloc,
1821 trie->trans = (reg_trie_trans *)
1822 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1829 for( state=1 ; state < next_alloc ; state ++ ) {
1833 DEBUG_TRIE_COMPILE_MORE_r(
1834 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1838 if (trie->states[state].trans.list) {
1839 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1843 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1844 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1845 if ( forid < minid ) {
1847 } else if ( forid > maxid ) {
1851 if ( transcount < tp + maxid - minid + 1) {
1853 trie->trans = (reg_trie_trans *)
1854 PerlMemShared_realloc( trie->trans,
1856 * sizeof(reg_trie_trans) );
1857 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1859 base = trie->uniquecharcount + tp - minid;
1860 if ( maxid == minid ) {
1862 for ( ; zp < tp ; zp++ ) {
1863 if ( ! trie->trans[ zp ].next ) {
1864 base = trie->uniquecharcount + zp - minid;
1865 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1866 trie->trans[ zp ].check = state;
1872 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1873 trie->trans[ tp ].check = state;
1878 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1879 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1880 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1881 trie->trans[ tid ].check = state;
1883 tp += ( maxid - minid + 1 );
1885 Safefree(trie->states[ state ].trans.list);
1888 DEBUG_TRIE_COMPILE_MORE_r(
1889 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1892 trie->states[ state ].trans.base=base;
1894 trie->lasttrans = tp + 1;
1898 Second Pass -- Flat Table Representation.
1900 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1901 We know that we will need Charcount+1 trans at most to store the data
1902 (one row per char at worst case) So we preallocate both structures
1903 assuming worst case.
1905 We then construct the trie using only the .next slots of the entry
1908 We use the .check field of the first entry of the node temporarily to
1909 make compression both faster and easier by keeping track of how many non
1910 zero fields are in the node.
1912 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1915 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1916 number representing the first entry of the node, and state as a
1917 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1918 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1919 are 2 entrys per node. eg:
1927 The table is internally in the right hand, idx form. However as we also
1928 have to deal with the states array which is indexed by nodenum we have to
1929 use TRIE_NODENUM() to convert.
1932 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1933 "%*sCompiling trie using table compiler\n",
1934 (int)depth * 2 + 2, ""));
1936 trie->trans = (reg_trie_trans *)
1937 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1938 * trie->uniquecharcount + 1,
1939 sizeof(reg_trie_trans) );
1940 trie->states = (reg_trie_state *)
1941 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1942 sizeof(reg_trie_state) );
1943 next_alloc = trie->uniquecharcount + 1;
1946 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1948 regnode * const noper = NEXTOPER( cur );
1949 const U8 *uc = (U8*)STRING( noper );
1950 const U8 * const e = uc + STR_LEN( noper );
1952 U32 state = 1; /* required init */
1954 U16 charid = 0; /* sanity init */
1955 U32 accept_state = 0; /* sanity init */
1956 U8 *scan = (U8*)NULL; /* sanity init */
1958 STRLEN foldlen = 0; /* required init */
1959 U32 wordlen = 0; /* required init */
1961 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1964 if ( OP(noper) != NOTHING ) {
1965 for ( ; uc < e ; uc += len ) {
1970 charid = trie->charmap[ uvc ];
1972 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1973 charid = svpp ? (U16)SvIV(*svpp) : 0;
1977 if ( !trie->trans[ state + charid ].next ) {
1978 trie->trans[ state + charid ].next = next_alloc;
1979 trie->trans[ state ].check++;
1980 prev_states[TRIE_NODENUM(next_alloc)]
1981 = TRIE_NODENUM(state);
1982 next_alloc += trie->uniquecharcount;
1984 state = trie->trans[ state + charid ].next;
1986 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1988 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1991 accept_state = TRIE_NODENUM( state );
1992 TRIE_HANDLE_WORD(accept_state);
1994 } /* end second pass */
1996 /* and now dump it out before we compress it */
1997 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1999 next_alloc, depth+1));
2003 * Inplace compress the table.*
2005 For sparse data sets the table constructed by the trie algorithm will
2006 be mostly 0/FAIL transitions or to put it another way mostly empty.
2007 (Note that leaf nodes will not contain any transitions.)
2009 This algorithm compresses the tables by eliminating most such
2010 transitions, at the cost of a modest bit of extra work during lookup:
2012 - Each states[] entry contains a .base field which indicates the
2013 index in the state[] array wheres its transition data is stored.
2015 - If .base is 0 there are no valid transitions from that node.
2017 - If .base is nonzero then charid is added to it to find an entry in
2020 -If trans[states[state].base+charid].check!=state then the
2021 transition is taken to be a 0/Fail transition. Thus if there are fail
2022 transitions at the front of the node then the .base offset will point
2023 somewhere inside the previous nodes data (or maybe even into a node
2024 even earlier), but the .check field determines if the transition is
2028 The following process inplace converts the table to the compressed
2029 table: We first do not compress the root node 1,and mark all its
2030 .check pointers as 1 and set its .base pointer as 1 as well. This
2031 allows us to do a DFA construction from the compressed table later,
2032 and ensures that any .base pointers we calculate later are greater
2035 - We set 'pos' to indicate the first entry of the second node.
2037 - We then iterate over the columns of the node, finding the first and
2038 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2039 and set the .check pointers accordingly, and advance pos
2040 appropriately and repreat for the next node. Note that when we copy
2041 the next pointers we have to convert them from the original
2042 NODEIDX form to NODENUM form as the former is not valid post
2045 - If a node has no transitions used we mark its base as 0 and do not
2046 advance the pos pointer.
2048 - If a node only has one transition we use a second pointer into the
2049 structure to fill in allocated fail transitions from other states.
2050 This pointer is independent of the main pointer and scans forward
2051 looking for null transitions that are allocated to a state. When it
2052 finds one it writes the single transition into the "hole". If the
2053 pointer doesnt find one the single transition is appended as normal.
2055 - Once compressed we can Renew/realloc the structures to release the
2058 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2059 specifically Fig 3.47 and the associated pseudocode.
2063 const U32 laststate = TRIE_NODENUM( next_alloc );
2066 trie->statecount = laststate;
2068 for ( state = 1 ; state < laststate ; state++ ) {
2070 const U32 stateidx = TRIE_NODEIDX( state );
2071 const U32 o_used = trie->trans[ stateidx ].check;
2072 U32 used = trie->trans[ stateidx ].check;
2073 trie->trans[ stateidx ].check = 0;
2075 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2076 if ( flag || trie->trans[ stateidx + charid ].next ) {
2077 if ( trie->trans[ stateidx + charid ].next ) {
2079 for ( ; zp < pos ; zp++ ) {
2080 if ( ! trie->trans[ zp ].next ) {
2084 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2085 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2086 trie->trans[ zp ].check = state;
2087 if ( ++zp > pos ) pos = zp;
2094 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2096 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2097 trie->trans[ pos ].check = state;
2102 trie->lasttrans = pos + 1;
2103 trie->states = (reg_trie_state *)
2104 PerlMemShared_realloc( trie->states, laststate
2105 * sizeof(reg_trie_state) );
2106 DEBUG_TRIE_COMPILE_MORE_r(
2107 PerlIO_printf( Perl_debug_log,
2108 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2109 (int)depth * 2 + 2,"",
2110 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2113 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2116 } /* end table compress */
2118 DEBUG_TRIE_COMPILE_MORE_r(
2119 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2120 (int)depth * 2 + 2, "",
2121 (UV)trie->statecount,
2122 (UV)trie->lasttrans)
2124 /* resize the trans array to remove unused space */
2125 trie->trans = (reg_trie_trans *)
2126 PerlMemShared_realloc( trie->trans, trie->lasttrans
2127 * sizeof(reg_trie_trans) );
2129 { /* Modify the program and insert the new TRIE node */
2130 U8 nodetype =(U8)(flags & 0xFF);
2134 regnode *optimize = NULL;
2135 #ifdef RE_TRACK_PATTERN_OFFSETS
2138 U32 mjd_nodelen = 0;
2139 #endif /* RE_TRACK_PATTERN_OFFSETS */
2140 #endif /* DEBUGGING */
2142 This means we convert either the first branch or the first Exact,
2143 depending on whether the thing following (in 'last') is a branch
2144 or not and whther first is the startbranch (ie is it a sub part of
2145 the alternation or is it the whole thing.)
2146 Assuming its a sub part we convert the EXACT otherwise we convert
2147 the whole branch sequence, including the first.
2149 /* Find the node we are going to overwrite */
2150 if ( first != startbranch || OP( last ) == BRANCH ) {
2151 /* branch sub-chain */
2152 NEXT_OFF( first ) = (U16)(last - first);
2153 #ifdef RE_TRACK_PATTERN_OFFSETS
2155 mjd_offset= Node_Offset((convert));
2156 mjd_nodelen= Node_Length((convert));
2159 /* whole branch chain */
2161 #ifdef RE_TRACK_PATTERN_OFFSETS
2164 const regnode *nop = NEXTOPER( convert );
2165 mjd_offset= Node_Offset((nop));
2166 mjd_nodelen= Node_Length((nop));
2170 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2171 (int)depth * 2 + 2, "",
2172 (UV)mjd_offset, (UV)mjd_nodelen)
2175 /* But first we check to see if there is a common prefix we can
2176 split out as an EXACT and put in front of the TRIE node. */
2177 trie->startstate= 1;
2178 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2180 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2184 const U32 base = trie->states[ state ].trans.base;
2186 if ( trie->states[state].wordnum )
2189 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2190 if ( ( base + ofs >= trie->uniquecharcount ) &&
2191 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2192 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2194 if ( ++count > 1 ) {
2195 SV **tmp = av_fetch( revcharmap, ofs, 0);
2196 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2197 if ( state == 1 ) break;
2199 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2201 PerlIO_printf(Perl_debug_log,
2202 "%*sNew Start State=%"UVuf" Class: [",
2203 (int)depth * 2 + 2, "",
2206 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2207 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2209 TRIE_BITMAP_SET(trie,*ch);
2211 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2213 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2217 TRIE_BITMAP_SET(trie,*ch);
2219 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2220 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2226 SV **tmp = av_fetch( revcharmap, idx, 0);
2228 char *ch = SvPV( *tmp, len );
2230 SV *sv=sv_newmortal();
2231 PerlIO_printf( Perl_debug_log,
2232 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2233 (int)depth * 2 + 2, "",
2235 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2236 PL_colors[0], PL_colors[1],
2237 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2238 PERL_PV_ESCAPE_FIRSTCHAR
2243 OP( convert ) = nodetype;
2244 str=STRING(convert);
2247 STR_LEN(convert) += len;
2253 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2258 trie->prefixlen = (state-1);
2260 regnode *n = convert+NODE_SZ_STR(convert);
2261 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2262 trie->startstate = state;
2263 trie->minlen -= (state - 1);
2264 trie->maxlen -= (state - 1);
2266 /* At least the UNICOS C compiler choked on this
2267 * being argument to DEBUG_r(), so let's just have
2270 #ifdef PERL_EXT_RE_BUILD
2276 regnode *fix = convert;
2277 U32 word = trie->wordcount;
2279 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2280 while( ++fix < n ) {
2281 Set_Node_Offset_Length(fix, 0, 0);
2284 SV ** const tmp = av_fetch( trie_words, word, 0 );
2286 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2287 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2289 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2297 NEXT_OFF(convert) = (U16)(tail - convert);
2298 DEBUG_r(optimize= n);
2304 if ( trie->maxlen ) {
2305 NEXT_OFF( convert ) = (U16)(tail - convert);
2306 ARG_SET( convert, data_slot );
2307 /* Store the offset to the first unabsorbed branch in
2308 jump[0], which is otherwise unused by the jump logic.
2309 We use this when dumping a trie and during optimisation. */
2311 trie->jump[0] = (U16)(nextbranch - convert);
2313 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2314 * and there is a bitmap
2315 * and the first "jump target" node we found leaves enough room
2316 * then convert the TRIE node into a TRIEC node, with the bitmap
2317 * embedded inline in the opcode - this is hypothetically faster.
2319 if ( !trie->states[trie->startstate].wordnum
2321 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2323 OP( convert ) = TRIEC;
2324 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2325 PerlMemShared_free(trie->bitmap);
2328 OP( convert ) = TRIE;
2330 /* store the type in the flags */
2331 convert->flags = nodetype;
2335 + regarglen[ OP( convert ) ];
2337 /* XXX We really should free up the resource in trie now,
2338 as we won't use them - (which resources?) dmq */
2340 /* needed for dumping*/
2341 DEBUG_r(if (optimize) {
2342 regnode *opt = convert;
2344 while ( ++opt < optimize) {
2345 Set_Node_Offset_Length(opt,0,0);
2348 Try to clean up some of the debris left after the
2351 while( optimize < jumper ) {
2352 mjd_nodelen += Node_Length((optimize));
2353 OP( optimize ) = OPTIMIZED;
2354 Set_Node_Offset_Length(optimize,0,0);
2357 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2359 } /* end node insert */
2361 /* Finish populating the prev field of the wordinfo array. Walk back
2362 * from each accept state until we find another accept state, and if
2363 * so, point the first word's .prev field at the second word. If the
2364 * second already has a .prev field set, stop now. This will be the
2365 * case either if we've already processed that word's accept state,
2366 * or that state had multiple words, and the overspill words were
2367 * already linked up earlier.
2374 for (word=1; word <= trie->wordcount; word++) {
2376 if (trie->wordinfo[word].prev)
2378 state = trie->wordinfo[word].accept;
2380 state = prev_states[state];
2383 prev = trie->states[state].wordnum;
2387 trie->wordinfo[word].prev = prev;
2389 Safefree(prev_states);
2393 /* and now dump out the compressed format */
2394 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2396 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2398 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2399 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2401 SvREFCNT_dec(revcharmap);
2405 : trie->startstate>1
2411 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2413 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2415 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2416 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2419 We find the fail state for each state in the trie, this state is the longest proper
2420 suffix of the current state's 'word' that is also a proper prefix of another word in our
2421 trie. State 1 represents the word '' and is thus the default fail state. This allows
2422 the DFA not to have to restart after its tried and failed a word at a given point, it
2423 simply continues as though it had been matching the other word in the first place.
2425 'abcdgu'=~/abcdefg|cdgu/
2426 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2427 fail, which would bring us to the state representing 'd' in the second word where we would
2428 try 'g' and succeed, proceeding to match 'cdgu'.
2430 /* add a fail transition */
2431 const U32 trie_offset = ARG(source);
2432 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2434 const U32 ucharcount = trie->uniquecharcount;
2435 const U32 numstates = trie->statecount;
2436 const U32 ubound = trie->lasttrans + ucharcount;
2440 U32 base = trie->states[ 1 ].trans.base;
2443 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2444 GET_RE_DEBUG_FLAGS_DECL;
2446 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2448 PERL_UNUSED_ARG(depth);
2452 ARG_SET( stclass, data_slot );
2453 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2454 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2455 aho->trie=trie_offset;
2456 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2457 Copy( trie->states, aho->states, numstates, reg_trie_state );
2458 Newxz( q, numstates, U32);
2459 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2462 /* initialize fail[0..1] to be 1 so that we always have
2463 a valid final fail state */
2464 fail[ 0 ] = fail[ 1 ] = 1;
2466 for ( charid = 0; charid < ucharcount ; charid++ ) {
2467 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2469 q[ q_write ] = newstate;
2470 /* set to point at the root */
2471 fail[ q[ q_write++ ] ]=1;
2474 while ( q_read < q_write) {
2475 const U32 cur = q[ q_read++ % numstates ];
2476 base = trie->states[ cur ].trans.base;
2478 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2479 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2481 U32 fail_state = cur;
2484 fail_state = fail[ fail_state ];
2485 fail_base = aho->states[ fail_state ].trans.base;
2486 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2488 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2489 fail[ ch_state ] = fail_state;
2490 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2492 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2494 q[ q_write++ % numstates] = ch_state;
2498 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2499 when we fail in state 1, this allows us to use the
2500 charclass scan to find a valid start char. This is based on the principle
2501 that theres a good chance the string being searched contains lots of stuff
2502 that cant be a start char.
2504 fail[ 0 ] = fail[ 1 ] = 0;
2505 DEBUG_TRIE_COMPILE_r({
2506 PerlIO_printf(Perl_debug_log,
2507 "%*sStclass Failtable (%"UVuf" states): 0",
2508 (int)(depth * 2), "", (UV)numstates
2510 for( q_read=1; q_read<numstates; q_read++ ) {
2511 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2513 PerlIO_printf(Perl_debug_log, "\n");
2516 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2521 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2522 * These need to be revisited when a newer toolchain becomes available.
2524 #if defined(__sparc64__) && defined(__GNUC__)
2525 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2526 # undef SPARC64_GCC_WORKAROUND
2527 # define SPARC64_GCC_WORKAROUND 1
2531 #define DEBUG_PEEP(str,scan,depth) \
2532 DEBUG_OPTIMISE_r({if (scan){ \
2533 SV * const mysv=sv_newmortal(); \
2534 regnode *Next = regnext(scan); \
2535 regprop(RExC_rx, mysv, scan); \
2536 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2537 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2538 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2542 /* The below joins as many adjacent EXACTish nodes as possible into a single
2543 * one, and looks for problematic sequences of characters whose folds vs.
2544 * non-folds have sufficiently different lengths, that the optimizer would be
2545 * fooled into rejecting legitimate matches of them, and the trie construction
2546 * code can't cope with them. The joining is only done if:
2547 * 1) there is room in the current conglomerated node to entirely contain the
2549 * 2) they are the exact same node type
2551 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2552 * these get optimized out
2554 * If there are problematic code sequences, *min_subtract is set to the delta
2555 * that the minimum size of the node can be less than its actual size. And,
2556 * the node type of the result is changed to reflect that it contains these
2559 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2560 * and contains LATIN SMALL LETTER SHARP S
2562 * This is as good a place as any to discuss the design of handling these
2563 * problematic sequences. It's been wrong in Perl for a very long time. There
2564 * are three code points in Unicode whose folded lengths differ so much from
2565 * the un-folded lengths that it causes problems for the optimizer and trie
2566 * construction. Why only these are problematic, and not others where lengths
2567 * also differ is something I (khw) do not understand. New versions of Unicode
2568 * might add more such code points. Hopefully the logic in fold_grind.t that
2569 * figures out what to test (in part by verifying that each size-combination
2570 * gets tested) will catch any that do come along, so they can be added to the
2571 * special handling below. The chances of new ones are actually rather small,
2572 * as most, if not all, of the world's scripts that have casefolding have
2573 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2574 * made to allow compatibility with pre-existing standards, and almost all of
2575 * those have already been dealt with. These would otherwise be the most
2576 * likely candidates for generating further tricky sequences. In other words,
2577 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2578 * with pre-existing standards, and there aren't many of those left.
2580 * The previous designs for dealing with these involved assigning a special
2581 * node for them. This approach doesn't work, as evidenced by this example:
2582 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2583 * Both these fold to "sss", but if the pattern is parsed to create a node of
2584 * that would match just the \xDF, it won't be able to handle the case where a
2585 * successful match would have to cross the node's boundary. The new approach
2586 * that hopefully generally solves the problem generates an EXACTFU_SS node
2589 * There are a number of components to the approach (a lot of work for just
2590 * three code points!):
2591 * 1) This routine examines each EXACTFish node that could contain the
2592 * problematic sequences. It returns in *min_subtract how much to
2593 * subtract from the the actual length of the string to get a real minimum
2594 * for one that could match it. This number is usually 0 except for the
2595 * problematic sequences. This delta is used by the caller to adjust the
2596 * min length of the match, and the delta between min and max, so that the
2597 * optimizer doesn't reject these possibilities based on size constraints.
2598 * 2) These sequences are not currently correctly handled by the trie code
2599 * either, so it changes the joined node type to ops that are not handled
2600 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2601 * 3) This is sufficient for the two Greek sequences (described below), but
2602 * the one involving the Sharp s (\xDF) needs more. The node type
2603 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2604 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2605 * case where there is a possible fold length change. That means that a
2606 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2607 * itself with length changes, and so can be processed faster. regexec.c
2608 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2609 * is pre-folded by regcomp.c. This saves effort in regex matching.
2610 * However, probably mostly for historical reasons, the pre-folding isn't
2611 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2612 * nodes, as what they fold to isn't known until runtime.) The fold
2613 * possibilities for the non-UTF8 patterns are quite simple, except for
2614 * the sharp s. All the ones that don't involve a UTF-8 target string
2615 * are members of a fold-pair, and arrays are set up for all of them
2616 * that quickly find the other member of the pair. It might actually
2617 * be faster to pre-fold these, but it isn't currently done, except for
2618 * the sharp s. Code elsewhere in this file makes sure that it gets
2619 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2620 * issues described in the next item.
2621 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2622 * 'ss' or not is not knowable at compile time. It will match iff the
2623 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2624 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2625 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2626 * described in item 3). An assumption that the optimizer part of
2627 * regexec.c (probably unwittingly) makes is that a character in the
2628 * pattern corresponds to at most a single character in the target string.
2629 * (And I do mean character, and not byte here, unlike other parts of the
2630 * documentation that have never been updated to account for multibyte
2631 * Unicode.) This assumption is wrong only in this case, as all other
2632 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2633 * virtue of having this file pre-fold UTF-8 patterns. I'm
2634 * reluctant to try to change this assumption, so instead the code punts.
2635 * This routine examines EXACTF nodes for the sharp s, and returns a
2636 * boolean indicating whether or not the node is an EXACTF node that
2637 * contains a sharp s. When it is true, the caller sets a flag that later
2638 * causes the optimizer in this file to not set values for the floating
2639 * and fixed string lengths, and thus avoids the optimizer code in
2640 * regexec.c that makes the invalid assumption. Thus, there is no
2641 * optimization based on string lengths for EXACTF nodes that contain the
2642 * sharp s. This only happens for /id rules (which means the pattern
2646 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2647 if (PL_regkind[OP(scan)] == EXACT) \
2648 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2651 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2652 /* Merge several consecutive EXACTish nodes into one. */
2653 regnode *n = regnext(scan);
2655 regnode *next = scan + NODE_SZ_STR(scan);
2659 regnode *stop = scan;
2660 GET_RE_DEBUG_FLAGS_DECL;
2662 PERL_UNUSED_ARG(depth);
2665 PERL_ARGS_ASSERT_JOIN_EXACT;
2666 #ifndef EXPERIMENTAL_INPLACESCAN
2667 PERL_UNUSED_ARG(flags);
2668 PERL_UNUSED_ARG(val);
2670 DEBUG_PEEP("join",scan,depth);
2672 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2673 * EXACT ones that are mergeable to the current one. */
2675 && (PL_regkind[OP(n)] == NOTHING
2676 || (stringok && OP(n) == OP(scan)))
2678 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2681 if (OP(n) == TAIL || n > next)
2683 if (PL_regkind[OP(n)] == NOTHING) {
2684 DEBUG_PEEP("skip:",n,depth);
2685 NEXT_OFF(scan) += NEXT_OFF(n);
2686 next = n + NODE_STEP_REGNODE;
2693 else if (stringok) {
2694 const unsigned int oldl = STR_LEN(scan);
2695 regnode * const nnext = regnext(n);
2697 if (oldl + STR_LEN(n) > U8_MAX)
2700 DEBUG_PEEP("merg",n,depth);
2703 NEXT_OFF(scan) += NEXT_OFF(n);
2704 STR_LEN(scan) += STR_LEN(n);
2705 next = n + NODE_SZ_STR(n);
2706 /* Now we can overwrite *n : */
2707 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2715 #ifdef EXPERIMENTAL_INPLACESCAN
2716 if (flags && !NEXT_OFF(n)) {
2717 DEBUG_PEEP("atch", val, depth);
2718 if (reg_off_by_arg[OP(n)]) {
2719 ARG_SET(n, val - n);
2722 NEXT_OFF(n) = val - n;
2730 *has_exactf_sharp_s = FALSE;
2732 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2733 * can now analyze for sequences of problematic code points. (Prior to
2734 * this final joining, sequences could have been split over boundaries, and
2735 * hence missed). The sequences only happen in folding, hence for any
2736 * non-EXACT EXACTish node */
2737 if (OP(scan) != EXACT) {
2739 U8 * s0 = (U8*) STRING(scan);
2740 U8 * const s_end = s0 + STR_LEN(scan);
2742 /* The below is perhaps overboard, but this allows us to save a test
2743 * each time through the loop at the expense of a mask. This is
2744 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2745 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2746 * This uses an exclusive 'or' to find that bit and then inverts it to
2747 * form a mask, with just a single 0, in the bit position where 'S' and
2749 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2750 const U8 s_masked = 's' & S_or_s_mask;
2752 /* One pass is made over the node's string looking for all the
2753 * possibilities. to avoid some tests in the loop, there are two main
2754 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2758 /* There are two problematic Greek code points in Unicode
2761 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2762 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2768 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2769 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2771 * This means that in case-insensitive matching (or "loose
2772 * matching", as Unicode calls it), an EXACTF of length six (the
2773 * UTF-8 encoded byte length of the above casefolded versions) can
2774 * match a target string of length two (the byte length of UTF-8
2775 * encoded U+0390 or U+03B0). This would rather mess up the
2776 * minimum length computation. (there are other code points that
2777 * also fold to these two sequences, but the delta is smaller)
2779 * If these sequences are found, the minimum length is decreased by
2780 * four (six minus two).
2782 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2783 * LETTER SHARP S. We decrease the min length by 1 for each
2784 * occurrence of 'ss' found */
2786 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2787 # define U390_first_byte 0xb4
2788 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2789 # define U3B0_first_byte 0xb5
2790 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2792 # define U390_first_byte 0xce
2793 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2794 # define U3B0_first_byte 0xcf
2795 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2797 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2798 yields a net of 0 */
2799 /* Examine the string for one of the problematic sequences */
2801 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2802 * sequence we are looking for is 2 */
2806 /* Look for the first byte in each problematic sequence */
2808 /* We don't have to worry about other things that fold to
2809 * 's' (such as the long s, U+017F), as all above-latin1
2810 * code points have been pre-folded */
2814 /* Current character is an 's' or 'S'. If next one is
2815 * as well, we have the dreaded sequence */
2816 if (((*(s+1) & S_or_s_mask) == s_masked)
2817 /* These two node types don't have special handling
2819 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2822 OP(scan) = EXACTFU_SS;
2823 s++; /* No need to look at this character again */
2827 case U390_first_byte:
2828 if (s_end - s >= len
2830 /* The 1's are because are skipping comparing the
2832 && memEQ(s + 1, U390_tail, len - 1))
2834 goto greek_sequence;
2838 case U3B0_first_byte:
2839 if (! (s_end - s >= len
2840 && memEQ(s + 1, U3B0_tail, len - 1)))
2847 /* This can't currently be handled by trie's, so change
2848 * the node type to indicate this. If EXACTFA and
2849 * EXACTFL were ever to be handled by trie's, this
2850 * would have to be changed. If this node has already
2851 * been changed to EXACTFU_SS in this loop, leave it as
2852 * is. (I (khw) think it doesn't matter in regexec.c
2853 * for UTF patterns, but no need to change it */
2854 if (OP(scan) == EXACTFU) {
2855 OP(scan) = EXACTFU_TRICKYFOLD;
2857 s += 6; /* We already know what this sequence is. Skip
2863 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2865 /* Here, the pattern is not UTF-8. We need to look only for the
2866 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2867 * in the final position. Otherwise we can stop looking 1 byte
2868 * earlier because have to find both the first and second 's' */
2869 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2871 for (s = s0; s < upper; s++) {
2876 && ((*(s+1) & S_or_s_mask) == s_masked))
2880 /* EXACTF nodes need to know that the minimum
2881 * length changed so that a sharp s in the string
2882 * can match this ss in the pattern, but they
2883 * remain EXACTF nodes, as they are not trie'able,
2884 * so don't have to invent a new node type to
2885 * exclude them from the trie code */
2886 if (OP(scan) != EXACTF) {
2887 OP(scan) = EXACTFU_SS;
2892 case LATIN_SMALL_LETTER_SHARP_S:
2893 if (OP(scan) == EXACTF) {
2894 *has_exactf_sharp_s = TRUE;
2903 /* Allow dumping but overwriting the collection of skipped
2904 * ops and/or strings with fake optimized ops */
2905 n = scan + NODE_SZ_STR(scan);
2913 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2917 /* REx optimizer. Converts nodes into quicker variants "in place".
2918 Finds fixed substrings. */
2920 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2921 to the position after last scanned or to NULL. */
2923 #define INIT_AND_WITHP \
2924 assert(!and_withp); \
2925 Newx(and_withp,1,struct regnode_charclass_class); \
2926 SAVEFREEPV(and_withp)
2928 /* this is a chain of data about sub patterns we are processing that
2929 need to be handled separately/specially in study_chunk. Its so
2930 we can simulate recursion without losing state. */
2932 typedef struct scan_frame {
2933 regnode *last; /* last node to process in this frame */
2934 regnode *next; /* next node to process when last is reached */
2935 struct scan_frame *prev; /*previous frame*/
2936 I32 stop; /* what stopparen do we use */
2940 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2942 #define CASE_SYNST_FNC(nAmE) \
2944 if (flags & SCF_DO_STCLASS_AND) { \
2945 for (value = 0; value < 256; value++) \
2946 if (!is_ ## nAmE ## _cp(value)) \
2947 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2950 for (value = 0; value < 256; value++) \
2951 if (is_ ## nAmE ## _cp(value)) \
2952 ANYOF_BITMAP_SET(data->start_class, value); \
2956 if (flags & SCF_DO_STCLASS_AND) { \
2957 for (value = 0; value < 256; value++) \
2958 if (is_ ## nAmE ## _cp(value)) \
2959 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2962 for (value = 0; value < 256; value++) \
2963 if (!is_ ## nAmE ## _cp(value)) \
2964 ANYOF_BITMAP_SET(data->start_class, value); \
2971 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2972 I32 *minlenp, I32 *deltap,
2977 struct regnode_charclass_class *and_withp,
2978 U32 flags, U32 depth)
2979 /* scanp: Start here (read-write). */
2980 /* deltap: Write maxlen-minlen here. */
2981 /* last: Stop before this one. */
2982 /* data: string data about the pattern */
2983 /* stopparen: treat close N as END */
2984 /* recursed: which subroutines have we recursed into */
2985 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2988 I32 min = 0, pars = 0, code;
2989 regnode *scan = *scanp, *next;
2991 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2992 int is_inf_internal = 0; /* The studied chunk is infinite */
2993 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2994 scan_data_t data_fake;
2995 SV *re_trie_maxbuff = NULL;
2996 regnode *first_non_open = scan;
2997 I32 stopmin = I32_MAX;
2998 scan_frame *frame = NULL;
2999 GET_RE_DEBUG_FLAGS_DECL;
3001 PERL_ARGS_ASSERT_STUDY_CHUNK;
3004 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3008 while (first_non_open && OP(first_non_open) == OPEN)
3009 first_non_open=regnext(first_non_open);
3014 while ( scan && OP(scan) != END && scan < last ){
3015 UV min_subtract = 0; /* How much to subtract from the minimum node
3016 length to get a real minimum (because the
3017 folded version may be shorter) */
3018 bool has_exactf_sharp_s = FALSE;
3019 /* Peephole optimizer: */
3020 DEBUG_STUDYDATA("Peep:", data,depth);
3021 DEBUG_PEEP("Peep",scan,depth);
3023 /* Its not clear to khw or hv why this is done here, and not in the
3024 * clauses that deal with EXACT nodes. khw's guess is that it's
3025 * because of a previous design */
3026 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3028 /* Follow the next-chain of the current node and optimize
3029 away all the NOTHINGs from it. */
3030 if (OP(scan) != CURLYX) {
3031 const int max = (reg_off_by_arg[OP(scan)]
3033 /* I32 may be smaller than U16 on CRAYs! */
3034 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3035 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3039 /* Skip NOTHING and LONGJMP. */
3040 while ((n = regnext(n))
3041 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3042 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3043 && off + noff < max)
3045 if (reg_off_by_arg[OP(scan)])
3048 NEXT_OFF(scan) = off;
3053 /* The principal pseudo-switch. Cannot be a switch, since we
3054 look into several different things. */
3055 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3056 || OP(scan) == IFTHEN) {
3057 next = regnext(scan);
3059 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3061 if (OP(next) == code || code == IFTHEN) {
3062 /* NOTE - There is similar code to this block below for handling
3063 TRIE nodes on a re-study. If you change stuff here check there
3065 I32 max1 = 0, min1 = I32_MAX, num = 0;
3066 struct regnode_charclass_class accum;
3067 regnode * const startbranch=scan;
3069 if (flags & SCF_DO_SUBSTR)
3070 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3071 if (flags & SCF_DO_STCLASS)
3072 cl_init_zero(pRExC_state, &accum);
3074 while (OP(scan) == code) {
3075 I32 deltanext, minnext, f = 0, fake;
3076 struct regnode_charclass_class this_class;
3079 data_fake.flags = 0;
3081 data_fake.whilem_c = data->whilem_c;
3082 data_fake.last_closep = data->last_closep;
3085 data_fake.last_closep = &fake;
3087 data_fake.pos_delta = delta;
3088 next = regnext(scan);
3089 scan = NEXTOPER(scan);
3091 scan = NEXTOPER(scan);
3092 if (flags & SCF_DO_STCLASS) {
3093 cl_init(pRExC_state, &this_class);
3094 data_fake.start_class = &this_class;
3095 f = SCF_DO_STCLASS_AND;
3097 if (flags & SCF_WHILEM_VISITED_POS)
3098 f |= SCF_WHILEM_VISITED_POS;
3100 /* we suppose the run is continuous, last=next...*/
3101 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3103 stopparen, recursed, NULL, f,depth+1);
3106 if (max1 < minnext + deltanext)
3107 max1 = minnext + deltanext;
3108 if (deltanext == I32_MAX)
3109 is_inf = is_inf_internal = 1;
3111 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3113 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3114 if ( stopmin > minnext)
3115 stopmin = min + min1;
3116 flags &= ~SCF_DO_SUBSTR;
3118 data->flags |= SCF_SEEN_ACCEPT;
3121 if (data_fake.flags & SF_HAS_EVAL)
3122 data->flags |= SF_HAS_EVAL;
3123 data->whilem_c = data_fake.whilem_c;
3125 if (flags & SCF_DO_STCLASS)
3126 cl_or(pRExC_state, &accum, &this_class);
3128 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3130 if (flags & SCF_DO_SUBSTR) {
3131 data->pos_min += min1;
3132 data->pos_delta += max1 - min1;
3133 if (max1 != min1 || is_inf)
3134 data->longest = &(data->longest_float);
3137 delta += max1 - min1;
3138 if (flags & SCF_DO_STCLASS_OR) {
3139 cl_or(pRExC_state, data->start_class, &accum);
3141 cl_and(data->start_class, and_withp);
3142 flags &= ~SCF_DO_STCLASS;
3145 else if (flags & SCF_DO_STCLASS_AND) {
3147 cl_and(data->start_class, &accum);
3148 flags &= ~SCF_DO_STCLASS;
3151 /* Switch to OR mode: cache the old value of
3152 * data->start_class */
3154 StructCopy(data->start_class, and_withp,
3155 struct regnode_charclass_class);
3156 flags &= ~SCF_DO_STCLASS_AND;
3157 StructCopy(&accum, data->start_class,
3158 struct regnode_charclass_class);
3159 flags |= SCF_DO_STCLASS_OR;
3160 data->start_class->flags |= ANYOF_EOS;
3164 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3167 Assuming this was/is a branch we are dealing with: 'scan' now
3168 points at the item that follows the branch sequence, whatever
3169 it is. We now start at the beginning of the sequence and look
3176 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3178 If we can find such a subsequence we need to turn the first
3179 element into a trie and then add the subsequent branch exact
3180 strings to the trie.
3184 1. patterns where the whole set of branches can be converted.
3186 2. patterns where only a subset can be converted.
3188 In case 1 we can replace the whole set with a single regop
3189 for the trie. In case 2 we need to keep the start and end
3192 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3193 becomes BRANCH TRIE; BRANCH X;
3195 There is an additional case, that being where there is a
3196 common prefix, which gets split out into an EXACT like node
3197 preceding the TRIE node.
3199 If x(1..n)==tail then we can do a simple trie, if not we make
3200 a "jump" trie, such that when we match the appropriate word
3201 we "jump" to the appropriate tail node. Essentially we turn
3202 a nested if into a case structure of sorts.
3207 if (!re_trie_maxbuff) {
3208 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3209 if (!SvIOK(re_trie_maxbuff))
3210 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3212 if ( SvIV(re_trie_maxbuff)>=0 ) {
3214 regnode *first = (regnode *)NULL;
3215 regnode *last = (regnode *)NULL;
3216 regnode *tail = scan;
3221 SV * const mysv = sv_newmortal(); /* for dumping */
3223 /* var tail is used because there may be a TAIL
3224 regop in the way. Ie, the exacts will point to the
3225 thing following the TAIL, but the last branch will
3226 point at the TAIL. So we advance tail. If we
3227 have nested (?:) we may have to move through several
3231 while ( OP( tail ) == TAIL ) {
3232 /* this is the TAIL generated by (?:) */
3233 tail = regnext( tail );
3238 regprop(RExC_rx, mysv, tail );
3239 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3240 (int)depth * 2 + 2, "",
3241 "Looking for TRIE'able sequences. Tail node is: ",
3242 SvPV_nolen_const( mysv )
3248 Step through the branches
3249 cur represents each branch,
3250 noper is the first thing to be matched as part of that branch
3251 noper_next is the regnext() of that node.
3253 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3254 via a "jump trie" but we also support building with NOJUMPTRIE,
3255 which restricts the trie logic to structures like /FOO|BAR/.
3257 If noper is a trieable nodetype then the branch is a possible optimization
3258 target. If we are building under NOJUMPTRIE then we require that noper_next
3259 is the same as scan (our current position in the regex program).
3261 Once we have two or more consecutive such branches we can create a
3262 trie of the EXACT's contents and stitch it in place into the program.
3264 If the sequence represents all of the branches in the alternation we
3265 replace the entire thing with a single TRIE node.
3267 Otherwise when it is a subsequence we need to stitch it in place and
3268 replace only the relevant branches. This means the first branch has
3269 to remain as it is used by the alternation logic, and its next pointer,
3270 and needs to be repointed at the item on the branch chain following
3271 the last branch we have optimized away.
3273 This could be either a BRANCH, in which case the subsequence is internal,
3274 or it could be the item following the branch sequence in which case the
3275 subsequence is at the end (which does not necessarily mean the first node
3276 is the start of the alternation).
3278 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3281 ----------------+-----------
3285 EXACTFU_SS | EXACTFU
3286 EXACTFU_TRICKYFOLD | EXACTFU
3291 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3292 ( EXACT == (X) ) ? EXACT : \
3293 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3296 /* dont use tail as the end marker for this traverse */
3297 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3298 regnode * const noper = NEXTOPER( cur );
3299 U8 noper_type = OP( noper );
3300 U8 noper_trietype = TRIE_TYPE( noper_type );
3301 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3302 regnode * const noper_next = regnext( noper );
3306 regprop(RExC_rx, mysv, cur);
3307 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3308 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3310 regprop(RExC_rx, mysv, noper);
3311 PerlIO_printf( Perl_debug_log, " -> %s",
3312 SvPV_nolen_const(mysv));
3315 regprop(RExC_rx, mysv, noper_next );
3316 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3317 SvPV_nolen_const(mysv));
3319 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3320 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3323 /* Is noper a trieable nodetype that can be merged with the
3324 * current trie (if there is one)? */
3328 ( noper_trietype == NOTHING )
3330 ( trietype == NOTHING )
3332 ( trietype == noper_trietype )
3335 && noper_next == tail
3339 /* Handle mergable triable node
3340 * Either we are the first node in a new trieable sequence,
3341 * in which case we do some bookkeeping, otherwise we update
3342 * the end pointer. */
3346 trietype = noper_trietype;
3348 if ( trietype == NOTHING )
3349 trietype = noper_trietype;
3352 } /* end handle mergable triable node */
3354 /* handle unmergable node -
3355 * noper may either be a triable node which can not be tried
3356 * together with the current trie, or a non triable node */
3357 if ( last && trietype != NOTHING ) {
3358 /* if last is set then we have found at least two triable branch
3359 * sequences in a row of a similar trietype so we can turn them
3361 make_trie( pRExC_state,
3362 startbranch, first, cur, tail, count,
3363 trietype, depth+1 );
3364 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3368 && noper_next == tail
3371 /* noper is triable, so we can start a new trie sequence */
3374 trietype = noper_trietype;
3376 /* if we already saw a first but the current node is not triable then we have
3377 * to reset the first information. */
3382 } /* end handle unmergable node */
3383 } /* loop over branches */
3385 regprop(RExC_rx, mysv, cur);
3386 PerlIO_printf( Perl_debug_log,
3387 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3388 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3391 if ( last && trietype != NOTHING ) {
3392 /* the last branch of the sequence was part of a trie,
3393 * so we have to construct it here outside of the loop
3395 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3396 #ifdef TRIE_STUDY_OPT
3397 if ( ((made == MADE_EXACT_TRIE &&
3398 startbranch == first)
3399 || ( first_non_open == first )) &&
3401 flags |= SCF_TRIE_RESTUDY;
3402 if ( startbranch == first
3405 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3409 } /* end if ( last) */
3410 } /* TRIE_MAXBUF is non zero */
3415 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3416 scan = NEXTOPER(NEXTOPER(scan));
3417 } else /* single branch is optimized. */
3418 scan = NEXTOPER(scan);
3420 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3421 scan_frame *newframe = NULL;
3426 if (OP(scan) != SUSPEND) {
3427 /* set the pointer */
3428 if (OP(scan) == GOSUB) {
3430 RExC_recurse[ARG2L(scan)] = scan;
3431 start = RExC_open_parens[paren-1];
3432 end = RExC_close_parens[paren-1];
3435 start = RExC_rxi->program + 1;
3439 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3440 SAVEFREEPV(recursed);
3442 if (!PAREN_TEST(recursed,paren+1)) {
3443 PAREN_SET(recursed,paren+1);
3444 Newx(newframe,1,scan_frame);
3446 if (flags & SCF_DO_SUBSTR) {
3447 SCAN_COMMIT(pRExC_state,data,minlenp);
3448 data->longest = &(data->longest_float);
3450 is_inf = is_inf_internal = 1;
3451 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3452 cl_anything(pRExC_state, data->start_class);
3453 flags &= ~SCF_DO_STCLASS;
3456 Newx(newframe,1,scan_frame);
3459 end = regnext(scan);
3464 SAVEFREEPV(newframe);
3465 newframe->next = regnext(scan);
3466 newframe->last = last;
3467 newframe->stop = stopparen;
3468 newframe->prev = frame;
3478 else if (OP(scan) == EXACT) {
3479 I32 l = STR_LEN(scan);
3482 const U8 * const s = (U8*)STRING(scan);
3483 l = utf8_length(s, s + l);
3484 uc = utf8_to_uvchr(s, NULL);
3486 uc = *((U8*)STRING(scan));
3489 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3490 /* The code below prefers earlier match for fixed
3491 offset, later match for variable offset. */
3492 if (data->last_end == -1) { /* Update the start info. */
3493 data->last_start_min = data->pos_min;
3494 data->last_start_max = is_inf
3495 ? I32_MAX : data->pos_min + data->pos_delta;
3497 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3499 SvUTF8_on(data->last_found);
3501 SV * const sv = data->last_found;
3502 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3503 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3504 if (mg && mg->mg_len >= 0)
3505 mg->mg_len += utf8_length((U8*)STRING(scan),
3506 (U8*)STRING(scan)+STR_LEN(scan));
3508 data->last_end = data->pos_min + l;
3509 data->pos_min += l; /* As in the first entry. */
3510 data->flags &= ~SF_BEFORE_EOL;
3512 if (flags & SCF_DO_STCLASS_AND) {
3513 /* Check whether it is compatible with what we know already! */
3517 /* If compatible, we or it in below. It is compatible if is
3518 * in the bitmp and either 1) its bit or its fold is set, or 2)
3519 * it's for a locale. Even if there isn't unicode semantics
3520 * here, at runtime there may be because of matching against a
3521 * utf8 string, so accept a possible false positive for
3522 * latin1-range folds */
3524 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3525 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3526 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3527 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3532 ANYOF_CLASS_ZERO(data->start_class);
3533 ANYOF_BITMAP_ZERO(data->start_class);
3535 ANYOF_BITMAP_SET(data->start_class, uc);
3536 else if (uc >= 0x100) {
3539 /* Some Unicode code points fold to the Latin1 range; as
3540 * XXX temporary code, instead of figuring out if this is
3541 * one, just assume it is and set all the start class bits
3542 * that could be some such above 255 code point's fold
3543 * which will generate fals positives. As the code
3544 * elsewhere that does compute the fold settles down, it
3545 * can be extracted out and re-used here */
3546 for (i = 0; i < 256; i++){
3547 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3548 ANYOF_BITMAP_SET(data->start_class, i);
3552 data->start_class->flags &= ~ANYOF_EOS;
3554 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3556 else if (flags & SCF_DO_STCLASS_OR) {
3557 /* false positive possible if the class is case-folded */
3559 ANYOF_BITMAP_SET(data->start_class, uc);
3561 data->start_class->flags |= ANYOF_UNICODE_ALL;
3562 data->start_class->flags &= ~ANYOF_EOS;
3563 cl_and(data->start_class, and_withp);
3565 flags &= ~SCF_DO_STCLASS;
3567 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3568 I32 l = STR_LEN(scan);
3569 UV uc = *((U8*)STRING(scan));
3571 /* Search for fixed substrings supports EXACT only. */
3572 if (flags & SCF_DO_SUBSTR) {
3574 SCAN_COMMIT(pRExC_state, data, minlenp);
3577 const U8 * const s = (U8 *)STRING(scan);
3578 l = utf8_length(s, s + l);
3579 uc = utf8_to_uvchr(s, NULL);
3581 else if (has_exactf_sharp_s) {
3582 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3584 min += l - min_subtract;
3588 delta += min_subtract;
3589 if (flags & SCF_DO_SUBSTR) {
3590 data->pos_min += l - min_subtract;
3591 if (data->pos_min < 0) {
3594 data->pos_delta += min_subtract;
3596 data->longest = &(data->longest_float);
3599 if (flags & SCF_DO_STCLASS_AND) {
3600 /* Check whether it is compatible with what we know already! */
3603 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3604 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3605 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3609 ANYOF_CLASS_ZERO(data->start_class);
3610 ANYOF_BITMAP_ZERO(data->start_class);
3612 ANYOF_BITMAP_SET(data->start_class, uc);
3613 data->start_class->flags &= ~ANYOF_EOS;
3614 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3615 if (OP(scan) == EXACTFL) {
3616 /* XXX This set is probably no longer necessary, and
3617 * probably wrong as LOCALE now is on in the initial
3619 data->start_class->flags |= ANYOF_LOCALE;
3623 /* Also set the other member of the fold pair. In case
3624 * that unicode semantics is called for at runtime, use
3625 * the full latin1 fold. (Can't do this for locale,
3626 * because not known until runtime) */
3627 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3629 /* All other (EXACTFL handled above) folds except under
3630 * /iaa that include s, S, and sharp_s also may include
3632 if (OP(scan) != EXACTFA) {
3633 if (uc == 's' || uc == 'S') {
3634 ANYOF_BITMAP_SET(data->start_class,
3635 LATIN_SMALL_LETTER_SHARP_S);
3637 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3638 ANYOF_BITMAP_SET(data->start_class, 's');
3639 ANYOF_BITMAP_SET(data->start_class, 'S');
3644 else if (uc >= 0x100) {
3646 for (i = 0; i < 256; i++){
3647 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3648 ANYOF_BITMAP_SET(data->start_class, i);
3653 else if (flags & SCF_DO_STCLASS_OR) {
3654 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3655 /* false positive possible if the class is case-folded.
3656 Assume that the locale settings are the same... */
3658 ANYOF_BITMAP_SET(data->start_class, uc);
3659 if (OP(scan) != EXACTFL) {
3661 /* And set the other member of the fold pair, but
3662 * can't do that in locale because not known until
3664 ANYOF_BITMAP_SET(data->start_class,
3665 PL_fold_latin1[uc]);
3667 /* All folds except under /iaa that include s, S,
3668 * and sharp_s also may include the others */
3669 if (OP(scan) != EXACTFA) {
3670 if (uc == 's' || uc == 'S') {
3671 ANYOF_BITMAP_SET(data->start_class,
3672 LATIN_SMALL_LETTER_SHARP_S);
3674 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3675 ANYOF_BITMAP_SET(data->start_class, 's');
3676 ANYOF_BITMAP_SET(data->start_class, 'S');
3681 data->start_class->flags &= ~ANYOF_EOS;
3683 cl_and(data->start_class, and_withp);
3685 flags &= ~SCF_DO_STCLASS;
3687 else if (REGNODE_VARIES(OP(scan))) {
3688 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3689 I32 f = flags, pos_before = 0;
3690 regnode * const oscan = scan;
3691 struct regnode_charclass_class this_class;
3692 struct regnode_charclass_class *oclass = NULL;
3693 I32 next_is_eval = 0;
3695 switch (PL_regkind[OP(scan)]) {
3696 case WHILEM: /* End of (?:...)* . */
3697 scan = NEXTOPER(scan);
3700 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3701 next = NEXTOPER(scan);
3702 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3704 maxcount = REG_INFTY;
3705 next = regnext(scan);
3706 scan = NEXTOPER(scan);
3710 if (flags & SCF_DO_SUBSTR)
3715 if (flags & SCF_DO_STCLASS) {
3717 maxcount = REG_INFTY;
3718 next = regnext(scan);
3719 scan = NEXTOPER(scan);
3722 is_inf = is_inf_internal = 1;
3723 scan = regnext(scan);
3724 if (flags & SCF_DO_SUBSTR) {
3725 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3726 data->longest = &(data->longest_float);
3728 goto optimize_curly_tail;
3730 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3731 && (scan->flags == stopparen))
3736 mincount = ARG1(scan);
3737 maxcount = ARG2(scan);
3739 next = regnext(scan);
3740 if (OP(scan) == CURLYX) {
3741 I32 lp = (data ? *(data->last_closep) : 0);
3742 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3744 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3745 next_is_eval = (OP(scan) == EVAL);
3747 if (flags & SCF_DO_SUBSTR) {
3748 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3749 pos_before = data->pos_min;
3753 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3755 data->flags |= SF_IS_INF;
3757 if (flags & SCF_DO_STCLASS) {
3758 cl_init(pRExC_state, &this_class);
3759 oclass = data->start_class;
3760 data->start_class = &this_class;
3761 f |= SCF_DO_STCLASS_AND;
3762 f &= ~SCF_DO_STCLASS_OR;
3764 /* Exclude from super-linear cache processing any {n,m}
3765 regops for which the combination of input pos and regex
3766 pos is not enough information to determine if a match
3769 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3770 regex pos at the \s*, the prospects for a match depend not
3771 only on the input position but also on how many (bar\s*)
3772 repeats into the {4,8} we are. */
3773 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3774 f &= ~SCF_WHILEM_VISITED_POS;
3776 /* This will finish on WHILEM, setting scan, or on NULL: */
3777 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3778 last, data, stopparen, recursed, NULL,
3780 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3782 if (flags & SCF_DO_STCLASS)
3783 data->start_class = oclass;
3784 if (mincount == 0 || minnext == 0) {
3785 if (flags & SCF_DO_STCLASS_OR) {
3786 cl_or(pRExC_state, data->start_class, &this_class);
3788 else if (flags & SCF_DO_STCLASS_AND) {
3789 /* Switch to OR mode: cache the old value of
3790 * data->start_class */
3792 StructCopy(data->start_class, and_withp,
3793 struct regnode_charclass_class);
3794 flags &= ~SCF_DO_STCLASS_AND;
3795 StructCopy(&this_class, data->start_class,
3796 struct regnode_charclass_class);
3797 flags |= SCF_DO_STCLASS_OR;
3798 data->start_class->flags |= ANYOF_EOS;
3800 } else { /* Non-zero len */
3801 if (flags & SCF_DO_STCLASS_OR) {
3802 cl_or(pRExC_state, data->start_class, &this_class);
3803 cl_and(data->start_class, and_withp);
3805 else if (flags & SCF_DO_STCLASS_AND)
3806 cl_and(data->start_class, &this_class);
3807 flags &= ~SCF_DO_STCLASS;
3809 if (!scan) /* It was not CURLYX, but CURLY. */
3811 if ( /* ? quantifier ok, except for (?{ ... }) */
3812 (next_is_eval || !(mincount == 0 && maxcount == 1))
3813 && (minnext == 0) && (deltanext == 0)
3814 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3815 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3817 ckWARNreg(RExC_parse,
3818 "Quantifier unexpected on zero-length expression");
3821 min += minnext * mincount;
3822 is_inf_internal |= ((maxcount == REG_INFTY
3823 && (minnext + deltanext) > 0)
3824 || deltanext == I32_MAX);
3825 is_inf |= is_inf_internal;
3826 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3828 /* Try powerful optimization CURLYX => CURLYN. */
3829 if ( OP(oscan) == CURLYX && data
3830 && data->flags & SF_IN_PAR
3831 && !(data->flags & SF_HAS_EVAL)
3832 && !deltanext && minnext == 1 ) {
3833 /* Try to optimize to CURLYN. */
3834 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3835 regnode * const nxt1 = nxt;
3842 if (!REGNODE_SIMPLE(OP(nxt))
3843 && !(PL_regkind[OP(nxt)] == EXACT
3844 && STR_LEN(nxt) == 1))
3850 if (OP(nxt) != CLOSE)
3852 if (RExC_open_parens) {
3853 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3854 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3856 /* Now we know that nxt2 is the only contents: */
3857 oscan->flags = (U8)ARG(nxt);
3859 OP(nxt1) = NOTHING; /* was OPEN. */
3862 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3863 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3864 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3865 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3866 OP(nxt + 1) = OPTIMIZED; /* was count. */
3867 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3872 /* Try optimization CURLYX => CURLYM. */
3873 if ( OP(oscan) == CURLYX && data
3874 && !(data->flags & SF_HAS_PAR)
3875 && !(data->flags & SF_HAS_EVAL)
3876 && !deltanext /* atom is fixed width */
3877 && minnext != 0 /* CURLYM can't handle zero width */
3879 /* XXXX How to optimize if data == 0? */
3880 /* Optimize to a simpler form. */
3881 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3885 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3886 && (OP(nxt2) != WHILEM))
3888 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3889 /* Need to optimize away parenths. */
3890 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3891 /* Set the parenth number. */
3892 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3894 oscan->flags = (U8)ARG(nxt);
3895 if (RExC_open_parens) {
3896 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3897 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3899 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3900 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3903 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3904 OP(nxt + 1) = OPTIMIZED; /* was count. */
3905 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3906 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3909 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3910 regnode *nnxt = regnext(nxt1);
3912 if (reg_off_by_arg[OP(nxt1)])
3913 ARG_SET(nxt1, nxt2 - nxt1);
3914 else if (nxt2 - nxt1 < U16_MAX)
3915 NEXT_OFF(nxt1) = nxt2 - nxt1;
3917 OP(nxt) = NOTHING; /* Cannot beautify */
3922 /* Optimize again: */
3923 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3924 NULL, stopparen, recursed, NULL, 0,depth+1);
3929 else if ((OP(oscan) == CURLYX)
3930 && (flags & SCF_WHILEM_VISITED_POS)
3931 /* See the comment on a similar expression above.
3932 However, this time it's not a subexpression
3933 we care about, but the expression itself. */
3934 && (maxcount == REG_INFTY)
3935 && data && ++data->whilem_c < 16) {
3936 /* This stays as CURLYX, we can put the count/of pair. */
3937 /* Find WHILEM (as in regexec.c) */
3938 regnode *nxt = oscan + NEXT_OFF(oscan);
3940 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3942 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3943 | (RExC_whilem_seen << 4)); /* On WHILEM */
3945 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3947 if (flags & SCF_DO_SUBSTR) {
3948 SV *last_str = NULL;
3949 int counted = mincount != 0;
3951 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3952 #if defined(SPARC64_GCC_WORKAROUND)
3955 const char *s = NULL;
3958 if (pos_before >= data->last_start_min)
3961 b = data->last_start_min;
3964 s = SvPV_const(data->last_found, l);
3965 old = b - data->last_start_min;
3968 I32 b = pos_before >= data->last_start_min
3969 ? pos_before : data->last_start_min;
3971 const char * const s = SvPV_const(data->last_found, l);
3972 I32 old = b - data->last_start_min;
3976 old = utf8_hop((U8*)s, old) - (U8*)s;
3978 /* Get the added string: */
3979 last_str = newSVpvn_utf8(s + old, l, UTF);
3980 if (deltanext == 0 && pos_before == b) {
3981 /* What was added is a constant string */
3983 SvGROW(last_str, (mincount * l) + 1);
3984 repeatcpy(SvPVX(last_str) + l,
3985 SvPVX_const(last_str), l, mincount - 1);
3986 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3987 /* Add additional parts. */
3988 SvCUR_set(data->last_found,
3989 SvCUR(data->last_found) - l);
3990 sv_catsv(data->last_found, last_str);
3992 SV * sv = data->last_found;
3994 SvUTF8(sv) && SvMAGICAL(sv) ?
3995 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3996 if (mg && mg->mg_len >= 0)
3997 mg->mg_len += CHR_SVLEN(last_str) - l;
3999 data->last_end += l * (mincount - 1);
4002 /* start offset must point into the last copy */
4003 data->last_start_min += minnext * (mincount - 1);
4004 data->last_start_max += is_inf ? I32_MAX
4005 : (maxcount - 1) * (minnext + data->pos_delta);
4008 /* It is counted once already... */
4009 data->pos_min += minnext * (mincount - counted);
4010 data->pos_delta += - counted * deltanext +
4011 (minnext + deltanext) * maxcount - minnext * mincount;
4012 if (mincount != maxcount) {
4013 /* Cannot extend fixed substrings found inside
4015 SCAN_COMMIT(pRExC_state,data,minlenp);
4016 if (mincount && last_str) {
4017 SV * const sv = data->last_found;
4018 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4019 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4023 sv_setsv(sv, last_str);
4024 data->last_end = data->pos_min;
4025 data->last_start_min =
4026 data->pos_min - CHR_SVLEN(last_str);
4027 data->last_start_max = is_inf
4029 : data->pos_min + data->pos_delta
4030 - CHR_SVLEN(last_str);
4032 data->longest = &(data->longest_float);
4034 SvREFCNT_dec(last_str);
4036 if (data && (fl & SF_HAS_EVAL))
4037 data->flags |= SF_HAS_EVAL;
4038 optimize_curly_tail:
4039 if (OP(oscan) != CURLYX) {
4040 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4042 NEXT_OFF(oscan) += NEXT_OFF(next);
4045 default: /* REF, ANYOFV, and CLUMP only? */
4046 if (flags & SCF_DO_SUBSTR) {
4047 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4048 data->longest = &(data->longest_float);
4050 is_inf = is_inf_internal = 1;
4051 if (flags & SCF_DO_STCLASS_OR)
4052 cl_anything(pRExC_state, data->start_class);
4053 flags &= ~SCF_DO_STCLASS;
4057 else if (OP(scan) == LNBREAK) {
4058 if (flags & SCF_DO_STCLASS) {
4060 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4061 if (flags & SCF_DO_STCLASS_AND) {
4062 for (value = 0; value < 256; value++)
4063 if (!is_VERTWS_cp(value))
4064 ANYOF_BITMAP_CLEAR(data->start_class, value);
4067 for (value = 0; value < 256; value++)
4068 if (is_VERTWS_cp(value))
4069 ANYOF_BITMAP_SET(data->start_class, value);
4071 if (flags & SCF_DO_STCLASS_OR)
4072 cl_and(data->start_class, and_withp);
4073 flags &= ~SCF_DO_STCLASS;
4077 if (flags & SCF_DO_SUBSTR) {
4078 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4080 data->pos_delta += 1;
4081 data->longest = &(data->longest_float);
4084 else if (REGNODE_SIMPLE(OP(scan))) {
4087 if (flags & SCF_DO_SUBSTR) {
4088 SCAN_COMMIT(pRExC_state,data,minlenp);
4092 if (flags & SCF_DO_STCLASS) {
4093 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4095 /* Some of the logic below assumes that switching
4096 locale on will only add false positives. */
4097 switch (PL_regkind[OP(scan)]) {
4101 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4102 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4103 cl_anything(pRExC_state, data->start_class);
4106 if (OP(scan) == SANY)
4108 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4109 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4110 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4111 cl_anything(pRExC_state, data->start_class);
4113 if (flags & SCF_DO_STCLASS_AND || !value)
4114 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4117 if (flags & SCF_DO_STCLASS_AND)
4118 cl_and(data->start_class,
4119 (struct regnode_charclass_class*)scan);
4121 cl_or(pRExC_state, data->start_class,
4122 (struct regnode_charclass_class*)scan);
4125 if (flags & SCF_DO_STCLASS_AND) {
4126 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4127 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4128 if (OP(scan) == ALNUMU) {
4129 for (value = 0; value < 256; value++) {
4130 if (!isWORDCHAR_L1(value)) {
4131 ANYOF_BITMAP_CLEAR(data->start_class, value);
4135 for (value = 0; value < 256; value++) {
4136 if (!isALNUM(value)) {
4137 ANYOF_BITMAP_CLEAR(data->start_class, value);
4144 if (data->start_class->flags & ANYOF_LOCALE)
4145 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4147 /* Even if under locale, set the bits for non-locale
4148 * in case it isn't a true locale-node. This will
4149 * create false positives if it truly is locale */
4150 if (OP(scan) == ALNUMU) {
4151 for (value = 0; value < 256; value++) {
4152 if (isWORDCHAR_L1(value)) {
4153 ANYOF_BITMAP_SET(data->start_class, value);
4157 for (value = 0; value < 256; value++) {
4158 if (isALNUM(value)) {
4159 ANYOF_BITMAP_SET(data->start_class, value);
4166 if (flags & SCF_DO_STCLASS_AND) {
4167 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4168 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4169 if (OP(scan) == NALNUMU) {
4170 for (value = 0; value < 256; value++) {
4171 if (isWORDCHAR_L1(value)) {
4172 ANYOF_BITMAP_CLEAR(data->start_class, value);
4176 for (value = 0; value < 256; value++) {
4177 if (isALNUM(value)) {
4178 ANYOF_BITMAP_CLEAR(data->start_class, value);
4185 if (data->start_class->flags & ANYOF_LOCALE)
4186 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4188 /* Even if under locale, set the bits for non-locale in
4189 * case it isn't a true locale-node. This will create
4190 * false positives if it truly is locale */
4191 if (OP(scan) == NALNUMU) {
4192 for (value = 0; value < 256; value++) {
4193 if (! isWORDCHAR_L1(value)) {
4194 ANYOF_BITMAP_SET(data->start_class, value);
4198 for (value = 0; value < 256; value++) {
4199 if (! isALNUM(value)) {
4200 ANYOF_BITMAP_SET(data->start_class, value);
4207 if (flags & SCF_DO_STCLASS_AND) {
4208 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4209 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4210 if (OP(scan) == SPACEU) {
4211 for (value = 0; value < 256; value++) {
4212 if (!isSPACE_L1(value)) {
4213 ANYOF_BITMAP_CLEAR(data->start_class, value);
4217 for (value = 0; value < 256; value++) {
4218 if (!isSPACE(value)) {
4219 ANYOF_BITMAP_CLEAR(data->start_class, value);
4226 if (data->start_class->flags & ANYOF_LOCALE) {
4227 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4229 if (OP(scan) == SPACEU) {
4230 for (value = 0; value < 256; value++) {
4231 if (isSPACE_L1(value)) {
4232 ANYOF_BITMAP_SET(data->start_class, value);
4236 for (value = 0; value < 256; value++) {
4237 if (isSPACE(value)) {
4238 ANYOF_BITMAP_SET(data->start_class, value);
4245 if (flags & SCF_DO_STCLASS_AND) {
4246 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4247 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4248 if (OP(scan) == NSPACEU) {
4249 for (value = 0; value < 256; value++) {
4250 if (isSPACE_L1(value)) {
4251 ANYOF_BITMAP_CLEAR(data->start_class, value);
4255 for (value = 0; value < 256; value++) {
4256 if (isSPACE(value)) {
4257 ANYOF_BITMAP_CLEAR(data->start_class, value);
4264 if (data->start_class->flags & ANYOF_LOCALE)
4265 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4266 if (OP(scan) == NSPACEU) {
4267 for (value = 0; value < 256; value++) {
4268 if (!isSPACE_L1(value)) {
4269 ANYOF_BITMAP_SET(data->start_class, value);
4274 for (value = 0; value < 256; value++) {
4275 if (!isSPACE(value)) {
4276 ANYOF_BITMAP_SET(data->start_class, value);
4283 if (flags & SCF_DO_STCLASS_AND) {
4284 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4285 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4286 for (value = 0; value < 256; value++)
4287 if (!isDIGIT(value))
4288 ANYOF_BITMAP_CLEAR(data->start_class, value);
4292 if (data->start_class->flags & ANYOF_LOCALE)
4293 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4294 for (value = 0; value < 256; value++)
4296 ANYOF_BITMAP_SET(data->start_class, value);
4300 if (flags & SCF_DO_STCLASS_AND) {
4301 if (!(data->start_class->flags & ANYOF_LOCALE))
4302 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4303 for (value = 0; value < 256; value++)
4305 ANYOF_BITMAP_CLEAR(data->start_class, value);
4308 if (data->start_class->flags & ANYOF_LOCALE)
4309 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4310 for (value = 0; value < 256; value++)
4311 if (!isDIGIT(value))
4312 ANYOF_BITMAP_SET(data->start_class, value);
4315 CASE_SYNST_FNC(VERTWS);
4316 CASE_SYNST_FNC(HORIZWS);
4319 if (flags & SCF_DO_STCLASS_OR)
4320 cl_and(data->start_class, and_withp);
4321 flags &= ~SCF_DO_STCLASS;
4324 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4325 data->flags |= (OP(scan) == MEOL
4329 else if ( PL_regkind[OP(scan)] == BRANCHJ
4330 /* Lookbehind, or need to calculate parens/evals/stclass: */
4331 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4332 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4333 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4334 || OP(scan) == UNLESSM )
4336 /* Negative Lookahead/lookbehind
4337 In this case we can't do fixed string optimisation.
4340 I32 deltanext, minnext, fake = 0;
4342 struct regnode_charclass_class intrnl;
4345 data_fake.flags = 0;
4347 data_fake.whilem_c = data->whilem_c;
4348 data_fake.last_closep = data->last_closep;
4351 data_fake.last_closep = &fake;
4352 data_fake.pos_delta = delta;
4353 if ( flags & SCF_DO_STCLASS && !scan->flags
4354 && OP(scan) == IFMATCH ) { /* Lookahead */
4355 cl_init(pRExC_state, &intrnl);
4356 data_fake.start_class = &intrnl;
4357 f |= SCF_DO_STCLASS_AND;
4359 if (flags & SCF_WHILEM_VISITED_POS)
4360 f |= SCF_WHILEM_VISITED_POS;
4361 next = regnext(scan);
4362 nscan = NEXTOPER(NEXTOPER(scan));
4363 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4364 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4367 FAIL("Variable length lookbehind not implemented");
4369 else if (minnext > (I32)U8_MAX) {
4370 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4372 scan->flags = (U8)minnext;
4375 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4377 if (data_fake.flags & SF_HAS_EVAL)
4378 data->flags |= SF_HAS_EVAL;
4379 data->whilem_c = data_fake.whilem_c;
4381 if (f & SCF_DO_STCLASS_AND) {
4382 if (flags & SCF_DO_STCLASS_OR) {
4383 /* OR before, AND after: ideally we would recurse with
4384 * data_fake to get the AND applied by study of the
4385 * remainder of the pattern, and then derecurse;
4386 * *** HACK *** for now just treat as "no information".
4387 * See [perl #56690].
4389 cl_init(pRExC_state, data->start_class);
4391 /* AND before and after: combine and continue */
4392 const int was = (data->start_class->flags & ANYOF_EOS);
4394 cl_and(data->start_class, &intrnl);
4396 data->start_class->flags |= ANYOF_EOS;
4400 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4402 /* Positive Lookahead/lookbehind
4403 In this case we can do fixed string optimisation,
4404 but we must be careful about it. Note in the case of
4405 lookbehind the positions will be offset by the minimum
4406 length of the pattern, something we won't know about
4407 until after the recurse.
4409 I32 deltanext, fake = 0;
4411 struct regnode_charclass_class intrnl;
4413 /* We use SAVEFREEPV so that when the full compile
4414 is finished perl will clean up the allocated
4415 minlens when it's all done. This way we don't
4416 have to worry about freeing them when we know
4417 they wont be used, which would be a pain.
4420 Newx( minnextp, 1, I32 );
4421 SAVEFREEPV(minnextp);
4424 StructCopy(data, &data_fake, scan_data_t);
4425 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4428 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4429 data_fake.last_found=newSVsv(data->last_found);
4433 data_fake.last_closep = &fake;
4434 data_fake.flags = 0;
4435 data_fake.pos_delta = delta;
4437 data_fake.flags |= SF_IS_INF;
4438 if ( flags & SCF_DO_STCLASS && !scan->flags
4439 && OP(scan) == IFMATCH ) { /* Lookahead */
4440 cl_init(pRExC_state, &intrnl);
4441 data_fake.start_class = &intrnl;
4442 f |= SCF_DO_STCLASS_AND;
4444 if (flags & SCF_WHILEM_VISITED_POS)
4445 f |= SCF_WHILEM_VISITED_POS;
4446 next = regnext(scan);
4447 nscan = NEXTOPER(NEXTOPER(scan));
4449 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4450 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4453 FAIL("Variable length lookbehind not implemented");
4455 else if (*minnextp > (I32)U8_MAX) {
4456 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4458 scan->flags = (U8)*minnextp;
4463 if (f & SCF_DO_STCLASS_AND) {
4464 const int was = (data->start_class->flags & ANYOF_EOS);
4466 cl_and(data->start_class, &intrnl);
4468 data->start_class->flags |= ANYOF_EOS;
4471 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4473 if (data_fake.flags & SF_HAS_EVAL)
4474 data->flags |= SF_HAS_EVAL;
4475 data->whilem_c = data_fake.whilem_c;
4476 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4477 if (RExC_rx->minlen<*minnextp)
4478 RExC_rx->minlen=*minnextp;
4479 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4480 SvREFCNT_dec(data_fake.last_found);
4482 if ( data_fake.minlen_fixed != minlenp )
4484 data->offset_fixed= data_fake.offset_fixed;
4485 data->minlen_fixed= data_fake.minlen_fixed;
4486 data->lookbehind_fixed+= scan->flags;
4488 if ( data_fake.minlen_float != minlenp )
4490 data->minlen_float= data_fake.minlen_float;
4491 data->offset_float_min=data_fake.offset_float_min;
4492 data->offset_float_max=data_fake.offset_float_max;
4493 data->lookbehind_float+= scan->flags;
4502 else if (OP(scan) == OPEN) {
4503 if (stopparen != (I32)ARG(scan))
4506 else if (OP(scan) == CLOSE) {
4507 if (stopparen == (I32)ARG(scan)) {
4510 if ((I32)ARG(scan) == is_par) {
4511 next = regnext(scan);
4513 if ( next && (OP(next) != WHILEM) && next < last)
4514 is_par = 0; /* Disable optimization */
4517 *(data->last_closep) = ARG(scan);
4519 else if (OP(scan) == EVAL) {
4521 data->flags |= SF_HAS_EVAL;
4523 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4524 if (flags & SCF_DO_SUBSTR) {
4525 SCAN_COMMIT(pRExC_state,data,minlenp);
4526 flags &= ~SCF_DO_SUBSTR;
4528 if (data && OP(scan)==ACCEPT) {
4529 data->flags |= SCF_SEEN_ACCEPT;
4534 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4536 if (flags & SCF_DO_SUBSTR) {
4537 SCAN_COMMIT(pRExC_state,data,minlenp);
4538 data->longest = &(data->longest_float);
4540 is_inf = is_inf_internal = 1;
4541 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4542 cl_anything(pRExC_state, data->start_class);
4543 flags &= ~SCF_DO_STCLASS;
4545 else if (OP(scan) == GPOS) {
4546 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4547 !(delta || is_inf || (data && data->pos_delta)))
4549 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4550 RExC_rx->extflags |= RXf_ANCH_GPOS;
4551 if (RExC_rx->gofs < (U32)min)
4552 RExC_rx->gofs = min;
4554 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4558 #ifdef TRIE_STUDY_OPT
4559 #ifdef FULL_TRIE_STUDY
4560 else if (PL_regkind[OP(scan)] == TRIE) {
4561 /* NOTE - There is similar code to this block above for handling
4562 BRANCH nodes on the initial study. If you change stuff here
4564 regnode *trie_node= scan;
4565 regnode *tail= regnext(scan);
4566 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4567 I32 max1 = 0, min1 = I32_MAX;
4568 struct regnode_charclass_class accum;
4570 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4571 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4572 if (flags & SCF_DO_STCLASS)
4573 cl_init_zero(pRExC_state, &accum);
4579 const regnode *nextbranch= NULL;
4582 for ( word=1 ; word <= trie->wordcount ; word++)
4584 I32 deltanext=0, minnext=0, f = 0, fake;
4585 struct regnode_charclass_class this_class;
4587 data_fake.flags = 0;
4589 data_fake.whilem_c = data->whilem_c;
4590 data_fake.last_closep = data->last_closep;
4593 data_fake.last_closep = &fake;
4594 data_fake.pos_delta = delta;
4595 if (flags & SCF_DO_STCLASS) {
4596 cl_init(pRExC_state, &this_class);
4597 data_fake.start_class = &this_class;
4598 f = SCF_DO_STCLASS_AND;
4600 if (flags & SCF_WHILEM_VISITED_POS)
4601 f |= SCF_WHILEM_VISITED_POS;
4603 if (trie->jump[word]) {
4605 nextbranch = trie_node + trie->jump[0];
4606 scan= trie_node + trie->jump[word];
4607 /* We go from the jump point to the branch that follows
4608 it. Note this means we need the vestigal unused branches
4609 even though they arent otherwise used.
4611 minnext = study_chunk(pRExC_state, &scan, minlenp,
4612 &deltanext, (regnode *)nextbranch, &data_fake,
4613 stopparen, recursed, NULL, f,depth+1);
4615 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4616 nextbranch= regnext((regnode*)nextbranch);
4618 if (min1 > (I32)(minnext + trie->minlen))
4619 min1 = minnext + trie->minlen;
4620 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4621 max1 = minnext + deltanext + trie->maxlen;
4622 if (deltanext == I32_MAX)
4623 is_inf = is_inf_internal = 1;
4625 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4627 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4628 if ( stopmin > min + min1)
4629 stopmin = min + min1;
4630 flags &= ~SCF_DO_SUBSTR;
4632 data->flags |= SCF_SEEN_ACCEPT;
4635 if (data_fake.flags & SF_HAS_EVAL)
4636 data->flags |= SF_HAS_EVAL;
4637 data->whilem_c = data_fake.whilem_c;
4639 if (flags & SCF_DO_STCLASS)
4640 cl_or(pRExC_state, &accum, &this_class);
4643 if (flags & SCF_DO_SUBSTR) {
4644 data->pos_min += min1;
4645 data->pos_delta += max1 - min1;
4646 if (max1 != min1 || is_inf)
4647 data->longest = &(data->longest_float);
4650 delta += max1 - min1;
4651 if (flags & SCF_DO_STCLASS_OR) {
4652 cl_or(pRExC_state, data->start_class, &accum);
4654 cl_and(data->start_class, and_withp);
4655 flags &= ~SCF_DO_STCLASS;
4658 else if (flags & SCF_DO_STCLASS_AND) {
4660 cl_and(data->start_class, &accum);
4661 flags &= ~SCF_DO_STCLASS;
4664 /* Switch to OR mode: cache the old value of
4665 * data->start_class */
4667 StructCopy(data->start_class, and_withp,
4668 struct regnode_charclass_class);
4669 flags &= ~SCF_DO_STCLASS_AND;
4670 StructCopy(&accum, data->start_class,
4671 struct regnode_charclass_class);
4672 flags |= SCF_DO_STCLASS_OR;
4673 data->start_class->flags |= ANYOF_EOS;
4680 else if (PL_regkind[OP(scan)] == TRIE) {
4681 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4684 min += trie->minlen;
4685 delta += (trie->maxlen - trie->minlen);
4686 flags &= ~SCF_DO_STCLASS; /* xxx */
4687 if (flags & SCF_DO_SUBSTR) {
4688 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4689 data->pos_min += trie->minlen;
4690 data->pos_delta += (trie->maxlen - trie->minlen);
4691 if (trie->maxlen != trie->minlen)
4692 data->longest = &(data->longest_float);
4694 if (trie->jump) /* no more substrings -- for now /grr*/
4695 flags &= ~SCF_DO_SUBSTR;
4697 #endif /* old or new */
4698 #endif /* TRIE_STUDY_OPT */
4700 /* Else: zero-length, ignore. */
4701 scan = regnext(scan);
4706 stopparen = frame->stop;
4707 frame = frame->prev;
4708 goto fake_study_recurse;
4713 DEBUG_STUDYDATA("pre-fin:",data,depth);
4716 *deltap = is_inf_internal ? I32_MAX : delta;
4717 if (flags & SCF_DO_SUBSTR && is_inf)
4718 data->pos_delta = I32_MAX - data->pos_min;
4719 if (is_par > (I32)U8_MAX)
4721 if (is_par && pars==1 && data) {
4722 data->flags |= SF_IN_PAR;
4723 data->flags &= ~SF_HAS_PAR;
4725 else if (pars && data) {
4726 data->flags |= SF_HAS_PAR;
4727 data->flags &= ~SF_IN_PAR;
4729 if (flags & SCF_DO_STCLASS_OR)
4730 cl_and(data->start_class, and_withp);
4731 if (flags & SCF_TRIE_RESTUDY)
4732 data->flags |= SCF_TRIE_RESTUDY;
4734 DEBUG_STUDYDATA("post-fin:",data,depth);
4736 return min < stopmin ? min : stopmin;
4740 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4742 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4744 PERL_ARGS_ASSERT_ADD_DATA;
4746 Renewc(RExC_rxi->data,
4747 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4748 char, struct reg_data);
4750 Renew(RExC_rxi->data->what, count + n, U8);
4752 Newx(RExC_rxi->data->what, n, U8);
4753 RExC_rxi->data->count = count + n;
4754 Copy(s, RExC_rxi->data->what + count, n, U8);
4758 /*XXX: todo make this not included in a non debugging perl */
4759 #ifndef PERL_IN_XSUB_RE
4761 Perl_reginitcolors(pTHX)
4764 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4766 char *t = savepv(s);
4770 t = strchr(t, '\t');
4776 PL_colors[i] = t = (char *)"";
4781 PL_colors[i++] = (char *)"";
4788 #ifdef TRIE_STUDY_OPT
4789 #define CHECK_RESTUDY_GOTO \
4791 (data.flags & SCF_TRIE_RESTUDY) \
4795 #define CHECK_RESTUDY_GOTO
4799 - pregcomp - compile a regular expression into internal code
4801 * We can't allocate space until we know how big the compiled form will be,
4802 * but we can't compile it (and thus know how big it is) until we've got a
4803 * place to put the code. So we cheat: we compile it twice, once with code
4804 * generation turned off and size counting turned on, and once "for real".
4805 * This also means that we don't allocate space until we are sure that the
4806 * thing really will compile successfully, and we never have to move the
4807 * code and thus invalidate pointers into it. (Note that it has to be in
4808 * one piece because free() must be able to free it all.) [NB: not true in perl]
4810 * Beware that the optimization-preparation code in here knows about some
4811 * of the structure of the compiled regexp. [I'll say.]
4816 #ifndef PERL_IN_XSUB_RE
4817 #define RE_ENGINE_PTR &PL_core_reg_engine
4819 extern const struct regexp_engine my_reg_engine;
4820 #define RE_ENGINE_PTR &my_reg_engine
4823 #ifndef PERL_IN_XSUB_RE
4825 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4828 HV * const table = GvHV(PL_hintgv);
4830 PERL_ARGS_ASSERT_PREGCOMP;
4832 /* Dispatch a request to compile a regexp to correct
4835 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4836 GET_RE_DEBUG_FLAGS_DECL;
4837 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4838 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4840 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4843 return CALLREGCOMP_ENG(eng, pattern, flags);
4846 return Perl_re_compile(aTHX_ pattern, flags);
4851 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4856 register regexp_internal *ri;
4865 /* these are all flags - maybe they should be turned
4866 * into a single int with different bit masks */
4867 I32 sawlookahead = 0;
4870 bool used_setjump = FALSE;
4871 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4876 RExC_state_t RExC_state;
4877 RExC_state_t * const pRExC_state = &RExC_state;
4878 #ifdef TRIE_STUDY_OPT
4880 RExC_state_t copyRExC_state;
4882 GET_RE_DEBUG_FLAGS_DECL;
4884 PERL_ARGS_ASSERT_RE_COMPILE;
4886 DEBUG_r(if (!PL_colorset) reginitcolors());
4888 #ifndef PERL_IN_XSUB_RE
4889 /* Initialize these here instead of as-needed, as is quick and avoids
4890 * having to test them each time otherwise */
4891 if (! PL_AboveLatin1) {
4892 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4893 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4894 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4896 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4897 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4899 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4900 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4902 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4903 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4905 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4907 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4908 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4910 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4912 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4913 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4915 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4916 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4918 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4919 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4921 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4922 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4924 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4925 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4927 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4928 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4930 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4931 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4933 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4934 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4936 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4938 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4939 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4941 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4942 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4946 exp = SvPV(pattern, plen);
4948 if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4949 RExC_utf8 = RExC_orig_utf8 = 0;
4952 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4954 RExC_uni_semantics = 0;
4955 RExC_contains_locale = 0;
4957 /****************** LONG JUMP TARGET HERE***********************/
4958 /* Longjmp back to here if have to switch in midstream to utf8 */
4959 if (! RExC_orig_utf8) {
4960 JMPENV_PUSH(jump_ret);
4961 used_setjump = TRUE;
4964 if (jump_ret == 0) { /* First time through */
4968 SV *dsv= sv_newmortal();
4969 RE_PV_QUOTED_DECL(s, RExC_utf8,
4970 dsv, exp, plen, 60);
4971 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4972 PL_colors[4],PL_colors[5],s);
4975 else { /* longjumped back */
4978 /* If the cause for the longjmp was other than changing to utf8, pop
4979 * our own setjmp, and longjmp to the correct handler */
4980 if (jump_ret != UTF8_LONGJMP) {
4982 JMPENV_JUMP(jump_ret);
4987 /* It's possible to write a regexp in ascii that represents Unicode
4988 codepoints outside of the byte range, such as via \x{100}. If we
4989 detect such a sequence we have to convert the entire pattern to utf8
4990 and then recompile, as our sizing calculation will have been based
4991 on 1 byte == 1 character, but we will need to use utf8 to encode
4992 at least some part of the pattern, and therefore must convert the whole
4995 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4996 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4997 exp = (char*)Perl_bytes_to_utf8(aTHX_
4998 (U8*)SvPV_nomg(pattern, plen),
5001 RExC_orig_utf8 = RExC_utf8 = 1;
5005 #ifdef TRIE_STUDY_OPT
5009 pm_flags = orig_pm_flags;
5011 if (initial_charset == REGEX_LOCALE_CHARSET) {
5012 RExC_contains_locale = 1;
5014 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5016 /* Set to use unicode semantics if the pattern is in utf8 and has the
5017 * 'depends' charset specified, as it means unicode when utf8 */
5018 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5022 RExC_flags = pm_flags;
5026 RExC_in_lookbehind = 0;
5027 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5028 RExC_seen_evals = 0;
5030 RExC_override_recoding = 0;
5032 /* First pass: determine size, legality. */
5040 RExC_emit = &PL_regdummy;
5041 RExC_whilem_seen = 0;
5042 RExC_open_parens = NULL;
5043 RExC_close_parens = NULL;
5045 RExC_paren_names = NULL;
5047 RExC_paren_name_list = NULL;
5049 RExC_recurse = NULL;
5050 RExC_recurse_count = 0;
5052 #if 0 /* REGC() is (currently) a NOP at the first pass.
5053 * Clever compilers notice this and complain. --jhi */
5054 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5057 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5059 RExC_lastparse=NULL;
5061 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5062 RExC_precomp = NULL;
5066 /* Here, finished first pass. Get rid of any added setjmp */
5072 PerlIO_printf(Perl_debug_log,
5073 "Required size %"IVdf" nodes\n"
5074 "Starting second pass (creation)\n",
5077 RExC_lastparse=NULL;
5080 /* The first pass could have found things that force Unicode semantics */
5081 if ((RExC_utf8 || RExC_uni_semantics)
5082 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5084 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5087 /* Small enough for pointer-storage convention?
5088 If extralen==0, this means that we will not need long jumps. */
5089 if (RExC_size >= 0x10000L && RExC_extralen)
5090 RExC_size += RExC_extralen;
5093 if (RExC_whilem_seen > 15)
5094 RExC_whilem_seen = 15;
5096 /* Allocate space and zero-initialize. Note, the two step process
5097 of zeroing when in debug mode, thus anything assigned has to
5098 happen after that */
5099 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5100 r = (struct regexp*)SvANY(rx);
5101 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5102 char, regexp_internal);
5103 if ( r == NULL || ri == NULL )
5104 FAIL("Regexp out of space");
5106 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5107 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5109 /* bulk initialize base fields with 0. */
5110 Zero(ri, sizeof(regexp_internal), char);
5113 /* non-zero initialization begins here */
5115 r->engine= RE_ENGINE_PTR;
5116 r->extflags = pm_flags;
5118 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5119 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5121 /* The caret is output if there are any defaults: if not all the STD
5122 * flags are set, or if no character set specifier is needed */
5124 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5126 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5127 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5128 >> RXf_PMf_STD_PMMOD_SHIFT);
5129 const char *fptr = STD_PAT_MODS; /*"msix"*/
5131 /* Allocate for the worst case, which is all the std flags are turned
5132 * on. If more precision is desired, we could do a population count of
5133 * the flags set. This could be done with a small lookup table, or by
5134 * shifting, masking and adding, or even, when available, assembly
5135 * language for a machine-language population count.
5136 * We never output a minus, as all those are defaults, so are
5137 * covered by the caret */
5138 const STRLEN wraplen = plen + has_p + has_runon
5139 + has_default /* If needs a caret */
5141 /* If needs a character set specifier */
5142 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5143 + (sizeof(STD_PAT_MODS) - 1)
5144 + (sizeof("(?:)") - 1);
5146 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5148 SvFLAGS(rx) |= SvUTF8(pattern);
5151 /* If a default, cover it using the caret */
5153 *p++= DEFAULT_PAT_MOD;
5157 const char* const name = get_regex_charset_name(r->extflags, &len);
5158 Copy(name, p, len, char);
5162 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5165 while((ch = *fptr++)) {
5173 Copy(RExC_precomp, p, plen, char);
5174 assert ((RX_WRAPPED(rx) - p) < 16);
5175 r->pre_prefix = p - RX_WRAPPED(rx);
5181 SvCUR_set(rx, p - SvPVX_const(rx));
5185 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5187 if (RExC_seen & REG_SEEN_RECURSE) {
5188 Newxz(RExC_open_parens, RExC_npar,regnode *);
5189 SAVEFREEPV(RExC_open_parens);
5190 Newxz(RExC_close_parens,RExC_npar,regnode *);
5191 SAVEFREEPV(RExC_close_parens);
5194 /* Useful during FAIL. */
5195 #ifdef RE_TRACK_PATTERN_OFFSETS
5196 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5197 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5198 "%s %"UVuf" bytes for offset annotations.\n",
5199 ri->u.offsets ? "Got" : "Couldn't get",
5200 (UV)((2*RExC_size+1) * sizeof(U32))));
5202 SetProgLen(ri,RExC_size);
5207 /* Second pass: emit code. */
5208 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
5213 RExC_emit_start = ri->program;
5214 RExC_emit = ri->program;
5215 RExC_emit_bound = ri->program + RExC_size + 1;
5217 /* Store the count of eval-groups for security checks: */
5218 RExC_rx->seen_evals = RExC_seen_evals;
5219 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5220 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5224 /* XXXX To minimize changes to RE engine we always allocate
5225 3-units-long substrs field. */
5226 Newx(r->substrs, 1, struct reg_substr_data);
5227 if (RExC_recurse_count) {
5228 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5229 SAVEFREEPV(RExC_recurse);
5233 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5234 Zero(r->substrs, 1, struct reg_substr_data);
5236 #ifdef TRIE_STUDY_OPT
5238 StructCopy(&zero_scan_data, &data, scan_data_t);
5239 copyRExC_state = RExC_state;
5242 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5244 RExC_state = copyRExC_state;
5245 if (seen & REG_TOP_LEVEL_BRANCHES)
5246 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5248 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5249 if (data.last_found) {
5250 SvREFCNT_dec(data.longest_fixed);
5251 SvREFCNT_dec(data.longest_float);
5252 SvREFCNT_dec(data.last_found);
5254 StructCopy(&zero_scan_data, &data, scan_data_t);
5257 StructCopy(&zero_scan_data, &data, scan_data_t);
5260 /* Dig out information for optimizations. */
5261 r->extflags = RExC_flags; /* was pm_op */
5262 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5265 SvUTF8_on(rx); /* Unicode in it? */
5266 ri->regstclass = NULL;
5267 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5268 r->intflags |= PREGf_NAUGHTY;
5269 scan = ri->program + 1; /* First BRANCH. */
5271 /* testing for BRANCH here tells us whether there is "must appear"
5272 data in the pattern. If there is then we can use it for optimisations */
5273 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5275 STRLEN longest_float_length, longest_fixed_length;
5276 struct regnode_charclass_class ch_class; /* pointed to by data */
5278 I32 last_close = 0; /* pointed to by data */
5279 regnode *first= scan;
5280 regnode *first_next= regnext(first);
5282 * Skip introductions and multiplicators >= 1
5283 * so that we can extract the 'meat' of the pattern that must
5284 * match in the large if() sequence following.
5285 * NOTE that EXACT is NOT covered here, as it is normally
5286 * picked up by the optimiser separately.
5288 * This is unfortunate as the optimiser isnt handling lookahead
5289 * properly currently.
5292 while ((OP(first) == OPEN && (sawopen = 1)) ||
5293 /* An OR of *one* alternative - should not happen now. */
5294 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5295 /* for now we can't handle lookbehind IFMATCH*/
5296 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5297 (OP(first) == PLUS) ||
5298 (OP(first) == MINMOD) ||
5299 /* An {n,m} with n>0 */
5300 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5301 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5304 * the only op that could be a regnode is PLUS, all the rest
5305 * will be regnode_1 or regnode_2.
5308 if (OP(first) == PLUS)
5311 first += regarglen[OP(first)];
5313 first = NEXTOPER(first);
5314 first_next= regnext(first);
5317 /* Starting-point info. */
5319 DEBUG_PEEP("first:",first,0);
5320 /* Ignore EXACT as we deal with it later. */
5321 if (PL_regkind[OP(first)] == EXACT) {
5322 if (OP(first) == EXACT)
5323 NOOP; /* Empty, get anchored substr later. */
5325 ri->regstclass = first;
5328 else if (PL_regkind[OP(first)] == TRIE &&
5329 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5332 /* this can happen only on restudy */
5333 if ( OP(first) == TRIE ) {
5334 struct regnode_1 *trieop = (struct regnode_1 *)
5335 PerlMemShared_calloc(1, sizeof(struct regnode_1));
5336 StructCopy(first,trieop,struct regnode_1);
5337 trie_op=(regnode *)trieop;
5339 struct regnode_charclass *trieop = (struct regnode_charclass *)
5340 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5341 StructCopy(first,trieop,struct regnode_charclass);
5342 trie_op=(regnode *)trieop;
5345 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5346 ri->regstclass = trie_op;
5349 else if (REGNODE_SIMPLE(OP(first)))
5350 ri->regstclass = first;
5351 else if (PL_regkind[OP(first)] == BOUND ||
5352 PL_regkind[OP(first)] == NBOUND)
5353 ri->regstclass = first;
5354 else if (PL_regkind[OP(first)] == BOL) {
5355 r->extflags |= (OP(first) == MBOL
5357 : (OP(first) == SBOL
5360 first = NEXTOPER(first);
5363 else if (OP(first) == GPOS) {
5364 r->extflags |= RXf_ANCH_GPOS;
5365 first = NEXTOPER(first);
5368 else if ((!sawopen || !RExC_sawback) &&
5369 (OP(first) == STAR &&
5370 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5371 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5373 /* turn .* into ^.* with an implied $*=1 */
5375 (OP(NEXTOPER(first)) == REG_ANY)
5378 r->extflags |= type;
5379 r->intflags |= PREGf_IMPLICIT;
5380 first = NEXTOPER(first);
5383 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5384 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5385 /* x+ must match at the 1st pos of run of x's */
5386 r->intflags |= PREGf_SKIP;
5388 /* Scan is after the zeroth branch, first is atomic matcher. */
5389 #ifdef TRIE_STUDY_OPT
5392 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5393 (IV)(first - scan + 1))
5397 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5398 (IV)(first - scan + 1))
5404 * If there's something expensive in the r.e., find the
5405 * longest literal string that must appear and make it the
5406 * regmust. Resolve ties in favor of later strings, since
5407 * the regstart check works with the beginning of the r.e.
5408 * and avoiding duplication strengthens checking. Not a
5409 * strong reason, but sufficient in the absence of others.
5410 * [Now we resolve ties in favor of the earlier string if
5411 * it happens that c_offset_min has been invalidated, since the
5412 * earlier string may buy us something the later one won't.]
5415 data.longest_fixed = newSVpvs("");
5416 data.longest_float = newSVpvs("");
5417 data.last_found = newSVpvs("");
5418 data.longest = &(data.longest_fixed);
5420 if (!ri->regstclass) {
5421 cl_init(pRExC_state, &ch_class);
5422 data.start_class = &ch_class;
5423 stclass_flag = SCF_DO_STCLASS_AND;
5424 } else /* XXXX Check for BOUND? */
5426 data.last_closep = &last_close;
5428 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5429 &data, -1, NULL, NULL,
5430 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5436 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5437 && data.last_start_min == 0 && data.last_end > 0
5438 && !RExC_seen_zerolen
5439 && !(RExC_seen & REG_SEEN_VERBARG)
5440 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5441 r->extflags |= RXf_CHECK_ALL;
5442 scan_commit(pRExC_state, &data,&minlen,0);
5443 SvREFCNT_dec(data.last_found);
5445 /* Note that code very similar to this but for anchored string
5446 follows immediately below, changes may need to be made to both.
5449 longest_float_length = CHR_SVLEN(data.longest_float);
5450 if (longest_float_length
5451 || (data.flags & SF_FL_BEFORE_EOL
5452 && (!(data.flags & SF_FL_BEFORE_MEOL)
5453 || (RExC_flags & RXf_PMf_MULTILINE))))
5457 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5458 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5459 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5460 && data.offset_fixed == data.offset_float_min
5461 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5462 goto remove_float; /* As in (a)+. */
5464 /* copy the information about the longest float from the reg_scan_data
5465 over to the program. */
5466 if (SvUTF8(data.longest_float)) {
5467 r->float_utf8 = data.longest_float;
5468 r->float_substr = NULL;
5470 r->float_substr = data.longest_float;
5471 r->float_utf8 = NULL;
5473 /* float_end_shift is how many chars that must be matched that
5474 follow this item. We calculate it ahead of time as once the
5475 lookbehind offset is added in we lose the ability to correctly
5477 ml = data.minlen_float ? *(data.minlen_float)
5478 : (I32)longest_float_length;
5479 r->float_end_shift = ml - data.offset_float_min
5480 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5481 + data.lookbehind_float;
5482 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5483 r->float_max_offset = data.offset_float_max;
5484 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5485 r->float_max_offset -= data.lookbehind_float;
5487 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5488 && (!(data.flags & SF_FL_BEFORE_MEOL)
5489 || (RExC_flags & RXf_PMf_MULTILINE)));
5490 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5494 r->float_substr = r->float_utf8 = NULL;
5495 SvREFCNT_dec(data.longest_float);
5496 longest_float_length = 0;
5499 /* Note that code very similar to this but for floating string
5500 is immediately above, changes may need to be made to both.
5503 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5505 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5506 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5507 && (longest_fixed_length
5508 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5509 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5510 || (RExC_flags & RXf_PMf_MULTILINE)))) )
5514 /* copy the information about the longest fixed
5515 from the reg_scan_data over to the program. */
5516 if (SvUTF8(data.longest_fixed)) {
5517 r->anchored_utf8 = data.longest_fixed;
5518 r->anchored_substr = NULL;
5520 r->anchored_substr = data.longest_fixed;
5521 r->anchored_utf8 = NULL;
5523 /* fixed_end_shift is how many chars that must be matched that
5524 follow this item. We calculate it ahead of time as once the
5525 lookbehind offset is added in we lose the ability to correctly
5527 ml = data.minlen_fixed ? *(data.minlen_fixed)
5528 : (I32)longest_fixed_length;
5529 r->anchored_end_shift = ml - data.offset_fixed
5530 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5531 + data.lookbehind_fixed;
5532 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5534 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5535 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5536 || (RExC_flags & RXf_PMf_MULTILINE)));
5537 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5540 r->anchored_substr = r->anchored_utf8 = NULL;
5541 SvREFCNT_dec(data.longest_fixed);
5542 longest_fixed_length = 0;
5545 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5546 ri->regstclass = NULL;
5548 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5550 && !(data.start_class->flags & ANYOF_EOS)
5551 && !cl_is_anything(data.start_class))
5553 const U32 n = add_data(pRExC_state, 1, "f");
5554 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5556 Newx(RExC_rxi->data->data[n], 1,
5557 struct regnode_charclass_class);
5558 StructCopy(data.start_class,
5559 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5560 struct regnode_charclass_class);
5561 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5562 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5563 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5564 regprop(r, sv, (regnode*)data.start_class);
5565 PerlIO_printf(Perl_debug_log,
5566 "synthetic stclass \"%s\".\n",
5567 SvPVX_const(sv));});
5570 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5571 if (longest_fixed_length > longest_float_length) {
5572 r->check_end_shift = r->anchored_end_shift;
5573 r->check_substr = r->anchored_substr;
5574 r->check_utf8 = r->anchored_utf8;
5575 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5576 if (r->extflags & RXf_ANCH_SINGLE)
5577 r->extflags |= RXf_NOSCAN;
5580 r->check_end_shift = r->float_end_shift;
5581 r->check_substr = r->float_substr;
5582 r->check_utf8 = r->float_utf8;
5583 r->check_offset_min = r->float_min_offset;
5584 r->check_offset_max = r->float_max_offset;
5586 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5587 This should be changed ASAP! */
5588 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5589 r->extflags |= RXf_USE_INTUIT;
5590 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5591 r->extflags |= RXf_INTUIT_TAIL;
5593 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5594 if ( (STRLEN)minlen < longest_float_length )
5595 minlen= longest_float_length;
5596 if ( (STRLEN)minlen < longest_fixed_length )
5597 minlen= longest_fixed_length;
5601 /* Several toplevels. Best we can is to set minlen. */
5603 struct regnode_charclass_class ch_class;
5606 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5608 scan = ri->program + 1;
5609 cl_init(pRExC_state, &ch_class);
5610 data.start_class = &ch_class;
5611 data.last_closep = &last_close;
5614 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5615 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5619 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5620 = r->float_substr = r->float_utf8 = NULL;
5622 if (!(data.start_class->flags & ANYOF_EOS)
5623 && !cl_is_anything(data.start_class))
5625 const U32 n = add_data(pRExC_state, 1, "f");
5626 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5628 Newx(RExC_rxi->data->data[n], 1,
5629 struct regnode_charclass_class);
5630 StructCopy(data.start_class,
5631 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5632 struct regnode_charclass_class);
5633 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5634 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5635 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5636 regprop(r, sv, (regnode*)data.start_class);
5637 PerlIO_printf(Perl_debug_log,
5638 "synthetic stclass \"%s\".\n",
5639 SvPVX_const(sv));});
5643 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5644 the "real" pattern. */
5646 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5647 (IV)minlen, (IV)r->minlen);
5649 r->minlenret = minlen;
5650 if (r->minlen < minlen)
5653 if (RExC_seen & REG_SEEN_GPOS)
5654 r->extflags |= RXf_GPOS_SEEN;
5655 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5656 r->extflags |= RXf_LOOKBEHIND_SEEN;
5657 if (RExC_seen & REG_SEEN_EVAL)
5658 r->extflags |= RXf_EVAL_SEEN;
5659 if (RExC_seen & REG_SEEN_CANY)
5660 r->extflags |= RXf_CANY_SEEN;
5661 if (RExC_seen & REG_SEEN_VERBARG)
5662 r->intflags |= PREGf_VERBARG_SEEN;
5663 if (RExC_seen & REG_SEEN_CUTGROUP)
5664 r->intflags |= PREGf_CUTGROUP_SEEN;
5665 if (RExC_paren_names)
5666 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5668 RXp_PAREN_NAMES(r) = NULL;
5670 #ifdef STUPID_PATTERN_CHECKS
5671 if (RX_PRELEN(rx) == 0)
5672 r->extflags |= RXf_NULL;
5673 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5674 /* XXX: this should happen BEFORE we compile */
5675 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5676 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5677 r->extflags |= RXf_WHITE;
5678 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5679 r->extflags |= RXf_START_ONLY;
5681 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5682 /* XXX: this should happen BEFORE we compile */
5683 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5685 regnode *first = ri->program + 1;
5688 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5689 r->extflags |= RXf_NULL;
5690 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5691 r->extflags |= RXf_START_ONLY;
5692 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5693 && OP(regnext(first)) == END)
5694 r->extflags |= RXf_WHITE;
5698 if (RExC_paren_names) {
5699 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5700 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5703 ri->name_list_idx = 0;
5705 if (RExC_recurse_count) {
5706 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5707 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5708 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5711 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5712 /* assume we don't need to swap parens around before we match */
5715 PerlIO_printf(Perl_debug_log,"Final program:\n");
5718 #ifdef RE_TRACK_PATTERN_OFFSETS
5719 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5720 const U32 len = ri->u.offsets[0];
5722 GET_RE_DEBUG_FLAGS_DECL;
5723 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5724 for (i = 1; i <= len; i++) {
5725 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5726 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5727 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5729 PerlIO_printf(Perl_debug_log, "\n");
5735 #undef RE_ENGINE_PTR
5739 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5742 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5744 PERL_UNUSED_ARG(value);
5746 if (flags & RXapif_FETCH) {
5747 return reg_named_buff_fetch(rx, key, flags);
5748 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5749 Perl_croak_no_modify(aTHX);
5751 } else if (flags & RXapif_EXISTS) {
5752 return reg_named_buff_exists(rx, key, flags)
5755 } else if (flags & RXapif_REGNAMES) {
5756 return reg_named_buff_all(rx, flags);
5757 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5758 return reg_named_buff_scalar(rx, flags);
5760 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5766 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5769 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5770 PERL_UNUSED_ARG(lastkey);
5772 if (flags & RXapif_FIRSTKEY)
5773 return reg_named_buff_firstkey(rx, flags);
5774 else if (flags & RXapif_NEXTKEY)
5775 return reg_named_buff_nextkey(rx, flags);
5777 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5783 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5786 AV *retarray = NULL;
5788 struct regexp *const rx = (struct regexp *)SvANY(r);
5790 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5792 if (flags & RXapif_ALL)
5795 if (rx && RXp_PAREN_NAMES(rx)) {
5796 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5799 SV* sv_dat=HeVAL(he_str);
5800 I32 *nums=(I32*)SvPVX(sv_dat);
5801 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5802 if ((I32)(rx->nparens) >= nums[i]
5803 && rx->offs[nums[i]].start != -1
5804 && rx->offs[nums[i]].end != -1)
5807 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5812 ret = newSVsv(&PL_sv_undef);
5815 av_push(retarray, ret);
5818 return newRV_noinc(MUTABLE_SV(retarray));
5825 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5828 struct regexp *const rx = (struct regexp *)SvANY(r);
5830 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5832 if (rx && RXp_PAREN_NAMES(rx)) {
5833 if (flags & RXapif_ALL) {
5834 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5836 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5850 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5852 struct regexp *const rx = (struct regexp *)SvANY(r);
5854 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5856 if ( rx && RXp_PAREN_NAMES(rx) ) {
5857 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5859 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5866 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5868 struct regexp *const rx = (struct regexp *)SvANY(r);
5869 GET_RE_DEBUG_FLAGS_DECL;
5871 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5873 if (rx && RXp_PAREN_NAMES(rx)) {
5874 HV *hv = RXp_PAREN_NAMES(rx);
5876 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5879 SV* sv_dat = HeVAL(temphe);
5880 I32 *nums = (I32*)SvPVX(sv_dat);
5881 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5882 if ((I32)(rx->lastparen) >= nums[i] &&
5883 rx->offs[nums[i]].start != -1 &&
5884 rx->offs[nums[i]].end != -1)
5890 if (parno || flags & RXapif_ALL) {
5891 return newSVhek(HeKEY_hek(temphe));
5899 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5904 struct regexp *const rx = (struct regexp *)SvANY(r);
5906 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5908 if (rx && RXp_PAREN_NAMES(rx)) {
5909 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5910 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5911 } else if (flags & RXapif_ONE) {
5912 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5913 av = MUTABLE_AV(SvRV(ret));
5914 length = av_len(av);
5916 return newSViv(length + 1);
5918 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5922 return &PL_sv_undef;
5926 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5928 struct regexp *const rx = (struct regexp *)SvANY(r);
5931 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5933 if (rx && RXp_PAREN_NAMES(rx)) {
5934 HV *hv= RXp_PAREN_NAMES(rx);
5936 (void)hv_iterinit(hv);
5937 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5940 SV* sv_dat = HeVAL(temphe);
5941 I32 *nums = (I32*)SvPVX(sv_dat);
5942 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5943 if ((I32)(rx->lastparen) >= nums[i] &&
5944 rx->offs[nums[i]].start != -1 &&
5945 rx->offs[nums[i]].end != -1)
5951 if (parno || flags & RXapif_ALL) {
5952 av_push(av, newSVhek(HeKEY_hek(temphe)));
5957 return newRV_noinc(MUTABLE_SV(av));
5961 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5964 struct regexp *const rx = (struct regexp *)SvANY(r);
5969 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5972 sv_setsv(sv,&PL_sv_undef);
5976 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5978 i = rx->offs[0].start;
5982 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5984 s = rx->subbeg + rx->offs[0].end;
5985 i = rx->sublen - rx->offs[0].end;
5988 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5989 (s1 = rx->offs[paren].start) != -1 &&
5990 (t1 = rx->offs[paren].end) != -1)
5994 s = rx->subbeg + s1;
5996 sv_setsv(sv,&PL_sv_undef);
5999 assert(rx->sublen >= (s - rx->subbeg) + i );
6001 const int oldtainted = PL_tainted;
6003 sv_setpvn(sv, s, i);
6004 PL_tainted = oldtainted;
6005 if ( (rx->extflags & RXf_CANY_SEEN)
6006 ? (RXp_MATCH_UTF8(rx)
6007 && (!i || is_utf8_string((U8*)s, i)))
6008 : (RXp_MATCH_UTF8(rx)) )
6015 if (RXp_MATCH_TAINTED(rx)) {
6016 if (SvTYPE(sv) >= SVt_PVMG) {
6017 MAGIC* const mg = SvMAGIC(sv);
6020 SvMAGIC_set(sv, mg->mg_moremagic);
6022 if ((mgt = SvMAGIC(sv))) {
6023 mg->mg_moremagic = mgt;
6024 SvMAGIC_set(sv, mg);
6034 sv_setsv(sv,&PL_sv_undef);
6040 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6041 SV const * const value)
6043 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6045 PERL_UNUSED_ARG(rx);
6046 PERL_UNUSED_ARG(paren);
6047 PERL_UNUSED_ARG(value);
6050 Perl_croak_no_modify(aTHX);
6054 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6057 struct regexp *const rx = (struct regexp *)SvANY(r);
6061 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6063 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6065 /* $` / ${^PREMATCH} */
6066 case RX_BUFF_IDX_PREMATCH:
6067 if (rx->offs[0].start != -1) {
6068 i = rx->offs[0].start;
6076 /* $' / ${^POSTMATCH} */
6077 case RX_BUFF_IDX_POSTMATCH:
6078 if (rx->offs[0].end != -1) {
6079 i = rx->sublen - rx->offs[0].end;
6081 s1 = rx->offs[0].end;
6087 /* $& / ${^MATCH}, $1, $2, ... */
6089 if (paren <= (I32)rx->nparens &&
6090 (s1 = rx->offs[paren].start) != -1 &&
6091 (t1 = rx->offs[paren].end) != -1)
6096 if (ckWARN(WARN_UNINITIALIZED))
6097 report_uninit((const SV *)sv);
6102 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6103 const char * const s = rx->subbeg + s1;
6108 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6115 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6117 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6118 PERL_UNUSED_ARG(rx);
6122 return newSVpvs("Regexp");
6125 /* Scans the name of a named buffer from the pattern.
6126 * If flags is REG_RSN_RETURN_NULL returns null.
6127 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6128 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6129 * to the parsed name as looked up in the RExC_paren_names hash.
6130 * If there is an error throws a vFAIL().. type exception.
6133 #define REG_RSN_RETURN_NULL 0
6134 #define REG_RSN_RETURN_NAME 1
6135 #define REG_RSN_RETURN_DATA 2
6138 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6140 char *name_start = RExC_parse;
6142 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6144 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6145 /* skip IDFIRST by using do...while */
6148 RExC_parse += UTF8SKIP(RExC_parse);
6149 } while (isALNUM_utf8((U8*)RExC_parse));
6153 } while (isALNUM(*RExC_parse));
6158 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6159 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6160 if ( flags == REG_RSN_RETURN_NAME)
6162 else if (flags==REG_RSN_RETURN_DATA) {
6165 if ( ! sv_name ) /* should not happen*/
6166 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6167 if (RExC_paren_names)
6168 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6170 sv_dat = HeVAL(he_str);
6172 vFAIL("Reference to nonexistent named group");
6176 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6177 (unsigned long) flags);
6184 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6185 int rem=(int)(RExC_end - RExC_parse); \
6194 if (RExC_lastparse!=RExC_parse) \
6195 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6198 iscut ? "..." : "<" \
6201 PerlIO_printf(Perl_debug_log,"%16s",""); \
6204 num = RExC_size + 1; \
6206 num=REG_NODE_NUM(RExC_emit); \
6207 if (RExC_lastnum!=num) \
6208 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6210 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6211 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6212 (int)((depth*2)), "", \
6216 RExC_lastparse=RExC_parse; \
6221 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6222 DEBUG_PARSE_MSG((funcname)); \
6223 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6225 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6226 DEBUG_PARSE_MSG((funcname)); \
6227 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6230 /* This section of code defines the inversion list object and its methods. The
6231 * interfaces are highly subject to change, so as much as possible is static to
6232 * this file. An inversion list is here implemented as a malloc'd C UV array
6233 * with some added info that is placed as UVs at the beginning in a header
6234 * portion. An inversion list for Unicode is an array of code points, sorted
6235 * by ordinal number. The zeroth element is the first code point in the list.
6236 * The 1th element is the first element beyond that not in the list. In other
6237 * words, the first range is
6238 * invlist[0]..(invlist[1]-1)
6239 * The other ranges follow. Thus every element whose index is divisible by two
6240 * marks the beginning of a range that is in the list, and every element not
6241 * divisible by two marks the beginning of a range not in the list. A single
6242 * element inversion list that contains the single code point N generally
6243 * consists of two elements
6246 * (The exception is when N is the highest representable value on the
6247 * machine, in which case the list containing just it would be a single
6248 * element, itself. By extension, if the last range in the list extends to
6249 * infinity, then the first element of that range will be in the inversion list
6250 * at a position that is divisible by two, and is the final element in the
6252 * Taking the complement (inverting) an inversion list is quite simple, if the
6253 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6254 * This implementation reserves an element at the beginning of each inversion list
6255 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6256 * beginning of the list is either that element if 0, or the next one if 1.
6258 * More about inversion lists can be found in "Unicode Demystified"
6259 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6260 * More will be coming when functionality is added later.
6262 * The inversion list data structure is currently implemented as an SV pointing
6263 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6264 * array of UV whose memory management is automatically handled by the existing
6265 * facilities for SV's.
6267 * Some of the methods should always be private to the implementation, and some
6268 * should eventually be made public */
6270 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6271 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6273 /* This is a combination of a version and data structure type, so that one
6274 * being passed in can be validated to be an inversion list of the correct
6275 * vintage. When the structure of the header is changed, a new random number
6276 * in the range 2**31-1 should be generated and the new() method changed to
6277 * insert that at this location. Then, if an auxiliary program doesn't change
6278 * correspondingly, it will be discovered immediately */
6279 #define INVLIST_VERSION_ID_OFFSET 2
6280 #define INVLIST_VERSION_ID 1064334010
6282 /* For safety, when adding new elements, remember to #undef them at the end of
6283 * the inversion list code section */
6285 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
6286 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
6287 * contains the code point U+00000, and begins here. If 1, the inversion list
6288 * doesn't contain U+0000, and it begins at the next UV in the array.
6289 * Inverting an inversion list consists of adding or removing the 0 at the
6290 * beginning of it. By reserving a space for that 0, inversion can be made
6293 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6295 /* Internally things are UVs */
6296 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6297 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6299 #define INVLIST_INITIAL_LEN 10
6301 PERL_STATIC_INLINE UV*
6302 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6304 /* Returns a pointer to the first element in the inversion list's array.
6305 * This is called upon initialization of an inversion list. Where the
6306 * array begins depends on whether the list has the code point U+0000
6307 * in it or not. The other parameter tells it whether the code that
6308 * follows this call is about to put a 0 in the inversion list or not.
6309 * The first element is either the element with 0, if 0, or the next one,
6312 UV* zero = get_invlist_zero_addr(invlist);
6314 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6317 assert(! *get_invlist_len_addr(invlist));
6319 /* 1^1 = 0; 1^0 = 1 */
6320 *zero = 1 ^ will_have_0;
6321 return zero + *zero;
6324 PERL_STATIC_INLINE UV*
6325 S_invlist_array(pTHX_ SV* const invlist)
6327 /* Returns the pointer to the inversion list's array. Every time the
6328 * length changes, this needs to be called in case malloc or realloc moved
6331 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6333 /* Must not be empty. If these fail, you probably didn't check for <len>
6334 * being non-zero before trying to get the array */
6335 assert(*get_invlist_len_addr(invlist));
6336 assert(*get_invlist_zero_addr(invlist) == 0
6337 || *get_invlist_zero_addr(invlist) == 1);
6339 /* The array begins either at the element reserved for zero if the
6340 * list contains 0 (that element will be set to 0), or otherwise the next
6341 * element (in which case the reserved element will be set to 1). */
6342 return (UV *) (get_invlist_zero_addr(invlist)
6343 + *get_invlist_zero_addr(invlist));
6346 PERL_STATIC_INLINE UV*
6347 S_get_invlist_len_addr(pTHX_ SV* invlist)
6349 /* Return the address of the UV that contains the current number
6350 * of used elements in the inversion list */
6352 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6354 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6357 PERL_STATIC_INLINE UV
6358 S_invlist_len(pTHX_ SV* const invlist)
6360 /* Returns the current number of elements stored in the inversion list's
6363 PERL_ARGS_ASSERT_INVLIST_LEN;
6365 return *get_invlist_len_addr(invlist);
6368 PERL_STATIC_INLINE void
6369 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6371 /* Sets the current number of elements stored in the inversion list */
6373 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6375 *get_invlist_len_addr(invlist) = len;
6377 assert(len <= SvLEN(invlist));
6379 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6380 /* If the list contains U+0000, that element is part of the header,
6381 * and should not be counted as part of the array. It will contain
6382 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6384 * SvCUR_set(invlist,
6385 * TO_INTERNAL_SIZE(len
6386 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6387 * But, this is only valid if len is not 0. The consequences of not doing
6388 * this is that the memory allocation code may think that 1 more UV is
6389 * being used than actually is, and so might do an unnecessary grow. That
6390 * seems worth not bothering to make this the precise amount.
6392 * Note that when inverting, SvCUR shouldn't change */
6395 PERL_STATIC_INLINE UV
6396 S_invlist_max(pTHX_ SV* const invlist)
6398 /* Returns the maximum number of elements storable in the inversion list's
6399 * array, without having to realloc() */
6401 PERL_ARGS_ASSERT_INVLIST_MAX;
6403 return FROM_INTERNAL_SIZE(SvLEN(invlist));
6406 PERL_STATIC_INLINE UV*
6407 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6409 /* Return the address of the UV that is reserved to hold 0 if the inversion
6410 * list contains 0. This has to be the last element of the heading, as the
6411 * list proper starts with either it if 0, or the next element if not.
6412 * (But we force it to contain either 0 or 1) */
6414 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6416 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6419 #ifndef PERL_IN_XSUB_RE
6421 Perl__new_invlist(pTHX_ IV initial_size)
6424 /* Return a pointer to a newly constructed inversion list, with enough
6425 * space to store 'initial_size' elements. If that number is negative, a
6426 * system default is used instead */
6430 if (initial_size < 0) {
6431 initial_size = INVLIST_INITIAL_LEN;
6434 /* Allocate the initial space */
6435 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6436 invlist_set_len(new_list, 0);
6438 /* Force iterinit() to be used to get iteration to work */
6439 *get_invlist_iter_addr(new_list) = UV_MAX;
6441 /* This should force a segfault if a method doesn't initialize this
6443 *get_invlist_zero_addr(new_list) = UV_MAX;
6445 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6446 #if HEADER_LENGTH != 4
6447 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6455 S__new_invlist_C_array(pTHX_ UV* list)
6457 /* Return a pointer to a newly constructed inversion list, initialized to
6458 * point to <list>, which has to be in the exact correct inversion list
6459 * form, including internal fields. Thus this is a dangerous routine that
6460 * should not be used in the wrong hands */
6462 SV* invlist = newSV_type(SVt_PV);
6464 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6466 SvPV_set(invlist, (char *) list);
6467 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
6468 shouldn't touch it */
6469 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6471 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6472 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6479 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6481 /* Grow the maximum size of an inversion list */
6483 PERL_ARGS_ASSERT_INVLIST_EXTEND;
6485 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6488 PERL_STATIC_INLINE void
6489 S_invlist_trim(pTHX_ SV* const invlist)
6491 PERL_ARGS_ASSERT_INVLIST_TRIM;
6493 /* Change the length of the inversion list to how many entries it currently
6496 SvPV_shrink_to_cur((SV *) invlist);
6499 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6501 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6502 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6504 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6506 #ifndef PERL_IN_XSUB_RE
6508 Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6510 /* Subject to change or removal. Append the range from 'start' to 'end' at
6511 * the end of the inversion list. The range must be above any existing
6515 UV max = invlist_max(invlist);
6516 UV len = invlist_len(invlist);
6518 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6520 if (len == 0) { /* Empty lists must be initialized */
6521 array = _invlist_array_init(invlist, start == 0);
6524 /* Here, the existing list is non-empty. The current max entry in the
6525 * list is generally the first value not in the set, except when the
6526 * set extends to the end of permissible values, in which case it is
6527 * the first entry in that final set, and so this call is an attempt to
6528 * append out-of-order */
6530 UV final_element = len - 1;
6531 array = invlist_array(invlist);
6532 if (array[final_element] > start
6533 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6535 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6536 array[final_element], start,
6537 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6540 /* Here, it is a legal append. If the new range begins with the first
6541 * value not in the set, it is extending the set, so the new first
6542 * value not in the set is one greater than the newly extended range.
6544 if (array[final_element] == start) {
6545 if (end != UV_MAX) {
6546 array[final_element] = end + 1;
6549 /* But if the end is the maximum representable on the machine,
6550 * just let the range that this would extend to have no end */
6551 invlist_set_len(invlist, len - 1);
6557 /* Here the new range doesn't extend any existing set. Add it */
6559 len += 2; /* Includes an element each for the start and end of range */
6561 /* If overflows the existing space, extend, which may cause the array to be
6564 invlist_extend(invlist, len);
6565 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
6566 failure in invlist_array() */
6567 array = invlist_array(invlist);
6570 invlist_set_len(invlist, len);
6573 /* The next item on the list starts the range, the one after that is
6574 * one past the new range. */
6575 array[len - 2] = start;
6576 if (end != UV_MAX) {
6577 array[len - 1] = end + 1;
6580 /* But if the end is the maximum representable on the machine, just let
6581 * the range have no end */
6582 invlist_set_len(invlist, len - 1);
6587 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6589 /* Searches the inversion list for the entry that contains the input code
6590 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
6591 * return value is the index into the list's array of the range that
6595 IV high = invlist_len(invlist);
6596 const UV * const array = invlist_array(invlist);
6598 PERL_ARGS_ASSERT_INVLIST_SEARCH;
6600 /* If list is empty or the code point is before the first element, return
6602 if (high == 0 || cp < array[0]) {
6606 /* Binary search. What we are looking for is <i> such that
6607 * array[i] <= cp < array[i+1]
6608 * The loop below converges on the i+1. */
6609 while (low < high) {
6610 IV mid = (low + high) / 2;
6611 if (array[mid] <= cp) {
6614 /* We could do this extra test to exit the loop early.
6615 if (cp < array[low]) {
6620 else { /* cp < array[mid] */
6629 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6631 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6632 * but is used when the swash has an inversion list. This makes this much
6633 * faster, as it uses a binary search instead of a linear one. This is
6634 * intimately tied to that function, and perhaps should be in utf8.c,
6635 * except it is intimately tied to inversion lists as well. It assumes
6636 * that <swatch> is all 0's on input */
6639 const IV len = invlist_len(invlist);
6643 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6645 if (len == 0) { /* Empty inversion list */
6649 array = invlist_array(invlist);
6651 /* Find which element it is */
6652 i = invlist_search(invlist, start);
6654 /* We populate from <start> to <end> */
6655 while (current < end) {
6658 /* The inversion list gives the results for every possible code point
6659 * after the first one in the list. Only those ranges whose index is
6660 * even are ones that the inversion list matches. For the odd ones,
6661 * and if the initial code point is not in the list, we have to skip
6662 * forward to the next element */
6663 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6665 if (i >= len) { /* Finished if beyond the end of the array */
6669 if (current >= end) { /* Finished if beyond the end of what we
6674 assert(current >= start);
6676 /* The current range ends one below the next one, except don't go past
6679 upper = (i < len && array[i] < end) ? array[i] : end;
6681 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
6682 * for each code point in it */
6683 for (; current < upper; current++) {
6684 const STRLEN offset = (STRLEN)(current - start);
6685 swatch[offset >> 3] |= 1 << (offset & 7);
6688 /* Quit if at the end of the list */
6691 /* But first, have to deal with the highest possible code point on
6692 * the platform. The previous code assumes that <end> is one
6693 * beyond where we want to populate, but that is impossible at the
6694 * platform's infinity, so have to handle it specially */
6695 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6697 const STRLEN offset = (STRLEN)(end - start);
6698 swatch[offset >> 3] |= 1 << (offset & 7);
6703 /* Advance to the next range, which will be for code points not in the
6713 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6715 /* Take the union of two inversion lists and point <output> to it. *output
6716 * should be defined upon input, and if it points to one of the two lists,
6717 * the reference count to that list will be decremented. The first list,
6718 * <a>, may be NULL, in which case a copy of the second list is returned.
6719 * If <complement_b> is TRUE, the union is taken of the complement
6720 * (inversion) of <b> instead of b itself.
6722 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6723 * Richard Gillam, published by Addison-Wesley, and explained at some
6724 * length there. The preface says to incorporate its examples into your
6725 * code at your own risk.
6727 * The algorithm is like a merge sort.
6729 * XXX A potential performance improvement is to keep track as we go along
6730 * if only one of the inputs contributes to the result, meaning the other
6731 * is a subset of that one. In that case, we can skip the final copy and
6732 * return the larger of the input lists, but then outside code might need
6733 * to keep track of whether to free the input list or not */
6735 UV* array_a; /* a's array */
6737 UV len_a; /* length of a's array */
6740 SV* u; /* the resulting union */
6744 UV i_a = 0; /* current index into a's array */
6748 /* running count, as explained in the algorithm source book; items are
6749 * stopped accumulating and are output when the count changes to/from 0.
6750 * The count is incremented when we start a range that's in the set, and
6751 * decremented when we start a range that's not in the set. So its range
6752 * is 0 to 2. Only when the count is zero is something not in the set.
6756 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6759 /* If either one is empty, the union is the other one */
6760 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6767 *output = invlist_clone(b);
6769 _invlist_invert(*output);
6771 } /* else *output already = b; */
6774 else if ((len_b = invlist_len(b)) == 0) {
6779 /* The complement of an empty list is a list that has everything in it,
6780 * so the union with <a> includes everything too */
6785 *output = _new_invlist(1);
6786 _append_range_to_invlist(*output, 0, UV_MAX);
6788 else if (*output != a) {
6789 *output = invlist_clone(a);
6791 /* else *output already = a; */
6795 /* Here both lists exist and are non-empty */
6796 array_a = invlist_array(a);
6797 array_b = invlist_array(b);
6799 /* If are to take the union of 'a' with the complement of b, set it
6800 * up so are looking at b's complement. */
6803 /* To complement, we invert: if the first element is 0, remove it. To
6804 * do this, we just pretend the array starts one later, and clear the
6805 * flag as we don't have to do anything else later */
6806 if (array_b[0] == 0) {
6809 complement_b = FALSE;
6813 /* But if the first element is not zero, we unshift a 0 before the
6814 * array. The data structure reserves a space for that 0 (which
6815 * should be a '1' right now), so physical shifting is unneeded,
6816 * but temporarily change that element to 0. Before exiting the
6817 * routine, we must restore the element to '1' */
6824 /* Size the union for the worst case: that the sets are completely
6826 u = _new_invlist(len_a + len_b);
6828 /* Will contain U+0000 if either component does */
6829 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6830 || (len_b > 0 && array_b[0] == 0));
6832 /* Go through each list item by item, stopping when exhausted one of
6834 while (i_a < len_a && i_b < len_b) {
6835 UV cp; /* The element to potentially add to the union's array */
6836 bool cp_in_set; /* is it in the the input list's set or not */
6838 /* We need to take one or the other of the two inputs for the union.
6839 * Since we are merging two sorted lists, we take the smaller of the
6840 * next items. In case of a tie, we take the one that is in its set
6841 * first. If we took one not in the set first, it would decrement the
6842 * count, possibly to 0 which would cause it to be output as ending the
6843 * range, and the next time through we would take the same number, and
6844 * output it again as beginning the next range. By doing it the
6845 * opposite way, there is no possibility that the count will be
6846 * momentarily decremented to 0, and thus the two adjoining ranges will
6847 * be seamlessly merged. (In a tie and both are in the set or both not
6848 * in the set, it doesn't matter which we take first.) */
6849 if (array_a[i_a] < array_b[i_b]
6850 || (array_a[i_a] == array_b[i_b]
6851 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6853 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6857 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6861 /* Here, have chosen which of the two inputs to look at. Only output
6862 * if the running count changes to/from 0, which marks the
6863 * beginning/end of a range in that's in the set */
6866 array_u[i_u++] = cp;
6873 array_u[i_u++] = cp;
6878 /* Here, we are finished going through at least one of the lists, which
6879 * means there is something remaining in at most one. We check if the list
6880 * that hasn't been exhausted is positioned such that we are in the middle
6881 * of a range in its set or not. (i_a and i_b point to the element beyond
6882 * the one we care about.) If in the set, we decrement 'count'; if 0, there
6883 * is potentially more to output.
6884 * There are four cases:
6885 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6886 * in the union is entirely from the non-exhausted set.
6887 * 2) Both were in their sets, count is 2. Nothing further should
6888 * be output, as everything that remains will be in the exhausted
6889 * list's set, hence in the union; decrementing to 1 but not 0 insures
6891 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6892 * Nothing further should be output because the union includes
6893 * everything from the exhausted set. Not decrementing ensures that.
6894 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6895 * decrementing to 0 insures that we look at the remainder of the
6896 * non-exhausted set */
6897 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6898 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6903 /* The final length is what we've output so far, plus what else is about to
6904 * be output. (If 'count' is non-zero, then the input list we exhausted
6905 * has everything remaining up to the machine's limit in its set, and hence
6906 * in the union, so there will be no further output. */
6909 /* At most one of the subexpressions will be non-zero */
6910 len_u += (len_a - i_a) + (len_b - i_b);
6913 /* Set result to final length, which can change the pointer to array_u, so
6915 if (len_u != invlist_len(u)) {
6916 invlist_set_len(u, len_u);
6918 array_u = invlist_array(u);
6921 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6922 * the other) ended with everything above it not in its set. That means
6923 * that the remaining part of the union is precisely the same as the
6924 * non-exhausted list, so can just copy it unchanged. (If both list were
6925 * exhausted at the same time, then the operations below will be both 0.)
6928 IV copy_count; /* At most one will have a non-zero copy count */
6929 if ((copy_count = len_a - i_a) > 0) {
6930 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6932 else if ((copy_count = len_b - i_b) > 0) {
6933 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6937 /* We may be removing a reference to one of the inputs */
6938 if (a == *output || b == *output) {
6939 SvREFCNT_dec(*output);
6942 /* If we've changed b, restore it */
6952 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6954 /* Take the intersection of two inversion lists and point <i> to it. *i
6955 * should be defined upon input, and if it points to one of the two lists,
6956 * the reference count to that list will be decremented.
6957 * If <complement_b> is TRUE, the result will be the intersection of <a>
6958 * and the complement (or inversion) of <b> instead of <b> directly.
6960 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6961 * Richard Gillam, published by Addison-Wesley, and explained at some
6962 * length there. The preface says to incorporate its examples into your
6963 * code at your own risk. In fact, it had bugs
6965 * The algorithm is like a merge sort, and is essentially the same as the
6969 UV* array_a; /* a's array */
6971 UV len_a; /* length of a's array */
6974 SV* r; /* the resulting intersection */
6978 UV i_a = 0; /* current index into a's array */
6982 /* running count, as explained in the algorithm source book; items are
6983 * stopped accumulating and are output when the count changes to/from 2.
6984 * The count is incremented when we start a range that's in the set, and
6985 * decremented when we start a range that's not in the set. So its range
6986 * is 0 to 2. Only when the count is 2 is something in the intersection.
6990 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
6993 /* Special case if either one is empty */
6994 len_a = invlist_len(a);
6995 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6997 if (len_a != 0 && complement_b) {
6999 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7000 * be empty. Here, also we are using 'b's complement, which hence
7001 * must be every possible code point. Thus the intersection is
7004 *i = invlist_clone(a);
7010 /* else *i is already 'a' */
7014 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7015 * intersection must be empty */
7022 *i = _new_invlist(0);
7026 /* Here both lists exist and are non-empty */
7027 array_a = invlist_array(a);
7028 array_b = invlist_array(b);
7030 /* If are to take the intersection of 'a' with the complement of b, set it
7031 * up so are looking at b's complement. */
7034 /* To complement, we invert: if the first element is 0, remove it. To
7035 * do this, we just pretend the array starts one later, and clear the
7036 * flag as we don't have to do anything else later */
7037 if (array_b[0] == 0) {
7040 complement_b = FALSE;
7044 /* But if the first element is not zero, we unshift a 0 before the
7045 * array. The data structure reserves a space for that 0 (which
7046 * should be a '1' right now), so physical shifting is unneeded,
7047 * but temporarily change that element to 0. Before exiting the
7048 * routine, we must restore the element to '1' */
7055 /* Size the intersection for the worst case: that the intersection ends up
7056 * fragmenting everything to be completely disjoint */
7057 r= _new_invlist(len_a + len_b);
7059 /* Will contain U+0000 iff both components do */
7060 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7061 && len_b > 0 && array_b[0] == 0);
7063 /* Go through each list item by item, stopping when exhausted one of
7065 while (i_a < len_a && i_b < len_b) {
7066 UV cp; /* The element to potentially add to the intersection's
7068 bool cp_in_set; /* Is it in the input list's set or not */
7070 /* We need to take one or the other of the two inputs for the
7071 * intersection. Since we are merging two sorted lists, we take the
7072 * smaller of the next items. In case of a tie, we take the one that
7073 * is not in its set first (a difference from the union algorithm). If
7074 * we took one in the set first, it would increment the count, possibly
7075 * to 2 which would cause it to be output as starting a range in the
7076 * intersection, and the next time through we would take that same
7077 * number, and output it again as ending the set. By doing it the
7078 * opposite of this, there is no possibility that the count will be
7079 * momentarily incremented to 2. (In a tie and both are in the set or
7080 * both not in the set, it doesn't matter which we take first.) */
7081 if (array_a[i_a] < array_b[i_b]
7082 || (array_a[i_a] == array_b[i_b]
7083 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7085 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7089 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7093 /* Here, have chosen which of the two inputs to look at. Only output
7094 * if the running count changes to/from 2, which marks the
7095 * beginning/end of a range that's in the intersection */
7099 array_r[i_r++] = cp;
7104 array_r[i_r++] = cp;
7110 /* Here, we are finished going through at least one of the lists, which
7111 * means there is something remaining in at most one. We check if the list
7112 * that has been exhausted is positioned such that we are in the middle
7113 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7114 * the ones we care about.) There are four cases:
7115 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7116 * nothing left in the intersection.
7117 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7118 * above 2. What should be output is exactly that which is in the
7119 * non-exhausted set, as everything it has is also in the intersection
7120 * set, and everything it doesn't have can't be in the intersection
7121 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7122 * gets incremented to 2. Like the previous case, the intersection is
7123 * everything that remains in the non-exhausted set.
7124 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7125 * remains 1. And the intersection has nothing more. */
7126 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7127 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7132 /* The final length is what we've output so far plus what else is in the
7133 * intersection. At most one of the subexpressions below will be non-zero */
7136 len_r += (len_a - i_a) + (len_b - i_b);
7139 /* Set result to final length, which can change the pointer to array_r, so
7141 if (len_r != invlist_len(r)) {
7142 invlist_set_len(r, len_r);
7144 array_r = invlist_array(r);
7147 /* Finish outputting any remaining */
7148 if (count >= 2) { /* At most one will have a non-zero copy count */
7150 if ((copy_count = len_a - i_a) > 0) {
7151 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7153 else if ((copy_count = len_b - i_b) > 0) {
7154 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7158 /* We may be removing a reference to one of the inputs */
7159 if (a == *i || b == *i) {
7163 /* If we've changed b, restore it */
7175 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7177 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7178 * set. A pointer to the inversion list is returned. This may actually be
7179 * a new list, in which case the passed in one has been destroyed. The
7180 * passed in inversion list can be NULL, in which case a new one is created
7181 * with just the one range in it */
7186 if (invlist == NULL) {
7187 invlist = _new_invlist(2);
7191 len = invlist_len(invlist);
7194 /* If comes after the final entry, can just append it to the end */
7196 || start >= invlist_array(invlist)
7197 [invlist_len(invlist) - 1])
7199 _append_range_to_invlist(invlist, start, end);
7203 /* Here, can't just append things, create and return a new inversion list
7204 * which is the union of this range and the existing inversion list */
7205 range_invlist = _new_invlist(2);
7206 _append_range_to_invlist(range_invlist, start, end);
7208 _invlist_union(invlist, range_invlist, &invlist);
7210 /* The temporary can be freed */
7211 SvREFCNT_dec(range_invlist);
7216 PERL_STATIC_INLINE SV*
7217 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7218 return add_range_to_invlist(invlist, cp, cp);
7221 #ifndef PERL_IN_XSUB_RE
7223 Perl__invlist_invert(pTHX_ SV* const invlist)
7225 /* Complement the input inversion list. This adds a 0 if the list didn't
7226 * have a zero; removes it otherwise. As described above, the data
7227 * structure is set up so that this is very efficient */
7229 UV* len_pos = get_invlist_len_addr(invlist);
7231 PERL_ARGS_ASSERT__INVLIST_INVERT;
7233 /* The inverse of matching nothing is matching everything */
7234 if (*len_pos == 0) {
7235 _append_range_to_invlist(invlist, 0, UV_MAX);
7239 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7240 * zero element was a 0, so it is being removed, so the length decrements
7241 * by 1; and vice-versa. SvCUR is unaffected */
7242 if (*get_invlist_zero_addr(invlist) ^= 1) {
7251 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7253 /* Complement the input inversion list (which must be a Unicode property,
7254 * all of which don't match above the Unicode maximum code point.) And
7255 * Perl has chosen to not have the inversion match above that either. This
7256 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7262 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7264 _invlist_invert(invlist);
7266 len = invlist_len(invlist);
7268 if (len != 0) { /* If empty do nothing */
7269 array = invlist_array(invlist);
7270 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7271 /* Add 0x110000. First, grow if necessary */
7273 if (invlist_max(invlist) < len) {
7274 invlist_extend(invlist, len);
7275 array = invlist_array(invlist);
7277 invlist_set_len(invlist, len);
7278 array[len - 1] = PERL_UNICODE_MAX + 1;
7280 else { /* Remove the 0x110000 */
7281 invlist_set_len(invlist, len - 1);
7289 PERL_STATIC_INLINE SV*
7290 S_invlist_clone(pTHX_ SV* const invlist)
7293 /* Return a new inversion list that is a copy of the input one, which is
7296 /* Need to allocate extra space to accommodate Perl's addition of a
7297 * trailing NUL to SvPV's, since it thinks they are always strings */
7298 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7299 STRLEN length = SvCUR(invlist);
7301 PERL_ARGS_ASSERT_INVLIST_CLONE;
7303 SvCUR_set(new_invlist, length); /* This isn't done automatically */
7304 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7309 PERL_STATIC_INLINE UV*
7310 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7312 /* Return the address of the UV that contains the current iteration
7315 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7317 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7320 PERL_STATIC_INLINE UV*
7321 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7323 /* Return the address of the UV that contains the version id. */
7325 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7327 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7330 PERL_STATIC_INLINE void
7331 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
7333 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7335 *get_invlist_iter_addr(invlist) = 0;
7339 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7341 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7342 * This call sets in <*start> and <*end>, the next range in <invlist>.
7343 * Returns <TRUE> if successful and the next call will return the next
7344 * range; <FALSE> if was already at the end of the list. If the latter,
7345 * <*start> and <*end> are unchanged, and the next call to this function
7346 * will start over at the beginning of the list */
7348 UV* pos = get_invlist_iter_addr(invlist);
7349 UV len = invlist_len(invlist);
7352 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7355 *pos = UV_MAX; /* Force iternit() to be required next time */
7359 array = invlist_array(invlist);
7361 *start = array[(*pos)++];
7367 *end = array[(*pos)++] - 1;
7373 #ifndef PERL_IN_XSUB_RE
7375 Perl__invlist_contents(pTHX_ SV* const invlist)
7377 /* Get the contents of an inversion list into a string SV so that they can
7378 * be printed out. It uses the format traditionally done for debug tracing
7382 SV* output = newSVpvs("\n");
7384 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7386 invlist_iterinit(invlist);
7387 while (invlist_iternext(invlist, &start, &end)) {
7388 if (end == UV_MAX) {
7389 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7391 else if (end != start) {
7392 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7396 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7406 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7408 /* Dumps out the ranges in an inversion list. The string 'header'
7409 * if present is output on a line before the first range */
7413 if (header && strlen(header)) {
7414 PerlIO_printf(Perl_debug_log, "%s\n", header);
7416 invlist_iterinit(invlist);
7417 while (invlist_iternext(invlist, &start, &end)) {
7418 if (end == UV_MAX) {
7419 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7422 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7428 #undef HEADER_LENGTH
7429 #undef INVLIST_INITIAL_LENGTH
7430 #undef TO_INTERNAL_SIZE
7431 #undef FROM_INTERNAL_SIZE
7432 #undef INVLIST_LEN_OFFSET
7433 #undef INVLIST_ZERO_OFFSET
7434 #undef INVLIST_ITER_OFFSET
7435 #undef INVLIST_VERSION_ID
7437 /* End of inversion list object */
7440 - reg - regular expression, i.e. main body or parenthesized thing
7442 * Caller must absorb opening parenthesis.
7444 * Combining parenthesis handling with the base level of regular expression
7445 * is a trifle forced, but the need to tie the tails of the branches to what
7446 * follows makes it hard to avoid.
7448 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7450 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7452 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7456 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7457 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7460 register regnode *ret; /* Will be the head of the group. */
7461 register regnode *br;
7462 register regnode *lastbr;
7463 register regnode *ender = NULL;
7464 register I32 parno = 0;
7466 U32 oregflags = RExC_flags;
7467 bool have_branch = 0;
7469 I32 freeze_paren = 0;
7470 I32 after_freeze = 0;
7472 /* for (?g), (?gc), and (?o) warnings; warning
7473 about (?c) will warn about (?g) -- japhy */
7475 #define WASTED_O 0x01
7476 #define WASTED_G 0x02
7477 #define WASTED_C 0x04
7478 #define WASTED_GC (0x02|0x04)
7479 I32 wastedflags = 0x00;
7481 char * parse_start = RExC_parse; /* MJD */
7482 char * const oregcomp_parse = RExC_parse;
7484 GET_RE_DEBUG_FLAGS_DECL;
7486 PERL_ARGS_ASSERT_REG;
7487 DEBUG_PARSE("reg ");
7489 *flagp = 0; /* Tentatively. */
7492 /* Make an OPEN node, if parenthesized. */
7494 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7495 char *start_verb = RExC_parse;
7496 STRLEN verb_len = 0;
7497 char *start_arg = NULL;
7498 unsigned char op = 0;
7500 int internal_argval = 0; /* internal_argval is only useful if !argok */
7501 while ( *RExC_parse && *RExC_parse != ')' ) {
7502 if ( *RExC_parse == ':' ) {
7503 start_arg = RExC_parse + 1;
7509 verb_len = RExC_parse - start_verb;
7512 while ( *RExC_parse && *RExC_parse != ')' )
7514 if ( *RExC_parse != ')' )
7515 vFAIL("Unterminated verb pattern argument");
7516 if ( RExC_parse == start_arg )
7519 if ( *RExC_parse != ')' )
7520 vFAIL("Unterminated verb pattern");
7523 switch ( *start_verb ) {
7524 case 'A': /* (*ACCEPT) */
7525 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7527 internal_argval = RExC_nestroot;
7530 case 'C': /* (*COMMIT) */
7531 if ( memEQs(start_verb,verb_len,"COMMIT") )
7534 case 'F': /* (*FAIL) */
7535 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7540 case ':': /* (*:NAME) */
7541 case 'M': /* (*MARK:NAME) */
7542 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7547 case 'P': /* (*PRUNE) */
7548 if ( memEQs(start_verb,verb_len,"PRUNE") )
7551 case 'S': /* (*SKIP) */
7552 if ( memEQs(start_verb,verb_len,"SKIP") )
7555 case 'T': /* (*THEN) */
7556 /* [19:06] <TimToady> :: is then */
7557 if ( memEQs(start_verb,verb_len,"THEN") ) {
7559 RExC_seen |= REG_SEEN_CUTGROUP;
7565 vFAIL3("Unknown verb pattern '%.*s'",
7566 verb_len, start_verb);
7569 if ( start_arg && internal_argval ) {
7570 vFAIL3("Verb pattern '%.*s' may not have an argument",
7571 verb_len, start_verb);
7572 } else if ( argok < 0 && !start_arg ) {
7573 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7574 verb_len, start_verb);
7576 ret = reganode(pRExC_state, op, internal_argval);
7577 if ( ! internal_argval && ! SIZE_ONLY ) {
7579 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7580 ARG(ret) = add_data( pRExC_state, 1, "S" );
7581 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7588 if (!internal_argval)
7589 RExC_seen |= REG_SEEN_VERBARG;
7590 } else if ( start_arg ) {
7591 vFAIL3("Verb pattern '%.*s' may not have an argument",
7592 verb_len, start_verb);
7594 ret = reg_node(pRExC_state, op);
7596 nextchar(pRExC_state);
7599 if (*RExC_parse == '?') { /* (?...) */
7600 bool is_logical = 0;
7601 const char * const seqstart = RExC_parse;
7602 bool has_use_defaults = FALSE;
7605 paren = *RExC_parse++;
7606 ret = NULL; /* For look-ahead/behind. */
7609 case 'P': /* (?P...) variants for those used to PCRE/Python */
7610 paren = *RExC_parse++;
7611 if ( paren == '<') /* (?P<...>) named capture */
7613 else if (paren == '>') { /* (?P>name) named recursion */
7614 goto named_recursion;
7616 else if (paren == '=') { /* (?P=...) named backref */
7617 /* this pretty much dupes the code for \k<NAME> in regatom(), if
7618 you change this make sure you change that */
7619 char* name_start = RExC_parse;
7621 SV *sv_dat = reg_scan_name(pRExC_state,
7622 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7623 if (RExC_parse == name_start || *RExC_parse != ')')
7624 vFAIL2("Sequence %.3s... not terminated",parse_start);
7627 num = add_data( pRExC_state, 1, "S" );
7628 RExC_rxi->data->data[num]=(void*)sv_dat;
7629 SvREFCNT_inc_simple_void(sv_dat);
7632 ret = reganode(pRExC_state,
7635 : (MORE_ASCII_RESTRICTED)
7637 : (AT_LEAST_UNI_SEMANTICS)
7645 Set_Node_Offset(ret, parse_start+1);
7646 Set_Node_Cur_Length(ret); /* MJD */
7648 nextchar(pRExC_state);
7652 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7654 case '<': /* (?<...) */
7655 if (*RExC_parse == '!')
7657 else if (*RExC_parse != '=')
7663 case '\'': /* (?'...') */
7664 name_start= RExC_parse;
7665 svname = reg_scan_name(pRExC_state,
7666 SIZE_ONLY ? /* reverse test from the others */
7667 REG_RSN_RETURN_NAME :
7668 REG_RSN_RETURN_NULL);
7669 if (RExC_parse == name_start) {
7671 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7674 if (*RExC_parse != paren)
7675 vFAIL2("Sequence (?%c... not terminated",
7676 paren=='>' ? '<' : paren);
7680 if (!svname) /* shouldn't happen */
7682 "panic: reg_scan_name returned NULL");
7683 if (!RExC_paren_names) {
7684 RExC_paren_names= newHV();
7685 sv_2mortal(MUTABLE_SV(RExC_paren_names));
7687 RExC_paren_name_list= newAV();
7688 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7691 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7693 sv_dat = HeVAL(he_str);
7695 /* croak baby croak */
7697 "panic: paren_name hash element allocation failed");
7698 } else if ( SvPOK(sv_dat) ) {
7699 /* (?|...) can mean we have dupes so scan to check
7700 its already been stored. Maybe a flag indicating
7701 we are inside such a construct would be useful,
7702 but the arrays are likely to be quite small, so
7703 for now we punt -- dmq */
7704 IV count = SvIV(sv_dat);
7705 I32 *pv = (I32*)SvPVX(sv_dat);
7707 for ( i = 0 ; i < count ; i++ ) {
7708 if ( pv[i] == RExC_npar ) {
7714 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7715 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7716 pv[count] = RExC_npar;
7717 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7720 (void)SvUPGRADE(sv_dat,SVt_PVNV);
7721 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7723 SvIV_set(sv_dat, 1);
7726 /* Yes this does cause a memory leak in debugging Perls */
7727 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7728 SvREFCNT_dec(svname);
7731 /*sv_dump(sv_dat);*/
7733 nextchar(pRExC_state);
7735 goto capturing_parens;
7737 RExC_seen |= REG_SEEN_LOOKBEHIND;
7738 RExC_in_lookbehind++;
7740 case '=': /* (?=...) */
7741 RExC_seen_zerolen++;
7743 case '!': /* (?!...) */
7744 RExC_seen_zerolen++;
7745 if (*RExC_parse == ')') {
7746 ret=reg_node(pRExC_state, OPFAIL);
7747 nextchar(pRExC_state);
7751 case '|': /* (?|...) */
7752 /* branch reset, behave like a (?:...) except that
7753 buffers in alternations share the same numbers */
7755 after_freeze = freeze_paren = RExC_npar;
7757 case ':': /* (?:...) */
7758 case '>': /* (?>...) */
7760 case '$': /* (?$...) */
7761 case '@': /* (?@...) */
7762 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7764 case '#': /* (?#...) */
7765 while (*RExC_parse && *RExC_parse != ')')
7767 if (*RExC_parse != ')')
7768 FAIL("Sequence (?#... not terminated");
7769 nextchar(pRExC_state);
7772 case '0' : /* (?0) */
7773 case 'R' : /* (?R) */
7774 if (*RExC_parse != ')')
7775 FAIL("Sequence (?R) not terminated");
7776 ret = reg_node(pRExC_state, GOSTART);
7777 *flagp |= POSTPONED;
7778 nextchar(pRExC_state);
7781 { /* named and numeric backreferences */
7783 case '&': /* (?&NAME) */
7784 parse_start = RExC_parse - 1;
7787 SV *sv_dat = reg_scan_name(pRExC_state,
7788 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7789 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7791 goto gen_recurse_regop;
7794 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7796 vFAIL("Illegal pattern");
7798 goto parse_recursion;
7800 case '-': /* (?-1) */
7801 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7802 RExC_parse--; /* rewind to let it be handled later */
7806 case '1': case '2': case '3': case '4': /* (?1) */
7807 case '5': case '6': case '7': case '8': case '9':
7810 num = atoi(RExC_parse);
7811 parse_start = RExC_parse - 1; /* MJD */
7812 if (*RExC_parse == '-')
7814 while (isDIGIT(*RExC_parse))
7816 if (*RExC_parse!=')')
7817 vFAIL("Expecting close bracket");
7820 if ( paren == '-' ) {
7822 Diagram of capture buffer numbering.
7823 Top line is the normal capture buffer numbers
7824 Bottom line is the negative indexing as from
7828 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7832 num = RExC_npar + num;
7835 vFAIL("Reference to nonexistent group");
7837 } else if ( paren == '+' ) {
7838 num = RExC_npar + num - 1;
7841 ret = reganode(pRExC_state, GOSUB, num);
7843 if (num > (I32)RExC_rx->nparens) {
7845 vFAIL("Reference to nonexistent group");
7847 ARG2L_SET( ret, RExC_recurse_count++);
7849 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7850 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7854 RExC_seen |= REG_SEEN_RECURSE;
7855 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7856 Set_Node_Offset(ret, parse_start); /* MJD */
7858 *flagp |= POSTPONED;
7859 nextchar(pRExC_state);
7861 } /* named and numeric backreferences */
7864 case '?': /* (??...) */
7866 if (*RExC_parse != '{') {
7868 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7871 *flagp |= POSTPONED;
7872 paren = *RExC_parse++;
7874 case '{': /* (?{...}) */
7879 char *s = RExC_parse;
7881 RExC_seen_zerolen++;
7882 RExC_seen |= REG_SEEN_EVAL;
7883 while (count && (c = *RExC_parse)) {
7894 if (*RExC_parse != ')') {
7896 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7900 OP_4tree *sop, *rop;
7901 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7904 Perl_save_re_context(aTHX);
7905 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7906 sop->op_private |= OPpREFCOUNTED;
7907 /* re_dup will OpREFCNT_inc */
7908 OpREFCNT_set(sop, 1);
7911 n = add_data(pRExC_state, 3, "nop");
7912 RExC_rxi->data->data[n] = (void*)rop;
7913 RExC_rxi->data->data[n+1] = (void*)sop;
7914 RExC_rxi->data->data[n+2] = (void*)pad;
7917 else { /* First pass */
7918 if (PL_reginterp_cnt < ++RExC_seen_evals
7920 /* No compiled RE interpolated, has runtime
7921 components ===> unsafe. */
7922 FAIL("Eval-group not allowed at runtime, use re 'eval'");
7923 if (PL_tainting && PL_tainted)
7924 FAIL("Eval-group in insecure regular expression");
7925 #if PERL_VERSION > 8
7926 if (IN_PERL_COMPILETIME)
7931 nextchar(pRExC_state);
7933 ret = reg_node(pRExC_state, LOGICAL);
7936 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7937 /* deal with the length of this later - MJD */
7940 ret = reganode(pRExC_state, EVAL, n);
7941 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7942 Set_Node_Offset(ret, parse_start);
7945 case '(': /* (?(?{...})...) and (?(?=...)...) */
7948 if (RExC_parse[0] == '?') { /* (?(?...)) */
7949 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7950 || RExC_parse[1] == '<'
7951 || RExC_parse[1] == '{') { /* Lookahead or eval. */
7954 ret = reg_node(pRExC_state, LOGICAL);
7957 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7961 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
7962 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7964 char ch = RExC_parse[0] == '<' ? '>' : '\'';
7965 char *name_start= RExC_parse++;
7967 SV *sv_dat=reg_scan_name(pRExC_state,
7968 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7969 if (RExC_parse == name_start || *RExC_parse != ch)
7970 vFAIL2("Sequence (?(%c... not terminated",
7971 (ch == '>' ? '<' : ch));
7974 num = add_data( pRExC_state, 1, "S" );
7975 RExC_rxi->data->data[num]=(void*)sv_dat;
7976 SvREFCNT_inc_simple_void(sv_dat);
7978 ret = reganode(pRExC_state,NGROUPP,num);
7979 goto insert_if_check_paren;
7981 else if (RExC_parse[0] == 'D' &&
7982 RExC_parse[1] == 'E' &&
7983 RExC_parse[2] == 'F' &&
7984 RExC_parse[3] == 'I' &&
7985 RExC_parse[4] == 'N' &&
7986 RExC_parse[5] == 'E')
7988 ret = reganode(pRExC_state,DEFINEP,0);
7991 goto insert_if_check_paren;
7993 else if (RExC_parse[0] == 'R') {
7996 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7997 parno = atoi(RExC_parse++);
7998 while (isDIGIT(*RExC_parse))
8000 } else if (RExC_parse[0] == '&') {
8003 sv_dat = reg_scan_name(pRExC_state,
8004 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8005 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8007 ret = reganode(pRExC_state,INSUBP,parno);
8008 goto insert_if_check_paren;
8010 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8013 parno = atoi(RExC_parse++);
8015 while (isDIGIT(*RExC_parse))
8017 ret = reganode(pRExC_state, GROUPP, parno);
8019 insert_if_check_paren:
8020 if ((c = *nextchar(pRExC_state)) != ')')
8021 vFAIL("Switch condition not recognized");
8023 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8024 br = regbranch(pRExC_state, &flags, 1,depth+1);
8026 br = reganode(pRExC_state, LONGJMP, 0);
8028 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8029 c = *nextchar(pRExC_state);
8034 vFAIL("(?(DEFINE)....) does not allow branches");
8035 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8036 regbranch(pRExC_state, &flags, 1,depth+1);
8037 REGTAIL(pRExC_state, ret, lastbr);
8040 c = *nextchar(pRExC_state);
8045 vFAIL("Switch (?(condition)... contains too many branches");
8046 ender = reg_node(pRExC_state, TAIL);
8047 REGTAIL(pRExC_state, br, ender);
8049 REGTAIL(pRExC_state, lastbr, ender);
8050 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8053 REGTAIL(pRExC_state, ret, ender);
8054 RExC_size++; /* XXX WHY do we need this?!!
8055 For large programs it seems to be required
8056 but I can't figure out why. -- dmq*/
8060 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8064 RExC_parse--; /* for vFAIL to print correctly */
8065 vFAIL("Sequence (? incomplete");
8067 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8069 has_use_defaults = TRUE;
8070 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8071 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8072 ? REGEX_UNICODE_CHARSET
8073 : REGEX_DEPENDS_CHARSET);
8077 parse_flags: /* (?i) */
8079 U32 posflags = 0, negflags = 0;
8080 U32 *flagsp = &posflags;
8081 char has_charset_modifier = '\0';
8082 regex_charset cs = get_regex_charset(RExC_flags);
8083 if (cs == REGEX_DEPENDS_CHARSET
8084 && (RExC_utf8 || RExC_uni_semantics))
8086 cs = REGEX_UNICODE_CHARSET;
8089 while (*RExC_parse) {
8090 /* && strchr("iogcmsx", *RExC_parse) */
8091 /* (?g), (?gc) and (?o) are useless here
8092 and must be globally applied -- japhy */
8093 switch (*RExC_parse) {
8094 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8095 case LOCALE_PAT_MOD:
8096 if (has_charset_modifier) {
8097 goto excess_modifier;
8099 else if (flagsp == &negflags) {
8102 cs = REGEX_LOCALE_CHARSET;
8103 has_charset_modifier = LOCALE_PAT_MOD;
8104 RExC_contains_locale = 1;
8106 case UNICODE_PAT_MOD:
8107 if (has_charset_modifier) {
8108 goto excess_modifier;
8110 else if (flagsp == &negflags) {
8113 cs = REGEX_UNICODE_CHARSET;
8114 has_charset_modifier = UNICODE_PAT_MOD;
8116 case ASCII_RESTRICT_PAT_MOD:
8117 if (flagsp == &negflags) {
8120 if (has_charset_modifier) {
8121 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8122 goto excess_modifier;
8124 /* Doubled modifier implies more restricted */
8125 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8128 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8130 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8132 case DEPENDS_PAT_MOD:
8133 if (has_use_defaults) {
8134 goto fail_modifiers;
8136 else if (flagsp == &negflags) {
8139 else if (has_charset_modifier) {
8140 goto excess_modifier;
8143 /* The dual charset means unicode semantics if the
8144 * pattern (or target, not known until runtime) are
8145 * utf8, or something in the pattern indicates unicode
8147 cs = (RExC_utf8 || RExC_uni_semantics)
8148 ? REGEX_UNICODE_CHARSET
8149 : REGEX_DEPENDS_CHARSET;
8150 has_charset_modifier = DEPENDS_PAT_MOD;
8154 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8155 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8157 else if (has_charset_modifier == *(RExC_parse - 1)) {
8158 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8161 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8166 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8168 case ONCE_PAT_MOD: /* 'o' */
8169 case GLOBAL_PAT_MOD: /* 'g' */
8170 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8171 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8172 if (! (wastedflags & wflagbit) ) {
8173 wastedflags |= wflagbit;
8176 "Useless (%s%c) - %suse /%c modifier",
8177 flagsp == &negflags ? "?-" : "?",
8179 flagsp == &negflags ? "don't " : "",
8186 case CONTINUE_PAT_MOD: /* 'c' */
8187 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8188 if (! (wastedflags & WASTED_C) ) {
8189 wastedflags |= WASTED_GC;
8192 "Useless (%sc) - %suse /gc modifier",
8193 flagsp == &negflags ? "?-" : "?",
8194 flagsp == &negflags ? "don't " : ""
8199 case KEEPCOPY_PAT_MOD: /* 'p' */
8200 if (flagsp == &negflags) {
8202 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8204 *flagsp |= RXf_PMf_KEEPCOPY;
8208 /* A flag is a default iff it is following a minus, so
8209 * if there is a minus, it means will be trying to
8210 * re-specify a default which is an error */
8211 if (has_use_defaults || flagsp == &negflags) {
8214 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8218 wastedflags = 0; /* reset so (?g-c) warns twice */
8224 RExC_flags |= posflags;
8225 RExC_flags &= ~negflags;
8226 set_regex_charset(&RExC_flags, cs);
8228 oregflags |= posflags;
8229 oregflags &= ~negflags;
8230 set_regex_charset(&oregflags, cs);
8232 nextchar(pRExC_state);
8243 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8248 }} /* one for the default block, one for the switch */
8255 ret = reganode(pRExC_state, OPEN, parno);
8258 RExC_nestroot = parno;
8259 if (RExC_seen & REG_SEEN_RECURSE
8260 && !RExC_open_parens[parno-1])
8262 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8263 "Setting open paren #%"IVdf" to %d\n",
8264 (IV)parno, REG_NODE_NUM(ret)));
8265 RExC_open_parens[parno-1]= ret;
8268 Set_Node_Length(ret, 1); /* MJD */
8269 Set_Node_Offset(ret, RExC_parse); /* MJD */
8277 /* Pick up the branches, linking them together. */
8278 parse_start = RExC_parse; /* MJD */
8279 br = regbranch(pRExC_state, &flags, 1,depth+1);
8281 /* branch_len = (paren != 0); */
8285 if (*RExC_parse == '|') {
8286 if (!SIZE_ONLY && RExC_extralen) {
8287 reginsert(pRExC_state, BRANCHJ, br, depth+1);
8290 reginsert(pRExC_state, BRANCH, br, depth+1);
8291 Set_Node_Length(br, paren != 0);
8292 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8296 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
8298 else if (paren == ':') {
8299 *flagp |= flags&SIMPLE;
8301 if (is_open) { /* Starts with OPEN. */
8302 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
8304 else if (paren != '?') /* Not Conditional */
8306 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8308 while (*RExC_parse == '|') {
8309 if (!SIZE_ONLY && RExC_extralen) {
8310 ender = reganode(pRExC_state, LONGJMP,0);
8311 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8314 RExC_extralen += 2; /* Account for LONGJMP. */
8315 nextchar(pRExC_state);
8317 if (RExC_npar > after_freeze)
8318 after_freeze = RExC_npar;
8319 RExC_npar = freeze_paren;
8321 br = regbranch(pRExC_state, &flags, 0, depth+1);
8325 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
8327 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8330 if (have_branch || paren != ':') {
8331 /* Make a closing node, and hook it on the end. */
8334 ender = reg_node(pRExC_state, TAIL);
8337 ender = reganode(pRExC_state, CLOSE, parno);
8338 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8339 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8340 "Setting close paren #%"IVdf" to %d\n",
8341 (IV)parno, REG_NODE_NUM(ender)));
8342 RExC_close_parens[parno-1]= ender;
8343 if (RExC_nestroot == parno)
8346 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8347 Set_Node_Length(ender,1); /* MJD */
8353 *flagp &= ~HASWIDTH;
8356 ender = reg_node(pRExC_state, SUCCEED);
8359 ender = reg_node(pRExC_state, END);
8361 assert(!RExC_opend); /* there can only be one! */
8366 REGTAIL(pRExC_state, lastbr, ender);
8368 if (have_branch && !SIZE_ONLY) {
8370 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8372 /* Hook the tails of the branches to the closing node. */
8373 for (br = ret; br; br = regnext(br)) {
8374 const U8 op = PL_regkind[OP(br)];
8376 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8378 else if (op == BRANCHJ) {
8379 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8387 static const char parens[] = "=!<,>";
8389 if (paren && (p = strchr(parens, paren))) {
8390 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8391 int flag = (p - parens) > 1;
8394 node = SUSPEND, flag = 0;
8395 reginsert(pRExC_state, node,ret, depth+1);
8396 Set_Node_Cur_Length(ret);
8397 Set_Node_Offset(ret, parse_start + 1);
8399 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8403 /* Check for proper termination. */
8405 RExC_flags = oregflags;
8406 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8407 RExC_parse = oregcomp_parse;
8408 vFAIL("Unmatched (");
8411 else if (!paren && RExC_parse < RExC_end) {
8412 if (*RExC_parse == ')') {
8414 vFAIL("Unmatched )");
8417 FAIL("Junk on end of regexp"); /* "Can't happen". */
8421 if (RExC_in_lookbehind) {
8422 RExC_in_lookbehind--;
8424 if (after_freeze > RExC_npar)
8425 RExC_npar = after_freeze;
8430 - regbranch - one alternative of an | operator
8432 * Implements the concatenation operator.
8435 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8438 register regnode *ret;
8439 register regnode *chain = NULL;
8440 register regnode *latest;
8441 I32 flags = 0, c = 0;
8442 GET_RE_DEBUG_FLAGS_DECL;
8444 PERL_ARGS_ASSERT_REGBRANCH;
8446 DEBUG_PARSE("brnc");
8451 if (!SIZE_ONLY && RExC_extralen)
8452 ret = reganode(pRExC_state, BRANCHJ,0);
8454 ret = reg_node(pRExC_state, BRANCH);
8455 Set_Node_Length(ret, 1);
8459 if (!first && SIZE_ONLY)
8460 RExC_extralen += 1; /* BRANCHJ */
8462 *flagp = WORST; /* Tentatively. */
8465 nextchar(pRExC_state);
8466 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8468 latest = regpiece(pRExC_state, &flags,depth+1);
8469 if (latest == NULL) {
8470 if (flags & TRYAGAIN)
8474 else if (ret == NULL)
8476 *flagp |= flags&(HASWIDTH|POSTPONED);
8477 if (chain == NULL) /* First piece. */
8478 *flagp |= flags&SPSTART;
8481 REGTAIL(pRExC_state, chain, latest);
8486 if (chain == NULL) { /* Loop ran zero times. */
8487 chain = reg_node(pRExC_state, NOTHING);
8492 *flagp |= flags&SIMPLE;
8499 - regpiece - something followed by possible [*+?]
8501 * Note that the branching code sequences used for ? and the general cases
8502 * of * and + are somewhat optimized: they use the same NOTHING node as
8503 * both the endmarker for their branch list and the body of the last branch.
8504 * It might seem that this node could be dispensed with entirely, but the
8505 * endmarker role is not redundant.
8508 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8511 register regnode *ret;
8513 register char *next;
8515 const char * const origparse = RExC_parse;
8517 I32 max = REG_INFTY;
8518 #ifdef RE_TRACK_PATTERN_OFFSETS
8521 const char *maxpos = NULL;
8522 GET_RE_DEBUG_FLAGS_DECL;
8524 PERL_ARGS_ASSERT_REGPIECE;
8526 DEBUG_PARSE("piec");
8528 ret = regatom(pRExC_state, &flags,depth+1);
8530 if (flags & TRYAGAIN)
8537 if (op == '{' && regcurly(RExC_parse)) {
8539 #ifdef RE_TRACK_PATTERN_OFFSETS
8540 parse_start = RExC_parse; /* MJD */
8542 next = RExC_parse + 1;
8543 while (isDIGIT(*next) || *next == ',') {
8552 if (*next == '}') { /* got one */
8556 min = atoi(RExC_parse);
8560 maxpos = RExC_parse;
8562 if (!max && *maxpos != '0')
8563 max = REG_INFTY; /* meaning "infinity" */
8564 else if (max >= REG_INFTY)
8565 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8567 nextchar(pRExC_state);
8570 if ((flags&SIMPLE)) {
8571 RExC_naughty += 2 + RExC_naughty / 2;
8572 reginsert(pRExC_state, CURLY, ret, depth+1);
8573 Set_Node_Offset(ret, parse_start+1); /* MJD */
8574 Set_Node_Cur_Length(ret);
8577 regnode * const w = reg_node(pRExC_state, WHILEM);
8580 REGTAIL(pRExC_state, ret, w);
8581 if (!SIZE_ONLY && RExC_extralen) {
8582 reginsert(pRExC_state, LONGJMP,ret, depth+1);
8583 reginsert(pRExC_state, NOTHING,ret, depth+1);
8584 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
8586 reginsert(pRExC_state, CURLYX,ret, depth+1);
8588 Set_Node_Offset(ret, parse_start+1);
8589 Set_Node_Length(ret,
8590 op == '{' ? (RExC_parse - parse_start) : 1);
8592 if (!SIZE_ONLY && RExC_extralen)
8593 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
8594 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8596 RExC_whilem_seen++, RExC_extralen += 3;
8597 RExC_naughty += 4 + RExC_naughty; /* compound interest */
8606 vFAIL("Can't do {n,m} with n > m");
8608 ARG1_SET(ret, (U16)min);
8609 ARG2_SET(ret, (U16)max);
8621 #if 0 /* Now runtime fix should be reliable. */
8623 /* if this is reinstated, don't forget to put this back into perldiag:
8625 =item Regexp *+ operand could be empty at {#} in regex m/%s/
8627 (F) The part of the regexp subject to either the * or + quantifier
8628 could match an empty string. The {#} shows in the regular
8629 expression about where the problem was discovered.
8633 if (!(flags&HASWIDTH) && op != '?')
8634 vFAIL("Regexp *+ operand could be empty");
8637 #ifdef RE_TRACK_PATTERN_OFFSETS
8638 parse_start = RExC_parse;
8640 nextchar(pRExC_state);
8642 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8644 if (op == '*' && (flags&SIMPLE)) {
8645 reginsert(pRExC_state, STAR, ret, depth+1);
8649 else if (op == '*') {
8653 else if (op == '+' && (flags&SIMPLE)) {
8654 reginsert(pRExC_state, PLUS, ret, depth+1);
8658 else if (op == '+') {
8662 else if (op == '?') {
8667 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8668 ckWARN3reg(RExC_parse,
8669 "%.*s matches null string many times",
8670 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8674 if (RExC_parse < RExC_end && *RExC_parse == '?') {
8675 nextchar(pRExC_state);
8676 reginsert(pRExC_state, MINMOD, ret, depth+1);
8677 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8679 #ifndef REG_ALLOW_MINMOD_SUSPEND
8682 if (RExC_parse < RExC_end && *RExC_parse == '+') {
8684 nextchar(pRExC_state);
8685 ender = reg_node(pRExC_state, SUCCEED);
8686 REGTAIL(pRExC_state, ret, ender);
8687 reginsert(pRExC_state, SUSPEND, ret, depth+1);
8689 ender = reg_node(pRExC_state, TAIL);
8690 REGTAIL(pRExC_state, ret, ender);
8694 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8696 vFAIL("Nested quantifiers");
8703 /* reg_namedseq(pRExC_state,UVp, UV depth)
8705 This is expected to be called by a parser routine that has
8706 recognized '\N' and needs to handle the rest. RExC_parse is
8707 expected to point at the first char following the N at the time
8710 The \N may be inside (indicated by valuep not being NULL) or outside a
8713 \N may begin either a named sequence, or if outside a character class, mean
8714 to match a non-newline. For non single-quoted regexes, the tokenizer has
8715 attempted to decide which, and in the case of a named sequence converted it
8716 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8717 where c1... are the characters in the sequence. For single-quoted regexes,
8718 the tokenizer passes the \N sequence through unchanged; this code will not
8719 attempt to determine this nor expand those. The net effect is that if the
8720 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8721 signals that this \N occurrence means to match a non-newline.
8723 Only the \N{U+...} form should occur in a character class, for the same
8724 reason that '.' inside a character class means to just match a period: it
8725 just doesn't make sense.
8727 If valuep is non-null then it is assumed that we are parsing inside
8728 of a charclass definition and the first codepoint in the resolved
8729 string is returned via *valuep and the routine will return NULL.
8730 In this mode if a multichar string is returned from the charnames
8731 handler, a warning will be issued, and only the first char in the
8732 sequence will be examined. If the string returned is zero length
8733 then the value of *valuep is undefined and NON-NULL will
8734 be returned to indicate failure. (This will NOT be a valid pointer
8737 If valuep is null then it is assumed that we are parsing normal text and a
8738 new EXACT node is inserted into the program containing the resolved string,
8739 and a pointer to the new node is returned. But if the string is zero length
8740 a NOTHING node is emitted instead.
8742 On success RExC_parse is set to the char following the endbrace.
8743 Parsing failures will generate a fatal error via vFAIL(...)
8746 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8748 char * endbrace; /* '}' following the name */
8749 regnode *ret = NULL;
8752 GET_RE_DEBUG_FLAGS_DECL;
8754 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8758 /* The [^\n] meaning of \N ignores spaces and comments under the /x
8759 * modifier. The other meaning does not */
8760 p = (RExC_flags & RXf_PMf_EXTENDED)
8761 ? regwhite( pRExC_state, RExC_parse )
8764 /* Disambiguate between \N meaning a named character versus \N meaning
8765 * [^\n]. The former is assumed when it can't be the latter. */
8766 if (*p != '{' || regcurly(p)) {
8769 /* no bare \N in a charclass */
8770 vFAIL("\\N in a character class must be a named character: \\N{...}");
8772 nextchar(pRExC_state);
8773 ret = reg_node(pRExC_state, REG_ANY);
8774 *flagp |= HASWIDTH|SIMPLE;
8777 Set_Node_Length(ret, 1); /* MJD */
8781 /* Here, we have decided it should be a named sequence */
8783 /* The test above made sure that the next real character is a '{', but
8784 * under the /x modifier, it could be separated by space (or a comment and
8785 * \n) and this is not allowed (for consistency with \x{...} and the
8786 * tokenizer handling of \N{NAME}). */
8787 if (*RExC_parse != '{') {
8788 vFAIL("Missing braces on \\N{}");
8791 RExC_parse++; /* Skip past the '{' */
8793 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8794 || ! (endbrace == RExC_parse /* nothing between the {} */
8795 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
8796 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8798 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
8799 vFAIL("\\N{NAME} must be resolved by the lexer");
8802 if (endbrace == RExC_parse) { /* empty: \N{} */
8804 RExC_parse = endbrace + 1;
8805 return reg_node(pRExC_state,NOTHING);
8809 ckWARNreg(RExC_parse,
8810 "Ignoring zero length \\N{} in character class"
8812 RExC_parse = endbrace + 1;
8815 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8818 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
8819 RExC_parse += 2; /* Skip past the 'U+' */
8821 if (valuep) { /* In a bracketed char class */
8822 /* We only pay attention to the first char of
8823 multichar strings being returned. I kinda wonder
8824 if this makes sense as it does change the behaviour
8825 from earlier versions, OTOH that behaviour was broken
8826 as well. XXX Solution is to recharacterize as
8827 [rest-of-class]|multi1|multi2... */
8829 STRLEN length_of_hex;
8830 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8831 | PERL_SCAN_DISALLOW_PREFIX
8832 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8834 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8835 if (endchar < endbrace) {
8836 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8839 length_of_hex = (STRLEN)(endchar - RExC_parse);
8840 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8842 /* The tokenizer should have guaranteed validity, but it's possible to
8843 * bypass it by using single quoting, so check */
8844 if (length_of_hex == 0
8845 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8847 RExC_parse += length_of_hex; /* Includes all the valid */
8848 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
8849 ? UTF8SKIP(RExC_parse)
8851 /* Guard against malformed utf8 */
8852 if (RExC_parse >= endchar) RExC_parse = endchar;
8853 vFAIL("Invalid hexadecimal number in \\N{U+...}");
8856 RExC_parse = endbrace + 1;
8857 if (endchar == endbrace) return NULL;
8859 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
8861 else { /* Not a char class */
8863 /* What is done here is to convert this to a sub-pattern of the form
8864 * (?:\x{char1}\x{char2}...)
8865 * and then call reg recursively. That way, it retains its atomicness,
8866 * while not having to worry about special handling that some code
8867 * points may have. toke.c has converted the original Unicode values
8868 * to native, so that we can just pass on the hex values unchanged. We
8869 * do have to set a flag to keep recoding from happening in the
8872 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8874 char *endchar; /* Points to '.' or '}' ending cur char in the input
8876 char *orig_end = RExC_end;
8878 while (RExC_parse < endbrace) {
8880 /* Code points are separated by dots. If none, there is only one
8881 * code point, and is terminated by the brace */
8882 endchar = RExC_parse + strcspn(RExC_parse, ".}");
8884 /* Convert to notation the rest of the code understands */
8885 sv_catpv(substitute_parse, "\\x{");
8886 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8887 sv_catpv(substitute_parse, "}");
8889 /* Point to the beginning of the next character in the sequence. */
8890 RExC_parse = endchar + 1;
8892 sv_catpv(substitute_parse, ")");
8894 RExC_parse = SvPV(substitute_parse, len);
8896 /* Don't allow empty number */
8898 vFAIL("Invalid hexadecimal number in \\N{U+...}");
8900 RExC_end = RExC_parse + len;
8902 /* The values are Unicode, and therefore not subject to recoding */
8903 RExC_override_recoding = 1;
8905 ret = reg(pRExC_state, 1, flagp, depth+1);
8907 RExC_parse = endbrace;
8908 RExC_end = orig_end;
8909 RExC_override_recoding = 0;
8911 nextchar(pRExC_state);
8921 * It returns the code point in utf8 for the value in *encp.
8922 * value: a code value in the source encoding
8923 * encp: a pointer to an Encode object
8925 * If the result from Encode is not a single character,
8926 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8929 S_reg_recode(pTHX_ const char value, SV **encp)
8932 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8933 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8934 const STRLEN newlen = SvCUR(sv);
8935 UV uv = UNICODE_REPLACEMENT;
8937 PERL_ARGS_ASSERT_REG_RECODE;
8941 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8944 if (!newlen || numlen != newlen) {
8945 uv = UNICODE_REPLACEMENT;
8953 - regatom - the lowest level
8955 Try to identify anything special at the start of the pattern. If there
8956 is, then handle it as required. This may involve generating a single regop,
8957 such as for an assertion; or it may involve recursing, such as to
8958 handle a () structure.
8960 If the string doesn't start with something special then we gobble up
8961 as much literal text as we can.
8963 Once we have been able to handle whatever type of thing started the
8964 sequence, we return.
8966 Note: we have to be careful with escapes, as they can be both literal
8967 and special, and in the case of \10 and friends can either, depending
8968 on context. Specifically there are two separate switches for handling
8969 escape sequences, with the one for handling literal escapes requiring
8970 a dummy entry for all of the special escapes that are actually handled
8975 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8978 register regnode *ret = NULL;
8980 char *parse_start = RExC_parse;
8982 GET_RE_DEBUG_FLAGS_DECL;
8983 DEBUG_PARSE("atom");
8984 *flagp = WORST; /* Tentatively. */
8986 PERL_ARGS_ASSERT_REGATOM;
8989 switch ((U8)*RExC_parse) {
8991 RExC_seen_zerolen++;
8992 nextchar(pRExC_state);
8993 if (RExC_flags & RXf_PMf_MULTILINE)
8994 ret = reg_node(pRExC_state, MBOL);
8995 else if (RExC_flags & RXf_PMf_SINGLELINE)
8996 ret = reg_node(pRExC_state, SBOL);
8998 ret = reg_node(pRExC_state, BOL);
8999 Set_Node_Length(ret, 1); /* MJD */
9002 nextchar(pRExC_state);
9004 RExC_seen_zerolen++;
9005 if (RExC_flags & RXf_PMf_MULTILINE)
9006 ret = reg_node(pRExC_state, MEOL);
9007 else if (RExC_flags & RXf_PMf_SINGLELINE)
9008 ret = reg_node(pRExC_state, SEOL);
9010 ret = reg_node(pRExC_state, EOL);
9011 Set_Node_Length(ret, 1); /* MJD */
9014 nextchar(pRExC_state);
9015 if (RExC_flags & RXf_PMf_SINGLELINE)
9016 ret = reg_node(pRExC_state, SANY);
9018 ret = reg_node(pRExC_state, REG_ANY);
9019 *flagp |= HASWIDTH|SIMPLE;
9021 Set_Node_Length(ret, 1); /* MJD */
9025 char * const oregcomp_parse = ++RExC_parse;
9026 ret = regclass(pRExC_state,depth+1);
9027 if (*RExC_parse != ']') {
9028 RExC_parse = oregcomp_parse;
9029 vFAIL("Unmatched [");
9031 nextchar(pRExC_state);
9032 *flagp |= HASWIDTH|SIMPLE;
9033 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9037 nextchar(pRExC_state);
9038 ret = reg(pRExC_state, 1, &flags,depth+1);
9040 if (flags & TRYAGAIN) {
9041 if (RExC_parse == RExC_end) {
9042 /* Make parent create an empty node if needed. */
9050 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9054 if (flags & TRYAGAIN) {
9058 vFAIL("Internal urp");
9059 /* Supposed to be caught earlier. */
9062 if (!regcurly(RExC_parse)) {
9071 vFAIL("Quantifier follows nothing");
9076 This switch handles escape sequences that resolve to some kind
9077 of special regop and not to literal text. Escape sequnces that
9078 resolve to literal text are handled below in the switch marked
9081 Every entry in this switch *must* have a corresponding entry
9082 in the literal escape switch. However, the opposite is not
9083 required, as the default for this switch is to jump to the
9084 literal text handling code.
9086 switch ((U8)*++RExC_parse) {
9087 /* Special Escapes */
9089 RExC_seen_zerolen++;
9090 ret = reg_node(pRExC_state, SBOL);
9092 goto finish_meta_pat;
9094 ret = reg_node(pRExC_state, GPOS);
9095 RExC_seen |= REG_SEEN_GPOS;
9097 goto finish_meta_pat;
9099 RExC_seen_zerolen++;
9100 ret = reg_node(pRExC_state, KEEPS);
9102 /* XXX:dmq : disabling in-place substitution seems to
9103 * be necessary here to avoid cases of memory corruption, as
9104 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9106 RExC_seen |= REG_SEEN_LOOKBEHIND;
9107 goto finish_meta_pat;
9109 ret = reg_node(pRExC_state, SEOL);
9111 RExC_seen_zerolen++; /* Do not optimize RE away */
9112 goto finish_meta_pat;
9114 ret = reg_node(pRExC_state, EOS);
9116 RExC_seen_zerolen++; /* Do not optimize RE away */
9117 goto finish_meta_pat;
9119 ret = reg_node(pRExC_state, CANY);
9120 RExC_seen |= REG_SEEN_CANY;
9121 *flagp |= HASWIDTH|SIMPLE;
9122 goto finish_meta_pat;
9124 ret = reg_node(pRExC_state, CLUMP);
9126 goto finish_meta_pat;
9128 switch (get_regex_charset(RExC_flags)) {
9129 case REGEX_LOCALE_CHARSET:
9132 case REGEX_UNICODE_CHARSET:
9135 case REGEX_ASCII_RESTRICTED_CHARSET:
9136 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9139 case REGEX_DEPENDS_CHARSET:
9145 ret = reg_node(pRExC_state, op);
9146 *flagp |= HASWIDTH|SIMPLE;
9147 goto finish_meta_pat;
9149 switch (get_regex_charset(RExC_flags)) {
9150 case REGEX_LOCALE_CHARSET:
9153 case REGEX_UNICODE_CHARSET:
9156 case REGEX_ASCII_RESTRICTED_CHARSET:
9157 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9160 case REGEX_DEPENDS_CHARSET:
9166 ret = reg_node(pRExC_state, op);
9167 *flagp |= HASWIDTH|SIMPLE;
9168 goto finish_meta_pat;
9170 RExC_seen_zerolen++;
9171 RExC_seen |= REG_SEEN_LOOKBEHIND;
9172 switch (get_regex_charset(RExC_flags)) {
9173 case REGEX_LOCALE_CHARSET:
9176 case REGEX_UNICODE_CHARSET:
9179 case REGEX_ASCII_RESTRICTED_CHARSET:
9180 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9183 case REGEX_DEPENDS_CHARSET:
9189 ret = reg_node(pRExC_state, op);
9190 FLAGS(ret) = get_regex_charset(RExC_flags);
9192 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9193 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
9195 goto finish_meta_pat;
9197 RExC_seen_zerolen++;
9198 RExC_seen |= REG_SEEN_LOOKBEHIND;
9199 switch (get_regex_charset(RExC_flags)) {
9200 case REGEX_LOCALE_CHARSET:
9203 case REGEX_UNICODE_CHARSET:
9206 case REGEX_ASCII_RESTRICTED_CHARSET:
9207 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9210 case REGEX_DEPENDS_CHARSET:
9216 ret = reg_node(pRExC_state, op);
9217 FLAGS(ret) = get_regex_charset(RExC_flags);
9219 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9220 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
9222 goto finish_meta_pat;
9224 switch (get_regex_charset(RExC_flags)) {
9225 case REGEX_LOCALE_CHARSET:
9228 case REGEX_UNICODE_CHARSET:
9231 case REGEX_ASCII_RESTRICTED_CHARSET:
9232 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9235 case REGEX_DEPENDS_CHARSET:
9241 ret = reg_node(pRExC_state, op);
9242 *flagp |= HASWIDTH|SIMPLE;
9243 goto finish_meta_pat;
9245 switch (get_regex_charset(RExC_flags)) {
9246 case REGEX_LOCALE_CHARSET:
9249 case REGEX_UNICODE_CHARSET:
9252 case REGEX_ASCII_RESTRICTED_CHARSET:
9253 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9256 case REGEX_DEPENDS_CHARSET:
9262 ret = reg_node(pRExC_state, op);
9263 *flagp |= HASWIDTH|SIMPLE;
9264 goto finish_meta_pat;
9266 switch (get_regex_charset(RExC_flags)) {
9267 case REGEX_LOCALE_CHARSET:
9270 case REGEX_ASCII_RESTRICTED_CHARSET:
9271 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9274 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9275 case REGEX_UNICODE_CHARSET:
9281 ret = reg_node(pRExC_state, op);
9282 *flagp |= HASWIDTH|SIMPLE;
9283 goto finish_meta_pat;
9285 switch (get_regex_charset(RExC_flags)) {
9286 case REGEX_LOCALE_CHARSET:
9289 case REGEX_ASCII_RESTRICTED_CHARSET:
9290 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9293 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9294 case REGEX_UNICODE_CHARSET:
9300 ret = reg_node(pRExC_state, op);
9301 *flagp |= HASWIDTH|SIMPLE;
9302 goto finish_meta_pat;
9304 ret = reg_node(pRExC_state, LNBREAK);
9305 *flagp |= HASWIDTH|SIMPLE;
9306 goto finish_meta_pat;
9308 ret = reg_node(pRExC_state, HORIZWS);
9309 *flagp |= HASWIDTH|SIMPLE;
9310 goto finish_meta_pat;
9312 ret = reg_node(pRExC_state, NHORIZWS);
9313 *flagp |= HASWIDTH|SIMPLE;
9314 goto finish_meta_pat;
9316 ret = reg_node(pRExC_state, VERTWS);
9317 *flagp |= HASWIDTH|SIMPLE;
9318 goto finish_meta_pat;
9320 ret = reg_node(pRExC_state, NVERTWS);
9321 *flagp |= HASWIDTH|SIMPLE;
9323 nextchar(pRExC_state);
9324 Set_Node_Length(ret, 2); /* MJD */
9329 char* const oldregxend = RExC_end;
9331 char* parse_start = RExC_parse - 2;
9334 if (RExC_parse[1] == '{') {
9335 /* a lovely hack--pretend we saw [\pX] instead */
9336 RExC_end = strchr(RExC_parse, '}');
9338 const U8 c = (U8)*RExC_parse;
9340 RExC_end = oldregxend;
9341 vFAIL2("Missing right brace on \\%c{}", c);
9346 RExC_end = RExC_parse + 2;
9347 if (RExC_end > oldregxend)
9348 RExC_end = oldregxend;
9352 ret = regclass(pRExC_state,depth+1);
9354 RExC_end = oldregxend;
9357 Set_Node_Offset(ret, parse_start + 2);
9358 Set_Node_Cur_Length(ret);
9359 nextchar(pRExC_state);
9360 *flagp |= HASWIDTH|SIMPLE;
9364 /* Handle \N and \N{NAME} here and not below because it can be
9365 multicharacter. join_exact() will join them up later on.
9366 Also this makes sure that things like /\N{BLAH}+/ and
9367 \N{BLAH} being multi char Just Happen. dmq*/
9369 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9371 case 'k': /* Handle \k<NAME> and \k'NAME' */
9374 char ch= RExC_parse[1];
9375 if (ch != '<' && ch != '\'' && ch != '{') {
9377 vFAIL2("Sequence %.2s... not terminated",parse_start);
9379 /* this pretty much dupes the code for (?P=...) in reg(), if
9380 you change this make sure you change that */
9381 char* name_start = (RExC_parse += 2);
9383 SV *sv_dat = reg_scan_name(pRExC_state,
9384 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9385 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9386 if (RExC_parse == name_start || *RExC_parse != ch)
9387 vFAIL2("Sequence %.3s... not terminated",parse_start);
9390 num = add_data( pRExC_state, 1, "S" );
9391 RExC_rxi->data->data[num]=(void*)sv_dat;
9392 SvREFCNT_inc_simple_void(sv_dat);
9396 ret = reganode(pRExC_state,
9399 : (MORE_ASCII_RESTRICTED)
9401 : (AT_LEAST_UNI_SEMANTICS)
9409 /* override incorrect value set in reganode MJD */
9410 Set_Node_Offset(ret, parse_start+1);
9411 Set_Node_Cur_Length(ret); /* MJD */
9412 nextchar(pRExC_state);
9418 case '1': case '2': case '3': case '4':
9419 case '5': case '6': case '7': case '8': case '9':
9422 bool isg = *RExC_parse == 'g';
9427 if (*RExC_parse == '{') {
9431 if (*RExC_parse == '-') {
9435 if (hasbrace && !isDIGIT(*RExC_parse)) {
9436 if (isrel) RExC_parse--;
9438 goto parse_named_seq;
9440 num = atoi(RExC_parse);
9441 if (isg && num == 0)
9442 vFAIL("Reference to invalid group 0");
9444 num = RExC_npar - num;
9446 vFAIL("Reference to nonexistent or unclosed group");
9448 if (!isg && num > 9 && num >= RExC_npar)
9451 char * const parse_start = RExC_parse - 1; /* MJD */
9452 while (isDIGIT(*RExC_parse))
9454 if (parse_start == RExC_parse - 1)
9455 vFAIL("Unterminated \\g... pattern");
9457 if (*RExC_parse != '}')
9458 vFAIL("Unterminated \\g{...} pattern");
9462 if (num > (I32)RExC_rx->nparens)
9463 vFAIL("Reference to nonexistent group");
9466 ret = reganode(pRExC_state,
9469 : (MORE_ASCII_RESTRICTED)
9471 : (AT_LEAST_UNI_SEMANTICS)
9479 /* override incorrect value set in reganode MJD */
9480 Set_Node_Offset(ret, parse_start+1);
9481 Set_Node_Cur_Length(ret); /* MJD */
9483 nextchar(pRExC_state);
9488 if (RExC_parse >= RExC_end)
9489 FAIL("Trailing \\");
9492 /* Do not generate "unrecognized" warnings here, we fall
9493 back into the quick-grab loop below */
9500 if (RExC_flags & RXf_PMf_EXTENDED) {
9501 if ( reg_skipcomment( pRExC_state ) )
9508 parse_start = RExC_parse - 1;
9513 register STRLEN len;
9518 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9521 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
9522 * it is folded to 'ss' even if not utf8 */
9523 bool is_exactfu_sharp_s;
9526 node_type = ((! FOLD) ? EXACT
9529 : (MORE_ASCII_RESTRICTED)
9531 : (AT_LEAST_UNI_SEMANTICS)
9534 ret = reg_node(pRExC_state, node_type);
9537 /* XXX The node can hold up to 255 bytes, yet this only goes to
9538 * 127. I (khw) do not know why. Keeping it somewhat less than
9539 * 255 allows us to not have to worry about overflow due to
9540 * converting to utf8 and fold expansion, but that value is
9541 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
9542 * split up by this limit into a single one using the real max of
9543 * 255. Even at 127, this breaks under rare circumstances. If
9544 * folding, we do not want to split a node at a character that is a
9545 * non-final in a multi-char fold, as an input string could just
9546 * happen to want to match across the node boundary. The join
9547 * would solve that problem if the join actually happens. But a
9548 * series of more than two nodes in a row each of 127 would cause
9549 * the first join to succeed to get to 254, but then there wouldn't
9550 * be room for the next one, which could at be one of those split
9551 * multi-char folds. I don't know of any fool-proof solution. One
9552 * could back off to end with only a code point that isn't such a
9553 * non-final, but it is possible for there not to be any in the
9555 for (len = 0, p = RExC_parse - 1;
9556 len < 127 && p < RExC_end;
9559 char * const oldp = p;
9561 if (RExC_flags & RXf_PMf_EXTENDED)
9562 p = regwhite( pRExC_state, p );
9573 /* Literal Escapes Switch
9575 This switch is meant to handle escape sequences that
9576 resolve to a literal character.
9578 Every escape sequence that represents something
9579 else, like an assertion or a char class, is handled
9580 in the switch marked 'Special Escapes' above in this
9581 routine, but also has an entry here as anything that
9582 isn't explicitly mentioned here will be treated as
9583 an unescaped equivalent literal.
9587 /* These are all the special escapes. */
9588 case 'A': /* Start assertion */
9589 case 'b': case 'B': /* Word-boundary assertion*/
9590 case 'C': /* Single char !DANGEROUS! */
9591 case 'd': case 'D': /* digit class */
9592 case 'g': case 'G': /* generic-backref, pos assertion */
9593 case 'h': case 'H': /* HORIZWS */
9594 case 'k': case 'K': /* named backref, keep marker */
9595 case 'N': /* named char sequence */
9596 case 'p': case 'P': /* Unicode property */
9597 case 'R': /* LNBREAK */
9598 case 's': case 'S': /* space class */
9599 case 'v': case 'V': /* VERTWS */
9600 case 'w': case 'W': /* word class */
9601 case 'X': /* eXtended Unicode "combining character sequence" */
9602 case 'z': case 'Z': /* End of line/string assertion */
9606 /* Anything after here is an escape that resolves to a
9607 literal. (Except digits, which may or may not)
9626 ender = ASCII_TO_NATIVE('\033');
9630 ender = ASCII_TO_NATIVE('\007');
9635 STRLEN brace_len = len;
9637 const char* error_msg;
9639 bool valid = grok_bslash_o(p,
9646 RExC_parse = p; /* going to die anyway; point
9647 to exact spot of failure */
9654 if (PL_encoding && ender < 0x100) {
9655 goto recode_encoding;
9664 char* const e = strchr(p, '}');
9668 vFAIL("Missing right brace on \\x{}");
9671 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9672 | PERL_SCAN_DISALLOW_PREFIX;
9673 STRLEN numlen = e - p - 1;
9674 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9681 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9683 ender = grok_hex(p, &numlen, &flags, NULL);
9686 if (PL_encoding && ender < 0x100)
9687 goto recode_encoding;
9691 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9693 case '0': case '1': case '2': case '3':case '4':
9694 case '5': case '6': case '7': case '8':case '9':
9696 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9698 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9700 ender = grok_oct(p, &numlen, &flags, NULL);
9710 if (PL_encoding && ender < 0x100)
9711 goto recode_encoding;
9714 if (! RExC_override_recoding) {
9715 SV* enc = PL_encoding;
9716 ender = reg_recode((const char)(U8)ender, &enc);
9717 if (!enc && SIZE_ONLY)
9718 ckWARNreg(p, "Invalid escape in the specified encoding");
9724 FAIL("Trailing \\");
9727 if (!SIZE_ONLY&& isALPHA(*p)) {
9728 /* Include any { following the alpha to emphasize
9729 * that it could be part of an escape at some point
9731 int len = (*(p + 1) == '{') ? 2 : 1;
9732 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9734 goto normal_default;
9739 if (UTF8_IS_START(*p) && UTF) {
9741 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9742 &numlen, UTF8_ALLOW_DEFAULT);
9748 } /* End of switch on the literal */
9750 is_exactfu_sharp_s = (node_type == EXACTFU
9751 && ender == LATIN_SMALL_LETTER_SHARP_S);
9752 if ( RExC_flags & RXf_PMf_EXTENDED)
9753 p = regwhite( pRExC_state, p );
9754 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9755 /* Prime the casefolded buffer. Locale rules, which apply
9756 * only to code points < 256, aren't known until execution,
9757 * so for them, just output the original character using
9758 * utf8. If we start to fold non-UTF patterns, be sure to
9759 * update join_exact() */
9760 if (LOC && ender < 256) {
9761 if (UNI_IS_INVARIANT(ender)) {
9762 *tmpbuf = (U8) ender;
9765 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9766 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9770 else if (isASCII(ender)) { /* Note: Here can't also be LOC
9772 ender = toLOWER(ender);
9773 *tmpbuf = (U8) ender;
9776 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9778 /* Locale and /aa require more selectivity about the
9779 * fold, so are handled below. Otherwise, here, just
9781 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9784 /* Under locale rules or /aa we are not to mix,
9785 * respectively, ords < 256 or ASCII with non-. So
9786 * reject folds that mix them, using only the
9787 * non-folded code point. So do the fold to a
9788 * temporary, and inspect each character in it. */
9789 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9791 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9792 U8* e = s + foldlen;
9793 bool fold_ok = TRUE;
9797 || (LOC && (UTF8_IS_INVARIANT(*s)
9798 || UTF8_IS_DOWNGRADEABLE_START(*s))))
9806 Copy(trialbuf, tmpbuf, foldlen, U8);
9810 uvuni_to_utf8(tmpbuf, ender);
9811 foldlen = UNISKIP(ender);
9815 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9818 else if (UTF || is_exactfu_sharp_s) {
9820 /* Emit all the Unicode characters. */
9822 for (foldbuf = tmpbuf;
9824 foldlen -= numlen) {
9825 ender = utf8_to_uvchr(foldbuf, &numlen);
9827 const STRLEN unilen = reguni(pRExC_state, ender, s);
9830 /* In EBCDIC the numlen
9831 * and unilen can differ. */
9833 if (numlen >= foldlen)
9837 break; /* "Can't happen." */
9841 const STRLEN unilen = reguni(pRExC_state, ender, s);
9850 REGC((char)ender, s++);
9854 if (UTF || is_exactfu_sharp_s) {
9856 /* Emit all the Unicode characters. */
9858 for (foldbuf = tmpbuf;
9860 foldlen -= numlen) {
9861 ender = utf8_to_uvchr(foldbuf, &numlen);
9863 const STRLEN unilen = reguni(pRExC_state, ender, s);
9866 /* In EBCDIC the numlen
9867 * and unilen can differ. */
9869 if (numlen >= foldlen)
9877 const STRLEN unilen = reguni(pRExC_state, ender, s);
9886 REGC((char)ender, s++);
9889 loopdone: /* Jumped to when encounters something that shouldn't be in
9892 Set_Node_Cur_Length(ret); /* MJD */
9893 nextchar(pRExC_state);
9895 /* len is STRLEN which is unsigned, need to copy to signed */
9898 vFAIL("Internal disaster");
9902 if (len == 1 && UNI_IS_INVARIANT(ender))
9906 RExC_size += STR_SZ(len);
9909 RExC_emit += STR_SZ(len);
9917 /* Jumped to when an unrecognized character set is encountered */
9919 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9924 S_regwhite( RExC_state_t *pRExC_state, char *p )
9926 const char *e = RExC_end;
9928 PERL_ARGS_ASSERT_REGWHITE;
9933 else if (*p == '#') {
9942 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9950 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9951 Character classes ([:foo:]) can also be negated ([:^foo:]).
9952 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9953 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9954 but trigger failures because they are currently unimplemented. */
9956 #define POSIXCC_DONE(c) ((c) == ':')
9957 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9958 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9961 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9964 I32 namedclass = OOB_NAMEDCLASS;
9966 PERL_ARGS_ASSERT_REGPPOSIXCC;
9968 if (value == '[' && RExC_parse + 1 < RExC_end &&
9969 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9970 POSIXCC(UCHARAT(RExC_parse))) {
9971 const char c = UCHARAT(RExC_parse);
9972 char* const s = RExC_parse++;
9974 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9976 if (RExC_parse == RExC_end)
9977 /* Grandfather lone [:, [=, [. */
9980 const char* const t = RExC_parse++; /* skip over the c */
9983 if (UCHARAT(RExC_parse) == ']') {
9984 const char *posixcc = s + 1;
9985 RExC_parse++; /* skip over the ending ] */
9988 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9989 const I32 skip = t - posixcc;
9991 /* Initially switch on the length of the name. */
9994 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9995 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9998 /* Names all of length 5. */
9999 /* alnum alpha ascii blank cntrl digit graph lower
10000 print punct space upper */
10001 /* Offset 4 gives the best switch position. */
10002 switch (posixcc[4]) {
10004 if (memEQ(posixcc, "alph", 4)) /* alpha */
10005 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10008 if (memEQ(posixcc, "spac", 4)) /* space */
10009 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10012 if (memEQ(posixcc, "grap", 4)) /* graph */
10013 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10016 if (memEQ(posixcc, "asci", 4)) /* ascii */
10017 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10020 if (memEQ(posixcc, "blan", 4)) /* blank */
10021 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10024 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10025 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10028 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10029 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10032 if (memEQ(posixcc, "lowe", 4)) /* lower */
10033 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10034 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10035 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10038 if (memEQ(posixcc, "digi", 4)) /* digit */
10039 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10040 else if (memEQ(posixcc, "prin", 4)) /* print */
10041 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10042 else if (memEQ(posixcc, "punc", 4)) /* punct */
10043 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10048 if (memEQ(posixcc, "xdigit", 6))
10049 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10053 if (namedclass == OOB_NAMEDCLASS)
10054 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10056 assert (posixcc[skip] == ':');
10057 assert (posixcc[skip+1] == ']');
10058 } else if (!SIZE_ONLY) {
10059 /* [[=foo=]] and [[.foo.]] are still future. */
10061 /* adjust RExC_parse so the warning shows after
10062 the class closes */
10063 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10065 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10068 /* Maternal grandfather:
10069 * "[:" ending in ":" but not in ":]" */
10079 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10083 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10085 if (POSIXCC(UCHARAT(RExC_parse))) {
10086 const char *s = RExC_parse;
10087 const char c = *s++;
10089 while (isALNUM(*s))
10091 if (*s && c == *s && s[1] == ']') {
10093 "POSIX syntax [%c %c] belongs inside character classes",
10096 /* [[=foo=]] and [[.foo.]] are still future. */
10097 if (POSIXCC_NOTYET(c)) {
10098 /* adjust RExC_parse so the error shows after
10099 the class closes */
10100 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10102 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10108 /* Generate the code to add a full posix character <class> to the bracketed
10109 * character class given by <node>. (<node> is needed only under locale rules)
10110 * destlist is the inversion list for non-locale rules that this class is
10112 * sourcelist is the ASCII-range inversion list to add under /a rules
10113 * Xsourcelist is the full Unicode range list to use otherwise. */
10114 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10116 SV* scratch_list = NULL; \
10118 /* Set this class in the node for runtime matching */ \
10119 ANYOF_CLASS_SET(node, class); \
10121 /* For above Latin1 code points, we use the full Unicode range */ \
10122 _invlist_intersection(PL_AboveLatin1, \
10125 /* And set the output to it, adding instead if there already is an \
10126 * output. Checking if <destlist> is NULL first saves an extra \
10127 * clone. Its reference count will be decremented at the next \
10128 * union, etc, or if this is the only instance, at the end of the \
10130 if (! destlist) { \
10131 destlist = scratch_list; \
10134 _invlist_union(destlist, scratch_list, &destlist); \
10135 SvREFCNT_dec(scratch_list); \
10139 /* For non-locale, just add it to any existing list */ \
10140 _invlist_union(destlist, \
10141 (AT_LEAST_ASCII_RESTRICTED) \
10147 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10149 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10151 SV* scratch_list = NULL; \
10152 ANYOF_CLASS_SET(node, class); \
10153 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10154 if (! destlist) { \
10155 destlist = scratch_list; \
10158 _invlist_union(destlist, scratch_list, &destlist); \
10159 SvREFCNT_dec(scratch_list); \
10163 _invlist_union_complement_2nd(destlist, \
10164 (AT_LEAST_ASCII_RESTRICTED) \
10168 /* Under /d, everything in the upper half of the Latin1 range \
10169 * matches this complement */ \
10170 if (DEPENDS_SEMANTICS) { \
10171 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10175 /* Generate the code to add a posix character <class> to the bracketed
10176 * character class given by <node>. (<node> is needed only under locale rules)
10177 * destlist is the inversion list for non-locale rules that this class is
10179 * sourcelist is the ASCII-range inversion list to add under /a rules
10180 * l1_sourcelist is the Latin1 range list to use otherwise.
10181 * Xpropertyname is the name to add to <run_time_list> of the property to
10182 * specify the code points above Latin1 that will have to be
10183 * determined at run-time
10184 * run_time_list is a SV* that contains text names of properties that are to
10185 * be computed at run time. This concatenates <Xpropertyname>
10186 * to it, apppropriately
10187 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10189 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10190 l1_sourcelist, Xpropertyname, run_time_list) \
10191 /* If not /a matching, there are going to be code points we will have \
10192 * to defer to runtime to look-up */ \
10193 if (! AT_LEAST_ASCII_RESTRICTED) { \
10194 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10197 ANYOF_CLASS_SET(node, class); \
10200 _invlist_union(destlist, \
10201 (AT_LEAST_ASCII_RESTRICTED) \
10207 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10208 * this and DO_N_POSIX */
10209 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10210 l1_sourcelist, Xpropertyname, run_time_list) \
10211 if (AT_LEAST_ASCII_RESTRICTED) { \
10212 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10215 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10217 ANYOF_CLASS_SET(node, namedclass); \
10220 SV* scratch_list = NULL; \
10221 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10222 if (! destlist) { \
10223 destlist = scratch_list; \
10226 _invlist_union(destlist, scratch_list, &destlist); \
10227 SvREFCNT_dec(scratch_list); \
10229 if (DEPENDS_SEMANTICS) { \
10230 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10236 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10239 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10240 * Locale folding is done at run-time, so this function should not be
10241 * called for nodes that are for locales.
10243 * This function sets the bit corresponding to the fold of the input
10244 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
10247 * It also knows about the characters that are in the bitmap that have
10248 * folds that are matchable only outside it, and sets the appropriate lists
10251 * It returns the number of bits that actually changed from 0 to 1 */
10256 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10258 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10261 /* It assumes the bit for 'value' has already been set */
10262 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10263 ANYOF_BITMAP_SET(node, fold);
10266 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10267 /* Certain Latin1 characters have matches outside the bitmap. To get
10268 * here, 'value' is one of those characters. None of these matches is
10269 * valid for ASCII characters under /aa, which have been excluded by
10270 * the 'if' above. The matches fall into three categories:
10271 * 1) They are singly folded-to or -from an above 255 character, as
10272 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10274 * 2) They are part of a multi-char fold with another character in the
10275 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10276 * 3) They are part of a multi-char fold with a character not in the
10277 * bitmap, such as various ligatures.
10278 * We aren't dealing fully with multi-char folds, except we do deal
10279 * with the pattern containing a character that has a multi-char fold
10280 * (not so much the inverse).
10281 * For types 1) and 3), the matches only happen when the target string
10282 * is utf8; that's not true for 2), and we set a flag for it.
10284 * The code below adds to the passed in inversion list the single fold
10285 * closures for 'value'. The values are hard-coded here so that an
10286 * innocent-looking character class, like /[ks]/i won't have to go out
10287 * to disk to find the possible matches. XXX It would be better to
10288 * generate these via regen, in case a new version of the Unicode
10289 * standard adds new mappings, though that is not really likely. */
10294 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10298 /* LATIN SMALL LETTER LONG S */
10299 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10302 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10303 GREEK_SMALL_LETTER_MU);
10304 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10305 GREEK_CAPITAL_LETTER_MU);
10307 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10308 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10309 /* ANGSTROM SIGN */
10310 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10311 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
10312 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10313 PL_fold_latin1[value]);
10316 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10317 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10318 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10320 case LATIN_SMALL_LETTER_SHARP_S:
10321 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10322 LATIN_CAPITAL_LETTER_SHARP_S);
10324 /* Under /a, /d, and /u, this can match the two chars "ss" */
10325 if (! MORE_ASCII_RESTRICTED) {
10326 add_alternate(alternate_ptr, (U8 *) "ss", 2);
10328 /* And under /u or /a, it can match even if the target is
10330 if (AT_LEAST_UNI_SEMANTICS) {
10331 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10335 case 'F': case 'f':
10336 case 'I': case 'i':
10337 case 'L': case 'l':
10338 case 'T': case 't':
10339 case 'A': case 'a':
10340 case 'H': case 'h':
10341 case 'J': case 'j':
10342 case 'N': case 'n':
10343 case 'W': case 'w':
10344 case 'Y': case 'y':
10345 /* These all are targets of multi-character folds from code
10346 * points that require UTF8 to express, so they can't match
10347 * unless the target string is in UTF-8, so no action here is
10348 * necessary, as regexec.c properly handles the general case
10349 * for UTF-8 matching */
10352 /* Use deprecated warning to increase the chances of this
10354 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10358 else if (DEPENDS_SEMANTICS
10359 && ! isASCII(value)
10360 && PL_fold_latin1[value] != value)
10362 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10363 * folds only when the target string is in UTF-8. We add the fold
10364 * here to the list of things to match outside the bitmap, which
10365 * won't be looked at unless it is UTF8 (or else if something else
10366 * says to look even if not utf8, but those things better not happen
10367 * under DEPENDS semantics. */
10368 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10375 PERL_STATIC_INLINE U8
10376 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10378 /* This inline function sets a bit in the bitmap if not already set, and if
10379 * appropriate, its fold, returning the number of bits that actually
10380 * changed from 0 to 1 */
10384 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10386 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
10390 ANYOF_BITMAP_SET(node, value);
10393 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
10394 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10401 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10403 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10404 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10405 * the multi-character folds of characters in the node */
10408 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10410 if (! *alternate_ptr) {
10411 *alternate_ptr = newAV();
10413 sv = newSVpvn_utf8((char*)string, len, TRUE);
10414 av_push(*alternate_ptr, sv);
10419 parse a class specification and produce either an ANYOF node that
10420 matches the pattern or perhaps will be optimized into an EXACTish node
10421 instead. The node contains a bit map for the first 256 characters, with the
10422 corresponding bit set if that character is in the list. For characters
10423 above 255, a range list is used */
10426 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10429 register UV nextvalue;
10430 register IV prevvalue = OOB_UNICODE;
10431 register IV range = 0;
10432 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10433 register regnode *ret;
10436 char *rangebegin = NULL;
10437 bool need_class = 0;
10438 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
10440 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10441 than just initialized. */
10442 SV* properties = NULL; /* Code points that match \p{} \P{} */
10443 UV element_count = 0; /* Number of distinct elements in the class.
10444 Optimizations may be possible if this is tiny */
10447 /* Unicode properties are stored in a swash; this holds the current one
10448 * being parsed. If this swash is the only above-latin1 component of the
10449 * character class, an optimization is to pass it directly on to the
10450 * execution engine. Otherwise, it is set to NULL to indicate that there
10451 * are other things in the class that have to be dealt with at execution
10453 SV* swash = NULL; /* Code points that match \p{} \P{} */
10455 /* Set if a component of this character class is user-defined; just passed
10456 * on to the engine */
10457 UV has_user_defined_property = 0;
10459 /* code points this node matches that can't be stored in the bitmap */
10460 SV* nonbitmap = NULL;
10462 /* The items that are to match that aren't stored in the bitmap, but are a
10463 * result of things that are stored there. This is the fold closure of
10464 * such a character, either because it has DEPENDS semantics and shouldn't
10465 * be matched unless the target string is utf8, or is a code point that is
10466 * too large for the bit map, as for example, the fold of the MICRO SIGN is
10467 * above 255. This all is solely for performance reasons. By having this
10468 * code know the outside-the-bitmap folds that the bitmapped characters are
10469 * involved with, we don't have to go out to disk to find the list of
10470 * matches, unless the character class includes code points that aren't
10471 * storable in the bit map. That means that a character class with an 's'
10472 * in it, for example, doesn't need to go out to disk to find everything
10473 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
10474 * empty unless there is something whose fold we don't know about, and will
10475 * have to go out to the disk to find. */
10476 SV* l1_fold_invlist = NULL;
10478 /* List of multi-character folds that are matched by this node */
10479 AV* unicode_alternate = NULL;
10481 UV literal_endpoint = 0;
10483 UV stored = 0; /* how many chars stored in the bitmap */
10485 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10486 case we need to change the emitted regop to an EXACT. */
10487 const char * orig_parse = RExC_parse;
10488 GET_RE_DEBUG_FLAGS_DECL;
10490 PERL_ARGS_ASSERT_REGCLASS;
10492 PERL_UNUSED_ARG(depth);
10495 DEBUG_PARSE("clas");
10497 /* Assume we are going to generate an ANYOF node. */
10498 ret = reganode(pRExC_state, ANYOF, 0);
10502 ANYOF_FLAGS(ret) = 0;
10505 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
10509 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10511 /* We have decided to not allow multi-char folds in inverted character
10512 * classes, due to the confusion that can happen, especially with
10513 * classes that are designed for a non-Unicode world: You have the
10514 * peculiar case that:
10515 "s s" =~ /^[^\xDF]+$/i => Y
10516 "ss" =~ /^[^\xDF]+$/i => N
10518 * See [perl #89750] */
10519 allow_full_fold = FALSE;
10523 RExC_size += ANYOF_SKIP;
10524 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10527 RExC_emit += ANYOF_SKIP;
10529 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10531 ANYOF_BITMAP_ZERO(ret);
10532 listsv = newSVpvs("# comment\n");
10533 initial_listsv_len = SvCUR(listsv);
10536 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10538 if (!SIZE_ONLY && POSIXCC(nextvalue))
10539 checkposixcc(pRExC_state);
10541 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10542 if (UCHARAT(RExC_parse) == ']')
10543 goto charclassloop;
10546 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10550 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10553 rangebegin = RExC_parse;
10557 value = utf8n_to_uvchr((U8*)RExC_parse,
10558 RExC_end - RExC_parse,
10559 &numlen, UTF8_ALLOW_DEFAULT);
10560 RExC_parse += numlen;
10563 value = UCHARAT(RExC_parse++);
10565 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10566 if (value == '[' && POSIXCC(nextvalue))
10567 namedclass = regpposixcc(pRExC_state, value);
10568 else if (value == '\\') {
10570 value = utf8n_to_uvchr((U8*)RExC_parse,
10571 RExC_end - RExC_parse,
10572 &numlen, UTF8_ALLOW_DEFAULT);
10573 RExC_parse += numlen;
10576 value = UCHARAT(RExC_parse++);
10577 /* Some compilers cannot handle switching on 64-bit integer
10578 * values, therefore value cannot be an UV. Yes, this will
10579 * be a problem later if we want switch on Unicode.
10580 * A similar issue a little bit later when switching on
10581 * namedclass. --jhi */
10582 switch ((I32)value) {
10583 case 'w': namedclass = ANYOF_ALNUM; break;
10584 case 'W': namedclass = ANYOF_NALNUM; break;
10585 case 's': namedclass = ANYOF_SPACE; break;
10586 case 'S': namedclass = ANYOF_NSPACE; break;
10587 case 'd': namedclass = ANYOF_DIGIT; break;
10588 case 'D': namedclass = ANYOF_NDIGIT; break;
10589 case 'v': namedclass = ANYOF_VERTWS; break;
10590 case 'V': namedclass = ANYOF_NVERTWS; break;
10591 case 'h': namedclass = ANYOF_HORIZWS; break;
10592 case 'H': namedclass = ANYOF_NHORIZWS; break;
10593 case 'N': /* Handle \N{NAME} in class */
10595 /* We only pay attention to the first char of
10596 multichar strings being returned. I kinda wonder
10597 if this makes sense as it does change the behaviour
10598 from earlier versions, OTOH that behaviour was broken
10600 UV v; /* value is register so we cant & it /grrr */
10601 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10611 if (RExC_parse >= RExC_end)
10612 vFAIL2("Empty \\%c{}", (U8)value);
10613 if (*RExC_parse == '{') {
10614 const U8 c = (U8)value;
10615 e = strchr(RExC_parse++, '}');
10617 vFAIL2("Missing right brace on \\%c{}", c);
10618 while (isSPACE(UCHARAT(RExC_parse)))
10620 if (e == RExC_parse)
10621 vFAIL2("Empty \\%c{}", c);
10622 n = e - RExC_parse;
10623 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10634 if (UCHARAT(RExC_parse) == '^') {
10637 value = value == 'p' ? 'P' : 'p'; /* toggle */
10638 while (isSPACE(UCHARAT(RExC_parse))) {
10643 /* Try to get the definition of the property into
10644 * <invlist>. If /i is in effect, the effective property
10645 * will have its name be <__NAME_i>. The design is
10646 * discussed in commit
10647 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10648 Newx(name, n + sizeof("_i__\n"), char);
10650 sprintf(name, "%s%.*s%s\n",
10651 (FOLD) ? "__" : "",
10657 /* Look up the property name, and get its swash and
10658 * inversion list, if the property is found */
10660 SvREFCNT_dec(swash);
10662 swash = _core_swash_init("utf8", name, &PL_sv_undef,
10665 TRUE, /* this routine will handle
10666 undefined properties */
10667 NULL, FALSE /* No inversion list */
10671 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10673 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10675 || ! (invlist = *invlistsvp))
10678 SvREFCNT_dec(swash);
10682 /* Here didn't find it. It could be a user-defined
10683 * property that will be available at run-time. Add it
10684 * to the list to look up then */
10685 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10686 (value == 'p' ? '+' : '!'),
10688 has_user_defined_property = 1;
10690 /* We don't know yet, so have to assume that the
10691 * property could match something in the Latin1 range,
10692 * hence something that isn't utf8 */
10693 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10697 /* Here, did get the swash and its inversion list. If
10698 * the swash is from a user-defined property, then this
10699 * whole character class should be regarded as such */
10700 SV** user_defined_svp =
10701 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10702 "USER_DEFINED", FALSE);
10703 if (user_defined_svp) {
10704 has_user_defined_property
10705 |= SvUV(*user_defined_svp);
10708 /* Invert if asking for the complement */
10709 if (value == 'P') {
10710 _invlist_union_complement_2nd(properties, invlist, &properties);
10712 /* The swash can't be used as-is, because we've
10713 * inverted things; delay removing it to here after
10714 * have copied its invlist above */
10715 SvREFCNT_dec(swash);
10719 _invlist_union(properties, invlist, &properties);
10724 RExC_parse = e + 1;
10725 namedclass = ANYOF_MAX; /* no official name, but it's named */
10727 /* \p means they want Unicode semantics */
10728 RExC_uni_semantics = 1;
10731 case 'n': value = '\n'; break;
10732 case 'r': value = '\r'; break;
10733 case 't': value = '\t'; break;
10734 case 'f': value = '\f'; break;
10735 case 'b': value = '\b'; break;
10736 case 'e': value = ASCII_TO_NATIVE('\033');break;
10737 case 'a': value = ASCII_TO_NATIVE('\007');break;
10739 RExC_parse--; /* function expects to be pointed at the 'o' */
10741 const char* error_msg;
10742 bool valid = grok_bslash_o(RExC_parse,
10747 RExC_parse += numlen;
10752 if (PL_encoding && value < 0x100) {
10753 goto recode_encoding;
10757 if (*RExC_parse == '{') {
10758 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10759 | PERL_SCAN_DISALLOW_PREFIX;
10760 char * const e = strchr(RExC_parse++, '}');
10762 vFAIL("Missing right brace on \\x{}");
10764 numlen = e - RExC_parse;
10765 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10766 RExC_parse = e + 1;
10769 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10771 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10772 RExC_parse += numlen;
10774 if (PL_encoding && value < 0x100)
10775 goto recode_encoding;
10778 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10780 case '0': case '1': case '2': case '3': case '4':
10781 case '5': case '6': case '7':
10783 /* Take 1-3 octal digits */
10784 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10786 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10787 RExC_parse += numlen;
10788 if (PL_encoding && value < 0x100)
10789 goto recode_encoding;
10793 if (! RExC_override_recoding) {
10794 SV* enc = PL_encoding;
10795 value = reg_recode((const char)(U8)value, &enc);
10796 if (!enc && SIZE_ONLY)
10797 ckWARNreg(RExC_parse,
10798 "Invalid escape in the specified encoding");
10802 /* Allow \_ to not give an error */
10803 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10804 ckWARN2reg(RExC_parse,
10805 "Unrecognized escape \\%c in character class passed through",
10810 } /* end of \blah */
10813 literal_endpoint++;
10816 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10818 /* What matches in a locale is not known until runtime, so need to
10819 * (one time per class) allocate extra space to pass to regexec.
10820 * The space will contain a bit for each named class that is to be
10821 * matched against. This isn't needed for \p{} and pseudo-classes,
10822 * as they are not affected by locale, and hence are dealt with
10824 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10827 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10830 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10831 ANYOF_CLASS_ZERO(ret);
10833 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10836 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
10837 * literal, as is the character that began the false range, i.e.
10838 * the 'a' in the examples */
10842 RExC_parse >= rangebegin ?
10843 RExC_parse - rangebegin : 0;
10844 ckWARN4reg(RExC_parse,
10845 "False [] range \"%*.*s\"",
10849 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10850 if (prevvalue < 256) {
10852 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10855 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10859 range = 0; /* this was not a true range */
10864 /* Possible truncation here but in some 64-bit environments
10865 * the compiler gets heartburn about switch on 64-bit values.
10866 * A similar issue a little earlier when switching on value.
10868 switch ((I32)namedclass) {
10870 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10871 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10872 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10874 case ANYOF_NALNUMC:
10875 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10876 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10879 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10880 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10883 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10884 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10888 ANYOF_CLASS_SET(ret, namedclass);
10891 _invlist_union(properties, PL_ASCII, &properties);
10896 ANYOF_CLASS_SET(ret, namedclass);
10899 _invlist_union_complement_2nd(properties,
10900 PL_ASCII, &properties);
10901 if (DEPENDS_SEMANTICS) {
10902 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10907 DO_POSIX(ret, namedclass, properties,
10908 PL_PosixBlank, PL_XPosixBlank);
10911 DO_N_POSIX(ret, namedclass, properties,
10912 PL_PosixBlank, PL_XPosixBlank);
10915 DO_POSIX(ret, namedclass, properties,
10916 PL_PosixCntrl, PL_XPosixCntrl);
10919 DO_N_POSIX(ret, namedclass, properties,
10920 PL_PosixCntrl, PL_XPosixCntrl);
10923 /* Ignore the compiler warning for this macro, planned to
10924 * be eliminated later */
10925 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10926 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10929 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10930 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10933 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10934 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10937 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10938 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10940 case ANYOF_HORIZWS:
10941 /* For these, we use the nonbitmap, as /d doesn't make a
10942 * difference in what these match. There would be problems
10943 * if these characters had folds other than themselves, as
10944 * nonbitmap is subject to folding. It turns out that \h
10945 * is just a synonym for XPosixBlank */
10946 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10948 case ANYOF_NHORIZWS:
10949 _invlist_union_complement_2nd(nonbitmap,
10950 PL_XPosixBlank, &nonbitmap);
10954 { /* These require special handling, as they differ under
10955 folding, matching Cased there (which in the ASCII range
10956 is the same as Alpha */
10962 if (FOLD && ! LOC) {
10963 ascii_source = PL_PosixAlpha;
10964 l1_source = PL_L1Cased;
10968 ascii_source = PL_PosixLower;
10969 l1_source = PL_L1PosixLower;
10970 Xname = "XPosixLower";
10972 if (namedclass == ANYOF_LOWER) {
10973 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10974 ascii_source, l1_source, Xname, listsv);
10977 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10978 properties, ascii_source, l1_source, Xname, listsv);
10983 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10984 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10987 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10988 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10991 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10992 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10995 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10996 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10999 DO_POSIX(ret, namedclass, properties,
11000 PL_PosixSpace, PL_XPosixSpace);
11002 case ANYOF_NPSXSPC:
11003 DO_N_POSIX(ret, namedclass, properties,
11004 PL_PosixSpace, PL_XPosixSpace);
11007 DO_POSIX(ret, namedclass, properties,
11008 PL_PerlSpace, PL_XPerlSpace);
11011 DO_N_POSIX(ret, namedclass, properties,
11012 PL_PerlSpace, PL_XPerlSpace);
11014 case ANYOF_UPPER: /* Same as LOWER, above */
11021 if (FOLD && ! LOC) {
11022 ascii_source = PL_PosixAlpha;
11023 l1_source = PL_L1Cased;
11027 ascii_source = PL_PosixUpper;
11028 l1_source = PL_L1PosixUpper;
11029 Xname = "XPosixUpper";
11031 if (namedclass == ANYOF_UPPER) {
11032 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11033 ascii_source, l1_source, Xname, listsv);
11036 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11037 properties, ascii_source, l1_source, Xname, listsv);
11041 case ANYOF_ALNUM: /* Really is 'Word' */
11042 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11043 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11046 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11047 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11050 /* For these, we use the nonbitmap, as /d doesn't make a
11051 * difference in what these match. There would be problems
11052 * if these characters had folds other than themselves, as
11053 * nonbitmap is subject to folding */
11054 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11056 case ANYOF_NVERTWS:
11057 _invlist_union_complement_2nd(nonbitmap,
11058 PL_VertSpace, &nonbitmap);
11061 DO_POSIX(ret, namedclass, properties,
11062 PL_PosixXDigit, PL_XPosixXDigit);
11064 case ANYOF_NXDIGIT:
11065 DO_N_POSIX(ret, namedclass, properties,
11066 PL_PosixXDigit, PL_XPosixXDigit);
11069 /* this is to handle \p and \P */
11072 vFAIL("Invalid [::] class");
11078 } /* end of namedclass \blah */
11081 if (prevvalue > (IV)value) /* b-a */ {
11082 const int w = RExC_parse - rangebegin;
11083 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11084 range = 0; /* not a valid range */
11088 prevvalue = value; /* save the beginning of the range */
11089 if (RExC_parse+1 < RExC_end
11090 && *RExC_parse == '-'
11091 && RExC_parse[1] != ']')
11095 /* a bad range like \w-, [:word:]- ? */
11096 if (namedclass > OOB_NAMEDCLASS) {
11097 if (ckWARN(WARN_REGEXP)) {
11099 RExC_parse >= rangebegin ?
11100 RExC_parse - rangebegin : 0;
11102 "False [] range \"%*.*s\"",
11107 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11109 range = 1; /* yeah, it's a range! */
11110 continue; /* but do it the next time */
11114 /* non-Latin1 code point implies unicode semantics. Must be set in
11115 * pass1 so is there for the whole of pass 2 */
11117 RExC_uni_semantics = 1;
11120 /* now is the next time */
11122 if (prevvalue < 256) {
11123 const IV ceilvalue = value < 256 ? value : 255;
11126 /* In EBCDIC [\x89-\x91] should include
11127 * the \x8e but [i-j] should not. */
11128 if (literal_endpoint == 2 &&
11129 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11130 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11132 if (isLOWER(prevvalue)) {
11133 for (i = prevvalue; i <= ceilvalue; i++)
11134 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11136 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11139 for (i = prevvalue; i <= ceilvalue; i++)
11140 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11142 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11148 for (i = prevvalue; i <= ceilvalue; i++) {
11149 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11153 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11154 const UV natvalue = NATIVE_TO_UNI(value);
11155 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11158 literal_endpoint = 0;
11162 range = 0; /* this range (if it was one) is done now */
11169 /****** !SIZE_ONLY AFTER HERE *********/
11171 /* If folding and there are code points above 255, we calculate all
11172 * characters that could fold to or from the ones already on the list */
11173 if (FOLD && nonbitmap) {
11174 UV start, end; /* End points of code point ranges */
11176 SV* fold_intersection = NULL;
11178 /* This is a list of all the characters that participate in folds
11179 * (except marks, etc in multi-char folds */
11180 if (! PL_utf8_foldable) {
11181 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11182 PL_utf8_foldable = _swash_to_invlist(swash);
11183 SvREFCNT_dec(swash);
11186 /* This is a hash that for a particular fold gives all characters
11187 * that are involved in it */
11188 if (! PL_utf8_foldclosures) {
11190 /* If we were unable to find any folds, then we likely won't be
11191 * able to find the closures. So just create an empty list.
11192 * Folding will effectively be restricted to the non-Unicode rules
11193 * hard-coded into Perl. (This case happens legitimately during
11194 * compilation of Perl itself before the Unicode tables are
11196 if (invlist_len(PL_utf8_foldable) == 0) {
11197 PL_utf8_foldclosures = newHV();
11199 /* If the folds haven't been read in, call a fold function
11201 if (! PL_utf8_tofold) {
11202 U8 dummy[UTF8_MAXBYTES+1];
11205 /* This particular string is above \xff in both UTF-8 and
11207 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11208 assert(PL_utf8_tofold); /* Verify that worked */
11210 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11214 /* Only the characters in this class that participate in folds need be
11215 * checked. Get the intersection of this class and all the possible
11216 * characters that are foldable. This can quickly narrow down a large
11218 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11220 /* Now look at the foldable characters in this class individually */
11221 invlist_iterinit(fold_intersection);
11222 while (invlist_iternext(fold_intersection, &start, &end)) {
11225 /* Look at every character in the range */
11226 for (j = start; j <= end; j++) {
11229 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11232 _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
11234 if (foldlen > (STRLEN)UNISKIP(f)) {
11236 /* Any multicharacter foldings (disallowed in lookbehind
11237 * patterns) require the following transform: [ABCDEF] ->
11238 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11239 * folds into "rst", all other characters fold to single
11240 * characters. We save away these multicharacter foldings,
11241 * to be later saved as part of the additional "s" data. */
11242 if (! RExC_in_lookbehind) {
11244 U8* e = foldbuf + foldlen;
11246 /* If any of the folded characters of this are in the
11247 * Latin1 range, tell the regex engine that this can
11248 * match a non-utf8 target string. The only multi-byte
11249 * fold whose source is in the Latin1 range (U+00DF)
11250 * applies only when the target string is utf8, or
11251 * under unicode rules */
11252 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11255 /* Can't mix ascii with non- under /aa */
11256 if (MORE_ASCII_RESTRICTED
11257 && (isASCII(*loc) != isASCII(j)))
11259 goto end_multi_fold;
11261 if (UTF8_IS_INVARIANT(*loc)
11262 || UTF8_IS_DOWNGRADEABLE_START(*loc))
11264 /* Can't mix above and below 256 under LOC
11267 goto end_multi_fold;
11270 |= ANYOF_NONBITMAP_NON_UTF8;
11273 loc += UTF8SKIP(loc);
11277 add_alternate(&unicode_alternate, foldbuf, foldlen);
11281 /* This is special-cased, as it is the only letter which
11282 * has both a multi-fold and single-fold in Latin1. All
11283 * the other chars that have single and multi-folds are
11284 * always in utf8, and the utf8 folding algorithm catches
11286 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11287 stored += set_regclass_bit(pRExC_state,
11289 LATIN_SMALL_LETTER_SHARP_S,
11290 &l1_fold_invlist, &unicode_alternate);
11294 /* Single character fold. Add everything in its fold
11295 * closure to the list that this node should match */
11298 /* The fold closures data structure is a hash with the keys
11299 * being every character that is folded to, like 'k', and
11300 * the values each an array of everything that folds to its
11301 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
11302 if ((listp = hv_fetch(PL_utf8_foldclosures,
11303 (char *) foldbuf, foldlen, FALSE)))
11305 AV* list = (AV*) *listp;
11307 for (k = 0; k <= av_len(list); k++) {
11308 SV** c_p = av_fetch(list, k, FALSE);
11311 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11315 /* /aa doesn't allow folds between ASCII and non-;
11316 * /l doesn't allow them between above and below
11318 if ((MORE_ASCII_RESTRICTED
11319 && (isASCII(c) != isASCII(j)))
11320 || (LOC && ((c < 256) != (j < 256))))
11325 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11326 stored += set_regclass_bit(pRExC_state,
11329 &l1_fold_invlist, &unicode_alternate);
11331 /* It may be that the code point is already in
11332 * this range or already in the bitmap, in
11333 * which case we need do nothing */
11334 else if ((c < start || c > end)
11336 || ! ANYOF_BITMAP_TEST(ret, c)))
11338 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11345 SvREFCNT_dec(fold_intersection);
11348 /* Combine the two lists into one. */
11349 if (l1_fold_invlist) {
11351 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11352 SvREFCNT_dec(l1_fold_invlist);
11355 nonbitmap = l1_fold_invlist;
11359 /* And combine the result (if any) with any inversion list from properties.
11360 * The lists are kept separate up to now because we don't want to fold the
11364 _invlist_union(nonbitmap, properties, &nonbitmap);
11365 SvREFCNT_dec(properties);
11368 nonbitmap = properties;
11372 /* Here, <nonbitmap> contains all the code points we can determine at
11373 * compile time that we haven't put into the bitmap. Go through it, and
11374 * for things that belong in the bitmap, put them there, and delete from
11378 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11379 * possibly only should match when the target string is UTF-8 */
11380 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11382 /* This gets set if we actually need to modify things */
11383 bool change_invlist = FALSE;
11387 /* Start looking through <nonbitmap> */
11388 invlist_iterinit(nonbitmap);
11389 while (invlist_iternext(nonbitmap, &start, &end)) {
11393 /* Quit if are above what we should change */
11394 if (start > max_cp_to_set) {
11398 change_invlist = TRUE;
11400 /* Set all the bits in the range, up to the max that we are doing */
11401 high = (end < max_cp_to_set) ? end : max_cp_to_set;
11402 for (i = start; i <= (int) high; i++) {
11403 if (! ANYOF_BITMAP_TEST(ret, i)) {
11404 ANYOF_BITMAP_SET(ret, i);
11412 /* Done with loop; remove any code points that are in the bitmap from
11414 if (change_invlist) {
11415 _invlist_subtract(nonbitmap,
11416 (DEPENDS_SEMANTICS)
11422 /* If have completely emptied it, remove it completely */
11423 if (invlist_len(nonbitmap) == 0) {
11424 SvREFCNT_dec(nonbitmap);
11429 /* Here, we have calculated what code points should be in the character
11430 * class. <nonbitmap> does not overlap the bitmap except possibly in the
11431 * case of DEPENDS rules.
11433 * Now we can see about various optimizations. Fold calculation (which we
11434 * did above) needs to take place before inversion. Otherwise /[^k]/i
11435 * would invert to include K, which under /i would match k, which it
11438 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
11439 * set the FOLD flag yet, so this does optimize those. It doesn't
11440 * optimize locale. Doing so perhaps could be done as long as there is
11441 * nothing like \w in it; some thought also would have to be given to the
11442 * interaction with above 0x100 chars */
11443 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11445 && ! unicode_alternate
11446 /* In case of /d, there are some things that should match only when in
11447 * not in the bitmap, i.e., they require UTF8 to match. These are
11448 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11449 * case, they don't require UTF8, so can invert here */
11451 || ! DEPENDS_SEMANTICS
11452 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11453 && SvCUR(listsv) == initial_listsv_len)
11457 for (i = 0; i < 256; ++i) {
11458 if (ANYOF_BITMAP_TEST(ret, i)) {
11459 ANYOF_BITMAP_CLEAR(ret, i);
11462 ANYOF_BITMAP_SET(ret, i);
11467 /* The inversion means that everything above 255 is matched */
11468 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11471 /* Here, also has things outside the bitmap that may overlap with
11472 * the bitmap. We have to sync them up, so that they get inverted
11473 * in both places. Earlier, we removed all overlaps except in the
11474 * case of /d rules, so no syncing is needed except for this case
11476 SV *remove_list = NULL;
11478 if (DEPENDS_SEMANTICS) {
11481 /* Set the bits that correspond to the ones that aren't in the
11482 * bitmap. Otherwise, when we invert, we'll miss these.
11483 * Earlier, we removed from the nonbitmap all code points
11484 * < 128, so there is no extra work here */
11485 invlist_iterinit(nonbitmap);
11486 while (invlist_iternext(nonbitmap, &start, &end)) {
11487 if (start > 255) { /* The bit map goes to 255 */
11493 for (i = start; i <= (int) end; ++i) {
11494 ANYOF_BITMAP_SET(ret, i);
11501 /* Now invert both the bitmap and the nonbitmap. Anything in the
11502 * bitmap has to also be removed from the non-bitmap, but again,
11503 * there should not be overlap unless is /d rules. */
11504 _invlist_invert(nonbitmap);
11506 /* Any swash can't be used as-is, because we've inverted things */
11508 SvREFCNT_dec(swash);
11512 for (i = 0; i < 256; ++i) {
11513 if (ANYOF_BITMAP_TEST(ret, i)) {
11514 ANYOF_BITMAP_CLEAR(ret, i);
11515 if (DEPENDS_SEMANTICS) {
11516 if (! remove_list) {
11517 remove_list = _new_invlist(2);
11519 remove_list = add_cp_to_invlist(remove_list, i);
11523 ANYOF_BITMAP_SET(ret, i);
11529 /* And do the removal */
11530 if (DEPENDS_SEMANTICS) {
11532 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11533 SvREFCNT_dec(remove_list);
11537 /* There is no overlap for non-/d, so just delete anything
11539 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11543 stored = 256 - stored;
11545 /* Clear the invert flag since have just done it here */
11546 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11549 /* Folding in the bitmap is taken care of above, but not for locale (for
11550 * which we have to wait to see what folding is in effect at runtime), and
11551 * for some things not in the bitmap (only the upper latin folds in this
11552 * case, as all other single-char folding has been set above). Set
11553 * run-time fold flag for these */
11555 || (DEPENDS_SEMANTICS
11557 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11558 || unicode_alternate))
11560 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11563 /* A single character class can be "optimized" into an EXACTish node.
11564 * Note that since we don't currently count how many characters there are
11565 * outside the bitmap, we are XXX missing optimization possibilities for
11566 * them. This optimization can't happen unless this is a truly single
11567 * character class, which means that it can't be an inversion into a
11568 * many-character class, and there must be no possibility of there being
11569 * things outside the bitmap. 'stored' (only) for locales doesn't include
11570 * \w, etc, so have to make a special test that they aren't present
11572 * Similarly A 2-character class of the very special form like [bB] can be
11573 * optimized into an EXACTFish node, but only for non-locales, and for
11574 * characters which only have the two folds; so things like 'fF' and 'Ii'
11575 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11578 && ! unicode_alternate
11579 && SvCUR(listsv) == initial_listsv_len
11580 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11581 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11582 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11583 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11584 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11585 /* If the latest code point has a fold whose
11586 * bit is set, it must be the only other one */
11587 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11588 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11590 /* Note that the information needed to decide to do this optimization
11591 * is not currently available until the 2nd pass, and that the actually
11592 * used EXACTish node takes less space than the calculated ANYOF node,
11593 * and hence the amount of space calculated in the first pass is larger
11594 * than actually used, so this optimization doesn't gain us any space.
11595 * But an EXACT node is faster than an ANYOF node, and can be combined
11596 * with any adjacent EXACT nodes later by the optimizer for further
11597 * gains. The speed of executing an EXACTF is similar to an ANYOF
11598 * node, so the optimization advantage comes from the ability to join
11599 * it to adjacent EXACT nodes */
11601 const char * cur_parse= RExC_parse;
11603 RExC_emit = (regnode *)orig_emit;
11604 RExC_parse = (char *)orig_parse;
11608 /* A locale node with one point can be folded; all the other cases
11609 * with folding will have two points, since we calculate them above
11611 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11618 else { /* else 2 chars in the bit map: the folds of each other */
11620 /* Use the folded value, which for the cases where we get here,
11621 * is just the lower case of the current one (which may resolve to
11622 * itself, or to the other one */
11623 value = toLOWER_LATIN1(value);
11625 /* To join adjacent nodes, they must be the exact EXACTish type.
11626 * Try to use the most likely type, by using EXACTFA if possible,
11627 * then EXACTFU if the regex calls for it, or is required because
11628 * the character is non-ASCII. (If <value> is ASCII, its fold is
11629 * also ASCII for the cases where we get here.) */
11630 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11633 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11636 else { /* Otherwise, more likely to be EXACTF type */
11641 ret = reg_node(pRExC_state, op);
11642 RExC_parse = (char *)cur_parse;
11643 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11644 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11645 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11647 RExC_emit += STR_SZ(2);
11650 *STRING(ret)= (char)value;
11652 RExC_emit += STR_SZ(1);
11654 SvREFCNT_dec(listsv);
11658 /* If there is a swash and more than one element, we can't use the swash in
11659 * the optimization below. */
11660 if (swash && element_count > 1) {
11661 SvREFCNT_dec(swash);
11665 && SvCUR(listsv) == initial_listsv_len
11666 && ! unicode_alternate)
11668 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11669 SvREFCNT_dec(listsv);
11670 SvREFCNT_dec(unicode_alternate);
11673 /* av[0] stores the character class description in its textual form:
11674 * used later (regexec.c:Perl_regclass_swash()) to initialize the
11675 * appropriate swash, and is also useful for dumping the regnode.
11676 * av[1] if NULL, is a placeholder to later contain the swash computed
11677 * from av[0]. But if no further computation need be done, the
11678 * swash is stored there now.
11679 * av[2] stores the multicharacter foldings, used later in
11680 * regexec.c:S_reginclass().
11681 * av[3] stores the nonbitmap inversion list for use in addition or
11682 * instead of av[0]; not used if av[1] isn't NULL
11683 * av[4] is set if any component of the class is from a user-defined
11684 * property; not used if av[1] isn't NULL */
11685 AV * const av = newAV();
11688 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11692 av_store(av, 1, swash);
11693 SvREFCNT_dec(nonbitmap);
11696 av_store(av, 1, NULL);
11698 av_store(av, 3, nonbitmap);
11699 av_store(av, 4, newSVuv(has_user_defined_property));
11703 /* Store any computed multi-char folds only if we are allowing
11705 if (allow_full_fold) {
11706 av_store(av, 2, MUTABLE_SV(unicode_alternate));
11707 if (unicode_alternate) { /* This node is variable length */
11712 av_store(av, 2, NULL);
11714 rv = newRV_noinc(MUTABLE_SV(av));
11715 n = add_data(pRExC_state, 1, "s");
11716 RExC_rxi->data->data[n] = (void*)rv;
11723 /* reg_skipcomment()
11725 Absorbs an /x style # comments from the input stream.
11726 Returns true if there is more text remaining in the stream.
11727 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11728 terminates the pattern without including a newline.
11730 Note its the callers responsibility to ensure that we are
11731 actually in /x mode
11736 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11740 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11742 while (RExC_parse < RExC_end)
11743 if (*RExC_parse++ == '\n') {
11748 /* we ran off the end of the pattern without ending
11749 the comment, so we have to add an \n when wrapping */
11750 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11758 Advances the parse position, and optionally absorbs
11759 "whitespace" from the inputstream.
11761 Without /x "whitespace" means (?#...) style comments only,
11762 with /x this means (?#...) and # comments and whitespace proper.
11764 Returns the RExC_parse point from BEFORE the scan occurs.
11766 This is the /x friendly way of saying RExC_parse++.
11770 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11772 char* const retval = RExC_parse++;
11774 PERL_ARGS_ASSERT_NEXTCHAR;
11777 if (RExC_end - RExC_parse >= 3
11778 && *RExC_parse == '('
11779 && RExC_parse[1] == '?'
11780 && RExC_parse[2] == '#')
11782 while (*RExC_parse != ')') {
11783 if (RExC_parse == RExC_end)
11784 FAIL("Sequence (?#... not terminated");
11790 if (RExC_flags & RXf_PMf_EXTENDED) {
11791 if (isSPACE(*RExC_parse)) {
11795 else if (*RExC_parse == '#') {
11796 if ( reg_skipcomment( pRExC_state ) )
11805 - reg_node - emit a node
11807 STATIC regnode * /* Location. */
11808 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11811 register regnode *ptr;
11812 regnode * const ret = RExC_emit;
11813 GET_RE_DEBUG_FLAGS_DECL;
11815 PERL_ARGS_ASSERT_REG_NODE;
11818 SIZE_ALIGN(RExC_size);
11822 if (RExC_emit >= RExC_emit_bound)
11823 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11824 op, RExC_emit, RExC_emit_bound);
11826 NODE_ALIGN_FILL(ret);
11828 FILL_ADVANCE_NODE(ptr, op);
11829 #ifdef RE_TRACK_PATTERN_OFFSETS
11830 if (RExC_offsets) { /* MJD */
11831 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11832 "reg_node", __LINE__,
11834 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11835 ? "Overwriting end of array!\n" : "OK",
11836 (UV)(RExC_emit - RExC_emit_start),
11837 (UV)(RExC_parse - RExC_start),
11838 (UV)RExC_offsets[0]));
11839 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11847 - reganode - emit a node with an argument
11849 STATIC regnode * /* Location. */
11850 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11853 register regnode *ptr;
11854 regnode * const ret = RExC_emit;
11855 GET_RE_DEBUG_FLAGS_DECL;
11857 PERL_ARGS_ASSERT_REGANODE;
11860 SIZE_ALIGN(RExC_size);
11865 assert(2==regarglen[op]+1);
11867 Anything larger than this has to allocate the extra amount.
11868 If we changed this to be:
11870 RExC_size += (1 + regarglen[op]);
11872 then it wouldn't matter. Its not clear what side effect
11873 might come from that so its not done so far.
11878 if (RExC_emit >= RExC_emit_bound)
11879 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11880 op, RExC_emit, RExC_emit_bound);
11882 NODE_ALIGN_FILL(ret);
11884 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11885 #ifdef RE_TRACK_PATTERN_OFFSETS
11886 if (RExC_offsets) { /* MJD */
11887 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11891 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11892 "Overwriting end of array!\n" : "OK",
11893 (UV)(RExC_emit - RExC_emit_start),
11894 (UV)(RExC_parse - RExC_start),
11895 (UV)RExC_offsets[0]));
11896 Set_Cur_Node_Offset;
11904 - reguni - emit (if appropriate) a Unicode character
11907 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11911 PERL_ARGS_ASSERT_REGUNI;
11913 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11917 - reginsert - insert an operator in front of already-emitted operand
11919 * Means relocating the operand.
11922 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11925 register regnode *src;
11926 register regnode *dst;
11927 register regnode *place;
11928 const int offset = regarglen[(U8)op];
11929 const int size = NODE_STEP_REGNODE + offset;
11930 GET_RE_DEBUG_FLAGS_DECL;
11932 PERL_ARGS_ASSERT_REGINSERT;
11933 PERL_UNUSED_ARG(depth);
11934 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11935 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11944 if (RExC_open_parens) {
11946 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11947 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11948 if ( RExC_open_parens[paren] >= opnd ) {
11949 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11950 RExC_open_parens[paren] += size;
11952 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11954 if ( RExC_close_parens[paren] >= opnd ) {
11955 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11956 RExC_close_parens[paren] += size;
11958 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11963 while (src > opnd) {
11964 StructCopy(--src, --dst, regnode);
11965 #ifdef RE_TRACK_PATTERN_OFFSETS
11966 if (RExC_offsets) { /* MJD 20010112 */
11967 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11971 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11972 ? "Overwriting end of array!\n" : "OK",
11973 (UV)(src - RExC_emit_start),
11974 (UV)(dst - RExC_emit_start),
11975 (UV)RExC_offsets[0]));
11976 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11977 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11983 place = opnd; /* Op node, where operand used to be. */
11984 #ifdef RE_TRACK_PATTERN_OFFSETS
11985 if (RExC_offsets) { /* MJD */
11986 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11990 (UV)(place - RExC_emit_start) > RExC_offsets[0]
11991 ? "Overwriting end of array!\n" : "OK",
11992 (UV)(place - RExC_emit_start),
11993 (UV)(RExC_parse - RExC_start),
11994 (UV)RExC_offsets[0]));
11995 Set_Node_Offset(place, RExC_parse);
11996 Set_Node_Length(place, 1);
11999 src = NEXTOPER(place);
12000 FILL_ADVANCE_NODE(place, op);
12001 Zero(src, offset, regnode);
12005 - regtail - set the next-pointer at the end of a node chain of p to val.
12006 - SEE ALSO: regtail_study
12008 /* TODO: All three parms should be const */
12010 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12013 register regnode *scan;
12014 GET_RE_DEBUG_FLAGS_DECL;
12016 PERL_ARGS_ASSERT_REGTAIL;
12018 PERL_UNUSED_ARG(depth);
12024 /* Find last node. */
12027 regnode * const temp = regnext(scan);
12029 SV * const mysv=sv_newmortal();
12030 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12031 regprop(RExC_rx, mysv, scan);
12032 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12033 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12034 (temp == NULL ? "->" : ""),
12035 (temp == NULL ? PL_reg_name[OP(val)] : "")
12043 if (reg_off_by_arg[OP(scan)]) {
12044 ARG_SET(scan, val - scan);
12047 NEXT_OFF(scan) = val - scan;
12053 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12054 - Look for optimizable sequences at the same time.
12055 - currently only looks for EXACT chains.
12057 This is experimental code. The idea is to use this routine to perform
12058 in place optimizations on branches and groups as they are constructed,
12059 with the long term intention of removing optimization from study_chunk so
12060 that it is purely analytical.
12062 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12063 to control which is which.
12066 /* TODO: All four parms should be const */
12069 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12072 register regnode *scan;
12074 #ifdef EXPERIMENTAL_INPLACESCAN
12077 GET_RE_DEBUG_FLAGS_DECL;
12079 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12085 /* Find last node. */
12089 regnode * const temp = regnext(scan);
12090 #ifdef EXPERIMENTAL_INPLACESCAN
12091 if (PL_regkind[OP(scan)] == EXACT) {
12092 bool has_exactf_sharp_s; /* Unexamined in this routine */
12093 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12098 switch (OP(scan)) {
12104 case EXACTFU_TRICKYFOLD:
12106 if( exact == PSEUDO )
12108 else if ( exact != OP(scan) )
12117 SV * const mysv=sv_newmortal();
12118 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12119 regprop(RExC_rx, mysv, scan);
12120 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12121 SvPV_nolen_const(mysv),
12122 REG_NODE_NUM(scan),
12123 PL_reg_name[exact]);
12130 SV * const mysv_val=sv_newmortal();
12131 DEBUG_PARSE_MSG("");
12132 regprop(RExC_rx, mysv_val, val);
12133 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12134 SvPV_nolen_const(mysv_val),
12135 (IV)REG_NODE_NUM(val),
12139 if (reg_off_by_arg[OP(scan)]) {
12140 ARG_SET(scan, val - scan);
12143 NEXT_OFF(scan) = val - scan;
12151 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12155 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12161 for (bit=0; bit<32; bit++) {
12162 if (flags & (1<<bit)) {
12163 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12166 if (!set++ && lead)
12167 PerlIO_printf(Perl_debug_log, "%s",lead);
12168 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12171 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12172 if (!set++ && lead) {
12173 PerlIO_printf(Perl_debug_log, "%s",lead);
12176 case REGEX_UNICODE_CHARSET:
12177 PerlIO_printf(Perl_debug_log, "UNICODE");
12179 case REGEX_LOCALE_CHARSET:
12180 PerlIO_printf(Perl_debug_log, "LOCALE");
12182 case REGEX_ASCII_RESTRICTED_CHARSET:
12183 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12185 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12186 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12189 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12195 PerlIO_printf(Perl_debug_log, "\n");
12197 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12203 Perl_regdump(pTHX_ const regexp *r)
12207 SV * const sv = sv_newmortal();
12208 SV *dsv= sv_newmortal();
12209 RXi_GET_DECL(r,ri);
12210 GET_RE_DEBUG_FLAGS_DECL;
12212 PERL_ARGS_ASSERT_REGDUMP;
12214 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12216 /* Header fields of interest. */
12217 if (r->anchored_substr) {
12218 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12219 RE_SV_DUMPLEN(r->anchored_substr), 30);
12220 PerlIO_printf(Perl_debug_log,
12221 "anchored %s%s at %"IVdf" ",
12222 s, RE_SV_TAIL(r->anchored_substr),
12223 (IV)r->anchored_offset);
12224 } else if (r->anchored_utf8) {
12225 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12226 RE_SV_DUMPLEN(r->anchored_utf8), 30);
12227 PerlIO_printf(Perl_debug_log,
12228 "anchored utf8 %s%s at %"IVdf" ",
12229 s, RE_SV_TAIL(r->anchored_utf8),
12230 (IV)r->anchored_offset);
12232 if (r->float_substr) {
12233 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12234 RE_SV_DUMPLEN(r->float_substr), 30);
12235 PerlIO_printf(Perl_debug_log,
12236 "floating %s%s at %"IVdf"..%"UVuf" ",
12237 s, RE_SV_TAIL(r->float_substr),
12238 (IV)r->float_min_offset, (UV)r->float_max_offset);
12239 } else if (r->float_utf8) {
12240 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12241 RE_SV_DUMPLEN(r->float_utf8), 30);
12242 PerlIO_printf(Perl_debug_log,
12243 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12244 s, RE_SV_TAIL(r->float_utf8),
12245 (IV)r->float_min_offset, (UV)r->float_max_offset);
12247 if (r->check_substr || r->check_utf8)
12248 PerlIO_printf(Perl_debug_log,
12250 (r->check_substr == r->float_substr
12251 && r->check_utf8 == r->float_utf8
12252 ? "(checking floating" : "(checking anchored"));
12253 if (r->extflags & RXf_NOSCAN)
12254 PerlIO_printf(Perl_debug_log, " noscan");
12255 if (r->extflags & RXf_CHECK_ALL)
12256 PerlIO_printf(Perl_debug_log, " isall");
12257 if (r->check_substr || r->check_utf8)
12258 PerlIO_printf(Perl_debug_log, ") ");
12260 if (ri->regstclass) {
12261 regprop(r, sv, ri->regstclass);
12262 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12264 if (r->extflags & RXf_ANCH) {
12265 PerlIO_printf(Perl_debug_log, "anchored");
12266 if (r->extflags & RXf_ANCH_BOL)
12267 PerlIO_printf(Perl_debug_log, "(BOL)");
12268 if (r->extflags & RXf_ANCH_MBOL)
12269 PerlIO_printf(Perl_debug_log, "(MBOL)");
12270 if (r->extflags & RXf_ANCH_SBOL)
12271 PerlIO_printf(Perl_debug_log, "(SBOL)");
12272 if (r->extflags & RXf_ANCH_GPOS)
12273 PerlIO_printf(Perl_debug_log, "(GPOS)");
12274 PerlIO_putc(Perl_debug_log, ' ');
12276 if (r->extflags & RXf_GPOS_SEEN)
12277 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12278 if (r->intflags & PREGf_SKIP)
12279 PerlIO_printf(Perl_debug_log, "plus ");
12280 if (r->intflags & PREGf_IMPLICIT)
12281 PerlIO_printf(Perl_debug_log, "implicit ");
12282 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12283 if (r->extflags & RXf_EVAL_SEEN)
12284 PerlIO_printf(Perl_debug_log, "with eval ");
12285 PerlIO_printf(Perl_debug_log, "\n");
12286 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12288 PERL_ARGS_ASSERT_REGDUMP;
12289 PERL_UNUSED_CONTEXT;
12290 PERL_UNUSED_ARG(r);
12291 #endif /* DEBUGGING */
12295 - regprop - printable representation of opcode
12297 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12300 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12301 if (flags & ANYOF_INVERT) \
12302 /*make sure the invert info is in each */ \
12303 sv_catpvs(sv, "^"); \
12309 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12314 RXi_GET_DECL(prog,progi);
12315 GET_RE_DEBUG_FLAGS_DECL;
12317 PERL_ARGS_ASSERT_REGPROP;
12321 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
12322 /* It would be nice to FAIL() here, but this may be called from
12323 regexec.c, and it would be hard to supply pRExC_state. */
12324 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12325 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12327 k = PL_regkind[OP(o)];
12330 sv_catpvs(sv, " ");
12331 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12332 * is a crude hack but it may be the best for now since
12333 * we have no flag "this EXACTish node was UTF-8"
12335 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12336 PERL_PV_ESCAPE_UNI_DETECT |
12337 PERL_PV_ESCAPE_NONASCII |
12338 PERL_PV_PRETTY_ELLIPSES |
12339 PERL_PV_PRETTY_LTGT |
12340 PERL_PV_PRETTY_NOCLEAR
12342 } else if (k == TRIE) {
12343 /* print the details of the trie in dumpuntil instead, as
12344 * progi->data isn't available here */
12345 const char op = OP(o);
12346 const U32 n = ARG(o);
12347 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12348 (reg_ac_data *)progi->data->data[n] :
12350 const reg_trie_data * const trie
12351 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12353 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12354 DEBUG_TRIE_COMPILE_r(
12355 Perl_sv_catpvf(aTHX_ sv,
12356 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12357 (UV)trie->startstate,
12358 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12359 (UV)trie->wordcount,
12362 (UV)TRIE_CHARCOUNT(trie),
12363 (UV)trie->uniquecharcount
12366 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12368 int rangestart = -1;
12369 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12370 sv_catpvs(sv, "[");
12371 for (i = 0; i <= 256; i++) {
12372 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12373 if (rangestart == -1)
12375 } else if (rangestart != -1) {
12376 if (i <= rangestart + 3)
12377 for (; rangestart < i; rangestart++)
12378 put_byte(sv, rangestart);
12380 put_byte(sv, rangestart);
12381 sv_catpvs(sv, "-");
12382 put_byte(sv, i - 1);
12387 sv_catpvs(sv, "]");
12390 } else if (k == CURLY) {
12391 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12392 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12393 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12395 else if (k == WHILEM && o->flags) /* Ordinal/of */
12396 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12397 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12398 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
12399 if ( RXp_PAREN_NAMES(prog) ) {
12400 if ( k != REF || (OP(o) < NREF)) {
12401 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12402 SV **name= av_fetch(list, ARG(o), 0 );
12404 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12407 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12408 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12409 I32 *nums=(I32*)SvPVX(sv_dat);
12410 SV **name= av_fetch(list, nums[0], 0 );
12413 for ( n=0; n<SvIVX(sv_dat); n++ ) {
12414 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12415 (n ? "," : ""), (IV)nums[n]);
12417 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12421 } else if (k == GOSUB)
12422 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12423 else if (k == VERB) {
12425 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12426 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12427 } else if (k == LOGICAL)
12428 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
12429 else if (k == ANYOF) {
12430 int i, rangestart = -1;
12431 const U8 flags = ANYOF_FLAGS(o);
12434 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12435 static const char * const anyofs[] = {
12468 if (flags & ANYOF_LOCALE)
12469 sv_catpvs(sv, "{loc}");
12470 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12471 sv_catpvs(sv, "{i}");
12472 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12473 if (flags & ANYOF_INVERT)
12474 sv_catpvs(sv, "^");
12476 /* output what the standard cp 0-255 bitmap matches */
12477 for (i = 0; i <= 256; i++) {
12478 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12479 if (rangestart == -1)
12481 } else if (rangestart != -1) {
12482 if (i <= rangestart + 3)
12483 for (; rangestart < i; rangestart++)
12484 put_byte(sv, rangestart);
12486 put_byte(sv, rangestart);
12487 sv_catpvs(sv, "-");
12488 put_byte(sv, i - 1);
12495 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12496 /* output any special charclass tests (used entirely under use locale) */
12497 if (ANYOF_CLASS_TEST_ANY_SET(o))
12498 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12499 if (ANYOF_CLASS_TEST(o,i)) {
12500 sv_catpv(sv, anyofs[i]);
12504 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12506 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12507 sv_catpvs(sv, "{non-utf8-latin1-all}");
12510 /* output information about the unicode matching */
12511 if (flags & ANYOF_UNICODE_ALL)
12512 sv_catpvs(sv, "{unicode_all}");
12513 else if (ANYOF_NONBITMAP(o))
12514 sv_catpvs(sv, "{unicode}");
12515 if (flags & ANYOF_NONBITMAP_NON_UTF8)
12516 sv_catpvs(sv, "{outside bitmap}");
12518 if (ANYOF_NONBITMAP(o)) {
12519 SV *lv; /* Set if there is something outside the bit map */
12520 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12521 bool byte_output = FALSE; /* If something in the bitmap has been
12524 if (lv && lv != &PL_sv_undef) {
12526 U8 s[UTF8_MAXBYTES_CASE+1];
12528 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12529 uvchr_to_utf8(s, i);
12532 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
12536 && swash_fetch(sw, s, TRUE))
12538 if (rangestart == -1)
12540 } else if (rangestart != -1) {
12541 byte_output = TRUE;
12542 if (i <= rangestart + 3)
12543 for (; rangestart < i; rangestart++) {
12544 put_byte(sv, rangestart);
12547 put_byte(sv, rangestart);
12548 sv_catpvs(sv, "-");
12557 char *s = savesvpv(lv);
12558 char * const origs = s;
12560 while (*s && *s != '\n')
12564 const char * const t = ++s;
12567 sv_catpvs(sv, " ");
12573 /* Truncate very long output */
12574 if (s - origs > 256) {
12575 Perl_sv_catpvf(aTHX_ sv,
12577 (int) (s - origs - 1),
12583 else if (*s == '\t') {
12602 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12604 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12605 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12607 PERL_UNUSED_CONTEXT;
12608 PERL_UNUSED_ARG(sv);
12609 PERL_UNUSED_ARG(o);
12610 PERL_UNUSED_ARG(prog);
12611 #endif /* DEBUGGING */
12615 Perl_re_intuit_string(pTHX_ REGEXP * const r)
12616 { /* Assume that RE_INTUIT is set */
12618 struct regexp *const prog = (struct regexp *)SvANY(r);
12619 GET_RE_DEBUG_FLAGS_DECL;
12621 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12622 PERL_UNUSED_CONTEXT;
12626 const char * const s = SvPV_nolen_const(prog->check_substr
12627 ? prog->check_substr : prog->check_utf8);
12629 if (!PL_colorset) reginitcolors();
12630 PerlIO_printf(Perl_debug_log,
12631 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12633 prog->check_substr ? "" : "utf8 ",
12634 PL_colors[5],PL_colors[0],
12637 (strlen(s) > 60 ? "..." : ""));
12640 return prog->check_substr ? prog->check_substr : prog->check_utf8;
12646 handles refcounting and freeing the perl core regexp structure. When
12647 it is necessary to actually free the structure the first thing it
12648 does is call the 'free' method of the regexp_engine associated to
12649 the regexp, allowing the handling of the void *pprivate; member
12650 first. (This routine is not overridable by extensions, which is why
12651 the extensions free is called first.)
12653 See regdupe and regdupe_internal if you change anything here.
12655 #ifndef PERL_IN_XSUB_RE
12657 Perl_pregfree(pTHX_ REGEXP *r)
12663 Perl_pregfree2(pTHX_ REGEXP *rx)
12666 struct regexp *const r = (struct regexp *)SvANY(rx);
12667 GET_RE_DEBUG_FLAGS_DECL;
12669 PERL_ARGS_ASSERT_PREGFREE2;
12671 if (r->mother_re) {
12672 ReREFCNT_dec(r->mother_re);
12674 CALLREGFREE_PVT(rx); /* free the private data */
12675 SvREFCNT_dec(RXp_PAREN_NAMES(r));
12678 SvREFCNT_dec(r->anchored_substr);
12679 SvREFCNT_dec(r->anchored_utf8);
12680 SvREFCNT_dec(r->float_substr);
12681 SvREFCNT_dec(r->float_utf8);
12682 Safefree(r->substrs);
12684 RX_MATCH_COPY_FREE(rx);
12685 #ifdef PERL_OLD_COPY_ON_WRITE
12686 SvREFCNT_dec(r->saved_copy);
12693 This is a hacky workaround to the structural issue of match results
12694 being stored in the regexp structure which is in turn stored in
12695 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12696 could be PL_curpm in multiple contexts, and could require multiple
12697 result sets being associated with the pattern simultaneously, such
12698 as when doing a recursive match with (??{$qr})
12700 The solution is to make a lightweight copy of the regexp structure
12701 when a qr// is returned from the code executed by (??{$qr}) this
12702 lightweight copy doesn't actually own any of its data except for
12703 the starp/end and the actual regexp structure itself.
12709 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12711 struct regexp *ret;
12712 struct regexp *const r = (struct regexp *)SvANY(rx);
12713 register const I32 npar = r->nparens+1;
12715 PERL_ARGS_ASSERT_REG_TEMP_COPY;
12718 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12719 ret = (struct regexp *)SvANY(ret_x);
12721 (void)ReREFCNT_inc(rx);
12722 /* We can take advantage of the existing "copied buffer" mechanism in SVs
12723 by pointing directly at the buffer, but flagging that the allocated
12724 space in the copy is zero. As we've just done a struct copy, it's now
12725 a case of zero-ing that, rather than copying the current length. */
12726 SvPV_set(ret_x, RX_WRAPPED(rx));
12727 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12728 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12729 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12730 SvLEN_set(ret_x, 0);
12731 SvSTASH_set(ret_x, NULL);
12732 SvMAGIC_set(ret_x, NULL);
12733 Newx(ret->offs, npar, regexp_paren_pair);
12734 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12736 Newx(ret->substrs, 1, struct reg_substr_data);
12737 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12739 SvREFCNT_inc_void(ret->anchored_substr);
12740 SvREFCNT_inc_void(ret->anchored_utf8);
12741 SvREFCNT_inc_void(ret->float_substr);
12742 SvREFCNT_inc_void(ret->float_utf8);
12744 /* check_substr and check_utf8, if non-NULL, point to either their
12745 anchored or float namesakes, and don't hold a second reference. */
12747 RX_MATCH_COPIED_off(ret_x);
12748 #ifdef PERL_OLD_COPY_ON_WRITE
12749 ret->saved_copy = NULL;
12751 ret->mother_re = rx;
12757 /* regfree_internal()
12759 Free the private data in a regexp. This is overloadable by
12760 extensions. Perl takes care of the regexp structure in pregfree(),
12761 this covers the *pprivate pointer which technically perl doesn't
12762 know about, however of course we have to handle the
12763 regexp_internal structure when no extension is in use.
12765 Note this is called before freeing anything in the regexp
12770 Perl_regfree_internal(pTHX_ REGEXP * const rx)
12773 struct regexp *const r = (struct regexp *)SvANY(rx);
12774 RXi_GET_DECL(r,ri);
12775 GET_RE_DEBUG_FLAGS_DECL;
12777 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12783 SV *dsv= sv_newmortal();
12784 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12785 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12786 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12787 PL_colors[4],PL_colors[5],s);
12790 #ifdef RE_TRACK_PATTERN_OFFSETS
12792 Safefree(ri->u.offsets); /* 20010421 MJD */
12795 int n = ri->data->count;
12796 PAD* new_comppad = NULL;
12801 /* If you add a ->what type here, update the comment in regcomp.h */
12802 switch (ri->data->what[n]) {
12807 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12810 Safefree(ri->data->data[n]);
12813 new_comppad = MUTABLE_AV(ri->data->data[n]);
12816 if (new_comppad == NULL)
12817 Perl_croak(aTHX_ "panic: pregfree comppad");
12818 PAD_SAVE_LOCAL(old_comppad,
12819 /* Watch out for global destruction's random ordering. */
12820 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12823 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12826 op_free((OP_4tree*)ri->data->data[n]);
12828 PAD_RESTORE_LOCAL(old_comppad);
12829 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12830 new_comppad = NULL;
12835 { /* Aho Corasick add-on structure for a trie node.
12836 Used in stclass optimization only */
12838 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12840 refcount = --aho->refcount;
12843 PerlMemShared_free(aho->states);
12844 PerlMemShared_free(aho->fail);
12845 /* do this last!!!! */
12846 PerlMemShared_free(ri->data->data[n]);
12847 PerlMemShared_free(ri->regstclass);
12853 /* trie structure. */
12855 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12857 refcount = --trie->refcount;
12860 PerlMemShared_free(trie->charmap);
12861 PerlMemShared_free(trie->states);
12862 PerlMemShared_free(trie->trans);
12864 PerlMemShared_free(trie->bitmap);
12866 PerlMemShared_free(trie->jump);
12867 PerlMemShared_free(trie->wordinfo);
12868 /* do this last!!!! */
12869 PerlMemShared_free(ri->data->data[n]);
12874 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12877 Safefree(ri->data->what);
12878 Safefree(ri->data);
12884 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12885 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12886 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
12889 re_dup - duplicate a regexp.
12891 This routine is expected to clone a given regexp structure. It is only
12892 compiled under USE_ITHREADS.
12894 After all of the core data stored in struct regexp is duplicated
12895 the regexp_engine.dupe method is used to copy any private data
12896 stored in the *pprivate pointer. This allows extensions to handle
12897 any duplication it needs to do.
12899 See pregfree() and regfree_internal() if you change anything here.
12901 #if defined(USE_ITHREADS)
12902 #ifndef PERL_IN_XSUB_RE
12904 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12908 const struct regexp *r = (const struct regexp *)SvANY(sstr);
12909 struct regexp *ret = (struct regexp *)SvANY(dstr);
12911 PERL_ARGS_ASSERT_RE_DUP_GUTS;
12913 npar = r->nparens+1;
12914 Newx(ret->offs, npar, regexp_paren_pair);
12915 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12917 /* no need to copy these */
12918 Newx(ret->swap, npar, regexp_paren_pair);
12921 if (ret->substrs) {
12922 /* Do it this way to avoid reading from *r after the StructCopy().
12923 That way, if any of the sv_dup_inc()s dislodge *r from the L1
12924 cache, it doesn't matter. */
12925 const bool anchored = r->check_substr
12926 ? r->check_substr == r->anchored_substr
12927 : r->check_utf8 == r->anchored_utf8;
12928 Newx(ret->substrs, 1, struct reg_substr_data);
12929 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12931 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12932 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12933 ret->float_substr = sv_dup_inc(ret->float_substr, param);
12934 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12936 /* check_substr and check_utf8, if non-NULL, point to either their
12937 anchored or float namesakes, and don't hold a second reference. */
12939 if (ret->check_substr) {
12941 assert(r->check_utf8 == r->anchored_utf8);
12942 ret->check_substr = ret->anchored_substr;
12943 ret->check_utf8 = ret->anchored_utf8;
12945 assert(r->check_substr == r->float_substr);
12946 assert(r->check_utf8 == r->float_utf8);
12947 ret->check_substr = ret->float_substr;
12948 ret->check_utf8 = ret->float_utf8;
12950 } else if (ret->check_utf8) {
12952 ret->check_utf8 = ret->anchored_utf8;
12954 ret->check_utf8 = ret->float_utf8;
12959 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12962 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12964 if (RX_MATCH_COPIED(dstr))
12965 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
12967 ret->subbeg = NULL;
12968 #ifdef PERL_OLD_COPY_ON_WRITE
12969 ret->saved_copy = NULL;
12972 if (ret->mother_re) {
12973 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12974 /* Our storage points directly to our mother regexp, but that's
12975 1: a buffer in a different thread
12976 2: something we no longer hold a reference on
12977 so we need to copy it locally. */
12978 /* Note we need to use SvCUR(), rather than
12979 SvLEN(), on our mother_re, because it, in
12980 turn, may well be pointing to its own mother_re. */
12981 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12982 SvCUR(ret->mother_re)+1));
12983 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12985 ret->mother_re = NULL;
12989 #endif /* PERL_IN_XSUB_RE */
12994 This is the internal complement to regdupe() which is used to copy
12995 the structure pointed to by the *pprivate pointer in the regexp.
12996 This is the core version of the extension overridable cloning hook.
12997 The regexp structure being duplicated will be copied by perl prior
12998 to this and will be provided as the regexp *r argument, however
12999 with the /old/ structures pprivate pointer value. Thus this routine
13000 may override any copying normally done by perl.
13002 It returns a pointer to the new regexp_internal structure.
13006 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13009 struct regexp *const r = (struct regexp *)SvANY(rx);
13010 regexp_internal *reti;
13012 RXi_GET_DECL(r,ri);
13014 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13018 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13019 Copy(ri->program, reti->program, len+1, regnode);
13022 reti->regstclass = NULL;
13025 struct reg_data *d;
13026 const int count = ri->data->count;
13029 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13030 char, struct reg_data);
13031 Newx(d->what, count, U8);
13034 for (i = 0; i < count; i++) {
13035 d->what[i] = ri->data->what[i];
13036 switch (d->what[i]) {
13037 /* legal options are one of: sSfpontTua
13038 see also regcomp.h and pregfree() */
13039 case 'a': /* actually an AV, but the dup function is identical. */
13042 case 'p': /* actually an AV, but the dup function is identical. */
13043 case 'u': /* actually an HV, but the dup function is identical. */
13044 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13047 /* This is cheating. */
13048 Newx(d->data[i], 1, struct regnode_charclass_class);
13049 StructCopy(ri->data->data[i], d->data[i],
13050 struct regnode_charclass_class);
13051 reti->regstclass = (regnode*)d->data[i];
13054 /* Compiled op trees are readonly and in shared memory,
13055 and can thus be shared without duplication. */
13057 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
13061 /* Trie stclasses are readonly and can thus be shared
13062 * without duplication. We free the stclass in pregfree
13063 * when the corresponding reg_ac_data struct is freed.
13065 reti->regstclass= ri->regstclass;
13069 ((reg_trie_data*)ri->data->data[i])->refcount++;
13073 d->data[i] = ri->data->data[i];
13076 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13085 reti->name_list_idx = ri->name_list_idx;
13087 #ifdef RE_TRACK_PATTERN_OFFSETS
13088 if (ri->u.offsets) {
13089 Newx(reti->u.offsets, 2*len+1, U32);
13090 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13093 SetProgLen(reti,len);
13096 return (void*)reti;
13099 #endif /* USE_ITHREADS */
13101 #ifndef PERL_IN_XSUB_RE
13104 - regnext - dig the "next" pointer out of a node
13107 Perl_regnext(pTHX_ register regnode *p)
13110 register I32 offset;
13115 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13116 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13119 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13128 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13131 STRLEN l1 = strlen(pat1);
13132 STRLEN l2 = strlen(pat2);
13135 const char *message;
13137 PERL_ARGS_ASSERT_RE_CROAK2;
13143 Copy(pat1, buf, l1 , char);
13144 Copy(pat2, buf + l1, l2 , char);
13145 buf[l1 + l2] = '\n';
13146 buf[l1 + l2 + 1] = '\0';
13148 /* ANSI variant takes additional second argument */
13149 va_start(args, pat2);
13153 msv = vmess(buf, &args);
13155 message = SvPV_const(msv,l1);
13158 Copy(message, buf, l1 , char);
13159 buf[l1-1] = '\0'; /* Overwrite \n */
13160 Perl_croak(aTHX_ "%s", buf);
13163 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13165 #ifndef PERL_IN_XSUB_RE
13167 Perl_save_re_context(pTHX)
13171 struct re_save_state *state;
13173 SAVEVPTR(PL_curcop);
13174 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13176 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13177 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13178 SSPUSHUV(SAVEt_RE_STATE);
13180 Copy(&PL_reg_state, state, 1, struct re_save_state);
13182 PL_reg_start_tmp = 0;
13183 PL_reg_start_tmpl = 0;
13184 PL_reg_oldsaved = NULL;
13185 PL_reg_oldsavedlen = 0;
13186 PL_reg_maxiter = 0;
13187 PL_reg_leftiter = 0;
13188 PL_reg_poscache = NULL;
13189 PL_reg_poscache_size = 0;
13190 #ifdef PERL_OLD_COPY_ON_WRITE
13194 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13196 const REGEXP * const rx = PM_GETRE(PL_curpm);
13199 for (i = 1; i <= RX_NPARENS(rx); i++) {
13200 char digits[TYPE_CHARS(long)];
13201 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13202 GV *const *const gvp
13203 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13206 GV * const gv = *gvp;
13207 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13217 clear_re(pTHX_ void *r)
13220 ReREFCNT_dec((REGEXP *)r);
13226 S_put_byte(pTHX_ SV *sv, int c)
13228 PERL_ARGS_ASSERT_PUT_BYTE;
13230 /* Our definition of isPRINT() ignores locales, so only bytes that are
13231 not part of UTF-8 are considered printable. I assume that the same
13232 holds for UTF-EBCDIC.
13233 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13234 which Wikipedia says:
13236 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13237 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13238 identical, to the ASCII delete (DEL) or rubout control character.
13239 ) So the old condition can be simplified to !isPRINT(c) */
13242 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13245 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13249 const char string = c;
13250 if (c == '-' || c == ']' || c == '\\' || c == '^')
13251 sv_catpvs(sv, "\\");
13252 sv_catpvn(sv, &string, 1);
13257 #define CLEAR_OPTSTART \
13258 if (optstart) STMT_START { \
13259 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13263 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13265 STATIC const regnode *
13266 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13267 const regnode *last, const regnode *plast,
13268 SV* sv, I32 indent, U32 depth)
13271 register U8 op = PSEUDO; /* Arbitrary non-END op. */
13272 register const regnode *next;
13273 const regnode *optstart= NULL;
13275 RXi_GET_DECL(r,ri);
13276 GET_RE_DEBUG_FLAGS_DECL;
13278 PERL_ARGS_ASSERT_DUMPUNTIL;
13280 #ifdef DEBUG_DUMPUNTIL
13281 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13282 last ? last-start : 0,plast ? plast-start : 0);
13285 if (plast && plast < last)
13288 while (PL_regkind[op] != END && (!last || node < last)) {
13289 /* While that wasn't END last time... */
13292 if (op == CLOSE || op == WHILEM)
13294 next = regnext((regnode *)node);
13297 if (OP(node) == OPTIMIZED) {
13298 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13305 regprop(r, sv, node);
13306 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13307 (int)(2*indent + 1), "", SvPVX_const(sv));
13309 if (OP(node) != OPTIMIZED) {
13310 if (next == NULL) /* Next ptr. */
13311 PerlIO_printf(Perl_debug_log, " (0)");
13312 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13313 PerlIO_printf(Perl_debug_log, " (FAIL)");
13315 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13316 (void)PerlIO_putc(Perl_debug_log, '\n');
13320 if (PL_regkind[(U8)op] == BRANCHJ) {
13323 register const regnode *nnode = (OP(next) == LONGJMP
13324 ? regnext((regnode *)next)
13326 if (last && nnode > last)
13328 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13331 else if (PL_regkind[(U8)op] == BRANCH) {
13333 DUMPUNTIL(NEXTOPER(node), next);
13335 else if ( PL_regkind[(U8)op] == TRIE ) {
13336 const regnode *this_trie = node;
13337 const char op = OP(node);
13338 const U32 n = ARG(node);
13339 const reg_ac_data * const ac = op>=AHOCORASICK ?
13340 (reg_ac_data *)ri->data->data[n] :
13342 const reg_trie_data * const trie =
13343 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13345 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13347 const regnode *nextbranch= NULL;
13350 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13351 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13353 PerlIO_printf(Perl_debug_log, "%*s%s ",
13354 (int)(2*(indent+3)), "",
13355 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13356 PL_colors[0], PL_colors[1],
13357 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13358 PERL_PV_PRETTY_ELLIPSES |
13359 PERL_PV_PRETTY_LTGT
13364 U16 dist= trie->jump[word_idx+1];
13365 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13366 (UV)((dist ? this_trie + dist : next) - start));
13369 nextbranch= this_trie + trie->jump[0];
13370 DUMPUNTIL(this_trie + dist, nextbranch);
13372 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13373 nextbranch= regnext((regnode *)nextbranch);
13375 PerlIO_printf(Perl_debug_log, "\n");
13378 if (last && next > last)
13383 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
13384 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13385 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13387 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13389 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13391 else if ( op == PLUS || op == STAR) {
13392 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13394 else if (PL_regkind[(U8)op] == ANYOF) {
13395 /* arglen 1 + class block */
13396 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13397 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13398 node = NEXTOPER(node);
13400 else if (PL_regkind[(U8)op] == EXACT) {
13401 /* Literal string, where present. */
13402 node += NODE_SZ_STR(node) - 1;
13403 node = NEXTOPER(node);
13406 node = NEXTOPER(node);
13407 node += regarglen[(U8)op];
13409 if (op == CURLYX || op == OPEN)
13413 #ifdef DEBUG_DUMPUNTIL
13414 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13419 #endif /* DEBUGGING */
13423 * c-indentation-style: bsd
13424 * c-basic-offset: 4
13425 * indent-tabs-mode: t
13428 * ex: set ts=8 sts=4 sw=4 noet: