This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Allow trie use for /iaa matching
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
34#ifdef PERL_EXT_RE_BUILD
35#include "re_top.h"
36#endif
37
38/*
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
64 ****
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
74#define PERL_IN_REGCOMP_C
75#include "perl.h"
76
77#ifndef PERL_IN_XSUB_RE
78# include "INTERN.h"
79#endif
80
81#define REG_COMP_C
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84extern const struct regexp_engine my_reg_engine;
85#else
86# include "regcomp.h"
87#endif
88
89#include "dquote_static.c"
90#include "charclass_invlists.h"
91#include "inline_invlist.c"
92#include "unicode_constants.h"
93
94#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
96#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
97
98#ifdef op
99#undef op
100#endif /* op */
101
102#ifdef MSDOS
103# if defined(BUGGY_MSC6)
104 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
105# pragma optimize("a",off)
106 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
107# pragma optimize("w",on )
108# endif /* BUGGY_MSC6 */
109#endif /* MSDOS */
110
111#ifndef STATIC
112#define STATIC static
113#endif
114
115
116typedef struct RExC_state_t {
117 U32 flags; /* RXf_* are we folding, multilining? */
118 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
119 char *precomp; /* uncompiled string. */
120 REGEXP *rx_sv; /* The SV that is the regexp. */
121 regexp *rx; /* perl core regexp structure */
122 regexp_internal *rxi; /* internal data for regexp object pprivate field */
123 char *start; /* Start of input for compile */
124 char *end; /* End of input for compile */
125 char *parse; /* Input-scan pointer. */
126 SSize_t whilem_seen; /* number of WHILEM in this expr */
127 regnode *emit_start; /* Start of emitted-code area */
128 regnode *emit_bound; /* First regnode outside of the allocated space */
129 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
130 implies compiling, so don't emit */
131 regnode emit_dummy; /* placeholder for emit to point to */
132 I32 naughty; /* How bad is this pattern? */
133 I32 sawback; /* Did we see \1, ...? */
134 U32 seen;
135 SSize_t size; /* Code size. */
136 I32 npar; /* Capture buffer count, (OPEN). */
137 I32 cpar; /* Capture buffer count, (CLOSE). */
138 I32 nestroot; /* root parens we are in - used by accept */
139 I32 extralen;
140 I32 seen_zerolen;
141 regnode **open_parens; /* pointers to open parens */
142 regnode **close_parens; /* pointers to close parens */
143 regnode *opend; /* END node in program */
144 I32 utf8; /* whether the pattern is utf8 or not */
145 I32 orig_utf8; /* whether the pattern was originally in utf8 */
146 /* XXX use this for future optimisation of case
147 * where pattern must be upgraded to utf8. */
148 I32 uni_semantics; /* If a d charset modifier should use unicode
149 rules, even if the pattern is not in
150 utf8 */
151 HV *paren_names; /* Paren names */
152
153 regnode **recurse; /* Recurse regops */
154 I32 recurse_count; /* Number of recurse regops */
155 I32 in_lookbehind;
156 I32 contains_locale;
157 I32 override_recoding;
158 I32 in_multi_char_class;
159 struct reg_code_block *code_blocks; /* positions of literal (?{})
160 within pattern */
161 int num_code_blocks; /* size of code_blocks[] */
162 int code_index; /* next code_blocks[] slot */
163#if ADD_TO_REGEXEC
164 char *starttry; /* -Dr: where regtry was called. */
165#define RExC_starttry (pRExC_state->starttry)
166#endif
167 SV *runtime_code_qr; /* qr with the runtime code blocks */
168#ifdef DEBUGGING
169 const char *lastparse;
170 I32 lastnum;
171 AV *paren_name_list; /* idx -> name */
172#define RExC_lastparse (pRExC_state->lastparse)
173#define RExC_lastnum (pRExC_state->lastnum)
174#define RExC_paren_name_list (pRExC_state->paren_name_list)
175#endif
176} RExC_state_t;
177
178#define RExC_flags (pRExC_state->flags)
179#define RExC_pm_flags (pRExC_state->pm_flags)
180#define RExC_precomp (pRExC_state->precomp)
181#define RExC_rx_sv (pRExC_state->rx_sv)
182#define RExC_rx (pRExC_state->rx)
183#define RExC_rxi (pRExC_state->rxi)
184#define RExC_start (pRExC_state->start)
185#define RExC_end (pRExC_state->end)
186#define RExC_parse (pRExC_state->parse)
187#define RExC_whilem_seen (pRExC_state->whilem_seen)
188#ifdef RE_TRACK_PATTERN_OFFSETS
189#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
190#endif
191#define RExC_emit (pRExC_state->emit)
192#define RExC_emit_dummy (pRExC_state->emit_dummy)
193#define RExC_emit_start (pRExC_state->emit_start)
194#define RExC_emit_bound (pRExC_state->emit_bound)
195#define RExC_naughty (pRExC_state->naughty)
196#define RExC_sawback (pRExC_state->sawback)
197#define RExC_seen (pRExC_state->seen)
198#define RExC_size (pRExC_state->size)
199#define RExC_npar (pRExC_state->npar)
200#define RExC_nestroot (pRExC_state->nestroot)
201#define RExC_extralen (pRExC_state->extralen)
202#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
203#define RExC_utf8 (pRExC_state->utf8)
204#define RExC_uni_semantics (pRExC_state->uni_semantics)
205#define RExC_orig_utf8 (pRExC_state->orig_utf8)
206#define RExC_open_parens (pRExC_state->open_parens)
207#define RExC_close_parens (pRExC_state->close_parens)
208#define RExC_opend (pRExC_state->opend)
209#define RExC_paren_names (pRExC_state->paren_names)
210#define RExC_recurse (pRExC_state->recurse)
211#define RExC_recurse_count (pRExC_state->recurse_count)
212#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
213#define RExC_contains_locale (pRExC_state->contains_locale)
214#define RExC_override_recoding (pRExC_state->override_recoding)
215#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
216
217
218#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
219#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
220 ((*s) == '{' && regcurly(s, FALSE)))
221
222#ifdef SPSTART
223#undef SPSTART /* dratted cpp namespace... */
224#endif
225/*
226 * Flags to be passed up and down.
227 */
228#define WORST 0 /* Worst case. */
229#define HASWIDTH 0x01 /* Known to match non-null strings. */
230
231/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
232 * character. (There needs to be a case: in the switch statement in regexec.c
233 * for any node marked SIMPLE.) Note that this is not the same thing as
234 * REGNODE_SIMPLE */
235#define SIMPLE 0x02
236#define SPSTART 0x04 /* Starts with * or + */
237#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
238#define TRYAGAIN 0x10 /* Weeded out a declaration. */
239#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
240
241#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
242
243/* whether trie related optimizations are enabled */
244#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
245#define TRIE_STUDY_OPT
246#define FULL_TRIE_STUDY
247#define TRIE_STCLASS
248#endif
249
250
251
252#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
253#define PBITVAL(paren) (1 << ((paren) & 7))
254#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
255#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
256#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
257
258#define REQUIRE_UTF8 STMT_START { \
259 if (!UTF) { \
260 *flagp = RESTART_UTF8; \
261 return NULL; \
262 } \
263 } STMT_END
264
265/* This converts the named class defined in regcomp.h to its equivalent class
266 * number defined in handy.h. */
267#define namedclass_to_classnum(class) ((int) ((class) / 2))
268#define classnum_to_namedclass(classnum) ((classnum) * 2)
269
270/* About scan_data_t.
271
272 During optimisation we recurse through the regexp program performing
273 various inplace (keyhole style) optimisations. In addition study_chunk
274 and scan_commit populate this data structure with information about
275 what strings MUST appear in the pattern. We look for the longest
276 string that must appear at a fixed location, and we look for the
277 longest string that may appear at a floating location. So for instance
278 in the pattern:
279
280 /FOO[xX]A.*B[xX]BAR/
281
282 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
283 strings (because they follow a .* construct). study_chunk will identify
284 both FOO and BAR as being the longest fixed and floating strings respectively.
285
286 The strings can be composites, for instance
287
288 /(f)(o)(o)/
289
290 will result in a composite fixed substring 'foo'.
291
292 For each string some basic information is maintained:
293
294 - offset or min_offset
295 This is the position the string must appear at, or not before.
296 It also implicitly (when combined with minlenp) tells us how many
297 characters must match before the string we are searching for.
298 Likewise when combined with minlenp and the length of the string it
299 tells us how many characters must appear after the string we have
300 found.
301
302 - max_offset
303 Only used for floating strings. This is the rightmost point that
304 the string can appear at. If set to SSize_t_MAX it indicates that the
305 string can occur infinitely far to the right.
306
307 - minlenp
308 A pointer to the minimum number of characters of the pattern that the
309 string was found inside. This is important as in the case of positive
310 lookahead or positive lookbehind we can have multiple patterns
311 involved. Consider
312
313 /(?=FOO).*F/
314
315 The minimum length of the pattern overall is 3, the minimum length
316 of the lookahead part is 3, but the minimum length of the part that
317 will actually match is 1. So 'FOO's minimum length is 3, but the
318 minimum length for the F is 1. This is important as the minimum length
319 is used to determine offsets in front of and behind the string being
320 looked for. Since strings can be composites this is the length of the
321 pattern at the time it was committed with a scan_commit. Note that
322 the length is calculated by study_chunk, so that the minimum lengths
323 are not known until the full pattern has been compiled, thus the
324 pointer to the value.
325
326 - lookbehind
327
328 In the case of lookbehind the string being searched for can be
329 offset past the start point of the final matching string.
330 If this value was just blithely removed from the min_offset it would
331 invalidate some of the calculations for how many chars must match
332 before or after (as they are derived from min_offset and minlen and
333 the length of the string being searched for).
334 When the final pattern is compiled and the data is moved from the
335 scan_data_t structure into the regexp structure the information
336 about lookbehind is factored in, with the information that would
337 have been lost precalculated in the end_shift field for the
338 associated string.
339
340 The fields pos_min and pos_delta are used to store the minimum offset
341 and the delta to the maximum offset at the current point in the pattern.
342
343*/
344
345typedef struct scan_data_t {
346 /*I32 len_min; unused */
347 /*I32 len_delta; unused */
348 SSize_t pos_min;
349 SSize_t pos_delta;
350 SV *last_found;
351 SSize_t last_end; /* min value, <0 unless valid. */
352 SSize_t last_start_min;
353 SSize_t last_start_max;
354 SV **longest; /* Either &l_fixed, or &l_float. */
355 SV *longest_fixed; /* longest fixed string found in pattern */
356 SSize_t offset_fixed; /* offset where it starts */
357 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
358 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
359 SV *longest_float; /* longest floating string found in pattern */
360 SSize_t offset_float_min; /* earliest point in string it can appear */
361 SSize_t offset_float_max; /* latest point in string it can appear */
362 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
363 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
364 I32 flags;
365 I32 whilem_c;
366 SSize_t *last_closep;
367 struct regnode_charclass_class *start_class;
368} scan_data_t;
369
370/* The below is perhaps overboard, but this allows us to save a test at the
371 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
372 * and 'a' differ by a single bit; the same with the upper and lower case of
373 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
374 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
375 * then inverts it to form a mask, with just a single 0, in the bit position
376 * where the upper- and lowercase differ. XXX There are about 40 other
377 * instances in the Perl core where this micro-optimization could be used.
378 * Should decide if maintenance cost is worse, before changing those
379 *
380 * Returns a boolean as to whether or not 'v' is either a lowercase or
381 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
382 * compile-time constant, the generated code is better than some optimizing
383 * compilers figure out, amounting to a mask and test. The results are
384 * meaningless if 'c' is not one of [A-Za-z] */
385#define isARG2_lower_or_UPPER_ARG1(c, v) \
386 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
387
388/*
389 * Forward declarations for pregcomp()'s friends.
390 */
391
392static const scan_data_t zero_scan_data =
393 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
394
395#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
396#define SF_BEFORE_SEOL 0x0001
397#define SF_BEFORE_MEOL 0x0002
398#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
399#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
400
401#ifdef NO_UNARY_PLUS
402# define SF_FIX_SHIFT_EOL (0+2)
403# define SF_FL_SHIFT_EOL (0+4)
404#else
405# define SF_FIX_SHIFT_EOL (+2)
406# define SF_FL_SHIFT_EOL (+4)
407#endif
408
409#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
410#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
411
412#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
413#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
414#define SF_IS_INF 0x0040
415#define SF_HAS_PAR 0x0080
416#define SF_IN_PAR 0x0100
417#define SF_HAS_EVAL 0x0200
418#define SCF_DO_SUBSTR 0x0400
419#define SCF_DO_STCLASS_AND 0x0800
420#define SCF_DO_STCLASS_OR 0x1000
421#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
422#define SCF_WHILEM_VISITED_POS 0x2000
423
424#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
425#define SCF_SEEN_ACCEPT 0x8000
426#define SCF_TRIE_DOING_RESTUDY 0x10000
427
428#define UTF cBOOL(RExC_utf8)
429
430/* The enums for all these are ordered so things work out correctly */
431#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
432#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
433#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
434#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
435#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
436#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
437#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
438
439#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
440
441#define OOB_NAMEDCLASS -1
442
443/* There is no code point that is out-of-bounds, so this is problematic. But
444 * its only current use is to initialize a variable that is always set before
445 * looked at. */
446#define OOB_UNICODE 0xDEADBEEF
447
448#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
449#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
450
451
452/* length of regex to show in messages that don't mark a position within */
453#define RegexLengthToShowInErrorMessages 127
454
455/*
456 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
457 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
458 * op/pragma/warn/regcomp.
459 */
460#define MARKER1 "<-- HERE" /* marker as it appears in the description */
461#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
462
463#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
464
465/*
466 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
467 * arg. Show regex, up to a maximum length. If it's too long, chop and add
468 * "...".
469 */
470#define _FAIL(code) STMT_START { \
471 const char *ellipses = ""; \
472 IV len = RExC_end - RExC_precomp; \
473 \
474 if (!SIZE_ONLY) \
475 SAVEFREESV(RExC_rx_sv); \
476 if (len > RegexLengthToShowInErrorMessages) { \
477 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
478 len = RegexLengthToShowInErrorMessages - 10; \
479 ellipses = "..."; \
480 } \
481 code; \
482} STMT_END
483
484#define FAIL(msg) _FAIL( \
485 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
486 msg, (int)len, RExC_precomp, ellipses))
487
488#define FAIL2(msg,arg) _FAIL( \
489 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
490 arg, (int)len, RExC_precomp, ellipses))
491
492/*
493 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
494 */
495#define Simple_vFAIL(m) STMT_START { \
496 const IV offset = RExC_parse - RExC_precomp; \
497 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
498 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
500
501/*
502 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
503 */
504#define vFAIL(m) STMT_START { \
505 if (!SIZE_ONLY) \
506 SAVEFREESV(RExC_rx_sv); \
507 Simple_vFAIL(m); \
508} STMT_END
509
510/*
511 * Like Simple_vFAIL(), but accepts two arguments.
512 */
513#define Simple_vFAIL2(m,a1) STMT_START { \
514 const IV offset = RExC_parse - RExC_precomp; \
515 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
516 (int)offset, RExC_precomp, RExC_precomp + offset); \
517} STMT_END
518
519/*
520 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
521 */
522#define vFAIL2(m,a1) STMT_START { \
523 if (!SIZE_ONLY) \
524 SAVEFREESV(RExC_rx_sv); \
525 Simple_vFAIL2(m, a1); \
526} STMT_END
527
528
529/*
530 * Like Simple_vFAIL(), but accepts three arguments.
531 */
532#define Simple_vFAIL3(m, a1, a2) STMT_START { \
533 const IV offset = RExC_parse - RExC_precomp; \
534 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
535 (int)offset, RExC_precomp, RExC_precomp + offset); \
536} STMT_END
537
538/*
539 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
540 */
541#define vFAIL3(m,a1,a2) STMT_START { \
542 if (!SIZE_ONLY) \
543 SAVEFREESV(RExC_rx_sv); \
544 Simple_vFAIL3(m, a1, a2); \
545} STMT_END
546
547/*
548 * Like Simple_vFAIL(), but accepts four arguments.
549 */
550#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
551 const IV offset = RExC_parse - RExC_precomp; \
552 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
553 (int)offset, RExC_precomp, RExC_precomp + offset); \
554} STMT_END
555
556#define vFAIL4(m,a1,a2,a3) STMT_START { \
557 if (!SIZE_ONLY) \
558 SAVEFREESV(RExC_rx_sv); \
559 Simple_vFAIL4(m, a1, a2, a3); \
560} STMT_END
561
562/* m is not necessarily a "literal string", in this macro */
563#define reg_warn_non_literal_string(loc, m) STMT_START { \
564 const IV offset = loc - RExC_precomp; \
565 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
566 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
567} STMT_END
568
569#define ckWARNreg(loc,m) STMT_START { \
570 const IV offset = loc - RExC_precomp; \
571 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
572 (int)offset, RExC_precomp, RExC_precomp + offset); \
573} STMT_END
574
575#define vWARN_dep(loc, m) STMT_START { \
576 const IV offset = loc - RExC_precomp; \
577 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
578 (int)offset, RExC_precomp, RExC_precomp + offset); \
579} STMT_END
580
581#define ckWARNdep(loc,m) STMT_START { \
582 const IV offset = loc - RExC_precomp; \
583 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
584 m REPORT_LOCATION, \
585 (int)offset, RExC_precomp, RExC_precomp + offset); \
586} STMT_END
587
588#define ckWARNregdep(loc,m) STMT_START { \
589 const IV offset = loc - RExC_precomp; \
590 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
591 m REPORT_LOCATION, \
592 (int)offset, RExC_precomp, RExC_precomp + offset); \
593} STMT_END
594
595#define ckWARN2reg_d(loc,m, a1) STMT_START { \
596 const IV offset = loc - RExC_precomp; \
597 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
598 m REPORT_LOCATION, \
599 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
600} STMT_END
601
602#define ckWARN2reg(loc, m, a1) STMT_START { \
603 const IV offset = loc - RExC_precomp; \
604 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
605 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
606} STMT_END
607
608#define vWARN3(loc, m, a1, a2) STMT_START { \
609 const IV offset = loc - RExC_precomp; \
610 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
611 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
612} STMT_END
613
614#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
615 const IV offset = loc - RExC_precomp; \
616 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
617 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
618} STMT_END
619
620#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
621 const IV offset = loc - RExC_precomp; \
622 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
623 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
624} STMT_END
625
626#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
627 const IV offset = loc - RExC_precomp; \
628 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
629 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
630} STMT_END
631
632#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
633 const IV offset = loc - RExC_precomp; \
634 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
636} STMT_END
637
638
639/* Allow for side effects in s */
640#define REGC(c,s) STMT_START { \
641 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
642} STMT_END
643
644/* Macros for recording node offsets. 20001227 mjd@plover.com
645 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
646 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
647 * Element 0 holds the number n.
648 * Position is 1 indexed.
649 */
650#ifndef RE_TRACK_PATTERN_OFFSETS
651#define Set_Node_Offset_To_R(node,byte)
652#define Set_Node_Offset(node,byte)
653#define Set_Cur_Node_Offset
654#define Set_Node_Length_To_R(node,len)
655#define Set_Node_Length(node,len)
656#define Set_Node_Cur_Length(node,start)
657#define Node_Offset(n)
658#define Node_Length(n)
659#define Set_Node_Offset_Length(node,offset,len)
660#define ProgLen(ri) ri->u.proglen
661#define SetProgLen(ri,x) ri->u.proglen = x
662#else
663#define ProgLen(ri) ri->u.offsets[0]
664#define SetProgLen(ri,x) ri->u.offsets[0] = x
665#define Set_Node_Offset_To_R(node,byte) STMT_START { \
666 if (! SIZE_ONLY) { \
667 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
668 __LINE__, (int)(node), (int)(byte))); \
669 if((node) < 0) { \
670 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
671 } else { \
672 RExC_offsets[2*(node)-1] = (byte); \
673 } \
674 } \
675} STMT_END
676
677#define Set_Node_Offset(node,byte) \
678 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
679#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
680
681#define Set_Node_Length_To_R(node,len) STMT_START { \
682 if (! SIZE_ONLY) { \
683 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
684 __LINE__, (int)(node), (int)(len))); \
685 if((node) < 0) { \
686 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
687 } else { \
688 RExC_offsets[2*(node)] = (len); \
689 } \
690 } \
691} STMT_END
692
693#define Set_Node_Length(node,len) \
694 Set_Node_Length_To_R((node)-RExC_emit_start, len)
695#define Set_Node_Cur_Length(node, start) \
696 Set_Node_Length(node, RExC_parse - start)
697
698/* Get offsets and lengths */
699#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
700#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
701
702#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
703 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
704 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
705} STMT_END
706#endif
707
708#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
709#define EXPERIMENTAL_INPLACESCAN
710#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
711
712#define DEBUG_STUDYDATA(str,data,depth) \
713DEBUG_OPTIMISE_MORE_r(if(data){ \
714 PerlIO_printf(Perl_debug_log, \
715 "%*s" str "Pos:%"IVdf"/%"IVdf \
716 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
717 (int)(depth)*2, "", \
718 (IV)((data)->pos_min), \
719 (IV)((data)->pos_delta), \
720 (UV)((data)->flags), \
721 (IV)((data)->whilem_c), \
722 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
723 is_inf ? "INF " : "" \
724 ); \
725 if ((data)->last_found) \
726 PerlIO_printf(Perl_debug_log, \
727 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
728 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
729 SvPVX_const((data)->last_found), \
730 (IV)((data)->last_end), \
731 (IV)((data)->last_start_min), \
732 (IV)((data)->last_start_max), \
733 ((data)->longest && \
734 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
735 SvPVX_const((data)->longest_fixed), \
736 (IV)((data)->offset_fixed), \
737 ((data)->longest && \
738 (data)->longest==&((data)->longest_float)) ? "*" : "", \
739 SvPVX_const((data)->longest_float), \
740 (IV)((data)->offset_float_min), \
741 (IV)((data)->offset_float_max) \
742 ); \
743 PerlIO_printf(Perl_debug_log,"\n"); \
744});
745
746/* Mark that we cannot extend a found fixed substring at this point.
747 Update the longest found anchored substring and the longest found
748 floating substrings if needed. */
749
750STATIC void
751S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
752 SSize_t *minlenp, int is_inf)
753{
754 const STRLEN l = CHR_SVLEN(data->last_found);
755 const STRLEN old_l = CHR_SVLEN(*data->longest);
756 GET_RE_DEBUG_FLAGS_DECL;
757
758 PERL_ARGS_ASSERT_SCAN_COMMIT;
759
760 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
761 SvSetMagicSV(*data->longest, data->last_found);
762 if (*data->longest == data->longest_fixed) {
763 data->offset_fixed = l ? data->last_start_min : data->pos_min;
764 if (data->flags & SF_BEFORE_EOL)
765 data->flags
766 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
767 else
768 data->flags &= ~SF_FIX_BEFORE_EOL;
769 data->minlen_fixed=minlenp;
770 data->lookbehind_fixed=0;
771 }
772 else { /* *data->longest == data->longest_float */
773 data->offset_float_min = l ? data->last_start_min : data->pos_min;
774 data->offset_float_max = (l
775 ? data->last_start_max
776 : (data->pos_delta == SSize_t_MAX
777 ? SSize_t_MAX
778 : data->pos_min + data->pos_delta));
779 if (is_inf
780 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
781 data->offset_float_max = SSize_t_MAX;
782 if (data->flags & SF_BEFORE_EOL)
783 data->flags
784 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
785 else
786 data->flags &= ~SF_FL_BEFORE_EOL;
787 data->minlen_float=minlenp;
788 data->lookbehind_float=0;
789 }
790 }
791 SvCUR_set(data->last_found, 0);
792 {
793 SV * const sv = data->last_found;
794 if (SvUTF8(sv) && SvMAGICAL(sv)) {
795 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
796 if (mg)
797 mg->mg_len = 0;
798 }
799 }
800 data->last_end = -1;
801 data->flags &= ~SF_BEFORE_EOL;
802 DEBUG_STUDYDATA("commit: ",data,0);
803}
804
805/* These macros set, clear and test whether the synthetic start class ('ssc',
806 * given by the parameter) matches an empty string (EOS). This uses the
807 * 'next_off' field in the node, to save a bit in the flags field. The ssc
808 * stands alone, so there is never a next_off, so this field is otherwise
809 * unused. The EOS information is used only for compilation, but theoretically
810 * it could be passed on to the execution code. This could be used to store
811 * more than one bit of information, but only this one is currently used. */
812#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
813#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
814#define TEST_SSC_EOS(node) cBOOL((node)->next_off)
815
816/* Can match anything (initialization) */
817STATIC void
818S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
819{
820 PERL_ARGS_ASSERT_CL_ANYTHING;
821
822 ANYOF_BITMAP_SETALL(cl);
823 cl->flags = ANYOF_UNICODE_ALL;
824 SET_SSC_EOS(cl);
825
826 /* If any portion of the regex is to operate under locale rules,
827 * initialization includes it. The reason this isn't done for all regexes
828 * is that the optimizer was written under the assumption that locale was
829 * all-or-nothing. Given the complexity and lack of documentation in the
830 * optimizer, and that there are inadequate test cases for locale, so many
831 * parts of it may not work properly, it is safest to avoid locale unless
832 * necessary. */
833 if (RExC_contains_locale) {
834 ANYOF_CLASS_SETALL(cl); /* /l uses class */
835 cl->flags |= ANYOF_LOCALE|ANYOF_CLASS|ANYOF_LOC_FOLD;
836 }
837 else {
838 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
839 }
840}
841
842/* Can match anything (initialization) */
843STATIC int
844S_cl_is_anything(const struct regnode_charclass_class *cl)
845{
846 int value;
847
848 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
849
850 for (value = 0; value < ANYOF_MAX; value += 2)
851 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
852 return 1;
853 if (!(cl->flags & ANYOF_UNICODE_ALL))
854 return 0;
855 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
856 return 0;
857 return 1;
858}
859
860/* Can match anything (initialization) */
861STATIC void
862S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
863{
864 PERL_ARGS_ASSERT_CL_INIT;
865
866 Zero(cl, 1, struct regnode_charclass_class);
867 cl->type = ANYOF;
868 cl_anything(pRExC_state, cl);
869 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
870}
871
872/* These two functions currently do the exact same thing */
873#define cl_init_zero cl_init
874
875/* 'AND' a given class with another one. Can create false positives. 'cl'
876 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
877 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
878STATIC void
879S_cl_and(struct regnode_charclass_class *cl,
880 const struct regnode_charclass_class *and_with)
881{
882 PERL_ARGS_ASSERT_CL_AND;
883
884 assert(PL_regkind[and_with->type] == ANYOF);
885
886 /* I (khw) am not sure all these restrictions are necessary XXX */
887 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
888 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
889 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
890 && !(and_with->flags & ANYOF_LOC_FOLD)
891 && !(cl->flags & ANYOF_LOC_FOLD)) {
892 int i;
893
894 if (and_with->flags & ANYOF_INVERT)
895 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
896 cl->bitmap[i] &= ~and_with->bitmap[i];
897 else
898 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
899 cl->bitmap[i] &= and_with->bitmap[i];
900 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
901
902 if (and_with->flags & ANYOF_INVERT) {
903
904 /* Here, the and'ed node is inverted. Get the AND of the flags that
905 * aren't affected by the inversion. Those that are affected are
906 * handled individually below */
907 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
908 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
909 cl->flags |= affected_flags;
910
911 /* We currently don't know how to deal with things that aren't in the
912 * bitmap, but we know that the intersection is no greater than what
913 * is already in cl, so let there be false positives that get sorted
914 * out after the synthetic start class succeeds, and the node is
915 * matched for real. */
916
917 /* The inversion of these two flags indicate that the resulting
918 * intersection doesn't have them */
919 if (and_with->flags & ANYOF_UNICODE_ALL) {
920 cl->flags &= ~ANYOF_UNICODE_ALL;
921 }
922 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
923 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
924 }
925 }
926 else { /* and'd node is not inverted */
927 U8 outside_bitmap_but_not_utf8; /* Temp variable */
928
929 if (! ANYOF_NONBITMAP(and_with)) {
930
931 /* Here 'and_with' doesn't match anything outside the bitmap
932 * (except possibly ANYOF_UNICODE_ALL), which means the
933 * intersection can't either, except for ANYOF_UNICODE_ALL, in
934 * which case we don't know what the intersection is, but it's no
935 * greater than what cl already has, so can just leave it alone,
936 * with possible false positives */
937 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
938 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
939 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
940 }
941 }
942 else if (! ANYOF_NONBITMAP(cl)) {
943
944 /* Here, 'and_with' does match something outside the bitmap, and cl
945 * doesn't have a list of things to match outside the bitmap. If
946 * cl can match all code points above 255, the intersection will
947 * be those above-255 code points that 'and_with' matches. If cl
948 * can't match all Unicode code points, it means that it can't
949 * match anything outside the bitmap (since the 'if' that got us
950 * into this block tested for that), so we leave the bitmap empty.
951 */
952 if (cl->flags & ANYOF_UNICODE_ALL) {
953 ARG_SET(cl, ARG(and_with));
954
955 /* and_with's ARG may match things that don't require UTF8.
956 * And now cl's will too, in spite of this being an 'and'. See
957 * the comments below about the kludge */
958 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
959 }
960 }
961 else {
962 /* Here, both 'and_with' and cl match something outside the
963 * bitmap. Currently we do not do the intersection, so just match
964 * whatever cl had at the beginning. */
965 }
966
967
968 /* Take the intersection of the two sets of flags. However, the
969 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
970 * kludge around the fact that this flag is not treated like the others
971 * which are initialized in cl_anything(). The way the optimizer works
972 * is that the synthetic start class (SSC) is initialized to match
973 * anything, and then the first time a real node is encountered, its
974 * values are AND'd with the SSC's with the result being the values of
975 * the real node. However, there are paths through the optimizer where
976 * the AND never gets called, so those initialized bits are set
977 * inappropriately, which is not usually a big deal, as they just cause
978 * false positives in the SSC, which will just mean a probably
979 * imperceptible slow down in execution. However this bit has a
980 * higher false positive consequence in that it can cause utf8.pm,
981 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
982 * bigger slowdown and also causes significant extra memory to be used.
983 * In order to prevent this, the code now takes a different tack. The
984 * bit isn't set unless some part of the regular expression needs it,
985 * but once set it won't get cleared. This means that these extra
986 * modules won't get loaded unless there was some path through the
987 * pattern that would have required them anyway, and so any false
988 * positives that occur by not ANDing them out when they could be
989 * aren't as severe as they would be if we treated this bit like all
990 * the others */
991 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
992 & ANYOF_NONBITMAP_NON_UTF8;
993 cl->flags &= and_with->flags;
994 cl->flags |= outside_bitmap_but_not_utf8;
995 }
996}
997
998/* 'OR' a given class with another one. Can create false positives. 'cl'
999 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
1000 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
1001STATIC void
1002S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
1003{
1004 PERL_ARGS_ASSERT_CL_OR;
1005
1006 if (or_with->flags & ANYOF_INVERT) {
1007
1008 /* Here, the or'd node is to be inverted. This means we take the
1009 * complement of everything not in the bitmap, but currently we don't
1010 * know what that is, so give up and match anything */
1011 if (ANYOF_NONBITMAP(or_with)) {
1012 cl_anything(pRExC_state, cl);
1013 }
1014 /* We do not use
1015 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1016 * <= (B1 | !B2) | (CL1 | !CL2)
1017 * which is wasteful if CL2 is small, but we ignore CL2:
1018 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1019 * XXXX Can we handle case-fold? Unclear:
1020 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1021 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1022 */
1023 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1024 && !(or_with->flags & ANYOF_LOC_FOLD)
1025 && !(cl->flags & ANYOF_LOC_FOLD) ) {
1026 int i;
1027
1028 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1029 cl->bitmap[i] |= ~or_with->bitmap[i];
1030 } /* XXXX: logic is complicated otherwise */
1031 else {
1032 cl_anything(pRExC_state, cl);
1033 }
1034
1035 /* And, we can just take the union of the flags that aren't affected
1036 * by the inversion */
1037 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
1038
1039 /* For the remaining flags:
1040 ANYOF_UNICODE_ALL and inverted means to not match anything above
1041 255, which means that the union with cl should just be
1042 what cl has in it, so can ignore this flag
1043 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
1044 is (ASCII) 127-255 to match them, but then invert that, so
1045 the union with cl should just be what cl has in it, so can
1046 ignore this flag
1047 */
1048 } else { /* 'or_with' is not inverted */
1049 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1050 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
1051 && (!(or_with->flags & ANYOF_LOC_FOLD)
1052 || (cl->flags & ANYOF_LOC_FOLD)) ) {
1053 int i;
1054
1055 /* OR char bitmap and class bitmap separately */
1056 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
1057 cl->bitmap[i] |= or_with->bitmap[i];
1058 if (or_with->flags & ANYOF_CLASS) {
1059 ANYOF_CLASS_OR(or_with, cl);
1060 }
1061 }
1062 else { /* XXXX: logic is complicated, leave it along for a moment. */
1063 cl_anything(pRExC_state, cl);
1064 }
1065
1066 if (ANYOF_NONBITMAP(or_with)) {
1067
1068 /* Use the added node's outside-the-bit-map match if there isn't a
1069 * conflict. If there is a conflict (both nodes match something
1070 * outside the bitmap, but what they match outside is not the same
1071 * pointer, and hence not easily compared until XXX we extend
1072 * inversion lists this far), give up and allow the start class to
1073 * match everything outside the bitmap. If that stuff is all above
1074 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1075 if (! ANYOF_NONBITMAP(cl)) {
1076 ARG_SET(cl, ARG(or_with));
1077 }
1078 else if (ARG(cl) != ARG(or_with)) {
1079
1080 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1081 cl_anything(pRExC_state, cl);
1082 }
1083 else {
1084 cl->flags |= ANYOF_UNICODE_ALL;
1085 }
1086 }
1087 }
1088
1089 /* Take the union */
1090 cl->flags |= or_with->flags;
1091 }
1092}
1093
1094#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1095#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1096#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1097#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1098
1099
1100#ifdef DEBUGGING
1101/*
1102 dump_trie(trie,widecharmap,revcharmap)
1103 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1104 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1105
1106 These routines dump out a trie in a somewhat readable format.
1107 The _interim_ variants are used for debugging the interim
1108 tables that are used to generate the final compressed
1109 representation which is what dump_trie expects.
1110
1111 Part of the reason for their existence is to provide a form
1112 of documentation as to how the different representations function.
1113
1114*/
1115
1116/*
1117 Dumps the final compressed table form of the trie to Perl_debug_log.
1118 Used for debugging make_trie().
1119*/
1120
1121STATIC void
1122S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1123 AV *revcharmap, U32 depth)
1124{
1125 U32 state;
1126 SV *sv=sv_newmortal();
1127 int colwidth= widecharmap ? 6 : 4;
1128 U16 word;
1129 GET_RE_DEBUG_FLAGS_DECL;
1130
1131 PERL_ARGS_ASSERT_DUMP_TRIE;
1132
1133 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1134 (int)depth * 2 + 2,"",
1135 "Match","Base","Ofs" );
1136
1137 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1138 SV ** const tmp = av_fetch( revcharmap, state, 0);
1139 if ( tmp ) {
1140 PerlIO_printf( Perl_debug_log, "%*s",
1141 colwidth,
1142 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1143 PL_colors[0], PL_colors[1],
1144 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1145 PERL_PV_ESCAPE_FIRSTCHAR
1146 )
1147 );
1148 }
1149 }
1150 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1151 (int)depth * 2 + 2,"");
1152
1153 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1154 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1155 PerlIO_printf( Perl_debug_log, "\n");
1156
1157 for( state = 1 ; state < trie->statecount ; state++ ) {
1158 const U32 base = trie->states[ state ].trans.base;
1159
1160 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1161
1162 if ( trie->states[ state ].wordnum ) {
1163 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1164 } else {
1165 PerlIO_printf( Perl_debug_log, "%6s", "" );
1166 }
1167
1168 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1169
1170 if ( base ) {
1171 U32 ofs = 0;
1172
1173 while( ( base + ofs < trie->uniquecharcount ) ||
1174 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1175 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1176 ofs++;
1177
1178 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1179
1180 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1181 if ( ( base + ofs >= trie->uniquecharcount ) &&
1182 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1183 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1184 {
1185 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1186 colwidth,
1187 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1188 } else {
1189 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1190 }
1191 }
1192
1193 PerlIO_printf( Perl_debug_log, "]");
1194
1195 }
1196 PerlIO_printf( Perl_debug_log, "\n" );
1197 }
1198 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1199 for (word=1; word <= trie->wordcount; word++) {
1200 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1201 (int)word, (int)(trie->wordinfo[word].prev),
1202 (int)(trie->wordinfo[word].len));
1203 }
1204 PerlIO_printf(Perl_debug_log, "\n" );
1205}
1206/*
1207 Dumps a fully constructed but uncompressed trie in list form.
1208 List tries normally only are used for construction when the number of
1209 possible chars (trie->uniquecharcount) is very high.
1210 Used for debugging make_trie().
1211*/
1212STATIC void
1213S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1214 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1215 U32 depth)
1216{
1217 U32 state;
1218 SV *sv=sv_newmortal();
1219 int colwidth= widecharmap ? 6 : 4;
1220 GET_RE_DEBUG_FLAGS_DECL;
1221
1222 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1223
1224 /* print out the table precompression. */
1225 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1226 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1227 "------:-----+-----------------\n" );
1228
1229 for( state=1 ; state < next_alloc ; state ++ ) {
1230 U16 charid;
1231
1232 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1233 (int)depth * 2 + 2,"", (UV)state );
1234 if ( ! trie->states[ state ].wordnum ) {
1235 PerlIO_printf( Perl_debug_log, "%5s| ","");
1236 } else {
1237 PerlIO_printf( Perl_debug_log, "W%4x| ",
1238 trie->states[ state ].wordnum
1239 );
1240 }
1241 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1242 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1243 if ( tmp ) {
1244 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1245 colwidth,
1246 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1247 PL_colors[0], PL_colors[1],
1248 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1249 PERL_PV_ESCAPE_FIRSTCHAR
1250 ) ,
1251 TRIE_LIST_ITEM(state,charid).forid,
1252 (UV)TRIE_LIST_ITEM(state,charid).newstate
1253 );
1254 if (!(charid % 10))
1255 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1256 (int)((depth * 2) + 14), "");
1257 }
1258 }
1259 PerlIO_printf( Perl_debug_log, "\n");
1260 }
1261}
1262
1263/*
1264 Dumps a fully constructed but uncompressed trie in table form.
1265 This is the normal DFA style state transition table, with a few
1266 twists to facilitate compression later.
1267 Used for debugging make_trie().
1268*/
1269STATIC void
1270S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1271 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1272 U32 depth)
1273{
1274 U32 state;
1275 U16 charid;
1276 SV *sv=sv_newmortal();
1277 int colwidth= widecharmap ? 6 : 4;
1278 GET_RE_DEBUG_FLAGS_DECL;
1279
1280 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1281
1282 /*
1283 print out the table precompression so that we can do a visual check
1284 that they are identical.
1285 */
1286
1287 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1288
1289 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1291 if ( tmp ) {
1292 PerlIO_printf( Perl_debug_log, "%*s",
1293 colwidth,
1294 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1295 PL_colors[0], PL_colors[1],
1296 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1297 PERL_PV_ESCAPE_FIRSTCHAR
1298 )
1299 );
1300 }
1301 }
1302
1303 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1304
1305 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1306 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1307 }
1308
1309 PerlIO_printf( Perl_debug_log, "\n" );
1310
1311 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1312
1313 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1314 (int)depth * 2 + 2,"",
1315 (UV)TRIE_NODENUM( state ) );
1316
1317 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1318 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1319 if (v)
1320 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1321 else
1322 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1323 }
1324 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1325 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1326 } else {
1327 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1328 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1329 }
1330 }
1331}
1332
1333#endif
1334
1335
1336/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1337 startbranch: the first branch in the whole branch sequence
1338 first : start branch of sequence of branch-exact nodes.
1339 May be the same as startbranch
1340 last : Thing following the last branch.
1341 May be the same as tail.
1342 tail : item following the branch sequence
1343 count : words in the sequence
1344 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1345 depth : indent depth
1346
1347Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1348
1349A trie is an N'ary tree where the branches are determined by digital
1350decomposition of the key. IE, at the root node you look up the 1st character and
1351follow that branch repeat until you find the end of the branches. Nodes can be
1352marked as "accepting" meaning they represent a complete word. Eg:
1353
1354 /he|she|his|hers/
1355
1356would convert into the following structure. Numbers represent states, letters
1357following numbers represent valid transitions on the letter from that state, if
1358the number is in square brackets it represents an accepting state, otherwise it
1359will be in parenthesis.
1360
1361 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1362 | |
1363 | (2)
1364 | |
1365 (1) +-i->(6)-+-s->[7]
1366 |
1367 +-s->(3)-+-h->(4)-+-e->[5]
1368
1369 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1370
1371This shows that when matching against the string 'hers' we will begin at state 1
1372read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1373then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1374is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1375single traverse. We store a mapping from accepting to state to which word was
1376matched, and then when we have multiple possibilities we try to complete the
1377rest of the regex in the order in which they occured in the alternation.
1378
1379The only prior NFA like behaviour that would be changed by the TRIE support is
1380the silent ignoring of duplicate alternations which are of the form:
1381
1382 / (DUPE|DUPE) X? (?{ ... }) Y /x
1383
1384Thus EVAL blocks following a trie may be called a different number of times with
1385and without the optimisation. With the optimisations dupes will be silently
1386ignored. This inconsistent behaviour of EVAL type nodes is well established as
1387the following demonstrates:
1388
1389 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1390
1391which prints out 'word' three times, but
1392
1393 'words'=~/(word|word|word)(?{ print $1 })S/
1394
1395which doesnt print it out at all. This is due to other optimisations kicking in.
1396
1397Example of what happens on a structural level:
1398
1399The regexp /(ac|ad|ab)+/ will produce the following debug output:
1400
1401 1: CURLYM[1] {1,32767}(18)
1402 5: BRANCH(8)
1403 6: EXACT <ac>(16)
1404 8: BRANCH(11)
1405 9: EXACT <ad>(16)
1406 11: BRANCH(14)
1407 12: EXACT <ab>(16)
1408 16: SUCCEED(0)
1409 17: NOTHING(18)
1410 18: END(0)
1411
1412This would be optimizable with startbranch=5, first=5, last=16, tail=16
1413and should turn into:
1414
1415 1: CURLYM[1] {1,32767}(18)
1416 5: TRIE(16)
1417 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1418 <ac>
1419 <ad>
1420 <ab>
1421 16: SUCCEED(0)
1422 17: NOTHING(18)
1423 18: END(0)
1424
1425Cases where tail != last would be like /(?foo|bar)baz/:
1426
1427 1: BRANCH(4)
1428 2: EXACT <foo>(8)
1429 4: BRANCH(7)
1430 5: EXACT <bar>(8)
1431 7: TAIL(8)
1432 8: EXACT <baz>(10)
1433 10: END(0)
1434
1435which would be optimizable with startbranch=1, first=1, last=7, tail=8
1436and would end up looking like:
1437
1438 1: TRIE(8)
1439 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1440 <foo>
1441 <bar>
1442 7: TAIL(8)
1443 8: EXACT <baz>(10)
1444 10: END(0)
1445
1446 d = uvchr_to_utf8_flags(d, uv, 0);
1447
1448is the recommended Unicode-aware way of saying
1449
1450 *(d++) = uv;
1451*/
1452
1453#define TRIE_STORE_REVCHAR(val) \
1454 STMT_START { \
1455 if (UTF) { \
1456 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1457 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1458 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1459 SvCUR_set(zlopp, kapow - flrbbbbb); \
1460 SvPOK_on(zlopp); \
1461 SvUTF8_on(zlopp); \
1462 av_push(revcharmap, zlopp); \
1463 } else { \
1464 char ooooff = (char)val; \
1465 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1466 } \
1467 } STMT_END
1468
1469/* This gets the next character from the input, folding it if not already
1470 * folded. */
1471#define TRIE_READ_CHAR STMT_START { \
1472 wordlen++; \
1473 if ( UTF ) { \
1474 /* if it is UTF then it is either already folded, or does not need \
1475 * folding */ \
1476 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1477 } \
1478 else if (folder == PL_fold_latin1) { \
1479 /* This folder implies Unicode rules, which in the range expressible \
1480 * by not UTF is the lower case, with the two exceptions, one of \
1481 * which should have been taken care of before calling this */ \
1482 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1483 uvc = toLOWER_L1(*uc); \
1484 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1485 len = 1; \
1486 } else { \
1487 /* raw data, will be folded later if needed */ \
1488 uvc = (U32)*uc; \
1489 len = 1; \
1490 } \
1491} STMT_END
1492
1493
1494
1495#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1496 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1497 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1498 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1499 } \
1500 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1501 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1502 TRIE_LIST_CUR( state )++; \
1503} STMT_END
1504
1505#define TRIE_LIST_NEW(state) STMT_START { \
1506 Newxz( trie->states[ state ].trans.list, \
1507 4, reg_trie_trans_le ); \
1508 TRIE_LIST_CUR( state ) = 1; \
1509 TRIE_LIST_LEN( state ) = 4; \
1510} STMT_END
1511
1512#define TRIE_HANDLE_WORD(state) STMT_START { \
1513 U16 dupe= trie->states[ state ].wordnum; \
1514 regnode * const noper_next = regnext( noper ); \
1515 \
1516 DEBUG_r({ \
1517 /* store the word for dumping */ \
1518 SV* tmp; \
1519 if (OP(noper) != NOTHING) \
1520 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1521 else \
1522 tmp = newSVpvn_utf8( "", 0, UTF ); \
1523 av_push( trie_words, tmp ); \
1524 }); \
1525 \
1526 curword++; \
1527 trie->wordinfo[curword].prev = 0; \
1528 trie->wordinfo[curword].len = wordlen; \
1529 trie->wordinfo[curword].accept = state; \
1530 \
1531 if ( noper_next < tail ) { \
1532 if (!trie->jump) \
1533 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1534 trie->jump[curword] = (U16)(noper_next - convert); \
1535 if (!jumper) \
1536 jumper = noper_next; \
1537 if (!nextbranch) \
1538 nextbranch= regnext(cur); \
1539 } \
1540 \
1541 if ( dupe ) { \
1542 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1543 /* chain, so that when the bits of chain are later */\
1544 /* linked together, the dups appear in the chain */\
1545 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1546 trie->wordinfo[dupe].prev = curword; \
1547 } else { \
1548 /* we haven't inserted this word yet. */ \
1549 trie->states[ state ].wordnum = curword; \
1550 } \
1551} STMT_END
1552
1553
1554#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1555 ( ( base + charid >= ucharcount \
1556 && base + charid < ubound \
1557 && state == trie->trans[ base - ucharcount + charid ].check \
1558 && trie->trans[ base - ucharcount + charid ].next ) \
1559 ? trie->trans[ base - ucharcount + charid ].next \
1560 : ( state==1 ? special : 0 ) \
1561 )
1562
1563#define MADE_TRIE 1
1564#define MADE_JUMP_TRIE 2
1565#define MADE_EXACT_TRIE 4
1566
1567STATIC I32
1568S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1569{
1570 dVAR;
1571 /* first pass, loop through and scan words */
1572 reg_trie_data *trie;
1573 HV *widecharmap = NULL;
1574 AV *revcharmap = newAV();
1575 regnode *cur;
1576 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1577 STRLEN len = 0;
1578 UV uvc = 0;
1579 U16 curword = 0;
1580 U32 next_alloc = 0;
1581 regnode *jumper = NULL;
1582 regnode *nextbranch = NULL;
1583 regnode *convert = NULL;
1584 U32 *prev_states; /* temp array mapping each state to previous one */
1585 /* we just use folder as a flag in utf8 */
1586 const U8 * folder = NULL;
1587
1588#ifdef DEBUGGING
1589 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1590 AV *trie_words = NULL;
1591 /* along with revcharmap, this only used during construction but both are
1592 * useful during debugging so we store them in the struct when debugging.
1593 */
1594#else
1595 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1596 STRLEN trie_charcount=0;
1597#endif
1598 SV *re_trie_maxbuff;
1599 GET_RE_DEBUG_FLAGS_DECL;
1600
1601 PERL_ARGS_ASSERT_MAKE_TRIE;
1602#ifndef DEBUGGING
1603 PERL_UNUSED_ARG(depth);
1604#endif
1605
1606 switch (flags) {
1607 case EXACT: break;
1608 case EXACTFA:
1609 case EXACTFU_SS:
1610 case EXACTFU: folder = PL_fold_latin1; break;
1611 case EXACTF: folder = PL_fold; break;
1612 case EXACTFL: folder = PL_fold_locale; break;
1613 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1614 }
1615
1616 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1617 trie->refcount = 1;
1618 trie->startstate = 1;
1619 trie->wordcount = word_count;
1620 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1621 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1622 if (flags == EXACT)
1623 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1624 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1625 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1626
1627 DEBUG_r({
1628 trie_words = newAV();
1629 });
1630
1631 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1632 if (!SvIOK(re_trie_maxbuff)) {
1633 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1634 }
1635 DEBUG_TRIE_COMPILE_r({
1636 PerlIO_printf( Perl_debug_log,
1637 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1638 (int)depth * 2 + 2, "",
1639 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1640 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1641 (int)depth);
1642 });
1643
1644 /* Find the node we are going to overwrite */
1645 if ( first == startbranch && OP( last ) != BRANCH ) {
1646 /* whole branch chain */
1647 convert = first;
1648 } else {
1649 /* branch sub-chain */
1650 convert = NEXTOPER( first );
1651 }
1652
1653 /* -- First loop and Setup --
1654
1655 We first traverse the branches and scan each word to determine if it
1656 contains widechars, and how many unique chars there are, this is
1657 important as we have to build a table with at least as many columns as we
1658 have unique chars.
1659
1660 We use an array of integers to represent the character codes 0..255
1661 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1662 native representation of the character value as the key and IV's for the
1663 coded index.
1664
1665 *TODO* If we keep track of how many times each character is used we can
1666 remap the columns so that the table compression later on is more
1667 efficient in terms of memory by ensuring the most common value is in the
1668 middle and the least common are on the outside. IMO this would be better
1669 than a most to least common mapping as theres a decent chance the most
1670 common letter will share a node with the least common, meaning the node
1671 will not be compressible. With a middle is most common approach the worst
1672 case is when we have the least common nodes twice.
1673
1674 */
1675
1676 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1677 regnode *noper = NEXTOPER( cur );
1678 const U8 *uc = (U8*)STRING( noper );
1679 const U8 *e = uc + STR_LEN( noper );
1680 STRLEN foldlen = 0;
1681 U32 wordlen = 0; /* required init */
1682 STRLEN minbytes = 0;
1683 STRLEN maxbytes = 0;
1684 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1685
1686 if (OP(noper) == NOTHING) {
1687 regnode *noper_next= regnext(noper);
1688 if (noper_next != tail && OP(noper_next) == flags) {
1689 noper = noper_next;
1690 uc= (U8*)STRING(noper);
1691 e= uc + STR_LEN(noper);
1692 trie->minlen= STR_LEN(noper);
1693 } else {
1694 trie->minlen= 0;
1695 continue;
1696 }
1697 }
1698
1699 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1700 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1701 regardless of encoding */
1702 if (OP( noper ) == EXACTFU_SS) {
1703 /* false positives are ok, so just set this */
1704 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
1705 }
1706 }
1707 for ( ; uc < e ; uc += len ) {
1708 TRIE_CHARCOUNT(trie)++;
1709 TRIE_READ_CHAR;
1710
1711 /* Acummulate to the current values, the range in the number of
1712 * bytes that this character could match. The max is presumed to
1713 * be the same as the folded input (which TRIE_READ_CHAR returns),
1714 * except that when this is not in UTF-8, it could be matched
1715 * against a string which is UTF-8, and the variant characters
1716 * could be 2 bytes instead of the 1 here. Likewise, for the
1717 * minimum number of bytes when not folded. When folding, the min
1718 * is assumed to be 1 byte could fold to match the single character
1719 * here, or in the case of a multi-char fold, 1 byte can fold to
1720 * the whole sequence. 'foldlen' is used to denote whether we are
1721 * in such a sequence, skipping the min setting if so. XXX TODO
1722 * Use the exact list of what folds to each character, from
1723 * PL_utf8_foldclosures */
1724 if (UTF) {
1725 maxbytes += UTF8SKIP(uc);
1726 if (! folder) {
1727 /* A non-UTF-8 string could be 1 byte to match our 2 */
1728 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1729 ? 1
1730 : UTF8SKIP(uc);
1731 }
1732 else {
1733 if (foldlen) {
1734 foldlen -= UTF8SKIP(uc);
1735 }
1736 else {
1737 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1738 minbytes++;
1739 }
1740 }
1741 }
1742 else {
1743 maxbytes += (UNI_IS_INVARIANT(*uc))
1744 ? 1
1745 : 2;
1746 if (! folder) {
1747 minbytes++;
1748 }
1749 else {
1750 if (foldlen) {
1751 foldlen--;
1752 }
1753 else {
1754 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
1755 minbytes++;
1756 }
1757 }
1758 }
1759 if ( uvc < 256 ) {
1760 if ( folder ) {
1761 U8 folded= folder[ (U8) uvc ];
1762 if ( !trie->charmap[ folded ] ) {
1763 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1764 TRIE_STORE_REVCHAR( folded );
1765 }
1766 }
1767 if ( !trie->charmap[ uvc ] ) {
1768 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1769 TRIE_STORE_REVCHAR( uvc );
1770 }
1771 if ( set_bit ) {
1772 /* store the codepoint in the bitmap, and its folded
1773 * equivalent. */
1774 TRIE_BITMAP_SET(trie, uvc);
1775
1776 /* store the folded codepoint */
1777 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1778
1779 if ( !UTF ) {
1780 /* store first byte of utf8 representation of
1781 variant codepoints */
1782 if (! NATIVE_IS_INVARIANT(uvc)) {
1783 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1784 }
1785 }
1786 set_bit = 0; /* We've done our bit :-) */
1787 }
1788 } else {
1789 SV** svpp;
1790 if ( !widecharmap )
1791 widecharmap = newHV();
1792
1793 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1794
1795 if ( !svpp )
1796 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1797
1798 if ( !SvTRUE( *svpp ) ) {
1799 sv_setiv( *svpp, ++trie->uniquecharcount );
1800 TRIE_STORE_REVCHAR(uvc);
1801 }
1802 }
1803 }
1804 if( cur == first ) {
1805 trie->minlen = minbytes;
1806 trie->maxlen = maxbytes;
1807 } else if (minbytes < trie->minlen) {
1808 trie->minlen = minbytes;
1809 } else if (maxbytes > trie->maxlen) {
1810 trie->maxlen = maxbytes;
1811 }
1812 } /* end first pass */
1813 DEBUG_TRIE_COMPILE_r(
1814 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1815 (int)depth * 2 + 2,"",
1816 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1817 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1818 (int)trie->minlen, (int)trie->maxlen )
1819 );
1820
1821 /*
1822 We now know what we are dealing with in terms of unique chars and
1823 string sizes so we can calculate how much memory a naive
1824 representation using a flat table will take. If it's over a reasonable
1825 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1826 conservative but potentially much slower representation using an array
1827 of lists.
1828
1829 At the end we convert both representations into the same compressed
1830 form that will be used in regexec.c for matching with. The latter
1831 is a form that cannot be used to construct with but has memory
1832 properties similar to the list form and access properties similar
1833 to the table form making it both suitable for fast searches and
1834 small enough that its feasable to store for the duration of a program.
1835
1836 See the comment in the code where the compressed table is produced
1837 inplace from the flat tabe representation for an explanation of how
1838 the compression works.
1839
1840 */
1841
1842
1843 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1844 prev_states[1] = 0;
1845
1846 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1847 /*
1848 Second Pass -- Array Of Lists Representation
1849
1850 Each state will be represented by a list of charid:state records
1851 (reg_trie_trans_le) the first such element holds the CUR and LEN
1852 points of the allocated array. (See defines above).
1853
1854 We build the initial structure using the lists, and then convert
1855 it into the compressed table form which allows faster lookups
1856 (but cant be modified once converted).
1857 */
1858
1859 STRLEN transcount = 1;
1860
1861 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1862 "%*sCompiling trie using list compiler\n",
1863 (int)depth * 2 + 2, ""));
1864
1865 trie->states = (reg_trie_state *)
1866 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1867 sizeof(reg_trie_state) );
1868 TRIE_LIST_NEW(1);
1869 next_alloc = 2;
1870
1871 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1872
1873 regnode *noper = NEXTOPER( cur );
1874 U8 *uc = (U8*)STRING( noper );
1875 const U8 *e = uc + STR_LEN( noper );
1876 U32 state = 1; /* required init */
1877 U16 charid = 0; /* sanity init */
1878 U32 wordlen = 0; /* required init */
1879
1880 if (OP(noper) == NOTHING) {
1881 regnode *noper_next= regnext(noper);
1882 if (noper_next != tail && OP(noper_next) == flags) {
1883 noper = noper_next;
1884 uc= (U8*)STRING(noper);
1885 e= uc + STR_LEN(noper);
1886 }
1887 }
1888
1889 if (OP(noper) != NOTHING) {
1890 for ( ; uc < e ; uc += len ) {
1891
1892 TRIE_READ_CHAR;
1893
1894 if ( uvc < 256 ) {
1895 charid = trie->charmap[ uvc ];
1896 } else {
1897 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1898 if ( !svpp ) {
1899 charid = 0;
1900 } else {
1901 charid=(U16)SvIV( *svpp );
1902 }
1903 }
1904 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1905 if ( charid ) {
1906
1907 U16 check;
1908 U32 newstate = 0;
1909
1910 charid--;
1911 if ( !trie->states[ state ].trans.list ) {
1912 TRIE_LIST_NEW( state );
1913 }
1914 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1915 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1916 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1917 break;
1918 }
1919 }
1920 if ( ! newstate ) {
1921 newstate = next_alloc++;
1922 prev_states[newstate] = state;
1923 TRIE_LIST_PUSH( state, charid, newstate );
1924 transcount++;
1925 }
1926 state = newstate;
1927 } else {
1928 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1929 }
1930 }
1931 }
1932 TRIE_HANDLE_WORD(state);
1933
1934 } /* end second pass */
1935
1936 /* next alloc is the NEXT state to be allocated */
1937 trie->statecount = next_alloc;
1938 trie->states = (reg_trie_state *)
1939 PerlMemShared_realloc( trie->states,
1940 next_alloc
1941 * sizeof(reg_trie_state) );
1942
1943 /* and now dump it out before we compress it */
1944 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1945 revcharmap, next_alloc,
1946 depth+1)
1947 );
1948
1949 trie->trans = (reg_trie_trans *)
1950 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1951 {
1952 U32 state;
1953 U32 tp = 0;
1954 U32 zp = 0;
1955
1956
1957 for( state=1 ; state < next_alloc ; state ++ ) {
1958 U32 base=0;
1959
1960 /*
1961 DEBUG_TRIE_COMPILE_MORE_r(
1962 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1963 );
1964 */
1965
1966 if (trie->states[state].trans.list) {
1967 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1968 U16 maxid=minid;
1969 U16 idx;
1970
1971 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1972 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1973 if ( forid < minid ) {
1974 minid=forid;
1975 } else if ( forid > maxid ) {
1976 maxid=forid;
1977 }
1978 }
1979 if ( transcount < tp + maxid - minid + 1) {
1980 transcount *= 2;
1981 trie->trans = (reg_trie_trans *)
1982 PerlMemShared_realloc( trie->trans,
1983 transcount
1984 * sizeof(reg_trie_trans) );
1985 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1986 }
1987 base = trie->uniquecharcount + tp - minid;
1988 if ( maxid == minid ) {
1989 U32 set = 0;
1990 for ( ; zp < tp ; zp++ ) {
1991 if ( ! trie->trans[ zp ].next ) {
1992 base = trie->uniquecharcount + zp - minid;
1993 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1994 trie->trans[ zp ].check = state;
1995 set = 1;
1996 break;
1997 }
1998 }
1999 if ( !set ) {
2000 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2001 trie->trans[ tp ].check = state;
2002 tp++;
2003 zp = tp;
2004 }
2005 } else {
2006 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2007 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
2008 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2009 trie->trans[ tid ].check = state;
2010 }
2011 tp += ( maxid - minid + 1 );
2012 }
2013 Safefree(trie->states[ state ].trans.list);
2014 }
2015 /*
2016 DEBUG_TRIE_COMPILE_MORE_r(
2017 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2018 );
2019 */
2020 trie->states[ state ].trans.base=base;
2021 }
2022 trie->lasttrans = tp + 1;
2023 }
2024 } else {
2025 /*
2026 Second Pass -- Flat Table Representation.
2027
2028 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2029 We know that we will need Charcount+1 trans at most to store the data
2030 (one row per char at worst case) So we preallocate both structures
2031 assuming worst case.
2032
2033 We then construct the trie using only the .next slots of the entry
2034 structs.
2035
2036 We use the .check field of the first entry of the node temporarily to
2037 make compression both faster and easier by keeping track of how many non
2038 zero fields are in the node.
2039
2040 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2041 transition.
2042
2043 There are two terms at use here: state as a TRIE_NODEIDX() which is a
2044 number representing the first entry of the node, and state as a
2045 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2046 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2047 are 2 entrys per node. eg:
2048
2049 A B A B
2050 1. 2 4 1. 3 7
2051 2. 0 3 3. 0 5
2052 3. 0 0 5. 0 0
2053 4. 0 0 7. 0 0
2054
2055 The table is internally in the right hand, idx form. However as we also
2056 have to deal with the states array which is indexed by nodenum we have to
2057 use TRIE_NODENUM() to convert.
2058
2059 */
2060 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2061 "%*sCompiling trie using table compiler\n",
2062 (int)depth * 2 + 2, ""));
2063
2064 trie->trans = (reg_trie_trans *)
2065 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2066 * trie->uniquecharcount + 1,
2067 sizeof(reg_trie_trans) );
2068 trie->states = (reg_trie_state *)
2069 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2070 sizeof(reg_trie_state) );
2071 next_alloc = trie->uniquecharcount + 1;
2072
2073
2074 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2075
2076 regnode *noper = NEXTOPER( cur );
2077 const U8 *uc = (U8*)STRING( noper );
2078 const U8 *e = uc + STR_LEN( noper );
2079
2080 U32 state = 1; /* required init */
2081
2082 U16 charid = 0; /* sanity init */
2083 U32 accept_state = 0; /* sanity init */
2084
2085 U32 wordlen = 0; /* required init */
2086
2087 if (OP(noper) == NOTHING) {
2088 regnode *noper_next= regnext(noper);
2089 if (noper_next != tail && OP(noper_next) == flags) {
2090 noper = noper_next;
2091 uc= (U8*)STRING(noper);
2092 e= uc + STR_LEN(noper);
2093 }
2094 }
2095
2096 if ( OP(noper) != NOTHING ) {
2097 for ( ; uc < e ; uc += len ) {
2098
2099 TRIE_READ_CHAR;
2100
2101 if ( uvc < 256 ) {
2102 charid = trie->charmap[ uvc ];
2103 } else {
2104 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2105 charid = svpp ? (U16)SvIV(*svpp) : 0;
2106 }
2107 if ( charid ) {
2108 charid--;
2109 if ( !trie->trans[ state + charid ].next ) {
2110 trie->trans[ state + charid ].next = next_alloc;
2111 trie->trans[ state ].check++;
2112 prev_states[TRIE_NODENUM(next_alloc)]
2113 = TRIE_NODENUM(state);
2114 next_alloc += trie->uniquecharcount;
2115 }
2116 state = trie->trans[ state + charid ].next;
2117 } else {
2118 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2119 }
2120 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2121 }
2122 }
2123 accept_state = TRIE_NODENUM( state );
2124 TRIE_HANDLE_WORD(accept_state);
2125
2126 } /* end second pass */
2127
2128 /* and now dump it out before we compress it */
2129 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2130 revcharmap,
2131 next_alloc, depth+1));
2132
2133 {
2134 /*
2135 * Inplace compress the table.*
2136
2137 For sparse data sets the table constructed by the trie algorithm will
2138 be mostly 0/FAIL transitions or to put it another way mostly empty.
2139 (Note that leaf nodes will not contain any transitions.)
2140
2141 This algorithm compresses the tables by eliminating most such
2142 transitions, at the cost of a modest bit of extra work during lookup:
2143
2144 - Each states[] entry contains a .base field which indicates the
2145 index in the state[] array wheres its transition data is stored.
2146
2147 - If .base is 0 there are no valid transitions from that node.
2148
2149 - If .base is nonzero then charid is added to it to find an entry in
2150 the trans array.
2151
2152 -If trans[states[state].base+charid].check!=state then the
2153 transition is taken to be a 0/Fail transition. Thus if there are fail
2154 transitions at the front of the node then the .base offset will point
2155 somewhere inside the previous nodes data (or maybe even into a node
2156 even earlier), but the .check field determines if the transition is
2157 valid.
2158
2159 XXX - wrong maybe?
2160 The following process inplace converts the table to the compressed
2161 table: We first do not compress the root node 1,and mark all its
2162 .check pointers as 1 and set its .base pointer as 1 as well. This
2163 allows us to do a DFA construction from the compressed table later,
2164 and ensures that any .base pointers we calculate later are greater
2165 than 0.
2166
2167 - We set 'pos' to indicate the first entry of the second node.
2168
2169 - We then iterate over the columns of the node, finding the first and
2170 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2171 and set the .check pointers accordingly, and advance pos
2172 appropriately and repreat for the next node. Note that when we copy
2173 the next pointers we have to convert them from the original
2174 NODEIDX form to NODENUM form as the former is not valid post
2175 compression.
2176
2177 - If a node has no transitions used we mark its base as 0 and do not
2178 advance the pos pointer.
2179
2180 - If a node only has one transition we use a second pointer into the
2181 structure to fill in allocated fail transitions from other states.
2182 This pointer is independent of the main pointer and scans forward
2183 looking for null transitions that are allocated to a state. When it
2184 finds one it writes the single transition into the "hole". If the
2185 pointer doesnt find one the single transition is appended as normal.
2186
2187 - Once compressed we can Renew/realloc the structures to release the
2188 excess space.
2189
2190 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2191 specifically Fig 3.47 and the associated pseudocode.
2192
2193 demq
2194 */
2195 const U32 laststate = TRIE_NODENUM( next_alloc );
2196 U32 state, charid;
2197 U32 pos = 0, zp=0;
2198 trie->statecount = laststate;
2199
2200 for ( state = 1 ; state < laststate ; state++ ) {
2201 U8 flag = 0;
2202 const U32 stateidx = TRIE_NODEIDX( state );
2203 const U32 o_used = trie->trans[ stateidx ].check;
2204 U32 used = trie->trans[ stateidx ].check;
2205 trie->trans[ stateidx ].check = 0;
2206
2207 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2208 if ( flag || trie->trans[ stateidx + charid ].next ) {
2209 if ( trie->trans[ stateidx + charid ].next ) {
2210 if (o_used == 1) {
2211 for ( ; zp < pos ; zp++ ) {
2212 if ( ! trie->trans[ zp ].next ) {
2213 break;
2214 }
2215 }
2216 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2217 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2218 trie->trans[ zp ].check = state;
2219 if ( ++zp > pos ) pos = zp;
2220 break;
2221 }
2222 used--;
2223 }
2224 if ( !flag ) {
2225 flag = 1;
2226 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2227 }
2228 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2229 trie->trans[ pos ].check = state;
2230 pos++;
2231 }
2232 }
2233 }
2234 trie->lasttrans = pos + 1;
2235 trie->states = (reg_trie_state *)
2236 PerlMemShared_realloc( trie->states, laststate
2237 * sizeof(reg_trie_state) );
2238 DEBUG_TRIE_COMPILE_MORE_r(
2239 PerlIO_printf( Perl_debug_log,
2240 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2241 (int)depth * 2 + 2,"",
2242 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2243 (IV)next_alloc,
2244 (IV)pos,
2245 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2246 );
2247
2248 } /* end table compress */
2249 }
2250 DEBUG_TRIE_COMPILE_MORE_r(
2251 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2252 (int)depth * 2 + 2, "",
2253 (UV)trie->statecount,
2254 (UV)trie->lasttrans)
2255 );
2256 /* resize the trans array to remove unused space */
2257 trie->trans = (reg_trie_trans *)
2258 PerlMemShared_realloc( trie->trans, trie->lasttrans
2259 * sizeof(reg_trie_trans) );
2260
2261 { /* Modify the program and insert the new TRIE node */
2262 U8 nodetype =(U8)(flags & 0xFF);
2263 char *str=NULL;
2264
2265#ifdef DEBUGGING
2266 regnode *optimize = NULL;
2267#ifdef RE_TRACK_PATTERN_OFFSETS
2268
2269 U32 mjd_offset = 0;
2270 U32 mjd_nodelen = 0;
2271#endif /* RE_TRACK_PATTERN_OFFSETS */
2272#endif /* DEBUGGING */
2273 /*
2274 This means we convert either the first branch or the first Exact,
2275 depending on whether the thing following (in 'last') is a branch
2276 or not and whther first is the startbranch (ie is it a sub part of
2277 the alternation or is it the whole thing.)
2278 Assuming its a sub part we convert the EXACT otherwise we convert
2279 the whole branch sequence, including the first.
2280 */
2281 /* Find the node we are going to overwrite */
2282 if ( first != startbranch || OP( last ) == BRANCH ) {
2283 /* branch sub-chain */
2284 NEXT_OFF( first ) = (U16)(last - first);
2285#ifdef RE_TRACK_PATTERN_OFFSETS
2286 DEBUG_r({
2287 mjd_offset= Node_Offset((convert));
2288 mjd_nodelen= Node_Length((convert));
2289 });
2290#endif
2291 /* whole branch chain */
2292 }
2293#ifdef RE_TRACK_PATTERN_OFFSETS
2294 else {
2295 DEBUG_r({
2296 const regnode *nop = NEXTOPER( convert );
2297 mjd_offset= Node_Offset((nop));
2298 mjd_nodelen= Node_Length((nop));
2299 });
2300 }
2301 DEBUG_OPTIMISE_r(
2302 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2303 (int)depth * 2 + 2, "",
2304 (UV)mjd_offset, (UV)mjd_nodelen)
2305 );
2306#endif
2307 /* But first we check to see if there is a common prefix we can
2308 split out as an EXACT and put in front of the TRIE node. */
2309 trie->startstate= 1;
2310 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2311 U32 state;
2312 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2313 U32 ofs = 0;
2314 I32 idx = -1;
2315 U32 count = 0;
2316 const U32 base = trie->states[ state ].trans.base;
2317
2318 if ( trie->states[state].wordnum )
2319 count = 1;
2320
2321 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2322 if ( ( base + ofs >= trie->uniquecharcount ) &&
2323 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2324 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2325 {
2326 if ( ++count > 1 ) {
2327 SV **tmp = av_fetch( revcharmap, ofs, 0);
2328 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2329 if ( state == 1 ) break;
2330 if ( count == 2 ) {
2331 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2332 DEBUG_OPTIMISE_r(
2333 PerlIO_printf(Perl_debug_log,
2334 "%*sNew Start State=%"UVuf" Class: [",
2335 (int)depth * 2 + 2, "",
2336 (UV)state));
2337 if (idx >= 0) {
2338 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2339 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2340
2341 TRIE_BITMAP_SET(trie,*ch);
2342 if ( folder )
2343 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2344 DEBUG_OPTIMISE_r(
2345 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2346 );
2347 }
2348 }
2349 TRIE_BITMAP_SET(trie,*ch);
2350 if ( folder )
2351 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2352 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2353 }
2354 idx = ofs;
2355 }
2356 }
2357 if ( count == 1 ) {
2358 SV **tmp = av_fetch( revcharmap, idx, 0);
2359 STRLEN len;
2360 char *ch = SvPV( *tmp, len );
2361 DEBUG_OPTIMISE_r({
2362 SV *sv=sv_newmortal();
2363 PerlIO_printf( Perl_debug_log,
2364 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2365 (int)depth * 2 + 2, "",
2366 (UV)state, (UV)idx,
2367 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2368 PL_colors[0], PL_colors[1],
2369 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2370 PERL_PV_ESCAPE_FIRSTCHAR
2371 )
2372 );
2373 });
2374 if ( state==1 ) {
2375 OP( convert ) = nodetype;
2376 str=STRING(convert);
2377 STR_LEN(convert)=0;
2378 }
2379 STR_LEN(convert) += len;
2380 while (len--)
2381 *str++ = *ch++;
2382 } else {
2383#ifdef DEBUGGING
2384 if (state>1)
2385 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2386#endif
2387 break;
2388 }
2389 }
2390 trie->prefixlen = (state-1);
2391 if (str) {
2392 regnode *n = convert+NODE_SZ_STR(convert);
2393 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2394 trie->startstate = state;
2395 trie->minlen -= (state - 1);
2396 trie->maxlen -= (state - 1);
2397#ifdef DEBUGGING
2398 /* At least the UNICOS C compiler choked on this
2399 * being argument to DEBUG_r(), so let's just have
2400 * it right here. */
2401 if (
2402#ifdef PERL_EXT_RE_BUILD
2403 1
2404#else
2405 DEBUG_r_TEST
2406#endif
2407 ) {
2408 regnode *fix = convert;
2409 U32 word = trie->wordcount;
2410 mjd_nodelen++;
2411 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2412 while( ++fix < n ) {
2413 Set_Node_Offset_Length(fix, 0, 0);
2414 }
2415 while (word--) {
2416 SV ** const tmp = av_fetch( trie_words, word, 0 );
2417 if (tmp) {
2418 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2419 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2420 else
2421 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2422 }
2423 }
2424 }
2425#endif
2426 if (trie->maxlen) {
2427 convert = n;
2428 } else {
2429 NEXT_OFF(convert) = (U16)(tail - convert);
2430 DEBUG_r(optimize= n);
2431 }
2432 }
2433 }
2434 if (!jumper)
2435 jumper = last;
2436 if ( trie->maxlen ) {
2437 NEXT_OFF( convert ) = (U16)(tail - convert);
2438 ARG_SET( convert, data_slot );
2439 /* Store the offset to the first unabsorbed branch in
2440 jump[0], which is otherwise unused by the jump logic.
2441 We use this when dumping a trie and during optimisation. */
2442 if (trie->jump)
2443 trie->jump[0] = (U16)(nextbranch - convert);
2444
2445 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2446 * and there is a bitmap
2447 * and the first "jump target" node we found leaves enough room
2448 * then convert the TRIE node into a TRIEC node, with the bitmap
2449 * embedded inline in the opcode - this is hypothetically faster.
2450 */
2451 if ( !trie->states[trie->startstate].wordnum
2452 && trie->bitmap
2453 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2454 {
2455 OP( convert ) = TRIEC;
2456 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2457 PerlMemShared_free(trie->bitmap);
2458 trie->bitmap= NULL;
2459 } else
2460 OP( convert ) = TRIE;
2461
2462 /* store the type in the flags */
2463 convert->flags = nodetype;
2464 DEBUG_r({
2465 optimize = convert
2466 + NODE_STEP_REGNODE
2467 + regarglen[ OP( convert ) ];
2468 });
2469 /* XXX We really should free up the resource in trie now,
2470 as we won't use them - (which resources?) dmq */
2471 }
2472 /* needed for dumping*/
2473 DEBUG_r(if (optimize) {
2474 regnode *opt = convert;
2475
2476 while ( ++opt < optimize) {
2477 Set_Node_Offset_Length(opt,0,0);
2478 }
2479 /*
2480 Try to clean up some of the debris left after the
2481 optimisation.
2482 */
2483 while( optimize < jumper ) {
2484 mjd_nodelen += Node_Length((optimize));
2485 OP( optimize ) = OPTIMIZED;
2486 Set_Node_Offset_Length(optimize,0,0);
2487 optimize++;
2488 }
2489 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2490 });
2491 } /* end node insert */
2492
2493 /* Finish populating the prev field of the wordinfo array. Walk back
2494 * from each accept state until we find another accept state, and if
2495 * so, point the first word's .prev field at the second word. If the
2496 * second already has a .prev field set, stop now. This will be the
2497 * case either if we've already processed that word's accept state,
2498 * or that state had multiple words, and the overspill words were
2499 * already linked up earlier.
2500 */
2501 {
2502 U16 word;
2503 U32 state;
2504 U16 prev;
2505
2506 for (word=1; word <= trie->wordcount; word++) {
2507 prev = 0;
2508 if (trie->wordinfo[word].prev)
2509 continue;
2510 state = trie->wordinfo[word].accept;
2511 while (state) {
2512 state = prev_states[state];
2513 if (!state)
2514 break;
2515 prev = trie->states[state].wordnum;
2516 if (prev)
2517 break;
2518 }
2519 trie->wordinfo[word].prev = prev;
2520 }
2521 Safefree(prev_states);
2522 }
2523
2524
2525 /* and now dump out the compressed format */
2526 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2527
2528 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2529#ifdef DEBUGGING
2530 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2531 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2532#else
2533 SvREFCNT_dec_NN(revcharmap);
2534#endif
2535 return trie->jump
2536 ? MADE_JUMP_TRIE
2537 : trie->startstate>1
2538 ? MADE_EXACT_TRIE
2539 : MADE_TRIE;
2540}
2541
2542STATIC void
2543S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2544{
2545/* The Trie is constructed and compressed now so we can build a fail array if it's needed
2546
2547 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2548 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2549 ISBN 0-201-10088-6
2550
2551 We find the fail state for each state in the trie, this state is the longest proper
2552 suffix of the current state's 'word' that is also a proper prefix of another word in our
2553 trie. State 1 represents the word '' and is thus the default fail state. This allows
2554 the DFA not to have to restart after its tried and failed a word at a given point, it
2555 simply continues as though it had been matching the other word in the first place.
2556 Consider
2557 'abcdgu'=~/abcdefg|cdgu/
2558 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2559 fail, which would bring us to the state representing 'd' in the second word where we would
2560 try 'g' and succeed, proceeding to match 'cdgu'.
2561 */
2562 /* add a fail transition */
2563 const U32 trie_offset = ARG(source);
2564 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2565 U32 *q;
2566 const U32 ucharcount = trie->uniquecharcount;
2567 const U32 numstates = trie->statecount;
2568 const U32 ubound = trie->lasttrans + ucharcount;
2569 U32 q_read = 0;
2570 U32 q_write = 0;
2571 U32 charid;
2572 U32 base = trie->states[ 1 ].trans.base;
2573 U32 *fail;
2574 reg_ac_data *aho;
2575 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2576 GET_RE_DEBUG_FLAGS_DECL;
2577
2578 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2579#ifndef DEBUGGING
2580 PERL_UNUSED_ARG(depth);
2581#endif
2582
2583
2584 ARG_SET( stclass, data_slot );
2585 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2586 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2587 aho->trie=trie_offset;
2588 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2589 Copy( trie->states, aho->states, numstates, reg_trie_state );
2590 Newxz( q, numstates, U32);
2591 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2592 aho->refcount = 1;
2593 fail = aho->fail;
2594 /* initialize fail[0..1] to be 1 so that we always have
2595 a valid final fail state */
2596 fail[ 0 ] = fail[ 1 ] = 1;
2597
2598 for ( charid = 0; charid < ucharcount ; charid++ ) {
2599 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2600 if ( newstate ) {
2601 q[ q_write ] = newstate;
2602 /* set to point at the root */
2603 fail[ q[ q_write++ ] ]=1;
2604 }
2605 }
2606 while ( q_read < q_write) {
2607 const U32 cur = q[ q_read++ % numstates ];
2608 base = trie->states[ cur ].trans.base;
2609
2610 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2611 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2612 if (ch_state) {
2613 U32 fail_state = cur;
2614 U32 fail_base;
2615 do {
2616 fail_state = fail[ fail_state ];
2617 fail_base = aho->states[ fail_state ].trans.base;
2618 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2619
2620 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2621 fail[ ch_state ] = fail_state;
2622 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2623 {
2624 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2625 }
2626 q[ q_write++ % numstates] = ch_state;
2627 }
2628 }
2629 }
2630 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2631 when we fail in state 1, this allows us to use the
2632 charclass scan to find a valid start char. This is based on the principle
2633 that theres a good chance the string being searched contains lots of stuff
2634 that cant be a start char.
2635 */
2636 fail[ 0 ] = fail[ 1 ] = 0;
2637 DEBUG_TRIE_COMPILE_r({
2638 PerlIO_printf(Perl_debug_log,
2639 "%*sStclass Failtable (%"UVuf" states): 0",
2640 (int)(depth * 2), "", (UV)numstates
2641 );
2642 for( q_read=1; q_read<numstates; q_read++ ) {
2643 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2644 }
2645 PerlIO_printf(Perl_debug_log, "\n");
2646 });
2647 Safefree(q);
2648 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2649}
2650
2651
2652/*
2653 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2654 * These need to be revisited when a newer toolchain becomes available.
2655 */
2656#if defined(__sparc64__) && defined(__GNUC__)
2657# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2658# undef SPARC64_GCC_WORKAROUND
2659# define SPARC64_GCC_WORKAROUND 1
2660# endif
2661#endif
2662
2663#define DEBUG_PEEP(str,scan,depth) \
2664 DEBUG_OPTIMISE_r({if (scan){ \
2665 SV * const mysv=sv_newmortal(); \
2666 regnode *Next = regnext(scan); \
2667 regprop(RExC_rx, mysv, scan); \
2668 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2669 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2670 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2671 }});
2672
2673
2674/* The below joins as many adjacent EXACTish nodes as possible into a single
2675 * one. The regop may be changed if the node(s) contain certain sequences that
2676 * require special handling. The joining is only done if:
2677 * 1) there is room in the current conglomerated node to entirely contain the
2678 * next one.
2679 * 2) they are the exact same node type
2680 *
2681 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2682 * these get optimized out
2683 *
2684 * If a node is to match under /i (folded), the number of characters it matches
2685 * can be different than its character length if it contains a multi-character
2686 * fold. *min_subtract is set to the total delta of the input nodes.
2687 *
2688 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2689 * and contains LATIN SMALL LETTER SHARP S
2690 *
2691 * This is as good a place as any to discuss the design of handling these
2692 * multi-character fold sequences. It's been wrong in Perl for a very long
2693 * time. There are three code points in Unicode whose multi-character folds
2694 * were long ago discovered to mess things up. The previous designs for
2695 * dealing with these involved assigning a special node for them. This
2696 * approach doesn't work, as evidenced by this example:
2697 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2698 * Both these fold to "sss", but if the pattern is parsed to create a node that
2699 * would match just the \xDF, it won't be able to handle the case where a
2700 * successful match would have to cross the node's boundary. The new approach
2701 * that hopefully generally solves the problem generates an EXACTFU_SS node
2702 * that is "sss".
2703 *
2704 * It turns out that there are problems with all multi-character folds, and not
2705 * just these three. Now the code is general, for all such cases. The
2706 * approach taken is:
2707 * 1) This routine examines each EXACTFish node that could contain multi-
2708 * character fold sequences. It returns in *min_subtract how much to
2709 * subtract from the the actual length of the string to get a real minimum
2710 * match length; it is 0 if there are no multi-char folds. This delta is
2711 * used by the caller to adjust the min length of the match, and the delta
2712 * between min and max, so that the optimizer doesn't reject these
2713 * possibilities based on size constraints.
2714 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2715 * is used for an EXACTFU node that contains at least one "ss" sequence in
2716 * it. For non-UTF-8 patterns and strings, this is the only case where
2717 * there is a possible fold length change. That means that a regular
2718 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2719 * with length changes, and so can be processed faster. regexec.c takes
2720 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2721 * pre-folded by regcomp.c. This saves effort in regex matching.
2722 * However, the pre-folding isn't done for non-UTF8 patterns because the
2723 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2724 * down by forcing the pattern into UTF8 unless necessary. Also what
2725 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2726 * possibilities for the non-UTF8 patterns are quite simple, except for
2727 * the sharp s. All the ones that don't involve a UTF-8 target string are
2728 * members of a fold-pair, and arrays are set up for all of them so that
2729 * the other member of the pair can be found quickly. Code elsewhere in
2730 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2731 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2732 * described in the next item.
2733 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
2734 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2735 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
2736 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
2737 * character in the pattern corresponds to at most a single character in
2738 * the target string. (And I do mean character, and not byte here, unlike
2739 * other parts of the documentation that have never been updated to
2740 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
2741 * two character string 'ss'; in EXACTFA nodes it can match
2742 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
2743 * instances where it is violated. I'm reluctant to try to change the
2744 * assumption, as the code involved is impenetrable to me (khw), so
2745 * instead the code here punts. This routine examines (when the pattern
2746 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2747 * boolean indicating whether or not the node contains a sharp s. When it
2748 * is true, the caller sets a flag that later causes the optimizer in this
2749 * file to not set values for the floating and fixed string lengths, and
2750 * thus avoids the optimizer code in regexec.c that makes the invalid
2751 * assumption. Thus, there is no optimization based on string lengths for
2752 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2753 * (The reason the assumption is wrong only in these two cases is that all
2754 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2755 * other folds to their expanded versions. We can't prefold sharp s to
2756 * 'ss' in EXACTF nodes because we don't know at compile time if it
2757 * actually matches 'ss' or not. It will match iff the target string is
2758 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2759 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
2760 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2761 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2762 * require the pattern to be forced into UTF-8, the overhead of which we
2763 * want to avoid.)
2764 *
2765 * Similarly, the code that generates tries doesn't currently handle
2766 * not-already-folded multi-char folds, and it looks like a pain to change
2767 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
2768 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
2769 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
2770 * using /iaa matching will be doing so almost entirely with ASCII
2771 * strings, so this should rarely be encountered in practice */
2772
2773#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2774 if (PL_regkind[OP(scan)] == EXACT) \
2775 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2776
2777STATIC U32
2778S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2779 /* Merge several consecutive EXACTish nodes into one. */
2780 regnode *n = regnext(scan);
2781 U32 stringok = 1;
2782 regnode *next = scan + NODE_SZ_STR(scan);
2783 U32 merged = 0;
2784 U32 stopnow = 0;
2785#ifdef DEBUGGING
2786 regnode *stop = scan;
2787 GET_RE_DEBUG_FLAGS_DECL;
2788#else
2789 PERL_UNUSED_ARG(depth);
2790#endif
2791
2792 PERL_ARGS_ASSERT_JOIN_EXACT;
2793#ifndef EXPERIMENTAL_INPLACESCAN
2794 PERL_UNUSED_ARG(flags);
2795 PERL_UNUSED_ARG(val);
2796#endif
2797 DEBUG_PEEP("join",scan,depth);
2798
2799 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2800 * EXACT ones that are mergeable to the current one. */
2801 while (n
2802 && (PL_regkind[OP(n)] == NOTHING
2803 || (stringok && OP(n) == OP(scan)))
2804 && NEXT_OFF(n)
2805 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2806 {
2807
2808 if (OP(n) == TAIL || n > next)
2809 stringok = 0;
2810 if (PL_regkind[OP(n)] == NOTHING) {
2811 DEBUG_PEEP("skip:",n,depth);
2812 NEXT_OFF(scan) += NEXT_OFF(n);
2813 next = n + NODE_STEP_REGNODE;
2814#ifdef DEBUGGING
2815 if (stringok)
2816 stop = n;
2817#endif
2818 n = regnext(n);
2819 }
2820 else if (stringok) {
2821 const unsigned int oldl = STR_LEN(scan);
2822 regnode * const nnext = regnext(n);
2823
2824 /* XXX I (khw) kind of doubt that this works on platforms where
2825 * U8_MAX is above 255 because of lots of other assumptions */
2826 /* Don't join if the sum can't fit into a single node */
2827 if (oldl + STR_LEN(n) > U8_MAX)
2828 break;
2829
2830 DEBUG_PEEP("merg",n,depth);
2831 merged++;
2832
2833 NEXT_OFF(scan) += NEXT_OFF(n);
2834 STR_LEN(scan) += STR_LEN(n);
2835 next = n + NODE_SZ_STR(n);
2836 /* Now we can overwrite *n : */
2837 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2838#ifdef DEBUGGING
2839 stop = next - 1;
2840#endif
2841 n = nnext;
2842 if (stopnow) break;
2843 }
2844
2845#ifdef EXPERIMENTAL_INPLACESCAN
2846 if (flags && !NEXT_OFF(n)) {
2847 DEBUG_PEEP("atch", val, depth);
2848 if (reg_off_by_arg[OP(n)]) {
2849 ARG_SET(n, val - n);
2850 }
2851 else {
2852 NEXT_OFF(n) = val - n;
2853 }
2854 stopnow = 1;
2855 }
2856#endif
2857 }
2858
2859 *min_subtract = 0;
2860 *has_exactf_sharp_s = FALSE;
2861
2862 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2863 * can now analyze for sequences of problematic code points. (Prior to
2864 * this final joining, sequences could have been split over boundaries, and
2865 * hence missed). The sequences only happen in folding, hence for any
2866 * non-EXACT EXACTish node */
2867 if (OP(scan) != EXACT) {
2868 const U8 * const s0 = (U8*) STRING(scan);
2869 const U8 * s = s0;
2870 const U8 * const s_end = s0 + STR_LEN(scan);
2871
2872 /* One pass is made over the node's string looking for all the
2873 * possibilities. to avoid some tests in the loop, there are two main
2874 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2875 * non-UTF-8 */
2876 if (UTF) {
2877
2878 /* Examine the string for a multi-character fold sequence. UTF-8
2879 * patterns have all characters pre-folded by the time this code is
2880 * executed */
2881 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2882 length sequence we are looking for is 2 */
2883 {
2884 int count = 0;
2885 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2886 if (! len) { /* Not a multi-char fold: get next char */
2887 s += UTF8SKIP(s);
2888 continue;
2889 }
2890
2891 /* Nodes with 'ss' require special handling, except for EXACTFL
2892 * and EXACTFA-ish for which there is no multi-char fold to
2893 * this */
2894 if (len == 2 && *s == 's' && *(s+1) == 's'
2895 && OP(scan) != EXACTFL
2896 && OP(scan) != EXACTFA
2897 && OP(scan) != EXACTFA_NO_TRIE)
2898 {
2899 count = 2;
2900 OP(scan) = EXACTFU_SS;
2901 s += 2;
2902 }
2903 else { /* Here is a generic multi-char fold. */
2904 const U8* multi_end = s + len;
2905
2906 /* Count how many characters in it. In the case of /l and
2907 * /aa, no folds which contain ASCII code points are
2908 * allowed, so check for those, and skip if found. (In
2909 * EXACTFL, no folds are allowed to any Latin1 code point,
2910 * not just ASCII. But there aren't any of these
2911 * currently, nor ever likely, so don't take the time to
2912 * test for them. The code that generates the
2913 * is_MULTI_foo() macros croaks should one actually get put
2914 * into Unicode .) */
2915 if (OP(scan) != EXACTFL
2916 && OP(scan) != EXACTFA
2917 && OP(scan) != EXACTFA_NO_TRIE)
2918 {
2919 count = utf8_length(s, multi_end);
2920 s = multi_end;
2921 }
2922 else {
2923 while (s < multi_end) {
2924 if (isASCII(*s)) {
2925 s++;
2926 goto next_iteration;
2927 }
2928 else {
2929 s += UTF8SKIP(s);
2930 }
2931 count++;
2932 }
2933 }
2934 }
2935
2936 /* The delta is how long the sequence is minus 1 (1 is how long
2937 * the character that folds to the sequence is) */
2938 *min_subtract += count - 1;
2939 next_iteration: ;
2940 }
2941 }
2942 else if (OP(scan) == EXACTFA) {
2943
2944 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
2945 * fold to the ASCII range (and there are no existing ones in the
2946 * upper latin1 range). But, as outlined in the comments preceding
2947 * this function, we need to flag any occurrences of the sharp s.
2948 * This character forbids trie formation (because of added
2949 * complexity) */
2950 while (s < s_end) {
2951 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
2952 OP(scan) = EXACTFA_NO_TRIE;
2953 *has_exactf_sharp_s = TRUE;
2954 break;
2955 }
2956 s++;
2957 continue;
2958 }
2959 }
2960 else if (OP(scan) != EXACTFL) {
2961
2962 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
2963 * multi-char folds that are all Latin1. (This code knows that
2964 * there are no current multi-char folds possible with EXACTFL,
2965 * relying on fold_grind.t to catch any errors if the very unlikely
2966 * event happens that some get added in future Unicode versions.)
2967 * As explained in the comments preceding this function, we look
2968 * also for the sharp s in EXACTF nodes; it can be in the final
2969 * position. Otherwise we can stop looking 1 byte earlier because
2970 * have to find at least two characters for a multi-fold */
2971 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2972
2973 while (s < upper) {
2974 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2975 if (! len) { /* Not a multi-char fold. */
2976 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2977 {
2978 *has_exactf_sharp_s = TRUE;
2979 }
2980 s++;
2981 continue;
2982 }
2983
2984 if (len == 2
2985 && isARG2_lower_or_UPPER_ARG1('s', *s)
2986 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
2987 {
2988
2989 /* EXACTF nodes need to know that the minimum length
2990 * changed so that a sharp s in the string can match this
2991 * ss in the pattern, but they remain EXACTF nodes, as they
2992 * won't match this unless the target string is is UTF-8,
2993 * which we don't know until runtime */
2994 if (OP(scan) != EXACTF) {
2995 OP(scan) = EXACTFU_SS;
2996 }
2997 }
2998
2999 *min_subtract += len - 1;
3000 s += len;
3001 }
3002 }
3003 }
3004
3005#ifdef DEBUGGING
3006 /* Allow dumping but overwriting the collection of skipped
3007 * ops and/or strings with fake optimized ops */
3008 n = scan + NODE_SZ_STR(scan);
3009 while (n <= stop) {
3010 OP(n) = OPTIMIZED;
3011 FLAGS(n) = 0;
3012 NEXT_OFF(n) = 0;
3013 n++;
3014 }
3015#endif
3016 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3017 return stopnow;
3018}
3019
3020/* REx optimizer. Converts nodes into quicker variants "in place".
3021 Finds fixed substrings. */
3022
3023/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3024 to the position after last scanned or to NULL. */
3025
3026#define INIT_AND_WITHP \
3027 assert(!and_withp); \
3028 Newx(and_withp,1,struct regnode_charclass_class); \
3029 SAVEFREEPV(and_withp)
3030
3031/* this is a chain of data about sub patterns we are processing that
3032 need to be handled separately/specially in study_chunk. Its so
3033 we can simulate recursion without losing state. */
3034struct scan_frame;
3035typedef struct scan_frame {
3036 regnode *last; /* last node to process in this frame */
3037 regnode *next; /* next node to process when last is reached */
3038 struct scan_frame *prev; /*previous frame*/
3039 I32 stop; /* what stopparen do we use */
3040} scan_frame;
3041
3042
3043#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3044
3045STATIC SSize_t
3046S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3047 SSize_t *minlenp, SSize_t *deltap,
3048 regnode *last,
3049 scan_data_t *data,
3050 I32 stopparen,
3051 U8* recursed,
3052 struct regnode_charclass_class *and_withp,
3053 U32 flags, U32 depth)
3054 /* scanp: Start here (read-write). */
3055 /* deltap: Write maxlen-minlen here. */
3056 /* last: Stop before this one. */
3057 /* data: string data about the pattern */
3058 /* stopparen: treat close N as END */
3059 /* recursed: which subroutines have we recursed into */
3060 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3061{
3062 dVAR;
3063 /* There must be at least this number of characters to match */
3064 SSize_t min = 0;
3065 I32 pars = 0, code;
3066 regnode *scan = *scanp, *next;
3067 SSize_t delta = 0;
3068 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3069 int is_inf_internal = 0; /* The studied chunk is infinite */
3070 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3071 scan_data_t data_fake;
3072 SV *re_trie_maxbuff = NULL;
3073 regnode *first_non_open = scan;
3074 SSize_t stopmin = SSize_t_MAX;
3075 scan_frame *frame = NULL;
3076 GET_RE_DEBUG_FLAGS_DECL;
3077
3078 PERL_ARGS_ASSERT_STUDY_CHUNK;
3079
3080#ifdef DEBUGGING
3081 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3082#endif
3083
3084 if ( depth == 0 ) {
3085 while (first_non_open && OP(first_non_open) == OPEN)
3086 first_non_open=regnext(first_non_open);
3087 }
3088
3089
3090 fake_study_recurse:
3091 while ( scan && OP(scan) != END && scan < last ){
3092 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3093 node length to get a real minimum (because
3094 the folded version may be shorter) */
3095 bool has_exactf_sharp_s = FALSE;
3096 /* Peephole optimizer: */
3097 DEBUG_STUDYDATA("Peep:", data,depth);
3098 DEBUG_PEEP("Peep",scan,depth);
3099
3100 /* Its not clear to khw or hv why this is done here, and not in the
3101 * clauses that deal with EXACT nodes. khw's guess is that it's
3102 * because of a previous design */
3103 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3104
3105 /* Follow the next-chain of the current node and optimize
3106 away all the NOTHINGs from it. */
3107 if (OP(scan) != CURLYX) {
3108 const int max = (reg_off_by_arg[OP(scan)]
3109 ? I32_MAX
3110 /* I32 may be smaller than U16 on CRAYs! */
3111 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3112 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3113 int noff;
3114 regnode *n = scan;
3115
3116 /* Skip NOTHING and LONGJMP. */
3117 while ((n = regnext(n))
3118 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3119 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3120 && off + noff < max)
3121 off += noff;
3122 if (reg_off_by_arg[OP(scan)])
3123 ARG(scan) = off;
3124 else
3125 NEXT_OFF(scan) = off;
3126 }
3127
3128
3129
3130 /* The principal pseudo-switch. Cannot be a switch, since we
3131 look into several different things. */
3132 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3133 || OP(scan) == IFTHEN) {
3134 next = regnext(scan);
3135 code = OP(scan);
3136 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3137
3138 if (OP(next) == code || code == IFTHEN) {
3139 /* NOTE - There is similar code to this block below for handling
3140 TRIE nodes on a re-study. If you change stuff here check there
3141 too. */
3142 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3143 struct regnode_charclass_class accum;
3144 regnode * const startbranch=scan;
3145
3146 if (flags & SCF_DO_SUBSTR)
3147 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3148 if (flags & SCF_DO_STCLASS)
3149 cl_init_zero(pRExC_state, &accum);
3150
3151 while (OP(scan) == code) {
3152 SSize_t deltanext, minnext, fake;
3153 I32 f = 0;
3154 struct regnode_charclass_class this_class;
3155
3156 num++;
3157 data_fake.flags = 0;
3158 if (data) {
3159 data_fake.whilem_c = data->whilem_c;
3160 data_fake.last_closep = data->last_closep;
3161 }
3162 else
3163 data_fake.last_closep = &fake;
3164
3165 data_fake.pos_delta = delta;
3166 next = regnext(scan);
3167 scan = NEXTOPER(scan);
3168 if (code != BRANCH)
3169 scan = NEXTOPER(scan);
3170 if (flags & SCF_DO_STCLASS) {
3171 cl_init(pRExC_state, &this_class);
3172 data_fake.start_class = &this_class;
3173 f = SCF_DO_STCLASS_AND;
3174 }
3175 if (flags & SCF_WHILEM_VISITED_POS)
3176 f |= SCF_WHILEM_VISITED_POS;
3177
3178 /* we suppose the run is continuous, last=next...*/
3179 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3180 next, &data_fake,
3181 stopparen, recursed, NULL, f,depth+1);
3182 if (min1 > minnext)
3183 min1 = minnext;
3184 if (deltanext == SSize_t_MAX) {
3185 is_inf = is_inf_internal = 1;
3186 max1 = SSize_t_MAX;
3187 } else if (max1 < minnext + deltanext)
3188 max1 = minnext + deltanext;
3189 scan = next;
3190 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3191 pars++;
3192 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3193 if ( stopmin > minnext)
3194 stopmin = min + min1;
3195 flags &= ~SCF_DO_SUBSTR;
3196 if (data)
3197 data->flags |= SCF_SEEN_ACCEPT;
3198 }
3199 if (data) {
3200 if (data_fake.flags & SF_HAS_EVAL)
3201 data->flags |= SF_HAS_EVAL;
3202 data->whilem_c = data_fake.whilem_c;
3203 }
3204 if (flags & SCF_DO_STCLASS)
3205 cl_or(pRExC_state, &accum, &this_class);
3206 }
3207 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3208 min1 = 0;
3209 if (flags & SCF_DO_SUBSTR) {
3210 data->pos_min += min1;
3211 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3212 data->pos_delta = SSize_t_MAX;
3213 else
3214 data->pos_delta += max1 - min1;
3215 if (max1 != min1 || is_inf)
3216 data->longest = &(data->longest_float);
3217 }
3218 min += min1;
3219 if (delta == SSize_t_MAX
3220 || SSize_t_MAX - delta - (max1 - min1) < 0)
3221 delta = SSize_t_MAX;
3222 else
3223 delta += max1 - min1;
3224 if (flags & SCF_DO_STCLASS_OR) {
3225 cl_or(pRExC_state, data->start_class, &accum);
3226 if (min1) {
3227 cl_and(data->start_class, and_withp);
3228 flags &= ~SCF_DO_STCLASS;
3229 }
3230 }
3231 else if (flags & SCF_DO_STCLASS_AND) {
3232 if (min1) {
3233 cl_and(data->start_class, &accum);
3234 flags &= ~SCF_DO_STCLASS;
3235 }
3236 else {
3237 /* Switch to OR mode: cache the old value of
3238 * data->start_class */
3239 INIT_AND_WITHP;
3240 StructCopy(data->start_class, and_withp,
3241 struct regnode_charclass_class);
3242 flags &= ~SCF_DO_STCLASS_AND;
3243 StructCopy(&accum, data->start_class,
3244 struct regnode_charclass_class);
3245 flags |= SCF_DO_STCLASS_OR;
3246 SET_SSC_EOS(data->start_class);
3247 }
3248 }
3249
3250 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3251 /* demq.
3252
3253 Assuming this was/is a branch we are dealing with: 'scan' now
3254 points at the item that follows the branch sequence, whatever
3255 it is. We now start at the beginning of the sequence and look
3256 for subsequences of
3257
3258 BRANCH->EXACT=>x1
3259 BRANCH->EXACT=>x2
3260 tail
3261
3262 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3263
3264 If we can find such a subsequence we need to turn the first
3265 element into a trie and then add the subsequent branch exact
3266 strings to the trie.
3267
3268 We have two cases
3269
3270 1. patterns where the whole set of branches can be converted.
3271
3272 2. patterns where only a subset can be converted.
3273
3274 In case 1 we can replace the whole set with a single regop
3275 for the trie. In case 2 we need to keep the start and end
3276 branches so
3277
3278 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3279 becomes BRANCH TRIE; BRANCH X;
3280
3281 There is an additional case, that being where there is a
3282 common prefix, which gets split out into an EXACT like node
3283 preceding the TRIE node.
3284
3285 If x(1..n)==tail then we can do a simple trie, if not we make
3286 a "jump" trie, such that when we match the appropriate word
3287 we "jump" to the appropriate tail node. Essentially we turn
3288 a nested if into a case structure of sorts.
3289
3290 */
3291
3292 int made=0;
3293 if (!re_trie_maxbuff) {
3294 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3295 if (!SvIOK(re_trie_maxbuff))
3296 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3297 }
3298 if ( SvIV(re_trie_maxbuff)>=0 ) {
3299 regnode *cur;
3300 regnode *first = (regnode *)NULL;
3301 regnode *last = (regnode *)NULL;
3302 regnode *tail = scan;
3303 U8 trietype = 0;
3304 U32 count=0;
3305
3306#ifdef DEBUGGING
3307 SV * const mysv = sv_newmortal(); /* for dumping */
3308#endif
3309 /* var tail is used because there may be a TAIL
3310 regop in the way. Ie, the exacts will point to the
3311 thing following the TAIL, but the last branch will
3312 point at the TAIL. So we advance tail. If we
3313 have nested (?:) we may have to move through several
3314 tails.
3315 */
3316
3317 while ( OP( tail ) == TAIL ) {
3318 /* this is the TAIL generated by (?:) */
3319 tail = regnext( tail );
3320 }
3321
3322
3323 DEBUG_TRIE_COMPILE_r({
3324 regprop(RExC_rx, mysv, tail );
3325 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3326 (int)depth * 2 + 2, "",
3327 "Looking for TRIE'able sequences. Tail node is: ",
3328 SvPV_nolen_const( mysv )
3329 );
3330 });
3331
3332 /*
3333
3334 Step through the branches
3335 cur represents each branch,
3336 noper is the first thing to be matched as part of that branch
3337 noper_next is the regnext() of that node.
3338
3339 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3340 via a "jump trie" but we also support building with NOJUMPTRIE,
3341 which restricts the trie logic to structures like /FOO|BAR/.
3342
3343 If noper is a trieable nodetype then the branch is a possible optimization
3344 target. If we are building under NOJUMPTRIE then we require that noper_next
3345 is the same as scan (our current position in the regex program).
3346
3347 Once we have two or more consecutive such branches we can create a
3348 trie of the EXACT's contents and stitch it in place into the program.
3349
3350 If the sequence represents all of the branches in the alternation we
3351 replace the entire thing with a single TRIE node.
3352
3353 Otherwise when it is a subsequence we need to stitch it in place and
3354 replace only the relevant branches. This means the first branch has
3355 to remain as it is used by the alternation logic, and its next pointer,
3356 and needs to be repointed at the item on the branch chain following
3357 the last branch we have optimized away.
3358
3359 This could be either a BRANCH, in which case the subsequence is internal,
3360 or it could be the item following the branch sequence in which case the
3361 subsequence is at the end (which does not necessarily mean the first node
3362 is the start of the alternation).
3363
3364 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3365
3366 optype | trietype
3367 ----------------+-----------
3368 NOTHING | NOTHING
3369 EXACT | EXACT
3370 EXACTFU | EXACTFU
3371 EXACTFU_SS | EXACTFU
3372 EXACTFA | EXACTFA
3373
3374
3375 */
3376#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3377 ( EXACT == (X) ) ? EXACT : \
3378 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3379 ( EXACTFA == (X) ) ? EXACTFA : \
3380 0 )
3381
3382 /* dont use tail as the end marker for this traverse */
3383 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3384 regnode * const noper = NEXTOPER( cur );
3385 U8 noper_type = OP( noper );
3386 U8 noper_trietype = TRIE_TYPE( noper_type );
3387#if defined(DEBUGGING) || defined(NOJUMPTRIE)
3388 regnode * const noper_next = regnext( noper );
3389 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3390 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3391#endif
3392
3393 DEBUG_TRIE_COMPILE_r({
3394 regprop(RExC_rx, mysv, cur);
3395 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3396 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3397
3398 regprop(RExC_rx, mysv, noper);
3399 PerlIO_printf( Perl_debug_log, " -> %s",
3400 SvPV_nolen_const(mysv));
3401
3402 if ( noper_next ) {
3403 regprop(RExC_rx, mysv, noper_next );
3404 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3405 SvPV_nolen_const(mysv));
3406 }
3407 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3408 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3409 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3410 );
3411 });
3412
3413 /* Is noper a trieable nodetype that can be merged with the
3414 * current trie (if there is one)? */
3415 if ( noper_trietype
3416 &&
3417 (
3418 ( noper_trietype == NOTHING)
3419 || ( trietype == NOTHING )
3420 || ( trietype == noper_trietype )
3421 )
3422#ifdef NOJUMPTRIE
3423 && noper_next == tail
3424#endif
3425 && count < U16_MAX)
3426 {
3427 /* Handle mergable triable node
3428 * Either we are the first node in a new trieable sequence,
3429 * in which case we do some bookkeeping, otherwise we update
3430 * the end pointer. */
3431 if ( !first ) {
3432 first = cur;
3433 if ( noper_trietype == NOTHING ) {
3434#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3435 regnode * const noper_next = regnext( noper );
3436 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3437 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3438#endif
3439
3440 if ( noper_next_trietype ) {
3441 trietype = noper_next_trietype;
3442 } else if (noper_next_type) {
3443 /* a NOTHING regop is 1 regop wide. We need at least two
3444 * for a trie so we can't merge this in */
3445 first = NULL;
3446 }
3447 } else {
3448 trietype = noper_trietype;
3449 }
3450 } else {
3451 if ( trietype == NOTHING )
3452 trietype = noper_trietype;
3453 last = cur;
3454 }
3455 if (first)
3456 count++;
3457 } /* end handle mergable triable node */
3458 else {
3459 /* handle unmergable node -
3460 * noper may either be a triable node which can not be tried
3461 * together with the current trie, or a non triable node */
3462 if ( last ) {
3463 /* If last is set and trietype is not NOTHING then we have found
3464 * at least two triable branch sequences in a row of a similar
3465 * trietype so we can turn them into a trie. If/when we
3466 * allow NOTHING to start a trie sequence this condition will be
3467 * required, and it isn't expensive so we leave it in for now. */
3468 if ( trietype && trietype != NOTHING )
3469 make_trie( pRExC_state,
3470 startbranch, first, cur, tail, count,
3471 trietype, depth+1 );
3472 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3473 }
3474 if ( noper_trietype
3475#ifdef NOJUMPTRIE
3476 && noper_next == tail
3477#endif
3478 ){
3479 /* noper is triable, so we can start a new trie sequence */
3480 count = 1;
3481 first = cur;
3482 trietype = noper_trietype;
3483 } else if (first) {
3484 /* if we already saw a first but the current node is not triable then we have
3485 * to reset the first information. */
3486 count = 0;
3487 first = NULL;
3488 trietype = 0;
3489 }
3490 } /* end handle unmergable node */
3491 } /* loop over branches */
3492 DEBUG_TRIE_COMPILE_r({
3493 regprop(RExC_rx, mysv, cur);
3494 PerlIO_printf( Perl_debug_log,
3495 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3496 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3497
3498 });
3499 if ( last && trietype ) {
3500 if ( trietype != NOTHING ) {
3501 /* the last branch of the sequence was part of a trie,
3502 * so we have to construct it here outside of the loop
3503 */
3504 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3505#ifdef TRIE_STUDY_OPT
3506 if ( ((made == MADE_EXACT_TRIE &&
3507 startbranch == first)
3508 || ( first_non_open == first )) &&
3509 depth==0 ) {
3510 flags |= SCF_TRIE_RESTUDY;
3511 if ( startbranch == first
3512 && scan == tail )
3513 {
3514 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3515 }
3516 }
3517#endif
3518 } else {
3519 /* at this point we know whatever we have is a NOTHING sequence/branch
3520 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3521 */
3522 if ( startbranch == first ) {
3523 regnode *opt;
3524 /* the entire thing is a NOTHING sequence, something like this:
3525 * (?:|) So we can turn it into a plain NOTHING op. */
3526 DEBUG_TRIE_COMPILE_r({
3527 regprop(RExC_rx, mysv, cur);
3528 PerlIO_printf( Perl_debug_log,
3529 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3530 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3531
3532 });
3533 OP(startbranch)= NOTHING;
3534 NEXT_OFF(startbranch)= tail - startbranch;
3535 for ( opt= startbranch + 1; opt < tail ; opt++ )
3536 OP(opt)= OPTIMIZED;
3537 }
3538 }
3539 } /* end if ( last) */
3540 } /* TRIE_MAXBUF is non zero */
3541
3542 } /* do trie */
3543
3544 }
3545 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3546 scan = NEXTOPER(NEXTOPER(scan));
3547 } else /* single branch is optimized. */
3548 scan = NEXTOPER(scan);
3549 continue;
3550 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3551 scan_frame *newframe = NULL;
3552 I32 paren;
3553 regnode *start;
3554 regnode *end;
3555
3556 if (OP(scan) != SUSPEND) {
3557 /* set the pointer */
3558 if (OP(scan) == GOSUB) {
3559 paren = ARG(scan);
3560 RExC_recurse[ARG2L(scan)] = scan;
3561 start = RExC_open_parens[paren-1];
3562 end = RExC_close_parens[paren-1];
3563 } else {
3564 paren = 0;
3565 start = RExC_rxi->program + 1;
3566 end = RExC_opend;
3567 }
3568 if (!recursed) {
3569 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3570 SAVEFREEPV(recursed);
3571 }
3572 if (!PAREN_TEST(recursed,paren+1)) {
3573 PAREN_SET(recursed,paren+1);
3574 Newx(newframe,1,scan_frame);
3575 } else {
3576 if (flags & SCF_DO_SUBSTR) {
3577 SCAN_COMMIT(pRExC_state,data,minlenp);
3578 data->longest = &(data->longest_float);
3579 }
3580 is_inf = is_inf_internal = 1;
3581 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3582 cl_anything(pRExC_state, data->start_class);
3583 flags &= ~SCF_DO_STCLASS;
3584 }
3585 } else {
3586 Newx(newframe,1,scan_frame);
3587 paren = stopparen;
3588 start = scan+2;
3589 end = regnext(scan);
3590 }
3591 if (newframe) {
3592 assert(start);
3593 assert(end);
3594 SAVEFREEPV(newframe);
3595 newframe->next = regnext(scan);
3596 newframe->last = last;
3597 newframe->stop = stopparen;
3598 newframe->prev = frame;
3599
3600 frame = newframe;
3601 scan = start;
3602 stopparen = paren;
3603 last = end;
3604
3605 continue;
3606 }
3607 }
3608 else if (OP(scan) == EXACT) {
3609 SSize_t l = STR_LEN(scan);
3610 UV uc;
3611 if (UTF) {
3612 const U8 * const s = (U8*)STRING(scan);
3613 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3614 l = utf8_length(s, s + l);
3615 } else {
3616 uc = *((U8*)STRING(scan));
3617 }
3618 min += l;
3619 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3620 /* The code below prefers earlier match for fixed
3621 offset, later match for variable offset. */
3622 if (data->last_end == -1) { /* Update the start info. */
3623 data->last_start_min = data->pos_min;
3624 data->last_start_max = is_inf
3625 ? SSize_t_MAX : data->pos_min + data->pos_delta;
3626 }
3627 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3628 if (UTF)
3629 SvUTF8_on(data->last_found);
3630 {
3631 SV * const sv = data->last_found;
3632 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3633 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3634 if (mg && mg->mg_len >= 0)
3635 mg->mg_len += utf8_length((U8*)STRING(scan),
3636 (U8*)STRING(scan)+STR_LEN(scan));
3637 }
3638 data->last_end = data->pos_min + l;
3639 data->pos_min += l; /* As in the first entry. */
3640 data->flags &= ~SF_BEFORE_EOL;
3641 }
3642 if (flags & SCF_DO_STCLASS_AND) {
3643 /* Check whether it is compatible with what we know already! */
3644 int compat = 1;
3645
3646
3647 /* If compatible, we or it in below. It is compatible if is
3648 * in the bitmp and either 1) its bit or its fold is set, or 2)
3649 * it's for a locale. Even if there isn't unicode semantics
3650 * here, at runtime there may be because of matching against a
3651 * utf8 string, so accept a possible false positive for
3652 * latin1-range folds */
3653 if (uc >= 0x100 ||
3654 (!(data->start_class->flags & ANYOF_LOCALE)
3655 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3656 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3657 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3658 )
3659 {
3660 compat = 0;
3661 }
3662 ANYOF_CLASS_ZERO(data->start_class);
3663 ANYOF_BITMAP_ZERO(data->start_class);
3664 if (compat)
3665 ANYOF_BITMAP_SET(data->start_class, uc);
3666 else if (uc >= 0x100) {
3667 int i;
3668
3669 /* Some Unicode code points fold to the Latin1 range; as
3670 * XXX temporary code, instead of figuring out if this is
3671 * one, just assume it is and set all the start class bits
3672 * that could be some such above 255 code point's fold
3673 * which will generate fals positives. As the code
3674 * elsewhere that does compute the fold settles down, it
3675 * can be extracted out and re-used here */
3676 for (i = 0; i < 256; i++){
3677 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3678 ANYOF_BITMAP_SET(data->start_class, i);
3679 }
3680 }
3681 }
3682 CLEAR_SSC_EOS(data->start_class);
3683 if (uc < 0x100)
3684 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3685 }
3686 else if (flags & SCF_DO_STCLASS_OR) {
3687 /* false positive possible if the class is case-folded */
3688 if (uc < 0x100)
3689 ANYOF_BITMAP_SET(data->start_class, uc);
3690 else
3691 data->start_class->flags |= ANYOF_UNICODE_ALL;
3692 CLEAR_SSC_EOS(data->start_class);
3693 cl_and(data->start_class, and_withp);
3694 }
3695 flags &= ~SCF_DO_STCLASS;
3696 }
3697 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3698 SSize_t l = STR_LEN(scan);
3699 UV uc = *((U8*)STRING(scan));
3700
3701 /* Search for fixed substrings supports EXACT only. */
3702 if (flags & SCF_DO_SUBSTR) {
3703 assert(data);
3704 SCAN_COMMIT(pRExC_state, data, minlenp);
3705 }
3706 if (UTF) {
3707 const U8 * const s = (U8 *)STRING(scan);
3708 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3709 l = utf8_length(s, s + l);
3710 }
3711 if (has_exactf_sharp_s) {
3712 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3713 }
3714 min += l - min_subtract;
3715 assert (min >= 0);
3716 delta += min_subtract;
3717 if (flags & SCF_DO_SUBSTR) {
3718 data->pos_min += l - min_subtract;
3719 if (data->pos_min < 0) {
3720 data->pos_min = 0;
3721 }
3722 data->pos_delta += min_subtract;
3723 if (min_subtract) {
3724 data->longest = &(data->longest_float);
3725 }
3726 }
3727 if (flags & SCF_DO_STCLASS_AND) {
3728 /* Check whether it is compatible with what we know already! */
3729 int compat = 1;
3730 if (uc >= 0x100 ||
3731 (!(data->start_class->flags & ANYOF_LOCALE)
3732 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3733 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3734 {
3735 compat = 0;
3736 }
3737 ANYOF_CLASS_ZERO(data->start_class);
3738 ANYOF_BITMAP_ZERO(data->start_class);
3739 if (compat) {
3740 ANYOF_BITMAP_SET(data->start_class, uc);
3741 CLEAR_SSC_EOS(data->start_class);
3742 if (OP(scan) == EXACTFL) {
3743 /* XXX This set is probably no longer necessary, and
3744 * probably wrong as LOCALE now is on in the initial
3745 * state */
3746 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3747 }
3748 else {
3749
3750 /* Also set the other member of the fold pair. In case
3751 * that unicode semantics is called for at runtime, use
3752 * the full latin1 fold. (Can't do this for locale,
3753 * because not known until runtime) */
3754 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3755
3756 /* All other (EXACTFL handled above) folds except under
3757 * /iaa that include s, S, and sharp_s also may include
3758 * the others */
3759 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
3760 {
3761 if (uc == 's' || uc == 'S') {
3762 ANYOF_BITMAP_SET(data->start_class,
3763 LATIN_SMALL_LETTER_SHARP_S);
3764 }
3765 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3766 ANYOF_BITMAP_SET(data->start_class, 's');
3767 ANYOF_BITMAP_SET(data->start_class, 'S');
3768 }
3769 }
3770 }
3771 }
3772 else if (uc >= 0x100) {
3773 int i;
3774 for (i = 0; i < 256; i++){
3775 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3776 ANYOF_BITMAP_SET(data->start_class, i);
3777 }
3778 }
3779 }
3780 }
3781 else if (flags & SCF_DO_STCLASS_OR) {
3782 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3783 /* false positive possible if the class is case-folded.
3784 Assume that the locale settings are the same... */
3785 if (uc < 0x100) {
3786 ANYOF_BITMAP_SET(data->start_class, uc);
3787 if (OP(scan) != EXACTFL) {
3788
3789 /* And set the other member of the fold pair, but
3790 * can't do that in locale because not known until
3791 * run-time */
3792 ANYOF_BITMAP_SET(data->start_class,
3793 PL_fold_latin1[uc]);
3794
3795 /* All folds except under /iaa that include s, S,
3796 * and sharp_s also may include the others */
3797 if (OP(scan) != EXACTFA
3798 && OP(scan) != EXACTFA_NO_TRIE)
3799 {
3800 if (uc == 's' || uc == 'S') {
3801 ANYOF_BITMAP_SET(data->start_class,
3802 LATIN_SMALL_LETTER_SHARP_S);
3803 }
3804 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3805 ANYOF_BITMAP_SET(data->start_class, 's');
3806 ANYOF_BITMAP_SET(data->start_class, 'S');
3807 }
3808 }
3809 }
3810 }
3811 CLEAR_SSC_EOS(data->start_class);
3812 }
3813 cl_and(data->start_class, and_withp);
3814 }
3815 flags &= ~SCF_DO_STCLASS;
3816 }
3817 else if (REGNODE_VARIES(OP(scan))) {
3818 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
3819 I32 fl = 0, f = flags;
3820 regnode * const oscan = scan;
3821 struct regnode_charclass_class this_class;
3822 struct regnode_charclass_class *oclass = NULL;
3823 I32 next_is_eval = 0;
3824
3825 switch (PL_regkind[OP(scan)]) {
3826 case WHILEM: /* End of (?:...)* . */
3827 scan = NEXTOPER(scan);
3828 goto finish;
3829 case PLUS:
3830 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3831 next = NEXTOPER(scan);
3832 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3833 mincount = 1;
3834 maxcount = REG_INFTY;
3835 next = regnext(scan);
3836 scan = NEXTOPER(scan);
3837 goto do_curly;
3838 }
3839 }
3840 if (flags & SCF_DO_SUBSTR)
3841 data->pos_min++;
3842 min++;
3843 /* Fall through. */
3844 case STAR:
3845 if (flags & SCF_DO_STCLASS) {
3846 mincount = 0;
3847 maxcount = REG_INFTY;
3848 next = regnext(scan);
3849 scan = NEXTOPER(scan);
3850 goto do_curly;
3851 }
3852 is_inf = is_inf_internal = 1;
3853 scan = regnext(scan);
3854 if (flags & SCF_DO_SUBSTR) {
3855 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3856 data->longest = &(data->longest_float);
3857 }
3858 goto optimize_curly_tail;
3859 case CURLY:
3860 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3861 && (scan->flags == stopparen))
3862 {
3863 mincount = 1;
3864 maxcount = 1;
3865 } else {
3866 mincount = ARG1(scan);
3867 maxcount = ARG2(scan);
3868 }
3869 next = regnext(scan);
3870 if (OP(scan) == CURLYX) {
3871 I32 lp = (data ? *(data->last_closep) : 0);
3872 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3873 }
3874 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3875 next_is_eval = (OP(scan) == EVAL);
3876 do_curly:
3877 if (flags & SCF_DO_SUBSTR) {
3878 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3879 pos_before = data->pos_min;
3880 }
3881 if (data) {
3882 fl = data->flags;
3883 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3884 if (is_inf)
3885 data->flags |= SF_IS_INF;
3886 }
3887 if (flags & SCF_DO_STCLASS) {
3888 cl_init(pRExC_state, &this_class);
3889 oclass = data->start_class;
3890 data->start_class = &this_class;
3891 f |= SCF_DO_STCLASS_AND;
3892 f &= ~SCF_DO_STCLASS_OR;
3893 }
3894 /* Exclude from super-linear cache processing any {n,m}
3895 regops for which the combination of input pos and regex
3896 pos is not enough information to determine if a match
3897 will be possible.
3898
3899 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3900 regex pos at the \s*, the prospects for a match depend not
3901 only on the input position but also on how many (bar\s*)
3902 repeats into the {4,8} we are. */
3903 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3904 f &= ~SCF_WHILEM_VISITED_POS;
3905
3906 /* This will finish on WHILEM, setting scan, or on NULL: */
3907 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3908 last, data, stopparen, recursed, NULL,
3909 (mincount == 0
3910 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3911
3912 if (flags & SCF_DO_STCLASS)
3913 data->start_class = oclass;
3914 if (mincount == 0 || minnext == 0) {
3915 if (flags & SCF_DO_STCLASS_OR) {
3916 cl_or(pRExC_state, data->start_class, &this_class);
3917 }
3918 else if (flags & SCF_DO_STCLASS_AND) {
3919 /* Switch to OR mode: cache the old value of
3920 * data->start_class */
3921 INIT_AND_WITHP;
3922 StructCopy(data->start_class, and_withp,
3923 struct regnode_charclass_class);
3924 flags &= ~SCF_DO_STCLASS_AND;
3925 StructCopy(&this_class, data->start_class,
3926 struct regnode_charclass_class);
3927 flags |= SCF_DO_STCLASS_OR;
3928 SET_SSC_EOS(data->start_class);
3929 }
3930 } else { /* Non-zero len */
3931 if (flags & SCF_DO_STCLASS_OR) {
3932 cl_or(pRExC_state, data->start_class, &this_class);
3933 cl_and(data->start_class, and_withp);
3934 }
3935 else if (flags & SCF_DO_STCLASS_AND)
3936 cl_and(data->start_class, &this_class);
3937 flags &= ~SCF_DO_STCLASS;
3938 }
3939 if (!scan) /* It was not CURLYX, but CURLY. */
3940 scan = next;
3941 if (!(flags & SCF_TRIE_DOING_RESTUDY)
3942 /* ? quantifier ok, except for (?{ ... }) */
3943 && (next_is_eval || !(mincount == 0 && maxcount == 1))
3944 && (minnext == 0) && (deltanext == 0)
3945 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3946 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3947 {
3948 /* Fatal warnings may leak the regexp without this: */
3949 SAVEFREESV(RExC_rx_sv);
3950 ckWARNreg(RExC_parse,
3951 "Quantifier unexpected on zero-length expression");
3952 (void)ReREFCNT_inc(RExC_rx_sv);
3953 }
3954
3955 min += minnext * mincount;
3956 is_inf_internal |= deltanext == SSize_t_MAX
3957 || (maxcount == REG_INFTY && minnext + deltanext > 0);
3958 is_inf |= is_inf_internal;
3959 if (is_inf)
3960 delta = SSize_t_MAX;
3961 else
3962 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3963
3964 /* Try powerful optimization CURLYX => CURLYN. */
3965 if ( OP(oscan) == CURLYX && data
3966 && data->flags & SF_IN_PAR
3967 && !(data->flags & SF_HAS_EVAL)
3968 && !deltanext && minnext == 1 ) {
3969 /* Try to optimize to CURLYN. */
3970 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3971 regnode * const nxt1 = nxt;
3972#ifdef DEBUGGING
3973 regnode *nxt2;
3974#endif
3975
3976 /* Skip open. */
3977 nxt = regnext(nxt);
3978 if (!REGNODE_SIMPLE(OP(nxt))
3979 && !(PL_regkind[OP(nxt)] == EXACT
3980 && STR_LEN(nxt) == 1))
3981 goto nogo;
3982#ifdef DEBUGGING
3983 nxt2 = nxt;
3984#endif
3985 nxt = regnext(nxt);
3986 if (OP(nxt) != CLOSE)
3987 goto nogo;
3988 if (RExC_open_parens) {
3989 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3990 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3991 }
3992 /* Now we know that nxt2 is the only contents: */
3993 oscan->flags = (U8)ARG(nxt);
3994 OP(oscan) = CURLYN;
3995 OP(nxt1) = NOTHING; /* was OPEN. */
3996
3997#ifdef DEBUGGING
3998 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3999 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4000 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4001 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4002 OP(nxt + 1) = OPTIMIZED; /* was count. */
4003 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4004#endif
4005 }
4006 nogo:
4007
4008 /* Try optimization CURLYX => CURLYM. */
4009 if ( OP(oscan) == CURLYX && data
4010 && !(data->flags & SF_HAS_PAR)
4011 && !(data->flags & SF_HAS_EVAL)
4012 && !deltanext /* atom is fixed width */
4013 && minnext != 0 /* CURLYM can't handle zero width */
4014 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
4015 ) {
4016 /* XXXX How to optimize if data == 0? */
4017 /* Optimize to a simpler form. */
4018 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4019 regnode *nxt2;
4020
4021 OP(oscan) = CURLYM;
4022 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4023 && (OP(nxt2) != WHILEM))
4024 nxt = nxt2;
4025 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4026 /* Need to optimize away parenths. */
4027 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4028 /* Set the parenth number. */
4029 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4030
4031 oscan->flags = (U8)ARG(nxt);
4032 if (RExC_open_parens) {
4033 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4034 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4035 }
4036 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4037 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4038
4039#ifdef DEBUGGING
4040 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4041 OP(nxt + 1) = OPTIMIZED; /* was count. */
4042 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4043 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4044#endif
4045#if 0
4046 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4047 regnode *nnxt = regnext(nxt1);
4048 if (nnxt == nxt) {
4049 if (reg_off_by_arg[OP(nxt1)])
4050 ARG_SET(nxt1, nxt2 - nxt1);
4051 else if (nxt2 - nxt1 < U16_MAX)
4052 NEXT_OFF(nxt1) = nxt2 - nxt1;
4053 else
4054 OP(nxt) = NOTHING; /* Cannot beautify */
4055 }
4056 nxt1 = nnxt;
4057 }
4058#endif
4059 /* Optimize again: */
4060 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4061 NULL, stopparen, recursed, NULL, 0,depth+1);
4062 }
4063 else
4064 oscan->flags = 0;
4065 }
4066 else if ((OP(oscan) == CURLYX)
4067 && (flags & SCF_WHILEM_VISITED_POS)
4068 /* See the comment on a similar expression above.
4069 However, this time it's not a subexpression
4070 we care about, but the expression itself. */
4071 && (maxcount == REG_INFTY)
4072 && data && ++data->whilem_c < 16) {
4073 /* This stays as CURLYX, we can put the count/of pair. */
4074 /* Find WHILEM (as in regexec.c) */
4075 regnode *nxt = oscan + NEXT_OFF(oscan);
4076
4077 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4078 nxt += ARG(nxt);
4079 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4080 | (RExC_whilem_seen << 4)); /* On WHILEM */
4081 }
4082 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4083 pars++;
4084 if (flags & SCF_DO_SUBSTR) {
4085 SV *last_str = NULL;
4086 int counted = mincount != 0;
4087
4088 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4089#if defined(SPARC64_GCC_WORKAROUND)
4090 SSize_t b = 0;
4091 STRLEN l = 0;
4092 const char *s = NULL;
4093 SSize_t old = 0;
4094
4095 if (pos_before >= data->last_start_min)
4096 b = pos_before;
4097 else
4098 b = data->last_start_min;
4099
4100 l = 0;
4101 s = SvPV_const(data->last_found, l);
4102 old = b - data->last_start_min;
4103
4104#else
4105 SSize_t b = pos_before >= data->last_start_min
4106 ? pos_before : data->last_start_min;
4107 STRLEN l;
4108 const char * const s = SvPV_const(data->last_found, l);
4109 SSize_t old = b - data->last_start_min;
4110#endif
4111
4112 if (UTF)
4113 old = utf8_hop((U8*)s, old) - (U8*)s;
4114 l -= old;
4115 /* Get the added string: */
4116 last_str = newSVpvn_utf8(s + old, l, UTF);
4117 if (deltanext == 0 && pos_before == b) {
4118 /* What was added is a constant string */
4119 if (mincount > 1) {
4120 SvGROW(last_str, (mincount * l) + 1);
4121 repeatcpy(SvPVX(last_str) + l,
4122 SvPVX_const(last_str), l, mincount - 1);
4123 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4124 /* Add additional parts. */
4125 SvCUR_set(data->last_found,
4126 SvCUR(data->last_found) - l);
4127 sv_catsv(data->last_found, last_str);
4128 {
4129 SV * sv = data->last_found;
4130 MAGIC *mg =
4131 SvUTF8(sv) && SvMAGICAL(sv) ?
4132 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4133 if (mg && mg->mg_len >= 0)
4134 mg->mg_len += CHR_SVLEN(last_str) - l;
4135 }
4136 data->last_end += l * (mincount - 1);
4137 }
4138 } else {
4139 /* start offset must point into the last copy */
4140 data->last_start_min += minnext * (mincount - 1);
4141 data->last_start_max += is_inf ? SSize_t_MAX
4142 : (maxcount - 1) * (minnext + data->pos_delta);
4143 }
4144 }
4145 /* It is counted once already... */
4146 data->pos_min += minnext * (mincount - counted);
4147#if 0
4148PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4149 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4150 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4151 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4152 (UV)mincount);
4153if (deltanext != SSize_t_MAX)
4154PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4155 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4156 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4157#endif
4158 if (deltanext == SSize_t_MAX ||
4159 -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4160 data->pos_delta = SSize_t_MAX;
4161 else
4162 data->pos_delta += - counted * deltanext +
4163 (minnext + deltanext) * maxcount - minnext * mincount;
4164 if (mincount != maxcount) {
4165 /* Cannot extend fixed substrings found inside
4166 the group. */
4167 SCAN_COMMIT(pRExC_state,data,minlenp);
4168 if (mincount && last_str) {
4169 SV * const sv = data->last_found;
4170 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4171 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4172
4173 if (mg)
4174 mg->mg_len = -1;
4175 sv_setsv(sv, last_str);
4176 data->last_end = data->pos_min;
4177 data->last_start_min =
4178 data->pos_min - CHR_SVLEN(last_str);
4179 data->last_start_max = is_inf
4180 ? SSize_t_MAX
4181 : data->pos_min + data->pos_delta
4182 - CHR_SVLEN(last_str);
4183 }
4184 data->longest = &(data->longest_float);
4185 }
4186 SvREFCNT_dec(last_str);
4187 }
4188 if (data && (fl & SF_HAS_EVAL))
4189 data->flags |= SF_HAS_EVAL;
4190 optimize_curly_tail:
4191 if (OP(oscan) != CURLYX) {
4192 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4193 && NEXT_OFF(next))
4194 NEXT_OFF(oscan) += NEXT_OFF(next);
4195 }
4196 continue;
4197 default: /* REF, and CLUMP only? */
4198 if (flags & SCF_DO_SUBSTR) {
4199 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4200 data->longest = &(data->longest_float);
4201 }
4202 is_inf = is_inf_internal = 1;
4203 if (flags & SCF_DO_STCLASS_OR)
4204 cl_anything(pRExC_state, data->start_class);
4205 flags &= ~SCF_DO_STCLASS;
4206 break;
4207 }
4208 }
4209 else if (OP(scan) == LNBREAK) {
4210 if (flags & SCF_DO_STCLASS) {
4211 int value = 0;
4212 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4213 if (flags & SCF_DO_STCLASS_AND) {
4214 for (value = 0; value < 256; value++)
4215 if (!is_VERTWS_cp(value))
4216 ANYOF_BITMAP_CLEAR(data->start_class, value);
4217 }
4218 else {
4219 for (value = 0; value < 256; value++)
4220 if (is_VERTWS_cp(value))
4221 ANYOF_BITMAP_SET(data->start_class, value);
4222 }
4223 if (flags & SCF_DO_STCLASS_OR)
4224 cl_and(data->start_class, and_withp);
4225 flags &= ~SCF_DO_STCLASS;
4226 }
4227 min++;
4228 delta++; /* Because of the 2 char string cr-lf */
4229 if (flags & SCF_DO_SUBSTR) {
4230 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4231 data->pos_min += 1;
4232 data->pos_delta += 1;
4233 data->longest = &(data->longest_float);
4234 }
4235 }
4236 else if (REGNODE_SIMPLE(OP(scan))) {
4237 int value = 0;
4238
4239 if (flags & SCF_DO_SUBSTR) {
4240 SCAN_COMMIT(pRExC_state,data,minlenp);
4241 data->pos_min++;
4242 }
4243 min++;
4244 if (flags & SCF_DO_STCLASS) {
4245 int loop_max = 256;
4246 CLEAR_SSC_EOS(data->start_class); /* No match on empty */
4247
4248 /* Some of the logic below assumes that switching
4249 locale on will only add false positives. */
4250 switch (PL_regkind[OP(scan)]) {
4251 U8 classnum;
4252
4253 case SANY:
4254 default:
4255#ifdef DEBUGGING
4256 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan));
4257#endif
4258 do_default:
4259 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4260 cl_anything(pRExC_state, data->start_class);
4261 break;
4262 case REG_ANY:
4263 if (OP(scan) == SANY)
4264 goto do_default;
4265 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4266 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4267 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4268 cl_anything(pRExC_state, data->start_class);
4269 }
4270 if (flags & SCF_DO_STCLASS_AND || !value)
4271 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4272 break;
4273 case ANYOF:
4274 if (flags & SCF_DO_STCLASS_AND)
4275 cl_and(data->start_class,
4276 (struct regnode_charclass_class*)scan);
4277 else
4278 cl_or(pRExC_state, data->start_class,
4279 (struct regnode_charclass_class*)scan);
4280 break;
4281 case POSIXA:
4282 loop_max = 128;
4283 /* FALL THROUGH */
4284 case POSIXL:
4285 case POSIXD:
4286 case POSIXU:
4287 classnum = FLAGS(scan);
4288 if (flags & SCF_DO_STCLASS_AND) {
4289 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4290 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
4291 for (value = 0; value < loop_max; value++) {
4292 if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4293 ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4294 }
4295 }
4296 }
4297 }
4298 else {
4299 if (data->start_class->flags & ANYOF_LOCALE) {
4300 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum));
4301 }
4302 else {
4303
4304 /* Even if under locale, set the bits for non-locale
4305 * in case it isn't a true locale-node. This will
4306 * create false positives if it truly is locale */
4307 for (value = 0; value < loop_max; value++) {
4308 if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4309 ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4310 }
4311 }
4312 }
4313 }
4314 break;
4315 case NPOSIXA:
4316 loop_max = 128;
4317 /* FALL THROUGH */
4318 case NPOSIXL:
4319 case NPOSIXU:
4320 case NPOSIXD:
4321 classnum = FLAGS(scan);
4322 if (flags & SCF_DO_STCLASS_AND) {
4323 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4324 ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
4325 for (value = 0; value < loop_max; value++) {
4326 if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4327 ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
4328 }
4329 }
4330 }
4331 }
4332 else {
4333 if (data->start_class->flags & ANYOF_LOCALE) {
4334 ANYOF_CLASS_SET(data->start_class, classnum_to_namedclass(classnum) + 1);
4335 }
4336 else {
4337
4338 /* Even if under locale, set the bits for non-locale in
4339 * case it isn't a true locale-node. This will create
4340 * false positives if it truly is locale */
4341 for (value = 0; value < loop_max; value++) {
4342 if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
4343 ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
4344 }
4345 }
4346 if (PL_regkind[OP(scan)] == NPOSIXD) {
4347 data->start_class->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
4348 }
4349 }
4350 }
4351 break;
4352 }
4353 if (flags & SCF_DO_STCLASS_OR)
4354 cl_and(data->start_class, and_withp);
4355 flags &= ~SCF_DO_STCLASS;
4356 }
4357 }
4358 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4359 data->flags |= (OP(scan) == MEOL
4360 ? SF_BEFORE_MEOL
4361 : SF_BEFORE_SEOL);
4362 SCAN_COMMIT(pRExC_state, data, minlenp);
4363
4364 }
4365 else if ( PL_regkind[OP(scan)] == BRANCHJ
4366 /* Lookbehind, or need to calculate parens/evals/stclass: */
4367 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4368 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4369 if ( OP(scan) == UNLESSM &&
4370 scan->flags == 0 &&
4371 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4372 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4373 ) {
4374 regnode *opt;
4375 regnode *upto= regnext(scan);
4376 DEBUG_PARSE_r({
4377 SV * const mysv_val=sv_newmortal();
4378 DEBUG_STUDYDATA("OPFAIL",data,depth);
4379
4380 /*DEBUG_PARSE_MSG("opfail");*/
4381 regprop(RExC_rx, mysv_val, upto);
4382 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4383 SvPV_nolen_const(mysv_val),
4384 (IV)REG_NODE_NUM(upto),
4385 (IV)(upto - scan)
4386 );
4387 });
4388 OP(scan) = OPFAIL;
4389 NEXT_OFF(scan) = upto - scan;
4390 for (opt= scan + 1; opt < upto ; opt++)
4391 OP(opt) = OPTIMIZED;
4392 scan= upto;
4393 continue;
4394 }
4395 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4396 || OP(scan) == UNLESSM )
4397 {
4398 /* Negative Lookahead/lookbehind
4399 In this case we can't do fixed string optimisation.
4400 */
4401
4402 SSize_t deltanext, minnext, fake = 0;
4403 regnode *nscan;
4404 struct regnode_charclass_class intrnl;
4405 int f = 0;
4406
4407 data_fake.flags = 0;
4408 if (data) {
4409 data_fake.whilem_c = data->whilem_c;
4410 data_fake.last_closep = data->last_closep;
4411 }
4412 else
4413 data_fake.last_closep = &fake;
4414 data_fake.pos_delta = delta;
4415 if ( flags & SCF_DO_STCLASS && !scan->flags
4416 && OP(scan) == IFMATCH ) { /* Lookahead */
4417 cl_init(pRExC_state, &intrnl);
4418 data_fake.start_class = &intrnl;
4419 f |= SCF_DO_STCLASS_AND;
4420 }
4421 if (flags & SCF_WHILEM_VISITED_POS)
4422 f |= SCF_WHILEM_VISITED_POS;
4423 next = regnext(scan);
4424 nscan = NEXTOPER(NEXTOPER(scan));
4425 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4426 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4427 if (scan->flags) {
4428 if (deltanext) {
4429 FAIL("Variable length lookbehind not implemented");
4430 }
4431 else if (minnext > (I32)U8_MAX) {
4432 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4433 }
4434 scan->flags = (U8)minnext;
4435 }
4436 if (data) {
4437 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4438 pars++;
4439 if (data_fake.flags & SF_HAS_EVAL)
4440 data->flags |= SF_HAS_EVAL;
4441 data->whilem_c = data_fake.whilem_c;
4442 }
4443 if (f & SCF_DO_STCLASS_AND) {
4444 if (flags & SCF_DO_STCLASS_OR) {
4445 /* OR before, AND after: ideally we would recurse with
4446 * data_fake to get the AND applied by study of the
4447 * remainder of the pattern, and then derecurse;
4448 * *** HACK *** for now just treat as "no information".
4449 * See [perl #56690].
4450 */
4451 cl_init(pRExC_state, data->start_class);
4452 } else {
4453 /* AND before and after: combine and continue */
4454 const int was = TEST_SSC_EOS(data->start_class);
4455
4456 cl_and(data->start_class, &intrnl);
4457 if (was)
4458 SET_SSC_EOS(data->start_class);
4459 }
4460 }
4461 }
4462#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4463 else {
4464 /* Positive Lookahead/lookbehind
4465 In this case we can do fixed string optimisation,
4466 but we must be careful about it. Note in the case of
4467 lookbehind the positions will be offset by the minimum
4468 length of the pattern, something we won't know about
4469 until after the recurse.
4470 */
4471 SSize_t deltanext;
4472 I32 fake = 0;
4473 regnode *nscan;
4474 struct regnode_charclass_class intrnl;
4475 int f = 0;
4476 /* We use SAVEFREEPV so that when the full compile
4477 is finished perl will clean up the allocated
4478 minlens when it's all done. This way we don't
4479 have to worry about freeing them when we know
4480 they wont be used, which would be a pain.
4481 */
4482 SSize_t *minnextp;
4483 Newx( minnextp, 1, SSize_t );
4484 SAVEFREEPV(minnextp);
4485
4486 if (data) {
4487 StructCopy(data, &data_fake, scan_data_t);
4488 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4489 f |= SCF_DO_SUBSTR;
4490 if (scan->flags)
4491 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4492 data_fake.last_found=newSVsv(data->last_found);
4493 }
4494 }
4495 else
4496 data_fake.last_closep = &fake;
4497 data_fake.flags = 0;
4498 data_fake.pos_delta = delta;
4499 if (is_inf)
4500 data_fake.flags |= SF_IS_INF;
4501 if ( flags & SCF_DO_STCLASS && !scan->flags
4502 && OP(scan) == IFMATCH ) { /* Lookahead */
4503 cl_init(pRExC_state, &intrnl);
4504 data_fake.start_class = &intrnl;
4505 f |= SCF_DO_STCLASS_AND;
4506 }
4507 if (flags & SCF_WHILEM_VISITED_POS)
4508 f |= SCF_WHILEM_VISITED_POS;
4509 next = regnext(scan);
4510 nscan = NEXTOPER(NEXTOPER(scan));
4511
4512 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4513 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4514 if (scan->flags) {
4515 if (deltanext) {
4516 FAIL("Variable length lookbehind not implemented");
4517 }
4518 else if (*minnextp > (I32)U8_MAX) {
4519 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4520 }
4521 scan->flags = (U8)*minnextp;
4522 }
4523
4524 *minnextp += min;
4525
4526 if (f & SCF_DO_STCLASS_AND) {
4527 const int was = TEST_SSC_EOS(data.start_class);
4528
4529 cl_and(data->start_class, &intrnl);
4530 if (was)
4531 SET_SSC_EOS(data->start_class);
4532 }
4533 if (data) {
4534 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4535 pars++;
4536 if (data_fake.flags & SF_HAS_EVAL)
4537 data->flags |= SF_HAS_EVAL;
4538 data->whilem_c = data_fake.whilem_c;
4539 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4540 if (RExC_rx->minlen<*minnextp)
4541 RExC_rx->minlen=*minnextp;
4542 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4543 SvREFCNT_dec_NN(data_fake.last_found);
4544
4545 if ( data_fake.minlen_fixed != minlenp )
4546 {
4547 data->offset_fixed= data_fake.offset_fixed;
4548 data->minlen_fixed= data_fake.minlen_fixed;
4549 data->lookbehind_fixed+= scan->flags;
4550 }
4551 if ( data_fake.minlen_float != minlenp )
4552 {
4553 data->minlen_float= data_fake.minlen_float;
4554 data->offset_float_min=data_fake.offset_float_min;
4555 data->offset_float_max=data_fake.offset_float_max;
4556 data->lookbehind_float+= scan->flags;
4557 }
4558 }
4559 }
4560 }
4561#endif
4562 }
4563 else if (OP(scan) == OPEN) {
4564 if (stopparen != (I32)ARG(scan))
4565 pars++;
4566 }
4567 else if (OP(scan) == CLOSE) {
4568 if (stopparen == (I32)ARG(scan)) {
4569 break;
4570 }
4571 if ((I32)ARG(scan) == is_par) {
4572 next = regnext(scan);
4573
4574 if ( next && (OP(next) != WHILEM) && next < last)
4575 is_par = 0; /* Disable optimization */
4576 }
4577 if (data)
4578 *(data->last_closep) = ARG(scan);
4579 }
4580 else if (OP(scan) == EVAL) {
4581 if (data)
4582 data->flags |= SF_HAS_EVAL;
4583 }
4584 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4585 if (flags & SCF_DO_SUBSTR) {
4586 SCAN_COMMIT(pRExC_state,data,minlenp);
4587 flags &= ~SCF_DO_SUBSTR;
4588 }
4589 if (data && OP(scan)==ACCEPT) {
4590 data->flags |= SCF_SEEN_ACCEPT;
4591 if (stopmin > min)
4592 stopmin = min;
4593 }
4594 }
4595 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4596 {
4597 if (flags & SCF_DO_SUBSTR) {
4598 SCAN_COMMIT(pRExC_state,data,minlenp);
4599 data->longest = &(data->longest_float);
4600 }
4601 is_inf = is_inf_internal = 1;
4602 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4603 cl_anything(pRExC_state, data->start_class);
4604 flags &= ~SCF_DO_STCLASS;
4605 }
4606 else if (OP(scan) == GPOS) {
4607 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4608 !(delta || is_inf || (data && data->pos_delta)))
4609 {
4610 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4611 RExC_rx->extflags |= RXf_ANCH_GPOS;
4612 if (RExC_rx->gofs < (STRLEN)min)
4613 RExC_rx->gofs = min;
4614 } else {
4615 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4616 RExC_rx->gofs = 0;
4617 }
4618 }
4619#ifdef TRIE_STUDY_OPT
4620#ifdef FULL_TRIE_STUDY
4621 else if (PL_regkind[OP(scan)] == TRIE) {
4622 /* NOTE - There is similar code to this block above for handling
4623 BRANCH nodes on the initial study. If you change stuff here
4624 check there too. */
4625 regnode *trie_node= scan;
4626 regnode *tail= regnext(scan);
4627 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4628 SSize_t max1 = 0, min1 = SSize_t_MAX;
4629 struct regnode_charclass_class accum;
4630
4631 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4632 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4633 if (flags & SCF_DO_STCLASS)
4634 cl_init_zero(pRExC_state, &accum);
4635
4636 if (!trie->jump) {
4637 min1= trie->minlen;
4638 max1= trie->maxlen;
4639 } else {
4640 const regnode *nextbranch= NULL;
4641 U32 word;
4642
4643 for ( word=1 ; word <= trie->wordcount ; word++)
4644 {
4645 SSize_t deltanext=0, minnext=0, f = 0, fake;
4646 struct regnode_charclass_class this_class;
4647
4648 data_fake.flags = 0;
4649 if (data) {
4650 data_fake.whilem_c = data->whilem_c;
4651 data_fake.last_closep = data->last_closep;
4652 }
4653 else
4654 data_fake.last_closep = &fake;
4655 data_fake.pos_delta = delta;
4656 if (flags & SCF_DO_STCLASS) {
4657 cl_init(pRExC_state, &this_class);
4658 data_fake.start_class = &this_class;
4659 f = SCF_DO_STCLASS_AND;
4660 }
4661 if (flags & SCF_WHILEM_VISITED_POS)
4662 f |= SCF_WHILEM_VISITED_POS;
4663
4664 if (trie->jump[word]) {
4665 if (!nextbranch)
4666 nextbranch = trie_node + trie->jump[0];
4667 scan= trie_node + trie->jump[word];
4668 /* We go from the jump point to the branch that follows
4669 it. Note this means we need the vestigal unused branches
4670 even though they arent otherwise used.
4671 */
4672 minnext = study_chunk(pRExC_state, &scan, minlenp,
4673 &deltanext, (regnode *)nextbranch, &data_fake,
4674 stopparen, recursed, NULL, f,depth+1);
4675 }
4676 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4677 nextbranch= regnext((regnode*)nextbranch);
4678
4679 if (min1 > (SSize_t)(minnext + trie->minlen))
4680 min1 = minnext + trie->minlen;
4681 if (deltanext == SSize_t_MAX) {
4682 is_inf = is_inf_internal = 1;
4683 max1 = SSize_t_MAX;
4684 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
4685 max1 = minnext + deltanext + trie->maxlen;
4686
4687 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4688 pars++;
4689 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4690 if ( stopmin > min + min1)
4691 stopmin = min + min1;
4692 flags &= ~SCF_DO_SUBSTR;
4693 if (data)
4694 data->flags |= SCF_SEEN_ACCEPT;
4695 }
4696 if (data) {
4697 if (data_fake.flags & SF_HAS_EVAL)
4698 data->flags |= SF_HAS_EVAL;
4699 data->whilem_c = data_fake.whilem_c;
4700 }
4701 if (flags & SCF_DO_STCLASS)
4702 cl_or(pRExC_state, &accum, &this_class);
4703 }
4704 }
4705 if (flags & SCF_DO_SUBSTR) {
4706 data->pos_min += min1;
4707 data->pos_delta += max1 - min1;
4708 if (max1 != min1 || is_inf)
4709 data->longest = &(data->longest_float);
4710 }
4711 min += min1;
4712 delta += max1 - min1;
4713 if (flags & SCF_DO_STCLASS_OR) {
4714 cl_or(pRExC_state, data->start_class, &accum);
4715 if (min1) {
4716 cl_and(data->start_class, and_withp);
4717 flags &= ~SCF_DO_STCLASS;
4718 }
4719 }
4720 else if (flags & SCF_DO_STCLASS_AND) {
4721 if (min1) {
4722 cl_and(data->start_class, &accum);
4723 flags &= ~SCF_DO_STCLASS;
4724 }
4725 else {
4726 /* Switch to OR mode: cache the old value of
4727 * data->start_class */
4728 INIT_AND_WITHP;
4729 StructCopy(data->start_class, and_withp,
4730 struct regnode_charclass_class);
4731 flags &= ~SCF_DO_STCLASS_AND;
4732 StructCopy(&accum, data->start_class,
4733 struct regnode_charclass_class);
4734 flags |= SCF_DO_STCLASS_OR;
4735 SET_SSC_EOS(data->start_class);
4736 }
4737 }
4738 scan= tail;
4739 continue;
4740 }
4741#else
4742 else if (PL_regkind[OP(scan)] == TRIE) {
4743 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4744 U8*bang=NULL;
4745
4746 min += trie->minlen;
4747 delta += (trie->maxlen - trie->minlen);
4748 flags &= ~SCF_DO_STCLASS; /* xxx */
4749 if (flags & SCF_DO_SUBSTR) {
4750 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4751 data->pos_min += trie->minlen;
4752 data->pos_delta += (trie->maxlen - trie->minlen);
4753 if (trie->maxlen != trie->minlen)
4754 data->longest = &(data->longest_float);
4755 }
4756 if (trie->jump) /* no more substrings -- for now /grr*/
4757 flags &= ~SCF_DO_SUBSTR;
4758 }
4759#endif /* old or new */
4760#endif /* TRIE_STUDY_OPT */
4761
4762 /* Else: zero-length, ignore. */
4763 scan = regnext(scan);
4764 }
4765 if (frame) {
4766 last = frame->last;
4767 scan = frame->next;
4768 stopparen = frame->stop;
4769 frame = frame->prev;
4770 goto fake_study_recurse;
4771 }
4772
4773 finish:
4774 assert(!frame);
4775 DEBUG_STUDYDATA("pre-fin:",data,depth);
4776
4777 *scanp = scan;
4778 *deltap = is_inf_internal ? SSize_t_MAX : delta;
4779 if (flags & SCF_DO_SUBSTR && is_inf)
4780 data->pos_delta = SSize_t_MAX - data->pos_min;
4781 if (is_par > (I32)U8_MAX)
4782 is_par = 0;
4783 if (is_par && pars==1 && data) {
4784 data->flags |= SF_IN_PAR;
4785 data->flags &= ~SF_HAS_PAR;
4786 }
4787 else if (pars && data) {
4788 data->flags |= SF_HAS_PAR;
4789 data->flags &= ~SF_IN_PAR;
4790 }
4791 if (flags & SCF_DO_STCLASS_OR)
4792 cl_and(data->start_class, and_withp);
4793 if (flags & SCF_TRIE_RESTUDY)
4794 data->flags |= SCF_TRIE_RESTUDY;
4795
4796 DEBUG_STUDYDATA("post-fin:",data,depth);
4797
4798 return min < stopmin ? min : stopmin;
4799}
4800
4801STATIC U32
4802S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4803{
4804 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4805
4806 PERL_ARGS_ASSERT_ADD_DATA;
4807
4808 Renewc(RExC_rxi->data,
4809 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4810 char, struct reg_data);
4811 if(count)
4812 Renew(RExC_rxi->data->what, count + n, U8);
4813 else
4814 Newx(RExC_rxi->data->what, n, U8);
4815 RExC_rxi->data->count = count + n;
4816 Copy(s, RExC_rxi->data->what + count, n, U8);
4817 return count;
4818}
4819
4820/*XXX: todo make this not included in a non debugging perl */
4821#ifndef PERL_IN_XSUB_RE
4822void
4823Perl_reginitcolors(pTHX)
4824{
4825 dVAR;
4826 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4827 if (s) {
4828 char *t = savepv(s);
4829 int i = 0;
4830 PL_colors[0] = t;
4831 while (++i < 6) {
4832 t = strchr(t, '\t');
4833 if (t) {
4834 *t = '\0';
4835 PL_colors[i] = ++t;
4836 }
4837 else
4838 PL_colors[i] = t = (char *)"";
4839 }
4840 } else {
4841 int i = 0;
4842 while (i < 6)
4843 PL_colors[i++] = (char *)"";
4844 }
4845 PL_colorset = 1;
4846}
4847#endif
4848
4849
4850#ifdef TRIE_STUDY_OPT
4851#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
4852 STMT_START { \
4853 if ( \
4854 (data.flags & SCF_TRIE_RESTUDY) \
4855 && ! restudied++ \
4856 ) { \
4857 dOsomething; \
4858 goto reStudy; \
4859 } \
4860 } STMT_END
4861#else
4862#define CHECK_RESTUDY_GOTO_butfirst
4863#endif
4864
4865/*
4866 * pregcomp - compile a regular expression into internal code
4867 *
4868 * Decides which engine's compiler to call based on the hint currently in
4869 * scope
4870 */
4871
4872#ifndef PERL_IN_XSUB_RE
4873
4874/* return the currently in-scope regex engine (or the default if none) */
4875
4876regexp_engine const *
4877Perl_current_re_engine(pTHX)
4878{
4879 dVAR;
4880
4881 if (IN_PERL_COMPILETIME) {
4882 HV * const table = GvHV(PL_hintgv);
4883 SV **ptr;
4884
4885 if (!table)
4886 return &PL_core_reg_engine;
4887 ptr = hv_fetchs(table, "regcomp", FALSE);
4888 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4889 return &PL_core_reg_engine;
4890 return INT2PTR(regexp_engine*,SvIV(*ptr));
4891 }
4892 else {
4893 SV *ptr;
4894 if (!PL_curcop->cop_hints_hash)
4895 return &PL_core_reg_engine;
4896 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4897 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4898 return &PL_core_reg_engine;
4899 return INT2PTR(regexp_engine*,SvIV(ptr));
4900 }
4901}
4902
4903
4904REGEXP *
4905Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4906{
4907 dVAR;
4908 regexp_engine const *eng = current_re_engine();
4909 GET_RE_DEBUG_FLAGS_DECL;
4910
4911 PERL_ARGS_ASSERT_PREGCOMP;
4912
4913 /* Dispatch a request to compile a regexp to correct regexp engine. */
4914 DEBUG_COMPILE_r({
4915 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4916 PTR2UV(eng));
4917 });
4918 return CALLREGCOMP_ENG(eng, pattern, flags);
4919}
4920#endif
4921
4922/* public(ish) entry point for the perl core's own regex compiling code.
4923 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4924 * pattern rather than a list of OPs, and uses the internal engine rather
4925 * than the current one */
4926
4927REGEXP *
4928Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4929{
4930 SV *pat = pattern; /* defeat constness! */
4931 PERL_ARGS_ASSERT_RE_COMPILE;
4932 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4933#ifdef PERL_IN_XSUB_RE
4934 &my_reg_engine,
4935#else
4936 &PL_core_reg_engine,
4937#endif
4938 NULL, NULL, rx_flags, 0);
4939}
4940
4941
4942/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
4943 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
4944 * point to the realloced string and length.
4945 *
4946 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
4947 * stuff added */
4948
4949static void
4950S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
4951 char **pat_p, STRLEN *plen_p, int num_code_blocks)
4952{
4953 U8 *const src = (U8*)*pat_p;
4954 U8 *dst;
4955 int n=0;
4956 STRLEN s = 0, d = 0;
4957 bool do_end = 0;
4958 GET_RE_DEBUG_FLAGS_DECL;
4959
4960 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4961 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4962
4963 Newx(dst, *plen_p * 2 + 1, U8);
4964
4965 while (s < *plen_p) {
4966 if (NATIVE_IS_INVARIANT(src[s]))
4967 dst[d] = src[s];
4968 else {
4969 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
4970 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
4971 }
4972 if (n < num_code_blocks) {
4973 if (!do_end && pRExC_state->code_blocks[n].start == s) {
4974 pRExC_state->code_blocks[n].start = d;
4975 assert(dst[d] == '(');
4976 do_end = 1;
4977 }
4978 else if (do_end && pRExC_state->code_blocks[n].end == s) {
4979 pRExC_state->code_blocks[n].end = d;
4980 assert(dst[d] == ')');
4981 do_end = 0;
4982 n++;
4983 }
4984 }
4985 s++;
4986 d++;
4987 }
4988 dst[d] = '\0';
4989 *plen_p = d;
4990 *pat_p = (char*) dst;
4991 SAVEFREEPV(*pat_p);
4992 RExC_orig_utf8 = RExC_utf8 = 1;
4993}
4994
4995
4996
4997/* S_concat_pat(): concatenate a list of args to the pattern string pat,
4998 * while recording any code block indices, and handling overloading,
4999 * nested qr// objects etc. If pat is null, it will allocate a new
5000 * string, or just return the first arg, if there's only one.
5001 *
5002 * Returns the malloced/updated pat.
5003 * patternp and pat_count is the array of SVs to be concatted;
5004 * oplist is the optional list of ops that generated the SVs;
5005 * recompile_p is a pointer to a boolean that will be set if
5006 * the regex will need to be recompiled.
5007 * delim, if non-null is an SV that will be inserted between each element
5008 */
5009
5010static SV*
5011S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5012 SV *pat, SV ** const patternp, int pat_count,
5013 OP *oplist, bool *recompile_p, SV *delim)
5014{
5015 SV **svp;
5016 int n = 0;
5017 bool use_delim = FALSE;
5018 bool alloced = FALSE;
5019
5020 /* if we know we have at least two args, create an empty string,
5021 * then concatenate args to that. For no args, return an empty string */
5022 if (!pat && pat_count != 1) {
5023 pat = newSVpvn("", 0);
5024 SAVEFREESV(pat);
5025 alloced = TRUE;
5026 }
5027
5028 for (svp = patternp; svp < patternp + pat_count; svp++) {
5029 SV *sv;
5030 SV *rx = NULL;
5031 STRLEN orig_patlen = 0;
5032 bool code = 0;
5033 SV *msv = use_delim ? delim : *svp;
5034
5035 /* if we've got a delimiter, we go round the loop twice for each
5036 * svp slot (except the last), using the delimiter the second
5037 * time round */
5038 if (use_delim) {
5039 svp--;
5040 use_delim = FALSE;
5041 }
5042 else if (delim)
5043 use_delim = TRUE;
5044
5045 if (SvTYPE(msv) == SVt_PVAV) {
5046 /* we've encountered an interpolated array within
5047 * the pattern, e.g. /...@a..../. Expand the list of elements,
5048 * then recursively append elements.
5049 * The code in this block is based on S_pushav() */
5050
5051 AV *const av = (AV*)msv;
5052 const I32 maxarg = AvFILL(av) + 1;
5053 SV **array;
5054
5055 if (oplist) {
5056 assert(oplist->op_type == OP_PADAV
5057 || oplist->op_type == OP_RV2AV);
5058 oplist = oplist->op_sibling;;
5059 }
5060
5061 if (SvRMAGICAL(av)) {
5062 U32 i;
5063
5064 Newx(array, maxarg, SV*);
5065 SAVEFREEPV(array);
5066 for (i=0; i < (U32)maxarg; i++) {
5067 SV ** const svp = av_fetch(av, i, FALSE);
5068 array[i] = svp ? *svp : &PL_sv_undef;
5069 }
5070 }
5071 else
5072 array = AvARRAY(av);
5073
5074 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5075 array, maxarg, NULL, recompile_p,
5076 /* $" */
5077 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5078
5079 continue;
5080 }
5081
5082
5083 /* we make the assumption here that each op in the list of
5084 * op_siblings maps to one SV pushed onto the stack,
5085 * except for code blocks, with have both an OP_NULL and
5086 * and OP_CONST.
5087 * This allows us to match up the list of SVs against the
5088 * list of OPs to find the next code block.
5089 *
5090 * Note that PUSHMARK PADSV PADSV ..
5091 * is optimised to
5092 * PADRANGE PADSV PADSV ..
5093 * so the alignment still works. */
5094
5095 if (oplist) {
5096 if (oplist->op_type == OP_NULL
5097 && (oplist->op_flags & OPf_SPECIAL))
5098 {
5099 assert(n < pRExC_state->num_code_blocks);
5100 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5101 pRExC_state->code_blocks[n].block = oplist;
5102 pRExC_state->code_blocks[n].src_regex = NULL;
5103 n++;
5104 code = 1;
5105 oplist = oplist->op_sibling; /* skip CONST */
5106 assert(oplist);
5107 }
5108 oplist = oplist->op_sibling;;
5109 }
5110
5111 /* apply magic and QR overloading to arg */
5112
5113 SvGETMAGIC(msv);
5114 if (SvROK(msv) && SvAMAGIC(msv)) {
5115 SV *sv = AMG_CALLunary(msv, regexp_amg);
5116 if (sv) {
5117 if (SvROK(sv))
5118 sv = SvRV(sv);
5119 if (SvTYPE(sv) != SVt_REGEXP)
5120 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5121 msv = sv;
5122 }
5123 }
5124
5125 /* try concatenation overload ... */
5126 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5127 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5128 {
5129 sv_setsv(pat, sv);
5130 /* overloading involved: all bets are off over literal
5131 * code. Pretend we haven't seen it */
5132 pRExC_state->num_code_blocks -= n;
5133 n = 0;
5134 }
5135 else {
5136 /* ... or failing that, try "" overload */
5137 while (SvAMAGIC(msv)
5138 && (sv = AMG_CALLunary(msv, string_amg))
5139 && sv != msv
5140 && !( SvROK(msv)
5141 && SvROK(sv)
5142 && SvRV(msv) == SvRV(sv))
5143 ) {
5144 msv = sv;
5145 SvGETMAGIC(msv);
5146 }
5147 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5148 msv = SvRV(msv);
5149
5150 if (pat) {
5151 /* this is a partially unrolled
5152 * sv_catsv_nomg(pat, msv);
5153 * that allows us to adjust code block indices if
5154 * needed */
5155 STRLEN dlen;
5156 char *dst = SvPV_force_nomg(pat, dlen);
5157 orig_patlen = dlen;
5158 if (SvUTF8(msv) && !SvUTF8(pat)) {
5159 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5160 sv_setpvn(pat, dst, dlen);
5161 SvUTF8_on(pat);
5162 }
5163 sv_catsv_nomg(pat, msv);
5164 rx = msv;
5165 }
5166 else
5167 pat = msv;
5168
5169 if (code)
5170 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5171 }
5172
5173 /* extract any code blocks within any embedded qr//'s */
5174 if (rx && SvTYPE(rx) == SVt_REGEXP
5175 && RX_ENGINE((REGEXP*)rx)->op_comp)
5176 {
5177
5178 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5179 if (ri->num_code_blocks) {
5180 int i;
5181 /* the presence of an embedded qr// with code means
5182 * we should always recompile: the text of the
5183 * qr// may not have changed, but it may be a
5184 * different closure than last time */
5185 *recompile_p = 1;
5186 Renew(pRExC_state->code_blocks,
5187 pRExC_state->num_code_blocks + ri->num_code_blocks,
5188 struct reg_code_block);
5189 pRExC_state->num_code_blocks += ri->num_code_blocks;
5190
5191 for (i=0; i < ri->num_code_blocks; i++) {
5192 struct reg_code_block *src, *dst;
5193 STRLEN offset = orig_patlen
5194 + ReANY((REGEXP *)rx)->pre_prefix;
5195 assert(n < pRExC_state->num_code_blocks);
5196 src = &ri->code_blocks[i];
5197 dst = &pRExC_state->code_blocks[n];
5198 dst->start = src->start + offset;
5199 dst->end = src->end + offset;
5200 dst->block = src->block;
5201 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5202 src->src_regex
5203 ? src->src_regex
5204 : (REGEXP*)rx);
5205 n++;
5206 }
5207 }
5208 }
5209 }
5210 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5211 if (alloced)
5212 SvSETMAGIC(pat);
5213
5214 return pat;
5215}
5216
5217
5218
5219/* see if there are any run-time code blocks in the pattern.
5220 * False positives are allowed */
5221
5222static bool
5223S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5224 char *pat, STRLEN plen)
5225{
5226 int n = 0;
5227 STRLEN s;
5228
5229 for (s = 0; s < plen; s++) {
5230 if (n < pRExC_state->num_code_blocks
5231 && s == pRExC_state->code_blocks[n].start)
5232 {
5233 s = pRExC_state->code_blocks[n].end;
5234 n++;
5235 continue;
5236 }
5237 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5238 * positives here */
5239 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5240 (pat[s+2] == '{'
5241 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5242 )
5243 return 1;
5244 }
5245 return 0;
5246}
5247
5248/* Handle run-time code blocks. We will already have compiled any direct
5249 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5250 * copy of it, but with any literal code blocks blanked out and
5251 * appropriate chars escaped; then feed it into
5252 *
5253 * eval "qr'modified_pattern'"
5254 *
5255 * For example,
5256 *
5257 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5258 *
5259 * becomes
5260 *
5261 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5262 *
5263 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5264 * and merge them with any code blocks of the original regexp.
5265 *
5266 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5267 * instead, just save the qr and return FALSE; this tells our caller that
5268 * the original pattern needs upgrading to utf8.
5269 */
5270
5271static bool
5272S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5273 char *pat, STRLEN plen)
5274{
5275 SV *qr;
5276
5277 GET_RE_DEBUG_FLAGS_DECL;
5278
5279 if (pRExC_state->runtime_code_qr) {
5280 /* this is the second time we've been called; this should
5281 * only happen if the main pattern got upgraded to utf8
5282 * during compilation; re-use the qr we compiled first time
5283 * round (which should be utf8 too)
5284 */
5285 qr = pRExC_state->runtime_code_qr;
5286 pRExC_state->runtime_code_qr = NULL;
5287 assert(RExC_utf8 && SvUTF8(qr));
5288 }
5289 else {
5290 int n = 0;
5291 STRLEN s;
5292 char *p, *newpat;
5293 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5294 SV *sv, *qr_ref;
5295 dSP;
5296
5297 /* determine how many extra chars we need for ' and \ escaping */
5298 for (s = 0; s < plen; s++) {
5299 if (pat[s] == '\'' || pat[s] == '\\')
5300 newlen++;
5301 }
5302
5303 Newx(newpat, newlen, char);
5304 p = newpat;
5305 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5306
5307 for (s = 0; s < plen; s++) {
5308 if (n < pRExC_state->num_code_blocks
5309 && s == pRExC_state->code_blocks[n].start)
5310 {
5311 /* blank out literal code block */
5312 assert(pat[s] == '(');
5313 while (s <= pRExC_state->code_blocks[n].end) {
5314 *p++ = '_';
5315 s++;
5316 }
5317 s--;
5318 n++;
5319 continue;
5320 }
5321 if (pat[s] == '\'' || pat[s] == '\\')
5322 *p++ = '\\';
5323 *p++ = pat[s];
5324 }
5325 *p++ = '\'';
5326 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5327 *p++ = 'x';
5328 *p++ = '\0';
5329 DEBUG_COMPILE_r({
5330 PerlIO_printf(Perl_debug_log,
5331 "%sre-parsing pattern for runtime code:%s %s\n",
5332 PL_colors[4],PL_colors[5],newpat);
5333 });
5334
5335 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5336 Safefree(newpat);
5337
5338 ENTER;
5339 SAVETMPS;
5340 save_re_context();
5341 PUSHSTACKi(PERLSI_REQUIRE);
5342 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5343 * parsing qr''; normally only q'' does this. It also alters
5344 * hints handling */
5345 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5346 SvREFCNT_dec_NN(sv);
5347 SPAGAIN;
5348 qr_ref = POPs;
5349 PUTBACK;
5350 {
5351 SV * const errsv = ERRSV;
5352 if (SvTRUE_NN(errsv))
5353 {
5354 Safefree(pRExC_state->code_blocks);
5355 /* use croak_sv ? */
5356 Perl_croak_nocontext("%s", SvPV_nolen_const(errsv));
5357 }
5358 }
5359 assert(SvROK(qr_ref));
5360 qr = SvRV(qr_ref);
5361 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5362 /* the leaving below frees the tmp qr_ref.
5363 * Give qr a life of its own */
5364 SvREFCNT_inc(qr);
5365 POPSTACK;
5366 FREETMPS;
5367 LEAVE;
5368
5369 }
5370
5371 if (!RExC_utf8 && SvUTF8(qr)) {
5372 /* first time through; the pattern got upgraded; save the
5373 * qr for the next time through */
5374 assert(!pRExC_state->runtime_code_qr);
5375 pRExC_state->runtime_code_qr = qr;
5376 return 0;
5377 }
5378
5379
5380 /* extract any code blocks within the returned qr// */
5381
5382
5383 /* merge the main (r1) and run-time (r2) code blocks into one */
5384 {
5385 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
5386 struct reg_code_block *new_block, *dst;
5387 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5388 int i1 = 0, i2 = 0;
5389
5390 if (!r2->num_code_blocks) /* we guessed wrong */
5391 {
5392 SvREFCNT_dec_NN(qr);
5393 return 1;
5394 }
5395
5396 Newx(new_block,
5397 r1->num_code_blocks + r2->num_code_blocks,
5398 struct reg_code_block);
5399 dst = new_block;
5400
5401 while ( i1 < r1->num_code_blocks
5402 || i2 < r2->num_code_blocks)
5403 {
5404 struct reg_code_block *src;
5405 bool is_qr = 0;
5406
5407 if (i1 == r1->num_code_blocks) {
5408 src = &r2->code_blocks[i2++];
5409 is_qr = 1;
5410 }
5411 else if (i2 == r2->num_code_blocks)
5412 src = &r1->code_blocks[i1++];
5413 else if ( r1->code_blocks[i1].start
5414 < r2->code_blocks[i2].start)
5415 {
5416 src = &r1->code_blocks[i1++];
5417 assert(src->end < r2->code_blocks[i2].start);
5418 }
5419 else {
5420 assert( r1->code_blocks[i1].start
5421 > r2->code_blocks[i2].start);
5422 src = &r2->code_blocks[i2++];
5423 is_qr = 1;
5424 assert(src->end < r1->code_blocks[i1].start);
5425 }
5426
5427 assert(pat[src->start] == '(');
5428 assert(pat[src->end] == ')');
5429 dst->start = src->start;
5430 dst->end = src->end;
5431 dst->block = src->block;
5432 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5433 : src->src_regex;
5434 dst++;
5435 }
5436 r1->num_code_blocks += r2->num_code_blocks;
5437 Safefree(r1->code_blocks);
5438 r1->code_blocks = new_block;
5439 }
5440
5441 SvREFCNT_dec_NN(qr);
5442 return 1;
5443}
5444
5445
5446STATIC bool
5447S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
5448 SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
5449{
5450 /* This is the common code for setting up the floating and fixed length
5451 * string data extracted from Perl_re_op_compile() below. Returns a boolean
5452 * as to whether succeeded or not */
5453
5454 I32 t;
5455 SSize_t ml;
5456
5457 if (! (longest_length
5458 || (eol /* Can't have SEOL and MULTI */
5459 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5460 )
5461 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5462 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5463 {
5464 return FALSE;
5465 }
5466
5467 /* copy the information about the longest from the reg_scan_data
5468 over to the program. */
5469 if (SvUTF8(sv_longest)) {
5470 *rx_utf8 = sv_longest;
5471 *rx_substr = NULL;
5472 } else {
5473 *rx_substr = sv_longest;
5474 *rx_utf8 = NULL;
5475 }
5476 /* end_shift is how many chars that must be matched that
5477 follow this item. We calculate it ahead of time as once the
5478 lookbehind offset is added in we lose the ability to correctly
5479 calculate it.*/
5480 ml = minlen ? *(minlen) : (SSize_t)longest_length;
5481 *rx_end_shift = ml - offset
5482 - longest_length + (SvTAIL(sv_longest) != 0)
5483 + lookbehind;
5484
5485 t = (eol/* Can't have SEOL and MULTI */
5486 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5487 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5488
5489 return TRUE;
5490}
5491
5492/*
5493 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5494 * regular expression into internal code.
5495 * The pattern may be passed either as:
5496 * a list of SVs (patternp plus pat_count)
5497 * a list of OPs (expr)
5498 * If both are passed, the SV list is used, but the OP list indicates
5499 * which SVs are actually pre-compiled code blocks
5500 *
5501 * The SVs in the list have magic and qr overloading applied to them (and
5502 * the list may be modified in-place with replacement SVs in the latter
5503 * case).
5504 *
5505 * If the pattern hasn't changed from old_re, then old_re will be
5506 * returned.
5507 *
5508 * eng is the current engine. If that engine has an op_comp method, then
5509 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5510 * do the initial concatenation of arguments and pass on to the external
5511 * engine.
5512 *
5513 * If is_bare_re is not null, set it to a boolean indicating whether the
5514 * arg list reduced (after overloading) to a single bare regex which has
5515 * been returned (i.e. /$qr/).
5516 *
5517 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5518 *
5519 * pm_flags contains the PMf_* flags, typically based on those from the
5520 * pm_flags field of the related PMOP. Currently we're only interested in
5521 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5522 *
5523 * We can't allocate space until we know how big the compiled form will be,
5524 * but we can't compile it (and thus know how big it is) until we've got a
5525 * place to put the code. So we cheat: we compile it twice, once with code
5526 * generation turned off and size counting turned on, and once "for real".
5527 * This also means that we don't allocate space until we are sure that the
5528 * thing really will compile successfully, and we never have to move the
5529 * code and thus invalidate pointers into it. (Note that it has to be in
5530 * one piece because free() must be able to free it all.) [NB: not true in perl]
5531 *
5532 * Beware that the optimization-preparation code in here knows about some
5533 * of the structure of the compiled regexp. [I'll say.]
5534 */
5535
5536REGEXP *
5537Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5538 OP *expr, const regexp_engine* eng, REGEXP *old_re,
5539 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5540{
5541 dVAR;
5542 REGEXP *rx;
5543 struct regexp *r;
5544 regexp_internal *ri;
5545 STRLEN plen;
5546 char *exp;
5547 regnode *scan;
5548 I32 flags;
5549 SSize_t minlen = 0;
5550 U32 rx_flags;
5551 SV *pat;
5552 SV *code_blocksv = NULL;
5553 SV** new_patternp = patternp;
5554
5555 /* these are all flags - maybe they should be turned
5556 * into a single int with different bit masks */
5557 I32 sawlookahead = 0;
5558 I32 sawplus = 0;
5559 I32 sawopen = 0;
5560 I32 sawminmod = 0;
5561
5562 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5563 bool recompile = 0;
5564 bool runtime_code = 0;
5565 scan_data_t data;
5566 RExC_state_t RExC_state;
5567 RExC_state_t * const pRExC_state = &RExC_state;
5568#ifdef TRIE_STUDY_OPT
5569 int restudied = 0;
5570 RExC_state_t copyRExC_state;
5571#endif
5572 GET_RE_DEBUG_FLAGS_DECL;
5573
5574 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5575
5576 DEBUG_r(if (!PL_colorset) reginitcolors());
5577
5578#ifndef PERL_IN_XSUB_RE
5579 /* Initialize these here instead of as-needed, as is quick and avoids
5580 * having to test them each time otherwise */
5581 if (! PL_AboveLatin1) {
5582 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5583 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5584 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5585
5586 PL_L1Posix_ptrs[_CC_ALPHANUMERIC]
5587 = _new_invlist_C_array(L1PosixAlnum_invlist);
5588 PL_Posix_ptrs[_CC_ALPHANUMERIC]
5589 = _new_invlist_C_array(PosixAlnum_invlist);
5590
5591 PL_L1Posix_ptrs[_CC_ALPHA]
5592 = _new_invlist_C_array(L1PosixAlpha_invlist);
5593 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(PosixAlpha_invlist);
5594
5595 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(PosixBlank_invlist);
5596 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(XPosixBlank_invlist);
5597
5598 /* Cased is the same as Alpha in the ASCII range */
5599 PL_L1Posix_ptrs[_CC_CASED] = _new_invlist_C_array(L1Cased_invlist);
5600 PL_Posix_ptrs[_CC_CASED] = _new_invlist_C_array(PosixAlpha_invlist);
5601
5602 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(PosixCntrl_invlist);
5603 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(XPosixCntrl_invlist);
5604
5605 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5606 PL_L1Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(PosixDigit_invlist);
5607
5608 PL_L1Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(L1PosixGraph_invlist);
5609 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(PosixGraph_invlist);
5610
5611 PL_L1Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(L1PosixLower_invlist);
5612 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(PosixLower_invlist);
5613
5614 PL_L1Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(L1PosixPrint_invlist);
5615 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(PosixPrint_invlist);
5616
5617 PL_L1Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(L1PosixPunct_invlist);
5618 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(PosixPunct_invlist);
5619
5620 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(PerlSpace_invlist);
5621 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(XPerlSpace_invlist);
5622 PL_Posix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(PosixSpace_invlist);
5623 PL_XPosix_ptrs[_CC_PSXSPC] = _new_invlist_C_array(XPosixSpace_invlist);
5624
5625 PL_L1Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(L1PosixUpper_invlist);
5626 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(PosixUpper_invlist);
5627
5628 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(VertSpace_invlist);
5629
5630 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(PosixWord_invlist);
5631 PL_L1Posix_ptrs[_CC_WORDCHAR]
5632 = _new_invlist_C_array(L1PosixWord_invlist);
5633
5634 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(PosixXDigit_invlist);
5635 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(XPosixXDigit_invlist);
5636
5637 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5638 }
5639#endif
5640
5641 pRExC_state->code_blocks = NULL;
5642 pRExC_state->num_code_blocks = 0;
5643
5644 if (is_bare_re)
5645 *is_bare_re = FALSE;
5646
5647 if (expr && (expr->op_type == OP_LIST ||
5648 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5649 /* allocate code_blocks if needed */
5650 OP *o;
5651 int ncode = 0;
5652
5653 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
5654 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5655 ncode++; /* count of DO blocks */
5656 if (ncode) {
5657 pRExC_state->num_code_blocks = ncode;
5658 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5659 }
5660 }
5661
5662 if (!pat_count) {
5663 /* compile-time pattern with just OP_CONSTs and DO blocks */
5664
5665 int n;
5666 OP *o;
5667
5668 /* find how many CONSTs there are */
5669 assert(expr);
5670 n = 0;
5671 if (expr->op_type == OP_CONST)
5672 n = 1;
5673 else
5674 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5675 if (o->op_type == OP_CONST)
5676 n++;
5677 }
5678
5679 /* fake up an SV array */
5680
5681 assert(!new_patternp);
5682 Newx(new_patternp, n, SV*);
5683 SAVEFREEPV(new_patternp);
5684 pat_count = n;
5685
5686 n = 0;
5687 if (expr->op_type == OP_CONST)
5688 new_patternp[n] = cSVOPx_sv(expr);
5689 else
5690 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5691 if (o->op_type == OP_CONST)
5692 new_patternp[n++] = cSVOPo_sv;
5693 }
5694
5695 }
5696
5697 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5698 "Assembling pattern from %d elements%s\n", pat_count,
5699 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5700
5701 /* set expr to the first arg op */
5702
5703 if (pRExC_state->num_code_blocks
5704 && expr->op_type != OP_CONST)
5705 {
5706 expr = cLISTOPx(expr)->op_first;
5707 assert( expr->op_type == OP_PUSHMARK
5708 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
5709 || expr->op_type == OP_PADRANGE);
5710 expr = expr->op_sibling;
5711 }
5712
5713 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
5714 expr, &recompile, NULL);
5715
5716 /* handle bare (possibly after overloading) regex: foo =~ $re */
5717 {
5718 SV *re = pat;
5719 if (SvROK(re))
5720 re = SvRV(re);
5721 if (SvTYPE(re) == SVt_REGEXP) {
5722 if (is_bare_re)
5723 *is_bare_re = TRUE;
5724 SvREFCNT_inc(re);
5725 Safefree(pRExC_state->code_blocks);
5726 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5727 "Precompiled pattern%s\n",
5728 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
5729
5730 return (REGEXP*)re;
5731 }
5732 }
5733
5734 exp = SvPV_nomg(pat, plen);
5735
5736 if (!eng->op_comp) {
5737 if ((SvUTF8(pat) && IN_BYTES)
5738 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5739 {
5740 /* make a temporary copy; either to convert to bytes,
5741 * or to avoid repeating get-magic / overloaded stringify */
5742 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5743 (IN_BYTES ? 0 : SvUTF8(pat)));
5744 }
5745 Safefree(pRExC_state->code_blocks);
5746 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5747 }
5748
5749 /* ignore the utf8ness if the pattern is 0 length */
5750 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5751 RExC_uni_semantics = 0;
5752 RExC_contains_locale = 0;
5753 pRExC_state->runtime_code_qr = NULL;
5754
5755 DEBUG_COMPILE_r({
5756 SV *dsv= sv_newmortal();
5757 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
5758 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5759 PL_colors[4],PL_colors[5],s);
5760 });
5761
5762 redo_first_pass:
5763 /* we jump here if we upgrade the pattern to utf8 and have to
5764 * recompile */
5765
5766 if ((pm_flags & PMf_USE_RE_EVAL)
5767 /* this second condition covers the non-regex literal case,
5768 * i.e. $foo =~ '(?{})'. */
5769 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
5770 )
5771 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
5772
5773 /* return old regex if pattern hasn't changed */
5774 /* XXX: note in the below we have to check the flags as well as the pattern.
5775 *
5776 * Things get a touch tricky as we have to compare the utf8 flag independently
5777 * from the compile flags.
5778 */
5779
5780 if ( old_re
5781 && !recompile
5782 && !!RX_UTF8(old_re) == !!RExC_utf8
5783 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
5784 && RX_PRECOMP(old_re)
5785 && RX_PRELEN(old_re) == plen
5786 && memEQ(RX_PRECOMP(old_re), exp, plen)
5787 && !runtime_code /* with runtime code, always recompile */ )
5788 {
5789 Safefree(pRExC_state->code_blocks);
5790 return old_re;
5791 }
5792
5793 rx_flags = orig_rx_flags;
5794
5795 if (initial_charset == REGEX_LOCALE_CHARSET) {
5796 RExC_contains_locale = 1;
5797 }
5798 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5799
5800 /* Set to use unicode semantics if the pattern is in utf8 and has the
5801 * 'depends' charset specified, as it means unicode when utf8 */
5802 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5803 }
5804
5805 RExC_precomp = exp;
5806 RExC_flags = rx_flags;
5807 RExC_pm_flags = pm_flags;
5808
5809 if (runtime_code) {
5810 if (TAINTING_get && TAINT_get)
5811 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5812
5813 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5814 /* whoops, we have a non-utf8 pattern, whilst run-time code
5815 * got compiled as utf8. Try again with a utf8 pattern */
5816 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5817 pRExC_state->num_code_blocks);
5818 goto redo_first_pass;
5819 }
5820 }
5821 assert(!pRExC_state->runtime_code_qr);
5822
5823 RExC_sawback = 0;
5824
5825 RExC_seen = 0;
5826 RExC_in_lookbehind = 0;
5827 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5828 RExC_extralen = 0;
5829 RExC_override_recoding = 0;
5830 RExC_in_multi_char_class = 0;
5831
5832 /* First pass: determine size, legality. */
5833 RExC_parse = exp;
5834 RExC_start = exp;
5835 RExC_end = exp + plen;
5836 RExC_naughty = 0;
5837 RExC_npar = 1;
5838 RExC_nestroot = 0;
5839 RExC_size = 0L;
5840 RExC_emit = &RExC_emit_dummy;
5841 RExC_whilem_seen = 0;
5842 RExC_open_parens = NULL;
5843 RExC_close_parens = NULL;
5844 RExC_opend = NULL;
5845 RExC_paren_names = NULL;
5846#ifdef DEBUGGING
5847 RExC_paren_name_list = NULL;
5848#endif
5849 RExC_recurse = NULL;
5850 RExC_recurse_count = 0;
5851 pRExC_state->code_index = 0;
5852
5853#if 0 /* REGC() is (currently) a NOP at the first pass.
5854 * Clever compilers notice this and complain. --jhi */
5855 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5856#endif
5857 DEBUG_PARSE_r(
5858 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5859 RExC_lastnum=0;
5860 RExC_lastparse=NULL;
5861 );
5862 /* reg may croak on us, not giving us a chance to free
5863 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
5864 need it to survive as long as the regexp (qr/(?{})/).
5865 We must check that code_blocksv is not already set, because we may
5866 have jumped back to restart the sizing pass. */
5867 if (pRExC_state->code_blocks && !code_blocksv) {
5868 code_blocksv = newSV_type(SVt_PV);
5869 SAVEFREESV(code_blocksv);
5870 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
5871 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
5872 }
5873 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5874 /* It's possible to write a regexp in ascii that represents Unicode
5875 codepoints outside of the byte range, such as via \x{100}. If we
5876 detect such a sequence we have to convert the entire pattern to utf8
5877 and then recompile, as our sizing calculation will have been based
5878 on 1 byte == 1 character, but we will need to use utf8 to encode
5879 at least some part of the pattern, and therefore must convert the whole
5880 thing.
5881 -- dmq */
5882 if (flags & RESTART_UTF8) {
5883 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
5884 pRExC_state->num_code_blocks);
5885 goto redo_first_pass;
5886 }
5887 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
5888 }
5889 if (code_blocksv)
5890 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
5891
5892 DEBUG_PARSE_r({
5893 PerlIO_printf(Perl_debug_log,
5894 "Required size %"IVdf" nodes\n"
5895 "Starting second pass (creation)\n",
5896 (IV)RExC_size);
5897 RExC_lastnum=0;
5898 RExC_lastparse=NULL;
5899 });
5900
5901 /* The first pass could have found things that force Unicode semantics */
5902 if ((RExC_utf8 || RExC_uni_semantics)
5903 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5904 {
5905 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5906 }
5907
5908 /* Small enough for pointer-storage convention?
5909 If extralen==0, this means that we will not need long jumps. */
5910 if (RExC_size >= 0x10000L && RExC_extralen)
5911 RExC_size += RExC_extralen;
5912 else
5913 RExC_extralen = 0;
5914 if (RExC_whilem_seen > 15)
5915 RExC_whilem_seen = 15;
5916
5917 /* Allocate space and zero-initialize. Note, the two step process
5918 of zeroing when in debug mode, thus anything assigned has to
5919 happen after that */
5920 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5921 r = ReANY(rx);
5922 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5923 char, regexp_internal);
5924 if ( r == NULL || ri == NULL )
5925 FAIL("Regexp out of space");
5926#ifdef DEBUGGING
5927 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5928 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5929#else
5930 /* bulk initialize base fields with 0. */
5931 Zero(ri, sizeof(regexp_internal), char);
5932#endif
5933
5934 /* non-zero initialization begins here */
5935 RXi_SET( r, ri );
5936 r->engine= eng;
5937 r->extflags = rx_flags;
5938 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
5939
5940 if (pm_flags & PMf_IS_QR) {
5941 ri->code_blocks = pRExC_state->code_blocks;
5942 ri->num_code_blocks = pRExC_state->num_code_blocks;
5943 }
5944 else
5945 {
5946 int n;
5947 for (n = 0; n < pRExC_state->num_code_blocks; n++)
5948 if (pRExC_state->code_blocks[n].src_regex)
5949 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
5950 SAVEFREEPV(pRExC_state->code_blocks);
5951 }
5952
5953 {
5954 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5955 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5956
5957 /* The caret is output if there are any defaults: if not all the STD
5958 * flags are set, or if no character set specifier is needed */
5959 bool has_default =
5960 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5961 || ! has_charset);
5962 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5963 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5964 >> RXf_PMf_STD_PMMOD_SHIFT);
5965 const char *fptr = STD_PAT_MODS; /*"msix"*/
5966 char *p;
5967 /* Allocate for the worst case, which is all the std flags are turned
5968 * on. If more precision is desired, we could do a population count of
5969 * the flags set. This could be done with a small lookup table, or by
5970 * shifting, masking and adding, or even, when available, assembly
5971 * language for a machine-language population count.
5972 * We never output a minus, as all those are defaults, so are
5973 * covered by the caret */
5974 const STRLEN wraplen = plen + has_p + has_runon
5975 + has_default /* If needs a caret */
5976
5977 /* If needs a character set specifier */
5978 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5979 + (sizeof(STD_PAT_MODS) - 1)
5980 + (sizeof("(?:)") - 1);
5981
5982 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
5983 r->xpv_len_u.xpvlenu_pv = p;
5984 if (RExC_utf8)
5985 SvFLAGS(rx) |= SVf_UTF8;
5986 *p++='('; *p++='?';
5987
5988 /* If a default, cover it using the caret */
5989 if (has_default) {
5990 *p++= DEFAULT_PAT_MOD;
5991 }
5992 if (has_charset) {
5993 STRLEN len;
5994 const char* const name = get_regex_charset_name(r->extflags, &len);
5995 Copy(name, p, len, char);
5996 p += len;
5997 }
5998 if (has_p)
5999 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6000 {
6001 char ch;
6002 while((ch = *fptr++)) {
6003 if(reganch & 1)
6004 *p++ = ch;
6005 reganch >>= 1;
6006 }
6007 }
6008
6009 *p++ = ':';
6010 Copy(RExC_precomp, p, plen, char);
6011 assert ((RX_WRAPPED(rx) - p) < 16);
6012 r->pre_prefix = p - RX_WRAPPED(rx);
6013 p += plen;
6014 if (has_runon)
6015 *p++ = '\n';
6016 *p++ = ')';
6017 *p = 0;
6018 SvCUR_set(rx, p - RX_WRAPPED(rx));
6019 }
6020
6021 r->intflags = 0;
6022 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6023
6024 if (RExC_seen & REG_SEEN_RECURSE) {
6025 Newxz(RExC_open_parens, RExC_npar,regnode *);
6026 SAVEFREEPV(RExC_open_parens);
6027 Newxz(RExC_close_parens,RExC_npar,regnode *);
6028 SAVEFREEPV(RExC_close_parens);
6029 }
6030
6031 /* Useful during FAIL. */
6032#ifdef RE_TRACK_PATTERN_OFFSETS
6033 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6034 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6035 "%s %"UVuf" bytes for offset annotations.\n",
6036 ri->u.offsets ? "Got" : "Couldn't get",
6037 (UV)((2*RExC_size+1) * sizeof(U32))));
6038#endif
6039 SetProgLen(ri,RExC_size);
6040 RExC_rx_sv = rx;
6041 RExC_rx = r;
6042 RExC_rxi = ri;
6043
6044 /* Second pass: emit code. */
6045 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6046 RExC_pm_flags = pm_flags;
6047 RExC_parse = exp;
6048 RExC_end = exp + plen;
6049 RExC_naughty = 0;
6050 RExC_npar = 1;
6051 RExC_emit_start = ri->program;
6052 RExC_emit = ri->program;
6053 RExC_emit_bound = ri->program + RExC_size + 1;
6054 pRExC_state->code_index = 0;
6055
6056 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6057 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6058 ReREFCNT_dec(rx);
6059 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6060 }
6061 /* XXXX To minimize changes to RE engine we always allocate
6062 3-units-long substrs field. */
6063 Newx(r->substrs, 1, struct reg_substr_data);
6064 if (RExC_recurse_count) {
6065 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6066 SAVEFREEPV(RExC_recurse);
6067 }
6068
6069reStudy:
6070 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6071 Zero(r->substrs, 1, struct reg_substr_data);
6072
6073#ifdef TRIE_STUDY_OPT
6074 if (!restudied) {
6075 StructCopy(&zero_scan_data, &data, scan_data_t);
6076 copyRExC_state = RExC_state;
6077 } else {
6078 U32 seen=RExC_seen;
6079 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6080
6081 RExC_state = copyRExC_state;
6082 if (seen & REG_TOP_LEVEL_BRANCHES)
6083 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6084 else
6085 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6086 StructCopy(&zero_scan_data, &data, scan_data_t);
6087 }
6088#else
6089 StructCopy(&zero_scan_data, &data, scan_data_t);
6090#endif
6091
6092 /* Dig out information for optimizations. */
6093 r->extflags = RExC_flags; /* was pm_op */
6094 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6095
6096 if (UTF)
6097 SvUTF8_on(rx); /* Unicode in it? */
6098 ri->regstclass = NULL;
6099 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6100 r->intflags |= PREGf_NAUGHTY;
6101 scan = ri->program + 1; /* First BRANCH. */
6102
6103 /* testing for BRANCH here tells us whether there is "must appear"
6104 data in the pattern. If there is then we can use it for optimisations */
6105 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6106 SSize_t fake;
6107 STRLEN longest_float_length, longest_fixed_length;
6108 struct regnode_charclass_class ch_class; /* pointed to by data */
6109 int stclass_flag;
6110 SSize_t last_close = 0; /* pointed to by data */
6111 regnode *first= scan;
6112 regnode *first_next= regnext(first);
6113 /*
6114 * Skip introductions and multiplicators >= 1
6115 * so that we can extract the 'meat' of the pattern that must
6116 * match in the large if() sequence following.
6117 * NOTE that EXACT is NOT covered here, as it is normally
6118 * picked up by the optimiser separately.
6119 *
6120 * This is unfortunate as the optimiser isnt handling lookahead
6121 * properly currently.
6122 *
6123 */
6124 while ((OP(first) == OPEN && (sawopen = 1)) ||
6125 /* An OR of *one* alternative - should not happen now. */
6126 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6127 /* for now we can't handle lookbehind IFMATCH*/
6128 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6129 (OP(first) == PLUS) ||
6130 (OP(first) == MINMOD) ||
6131 /* An {n,m} with n>0 */
6132 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6133 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6134 {
6135 /*
6136 * the only op that could be a regnode is PLUS, all the rest
6137 * will be regnode_1 or regnode_2.
6138 *
6139 * (yves doesn't think this is true)
6140 */
6141 if (OP(first) == PLUS)
6142 sawplus = 1;
6143 else {
6144 if (OP(first) == MINMOD)
6145 sawminmod = 1;
6146 first += regarglen[OP(first)];
6147 }
6148 first = NEXTOPER(first);
6149 first_next= regnext(first);
6150 }
6151
6152 /* Starting-point info. */
6153 again:
6154 DEBUG_PEEP("first:",first,0);
6155 /* Ignore EXACT as we deal with it later. */
6156 if (PL_regkind[OP(first)] == EXACT) {
6157 if (OP(first) == EXACT)
6158 NOOP; /* Empty, get anchored substr later. */
6159 else
6160 ri->regstclass = first;
6161 }
6162#ifdef TRIE_STCLASS
6163 else if (PL_regkind[OP(first)] == TRIE &&
6164 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6165 {
6166 regnode *trie_op;
6167 /* this can happen only on restudy */
6168 if ( OP(first) == TRIE ) {
6169 struct regnode_1 *trieop = (struct regnode_1 *)
6170 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6171 StructCopy(first,trieop,struct regnode_1);
6172 trie_op=(regnode *)trieop;
6173 } else {
6174 struct regnode_charclass *trieop = (struct regnode_charclass *)
6175 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6176 StructCopy(first,trieop,struct regnode_charclass);
6177 trie_op=(regnode *)trieop;
6178 }
6179 OP(trie_op)+=2;
6180 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6181 ri->regstclass = trie_op;
6182 }
6183#endif
6184 else if (REGNODE_SIMPLE(OP(first)))
6185 ri->regstclass = first;
6186 else if (PL_regkind[OP(first)] == BOUND ||
6187 PL_regkind[OP(first)] == NBOUND)
6188 ri->regstclass = first;
6189 else if (PL_regkind[OP(first)] == BOL) {
6190 r->extflags |= (OP(first) == MBOL
6191 ? RXf_ANCH_MBOL
6192 : (OP(first) == SBOL
6193 ? RXf_ANCH_SBOL
6194 : RXf_ANCH_BOL));
6195 first = NEXTOPER(first);
6196 goto again;
6197 }
6198 else if (OP(first) == GPOS) {
6199 r->extflags |= RXf_ANCH_GPOS;
6200 first = NEXTOPER(first);
6201 goto again;
6202 }
6203 else if ((!sawopen || !RExC_sawback) &&
6204 (OP(first) == STAR &&
6205 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6206 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6207 {
6208 /* turn .* into ^.* with an implied $*=1 */
6209 const int type =
6210 (OP(NEXTOPER(first)) == REG_ANY)
6211 ? RXf_ANCH_MBOL
6212 : RXf_ANCH_SBOL;
6213 r->extflags |= type;
6214 r->intflags |= PREGf_IMPLICIT;
6215 first = NEXTOPER(first);
6216 goto again;
6217 }
6218 if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback)
6219 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6220 /* x+ must match at the 1st pos of run of x's */
6221 r->intflags |= PREGf_SKIP;
6222
6223 /* Scan is after the zeroth branch, first is atomic matcher. */
6224#ifdef TRIE_STUDY_OPT
6225 DEBUG_PARSE_r(
6226 if (!restudied)
6227 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6228 (IV)(first - scan + 1))
6229 );
6230#else
6231 DEBUG_PARSE_r(
6232 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6233 (IV)(first - scan + 1))
6234 );
6235#endif
6236
6237
6238 /*
6239 * If there's something expensive in the r.e., find the
6240 * longest literal string that must appear and make it the
6241 * regmust. Resolve ties in favor of later strings, since
6242 * the regstart check works with the beginning of the r.e.
6243 * and avoiding duplication strengthens checking. Not a
6244 * strong reason, but sufficient in the absence of others.
6245 * [Now we resolve ties in favor of the earlier string if
6246 * it happens that c_offset_min has been invalidated, since the
6247 * earlier string may buy us something the later one won't.]
6248 */
6249
6250 data.longest_fixed = newSVpvs("");
6251 data.longest_float = newSVpvs("");
6252 data.last_found = newSVpvs("");
6253 data.longest = &(data.longest_fixed);
6254 ENTER_with_name("study_chunk");
6255 SAVEFREESV(data.longest_fixed);
6256 SAVEFREESV(data.longest_float);
6257 SAVEFREESV(data.last_found);
6258 first = scan;
6259 if (!ri->regstclass) {
6260 cl_init(pRExC_state, &ch_class);
6261 data.start_class = &ch_class;
6262 stclass_flag = SCF_DO_STCLASS_AND;
6263 } else /* XXXX Check for BOUND? */
6264 stclass_flag = 0;
6265 data.last_closep = &last_close;
6266
6267 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6268 &data, -1, NULL, NULL,
6269 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6270 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6271 0);
6272
6273
6274 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6275
6276
6277 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6278 && data.last_start_min == 0 && data.last_end > 0
6279 && !RExC_seen_zerolen
6280 && !(RExC_seen & REG_SEEN_VERBARG)
6281 && !((RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6282 r->extflags |= RXf_CHECK_ALL;
6283 scan_commit(pRExC_state, &data,&minlen,0);
6284
6285 longest_float_length = CHR_SVLEN(data.longest_float);
6286
6287 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6288 && data.offset_fixed == data.offset_float_min
6289 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6290 && S_setup_longest (aTHX_ pRExC_state,
6291 data.longest_float,
6292 &(r->float_utf8),
6293 &(r->float_substr),
6294 &(r->float_end_shift),
6295 data.lookbehind_float,
6296 data.offset_float_min,
6297 data.minlen_float,
6298 longest_float_length,
6299 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6300 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6301 {
6302 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6303 r->float_max_offset = data.offset_float_max;
6304 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6305 r->float_max_offset -= data.lookbehind_float;
6306 SvREFCNT_inc_simple_void_NN(data.longest_float);
6307 }
6308 else {
6309 r->float_substr = r->float_utf8 = NULL;
6310 longest_float_length = 0;
6311 }
6312
6313 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6314
6315 if (S_setup_longest (aTHX_ pRExC_state,
6316 data.longest_fixed,
6317 &(r->anchored_utf8),
6318 &(r->anchored_substr),
6319 &(r->anchored_end_shift),
6320 data.lookbehind_fixed,
6321 data.offset_fixed,
6322 data.minlen_fixed,
6323 longest_fixed_length,
6324 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6325 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6326 {
6327 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6328 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6329 }
6330 else {
6331 r->anchored_substr = r->anchored_utf8 = NULL;
6332 longest_fixed_length = 0;
6333 }
6334 LEAVE_with_name("study_chunk");
6335
6336 if (ri->regstclass
6337 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6338 ri->regstclass = NULL;
6339
6340 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6341 && stclass_flag
6342 && ! TEST_SSC_EOS(data.start_class)
6343 && !cl_is_anything(data.start_class))
6344 {
6345 const U32 n = add_data(pRExC_state, 1, "f");
6346 OP(data.start_class) = ANYOF_SYNTHETIC;
6347
6348 Newx(RExC_rxi->data->data[n], 1,
6349 struct regnode_charclass_class);
6350 StructCopy(data.start_class,
6351 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6352 struct regnode_charclass_class);
6353 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6354 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6355 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6356 regprop(r, sv, (regnode*)data.start_class);
6357 PerlIO_printf(Perl_debug_log,
6358 "synthetic stclass \"%s\".\n",
6359 SvPVX_const(sv));});
6360 }
6361
6362 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6363 if (longest_fixed_length > longest_float_length) {
6364 r->check_end_shift = r->anchored_end_shift;
6365 r->check_substr = r->anchored_substr;
6366 r->check_utf8 = r->anchored_utf8;
6367 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6368 if (r->extflags & RXf_ANCH_SINGLE)
6369 r->extflags |= RXf_NOSCAN;
6370 }
6371 else {
6372 r->check_end_shift = r->float_end_shift;
6373 r->check_substr = r->float_substr;
6374 r->check_utf8 = r->float_utf8;
6375 r->check_offset_min = r->float_min_offset;
6376 r->check_offset_max = r->float_max_offset;
6377 }
6378 if ((r->check_substr || r->check_utf8) ) {
6379 r->extflags |= RXf_USE_INTUIT;
6380 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6381 r->extflags |= RXf_INTUIT_TAIL;
6382 }
6383 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6384 if ( (STRLEN)minlen < longest_float_length )
6385 minlen= longest_float_length;
6386 if ( (STRLEN)minlen < longest_fixed_length )
6387 minlen= longest_fixed_length;
6388 */
6389 }
6390 else {
6391 /* Several toplevels. Best we can is to set minlen. */
6392 SSize_t fake;
6393 struct regnode_charclass_class ch_class;
6394 SSize_t last_close = 0;
6395
6396 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6397
6398 scan = ri->program + 1;
6399 cl_init(pRExC_state, &ch_class);
6400 data.start_class = &ch_class;
6401 data.last_closep = &last_close;
6402
6403
6404 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6405 &data, -1, NULL, NULL,
6406 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS
6407 |(restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6408 0);
6409
6410 CHECK_RESTUDY_GOTO_butfirst(NOOP);
6411
6412 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6413 = r->float_substr = r->float_utf8 = NULL;
6414
6415 if (! TEST_SSC_EOS(data.start_class)
6416 && !cl_is_anything(data.start_class))
6417 {
6418 const U32 n = add_data(pRExC_state, 1, "f");
6419 OP(data.start_class) = ANYOF_SYNTHETIC;
6420
6421 Newx(RExC_rxi->data->data[n], 1,
6422 struct regnode_charclass_class);
6423 StructCopy(data.start_class,
6424 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6425 struct regnode_charclass_class);
6426 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6427 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6428 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6429 regprop(r, sv, (regnode*)data.start_class);
6430 PerlIO_printf(Perl_debug_log,
6431 "synthetic stclass \"%s\".\n",
6432 SvPVX_const(sv));});
6433 }
6434 }
6435
6436 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6437 the "real" pattern. */
6438 DEBUG_OPTIMISE_r({
6439 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6440 (IV)minlen, (IV)r->minlen);
6441 });
6442 r->minlenret = minlen;
6443 if (r->minlen < minlen)
6444 r->minlen = minlen;
6445
6446 if (RExC_seen & REG_SEEN_GPOS)
6447 r->extflags |= RXf_GPOS_SEEN;
6448 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6449 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
6450 if (pRExC_state->num_code_blocks)
6451 r->extflags |= RXf_EVAL_SEEN;
6452 if (RExC_seen & REG_SEEN_CANY)
6453 r->extflags |= RXf_CANY_SEEN;
6454 if (RExC_seen & REG_SEEN_VERBARG)
6455 {
6456 r->intflags |= PREGf_VERBARG_SEEN;
6457 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
6458 }
6459 if (RExC_seen & REG_SEEN_CUTGROUP)
6460 r->intflags |= PREGf_CUTGROUP_SEEN;
6461 if (pm_flags & PMf_USE_RE_EVAL)
6462 r->intflags |= PREGf_USE_RE_EVAL;
6463 if (RExC_paren_names)
6464 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6465 else
6466 RXp_PAREN_NAMES(r) = NULL;
6467
6468 {
6469 regnode *first = ri->program + 1;
6470 U8 fop = OP(first);
6471 regnode *next = NEXTOPER(first);
6472 U8 nop = OP(next);
6473
6474 if (PL_regkind[fop] == NOTHING && nop == END)
6475 r->extflags |= RXf_NULL;
6476 else if (PL_regkind[fop] == BOL && nop == END)
6477 r->extflags |= RXf_START_ONLY;
6478 else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
6479 r->extflags |= RXf_WHITE;
6480 else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
6481 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6482
6483 }
6484#ifdef DEBUGGING
6485 if (RExC_paren_names) {
6486 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6487 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6488 } else
6489#endif
6490 ri->name_list_idx = 0;
6491
6492 if (RExC_recurse_count) {
6493 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6494 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6495 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6496 }
6497 }
6498 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6499 /* assume we don't need to swap parens around before we match */
6500
6501 DEBUG_DUMP_r({
6502 PerlIO_printf(Perl_debug_log,"Final program:\n");
6503 regdump(r);
6504 });
6505#ifdef RE_TRACK_PATTERN_OFFSETS
6506 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6507 const STRLEN len = ri->u.offsets[0];
6508 STRLEN i;
6509 GET_RE_DEBUG_FLAGS_DECL;
6510 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6511 for (i = 1; i <= len; i++) {
6512 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6513 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6514 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6515 }
6516 PerlIO_printf(Perl_debug_log, "\n");
6517 });
6518#endif
6519
6520#ifdef USE_ITHREADS
6521 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
6522 * by setting the regexp SV to readonly-only instead. If the
6523 * pattern's been recompiled, the USEDness should remain. */
6524 if (old_re && SvREADONLY(old_re))
6525 SvREADONLY_on(rx);
6526#endif
6527 return rx;
6528}
6529
6530
6531SV*
6532Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6533 const U32 flags)
6534{
6535 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6536
6537 PERL_UNUSED_ARG(value);
6538
6539 if (flags & RXapif_FETCH) {
6540 return reg_named_buff_fetch(rx, key, flags);
6541 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6542 Perl_croak_no_modify();
6543 return NULL;
6544 } else if (flags & RXapif_EXISTS) {
6545 return reg_named_buff_exists(rx, key, flags)
6546 ? &PL_sv_yes
6547 : &PL_sv_no;
6548 } else if (flags & RXapif_REGNAMES) {
6549 return reg_named_buff_all(rx, flags);
6550 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6551 return reg_named_buff_scalar(rx, flags);
6552 } else {
6553 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6554 return NULL;
6555 }
6556}
6557
6558SV*
6559Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6560 const U32 flags)
6561{
6562 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6563 PERL_UNUSED_ARG(lastkey);
6564
6565 if (flags & RXapif_FIRSTKEY)
6566 return reg_named_buff_firstkey(rx, flags);
6567 else if (flags & RXapif_NEXTKEY)
6568 return reg_named_buff_nextkey(rx, flags);
6569 else {
6570 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6571 return NULL;
6572 }
6573}
6574
6575SV*
6576Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6577 const U32 flags)
6578{
6579 AV *retarray = NULL;
6580 SV *ret;
6581 struct regexp *const rx = ReANY(r);
6582
6583 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6584
6585 if (flags & RXapif_ALL)
6586 retarray=newAV();
6587
6588 if (rx && RXp_PAREN_NAMES(rx)) {
6589 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6590 if (he_str) {
6591 IV i;
6592 SV* sv_dat=HeVAL(he_str);
6593 I32 *nums=(I32*)SvPVX(sv_dat);
6594 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6595 if ((I32)(rx->nparens) >= nums[i]
6596 && rx->offs[nums[i]].start != -1
6597 && rx->offs[nums[i]].end != -1)
6598 {
6599 ret = newSVpvs("");
6600 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6601 if (!retarray)
6602 return ret;
6603 } else {
6604 if (retarray)
6605 ret = newSVsv(&PL_sv_undef);
6606 }
6607 if (retarray)
6608 av_push(retarray, ret);
6609 }
6610 if (retarray)
6611 return newRV_noinc(MUTABLE_SV(retarray));
6612 }
6613 }
6614 return NULL;
6615}
6616
6617bool
6618Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6619 const U32 flags)
6620{
6621 struct regexp *const rx = ReANY(r);
6622
6623 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6624
6625 if (rx && RXp_PAREN_NAMES(rx)) {
6626 if (flags & RXapif_ALL) {
6627 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6628 } else {
6629 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6630 if (sv) {
6631 SvREFCNT_dec_NN(sv);
6632 return TRUE;
6633 } else {
6634 return FALSE;
6635 }
6636 }
6637 } else {
6638 return FALSE;
6639 }
6640}
6641
6642SV*
6643Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6644{
6645 struct regexp *const rx = ReANY(r);
6646
6647 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6648
6649 if ( rx && RXp_PAREN_NAMES(rx) ) {
6650 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6651
6652 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6653 } else {
6654 return FALSE;
6655 }
6656}
6657
6658SV*
6659Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6660{
6661 struct regexp *const rx = ReANY(r);
6662 GET_RE_DEBUG_FLAGS_DECL;
6663
6664 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6665
6666 if (rx && RXp_PAREN_NAMES(rx)) {
6667 HV *hv = RXp_PAREN_NAMES(rx);
6668 HE *temphe;
6669 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6670 IV i;
6671 IV parno = 0;
6672 SV* sv_dat = HeVAL(temphe);
6673 I32 *nums = (I32*)SvPVX(sv_dat);
6674 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6675 if ((I32)(rx->lastparen) >= nums[i] &&
6676 rx->offs[nums[i]].start != -1 &&
6677 rx->offs[nums[i]].end != -1)
6678 {
6679 parno = nums[i];
6680 break;
6681 }
6682 }
6683 if (parno || flags & RXapif_ALL) {
6684 return newSVhek(HeKEY_hek(temphe));
6685 }
6686 }
6687 }
6688 return NULL;
6689}
6690
6691SV*
6692Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6693{
6694 SV *ret;
6695 AV *av;
6696 SSize_t length;
6697 struct regexp *const rx = ReANY(r);
6698
6699 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6700
6701 if (rx && RXp_PAREN_NAMES(rx)) {
6702 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6703 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6704 } else if (flags & RXapif_ONE) {
6705 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6706 av = MUTABLE_AV(SvRV(ret));
6707 length = av_len(av);
6708 SvREFCNT_dec_NN(ret);
6709 return newSViv(length + 1);
6710 } else {
6711 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6712 return NULL;
6713 }
6714 }
6715 return &PL_sv_undef;
6716}
6717
6718SV*
6719Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6720{
6721 struct regexp *const rx = ReANY(r);
6722 AV *av = newAV();
6723
6724 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6725
6726 if (rx && RXp_PAREN_NAMES(rx)) {
6727 HV *hv= RXp_PAREN_NAMES(rx);
6728 HE *temphe;
6729 (void)hv_iterinit(hv);
6730 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6731 IV i;
6732 IV parno = 0;
6733 SV* sv_dat = HeVAL(temphe);
6734 I32 *nums = (I32*)SvPVX(sv_dat);
6735 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6736 if ((I32)(rx->lastparen) >= nums[i] &&
6737 rx->offs[nums[i]].start != -1 &&
6738 rx->offs[nums[i]].end != -1)
6739 {
6740 parno = nums[i];
6741 break;
6742 }
6743 }
6744 if (parno || flags & RXapif_ALL) {
6745 av_push(av, newSVhek(HeKEY_hek(temphe)));
6746 }
6747 }
6748 }
6749
6750 return newRV_noinc(MUTABLE_SV(av));
6751}
6752
6753void
6754Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6755 SV * const sv)
6756{
6757 struct regexp *const rx = ReANY(r);
6758 char *s = NULL;
6759 SSize_t i = 0;
6760 SSize_t s1, t1;
6761 I32 n = paren;
6762
6763 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6764
6765 if ( n == RX_BUFF_IDX_CARET_PREMATCH
6766 || n == RX_BUFF_IDX_CARET_FULLMATCH
6767 || n == RX_BUFF_IDX_CARET_POSTMATCH
6768 )
6769 {
6770 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6771 if (!keepcopy) {
6772 /* on something like
6773 * $r = qr/.../;
6774 * /$qr/p;
6775 * the KEEPCOPY is set on the PMOP rather than the regex */
6776 if (PL_curpm && r == PM_GETRE(PL_curpm))
6777 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6778 }
6779 if (!keepcopy)
6780 goto ret_undef;
6781 }
6782
6783 if (!rx->subbeg)
6784 goto ret_undef;
6785
6786 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6787 /* no need to distinguish between them any more */
6788 n = RX_BUFF_IDX_FULLMATCH;
6789
6790 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6791 && rx->offs[0].start != -1)
6792 {
6793 /* $`, ${^PREMATCH} */
6794 i = rx->offs[0].start;
6795 s = rx->subbeg;
6796 }
6797 else
6798 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6799 && rx->offs[0].end != -1)
6800 {
6801 /* $', ${^POSTMATCH} */
6802 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6803 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6804 }
6805 else
6806 if ( 0 <= n && n <= (I32)rx->nparens &&
6807 (s1 = rx->offs[n].start) != -1 &&
6808 (t1 = rx->offs[n].end) != -1)
6809 {
6810 /* $&, ${^MATCH}, $1 ... */
6811 i = t1 - s1;
6812 s = rx->subbeg + s1 - rx->suboffset;
6813 } else {
6814 goto ret_undef;
6815 }
6816
6817 assert(s >= rx->subbeg);
6818 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
6819 if (i >= 0) {
6820#if NO_TAINT_SUPPORT
6821 sv_setpvn(sv, s, i);
6822#else
6823 const int oldtainted = TAINT_get;
6824 TAINT_NOT;
6825 sv_setpvn(sv, s, i);
6826 TAINT_set(oldtainted);
6827#endif
6828 if ( (rx->extflags & RXf_CANY_SEEN)
6829 ? (RXp_MATCH_UTF8(rx)
6830 && (!i || is_utf8_string((U8*)s, i)))
6831 : (RXp_MATCH_UTF8(rx)) )
6832 {
6833 SvUTF8_on(sv);
6834 }
6835 else
6836 SvUTF8_off(sv);
6837 if (TAINTING_get) {
6838 if (RXp_MATCH_TAINTED(rx)) {
6839 if (SvTYPE(sv) >= SVt_PVMG) {
6840 MAGIC* const mg = SvMAGIC(sv);
6841 MAGIC* mgt;
6842 TAINT;
6843 SvMAGIC_set(sv, mg->mg_moremagic);
6844 SvTAINT(sv);
6845 if ((mgt = SvMAGIC(sv))) {
6846 mg->mg_moremagic = mgt;
6847 SvMAGIC_set(sv, mg);
6848 }
6849 } else {
6850 TAINT;
6851 SvTAINT(sv);
6852 }
6853 } else
6854 SvTAINTED_off(sv);
6855 }
6856 } else {
6857 ret_undef:
6858 sv_setsv(sv,&PL_sv_undef);
6859 return;
6860 }
6861}
6862
6863void
6864Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6865 SV const * const value)
6866{
6867 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6868
6869 PERL_UNUSED_ARG(rx);
6870 PERL_UNUSED_ARG(paren);
6871 PERL_UNUSED_ARG(value);
6872
6873 if (!PL_localizing)
6874 Perl_croak_no_modify();
6875}
6876
6877I32
6878Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6879 const I32 paren)
6880{
6881 struct regexp *const rx = ReANY(r);
6882 I32 i;
6883 I32 s1, t1;
6884
6885 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6886
6887 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
6888 || paren == RX_BUFF_IDX_CARET_FULLMATCH
6889 || paren == RX_BUFF_IDX_CARET_POSTMATCH
6890 )
6891 {
6892 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
6893 if (!keepcopy) {
6894 /* on something like
6895 * $r = qr/.../;
6896 * /$qr/p;
6897 * the KEEPCOPY is set on the PMOP rather than the regex */
6898 if (PL_curpm && r == PM_GETRE(PL_curpm))
6899 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
6900 }
6901 if (!keepcopy)
6902 goto warn_undef;
6903 }
6904
6905 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6906 switch (paren) {
6907 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6908 case RX_BUFF_IDX_PREMATCH: /* $` */
6909 if (rx->offs[0].start != -1) {
6910 i = rx->offs[0].start;
6911 if (i > 0) {
6912 s1 = 0;
6913 t1 = i;
6914 goto getlen;
6915 }
6916 }
6917 return 0;
6918
6919 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6920 case RX_BUFF_IDX_POSTMATCH: /* $' */
6921 if (rx->offs[0].end != -1) {
6922 i = rx->sublen - rx->offs[0].end;
6923 if (i > 0) {
6924 s1 = rx->offs[0].end;
6925 t1 = rx->sublen;
6926 goto getlen;
6927 }
6928 }
6929 return 0;
6930
6931 default: /* $& / ${^MATCH}, $1, $2, ... */
6932 if (paren <= (I32)rx->nparens &&
6933 (s1 = rx->offs[paren].start) != -1 &&
6934 (t1 = rx->offs[paren].end) != -1)
6935 {
6936 i = t1 - s1;
6937 goto getlen;
6938 } else {
6939 warn_undef:
6940 if (ckWARN(WARN_UNINITIALIZED))
6941 report_uninit((const SV *)sv);
6942 return 0;
6943 }
6944 }
6945 getlen:
6946 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6947 const char * const s = rx->subbeg - rx->suboffset + s1;
6948 const U8 *ep;
6949 STRLEN el;
6950
6951 i = t1 - s1;
6952 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6953 i = el;
6954 }
6955 return i;
6956}
6957
6958SV*
6959Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6960{
6961 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6962 PERL_UNUSED_ARG(rx);
6963 if (0)
6964 return NULL;
6965 else
6966 return newSVpvs("Regexp");
6967}
6968
6969/* Scans the name of a named buffer from the pattern.
6970 * If flags is REG_RSN_RETURN_NULL returns null.
6971 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6972 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6973 * to the parsed name as looked up in the RExC_paren_names hash.
6974 * If there is an error throws a vFAIL().. type exception.
6975 */
6976
6977#define REG_RSN_RETURN_NULL 0
6978#define REG_RSN_RETURN_NAME 1
6979#define REG_RSN_RETURN_DATA 2
6980
6981STATIC SV*
6982S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6983{
6984 char *name_start = RExC_parse;
6985
6986 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6987
6988 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6989 /* skip IDFIRST by using do...while */
6990 if (UTF)
6991 do {
6992 RExC_parse += UTF8SKIP(RExC_parse);
6993 } while (isWORDCHAR_utf8((U8*)RExC_parse));
6994 else
6995 do {
6996 RExC_parse++;
6997 } while (isWORDCHAR(*RExC_parse));
6998 } else {
6999 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
7000 vFAIL("Group name must start with a non-digit word character");
7001 }
7002 if ( flags ) {
7003 SV* sv_name
7004 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7005 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7006 if ( flags == REG_RSN_RETURN_NAME)
7007 return sv_name;
7008 else if (flags==REG_RSN_RETURN_DATA) {
7009 HE *he_str = NULL;
7010 SV *sv_dat = NULL;
7011 if ( ! sv_name ) /* should not happen*/
7012 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7013 if (RExC_paren_names)
7014 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7015 if ( he_str )
7016 sv_dat = HeVAL(he_str);
7017 if ( ! sv_dat )
7018 vFAIL("Reference to nonexistent named group");
7019 return sv_dat;
7020 }
7021 else {
7022 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7023 (unsigned long) flags);
7024 }
7025 assert(0); /* NOT REACHED */
7026 }
7027 return NULL;
7028}
7029
7030#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7031 int rem=(int)(RExC_end - RExC_parse); \
7032 int cut; \
7033 int num; \
7034 int iscut=0; \
7035 if (rem>10) { \
7036 rem=10; \
7037 iscut=1; \
7038 } \
7039 cut=10-rem; \
7040 if (RExC_lastparse!=RExC_parse) \
7041 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7042 rem, RExC_parse, \
7043 cut + 4, \
7044 iscut ? "..." : "<" \
7045 ); \
7046 else \
7047 PerlIO_printf(Perl_debug_log,"%16s",""); \
7048 \
7049 if (SIZE_ONLY) \
7050 num = RExC_size + 1; \
7051 else \
7052 num=REG_NODE_NUM(RExC_emit); \
7053 if (RExC_lastnum!=num) \
7054 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7055 else \
7056 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7057 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7058 (int)((depth*2)), "", \
7059 (funcname) \
7060 ); \
7061 RExC_lastnum=num; \
7062 RExC_lastparse=RExC_parse; \
7063})
7064
7065
7066
7067#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7068 DEBUG_PARSE_MSG((funcname)); \
7069 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7070})
7071#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7072 DEBUG_PARSE_MSG((funcname)); \
7073 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7074})
7075
7076/* This section of code defines the inversion list object and its methods. The
7077 * interfaces are highly subject to change, so as much as possible is static to
7078 * this file. An inversion list is here implemented as a malloc'd C UV array
7079 * as an SVt_INVLIST scalar.
7080 *
7081 * An inversion list for Unicode is an array of code points, sorted by ordinal
7082 * number. The zeroth element is the first code point in the list. The 1th
7083 * element is the first element beyond that not in the list. In other words,
7084 * the first range is
7085 * invlist[0]..(invlist[1]-1)
7086 * The other ranges follow. Thus every element whose index is divisible by two
7087 * marks the beginning of a range that is in the list, and every element not
7088 * divisible by two marks the beginning of a range not in the list. A single
7089 * element inversion list that contains the single code point N generally
7090 * consists of two elements
7091 * invlist[0] == N
7092 * invlist[1] == N+1
7093 * (The exception is when N is the highest representable value on the
7094 * machine, in which case the list containing just it would be a single
7095 * element, itself. By extension, if the last range in the list extends to
7096 * infinity, then the first element of that range will be in the inversion list
7097 * at a position that is divisible by two, and is the final element in the
7098 * list.)
7099 * Taking the complement (inverting) an inversion list is quite simple, if the
7100 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7101 * This implementation reserves an element at the beginning of each inversion
7102 * list to always contain 0; there is an additional flag in the header which
7103 * indicates if the list begins at the 0, or is offset to begin at the next
7104 * element.
7105 *
7106 * More about inversion lists can be found in "Unicode Demystified"
7107 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7108 * More will be coming when functionality is added later.
7109 *
7110 * The inversion list data structure is currently implemented as an SV pointing
7111 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7112 * array of UV whose memory management is automatically handled by the existing
7113 * facilities for SV's.
7114 *
7115 * Some of the methods should always be private to the implementation, and some
7116 * should eventually be made public */
7117
7118/* The header definitions are in F<inline_invlist.c> */
7119
7120PERL_STATIC_INLINE UV*
7121S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7122{
7123 /* Returns a pointer to the first element in the inversion list's array.
7124 * This is called upon initialization of an inversion list. Where the
7125 * array begins depends on whether the list has the code point U+0000 in it
7126 * or not. The other parameter tells it whether the code that follows this
7127 * call is about to put a 0 in the inversion list or not. The first
7128 * element is either the element reserved for 0, if TRUE, or the element
7129 * after it, if FALSE */
7130
7131 bool* offset = get_invlist_offset_addr(invlist);
7132 UV* zero_addr = (UV *) SvPVX(invlist);
7133
7134 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7135
7136 /* Must be empty */
7137 assert(! _invlist_len(invlist));
7138
7139 *zero_addr = 0;
7140
7141 /* 1^1 = 0; 1^0 = 1 */
7142 *offset = 1 ^ will_have_0;
7143 return zero_addr + *offset;
7144}
7145
7146PERL_STATIC_INLINE UV*
7147S_invlist_array(pTHX_ SV* const invlist)
7148{
7149 /* Returns the pointer to the inversion list's array. Every time the
7150 * length changes, this needs to be called in case malloc or realloc moved
7151 * it */
7152
7153 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7154
7155 /* Must not be empty. If these fail, you probably didn't check for <len>
7156 * being non-zero before trying to get the array */
7157 assert(_invlist_len(invlist));
7158
7159 /* The very first element always contains zero, The array begins either
7160 * there, or if the inversion list is offset, at the element after it.
7161 * The offset header field determines which; it contains 0 or 1 to indicate
7162 * how much additionally to add */
7163 assert(0 == *(SvPVX(invlist)));
7164 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7165}
7166
7167PERL_STATIC_INLINE void
7168S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7169{
7170 /* Sets the current number of elements stored in the inversion list.
7171 * Updates SvCUR correspondingly */
7172
7173 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7174
7175 assert(SvTYPE(invlist) == SVt_INVLIST);
7176
7177 SvCUR_set(invlist,
7178 (len == 0)
7179 ? 0
7180 : TO_INTERNAL_SIZE(len + offset));
7181 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7182}
7183
7184PERL_STATIC_INLINE IV*
7185S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7186{
7187 /* Return the address of the IV that is reserved to hold the cached index
7188 * */
7189
7190 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7191
7192 assert(SvTYPE(invlist) == SVt_INVLIST);
7193
7194 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7195}
7196
7197PERL_STATIC_INLINE IV
7198S_invlist_previous_index(pTHX_ SV* const invlist)
7199{
7200 /* Returns cached index of previous search */
7201
7202 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7203
7204 return *get_invlist_previous_index_addr(invlist);
7205}
7206
7207PERL_STATIC_INLINE void
7208S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7209{
7210 /* Caches <index> for later retrieval */
7211
7212 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7213
7214 assert(index == 0 || index < (int) _invlist_len(invlist));
7215
7216 *get_invlist_previous_index_addr(invlist) = index;
7217}
7218
7219PERL_STATIC_INLINE UV
7220S_invlist_max(pTHX_ SV* const invlist)
7221{
7222 /* Returns the maximum number of elements storable in the inversion list's
7223 * array, without having to realloc() */
7224
7225 PERL_ARGS_ASSERT_INVLIST_MAX;
7226
7227 assert(SvTYPE(invlist) == SVt_INVLIST);
7228
7229 /* Assumes worst case, in which the 0 element is not counted in the
7230 * inversion list, so subtracts 1 for that */
7231 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7232 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7233 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7234}
7235
7236#ifndef PERL_IN_XSUB_RE
7237SV*
7238Perl__new_invlist(pTHX_ IV initial_size)
7239{
7240
7241 /* Return a pointer to a newly constructed inversion list, with enough
7242 * space to store 'initial_size' elements. If that number is negative, a
7243 * system default is used instead */
7244
7245 SV* new_list;
7246
7247 if (initial_size < 0) {
7248 initial_size = 10;
7249 }
7250
7251 /* Allocate the initial space */
7252 new_list = newSV_type(SVt_INVLIST);
7253
7254 /* First 1 is in case the zero element isn't in the list; second 1 is for
7255 * trailing NUL */
7256 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7257 invlist_set_len(new_list, 0, 0);
7258
7259 /* Force iterinit() to be used to get iteration to work */
7260 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7261
7262 *get_invlist_previous_index_addr(new_list) = 0;
7263
7264 return new_list;
7265}
7266#endif
7267
7268STATIC SV*
7269S__new_invlist_C_array(pTHX_ const UV* const list)
7270{
7271 /* Return a pointer to a newly constructed inversion list, initialized to
7272 * point to <list>, which has to be in the exact correct inversion list
7273 * form, including internal fields. Thus this is a dangerous routine that
7274 * should not be used in the wrong hands. The passed in 'list' contains
7275 * several header fields at the beginning that are not part of the
7276 * inversion list body proper */
7277
7278 const STRLEN length = (STRLEN) list[0];
7279 const UV version_id = list[1];
7280 const bool offset = cBOOL(list[2]);
7281#define HEADER_LENGTH 3
7282 /* If any of the above changes in any way, you must change HEADER_LENGTH
7283 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7284 * perl -E 'say int(rand 2**31-1)'
7285 */
7286#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7287 data structure type, so that one being
7288 passed in can be validated to be an
7289 inversion list of the correct vintage.
7290 */
7291
7292 SV* invlist = newSV_type(SVt_INVLIST);
7293
7294 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7295
7296 if (version_id != INVLIST_VERSION_ID) {
7297 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7298 }
7299
7300 /* The generated array passed in includes header elements that aren't part
7301 * of the list proper, so start it just after them */
7302 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7303
7304 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7305 shouldn't touch it */
7306
7307 *(get_invlist_offset_addr(invlist)) = offset;
7308
7309 /* The 'length' passed to us is the physical number of elements in the
7310 * inversion list. But if there is an offset the logical number is one
7311 * less than that */
7312 invlist_set_len(invlist, length - offset, offset);
7313
7314 invlist_set_previous_index(invlist, 0);
7315
7316 /* Initialize the iteration pointer. */
7317 invlist_iterfinish(invlist);
7318
7319 return invlist;
7320}
7321
7322STATIC void
7323S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7324{
7325 /* Grow the maximum size of an inversion list */
7326
7327 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7328
7329 assert(SvTYPE(invlist) == SVt_INVLIST);
7330
7331 /* Add one to account for the zero element at the beginning which may not
7332 * be counted by the calling parameters */
7333 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7334}
7335
7336PERL_STATIC_INLINE void
7337S_invlist_trim(pTHX_ SV* const invlist)
7338{
7339 PERL_ARGS_ASSERT_INVLIST_TRIM;
7340
7341 assert(SvTYPE(invlist) == SVt_INVLIST);
7342
7343 /* Change the length of the inversion list to how many entries it currently
7344 * has */
7345 SvPV_shrink_to_cur((SV *) invlist);
7346}
7347
7348#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7349
7350STATIC void
7351S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7352{
7353 /* Subject to change or removal. Append the range from 'start' to 'end' at
7354 * the end of the inversion list. The range must be above any existing
7355 * ones. */
7356
7357 UV* array;
7358 UV max = invlist_max(invlist);
7359 UV len = _invlist_len(invlist);
7360 bool offset;
7361
7362 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7363
7364 if (len == 0) { /* Empty lists must be initialized */
7365 offset = start != 0;
7366 array = _invlist_array_init(invlist, ! offset);
7367 }
7368 else {
7369 /* Here, the existing list is non-empty. The current max entry in the
7370 * list is generally the first value not in the set, except when the
7371 * set extends to the end of permissible values, in which case it is
7372 * the first entry in that final set, and so this call is an attempt to
7373 * append out-of-order */
7374
7375 UV final_element = len - 1;
7376 array = invlist_array(invlist);
7377 if (array[final_element] > start
7378 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7379 {
7380 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7381 array[final_element], start,
7382 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7383 }
7384
7385 /* Here, it is a legal append. If the new range begins with the first
7386 * value not in the set, it is extending the set, so the new first
7387 * value not in the set is one greater than the newly extended range.
7388 * */
7389 offset = *get_invlist_offset_addr(invlist);
7390 if (array[final_element] == start) {
7391 if (end != UV_MAX) {
7392 array[final_element] = end + 1;
7393 }
7394 else {
7395 /* But if the end is the maximum representable on the machine,
7396 * just let the range that this would extend to have no end */
7397 invlist_set_len(invlist, len - 1, offset);
7398 }
7399 return;
7400 }
7401 }
7402
7403 /* Here the new range doesn't extend any existing set. Add it */
7404
7405 len += 2; /* Includes an element each for the start and end of range */
7406
7407 /* If wll overflow the existing space, extend, which may cause the array to
7408 * be moved */
7409 if (max < len) {
7410 invlist_extend(invlist, len);
7411
7412 /* Have to set len here to avoid assert failure in invlist_array() */
7413 invlist_set_len(invlist, len, offset);
7414
7415 array = invlist_array(invlist);
7416 }
7417 else {
7418 invlist_set_len(invlist, len, offset);
7419 }
7420
7421 /* The next item on the list starts the range, the one after that is
7422 * one past the new range. */
7423 array[len - 2] = start;
7424 if (end != UV_MAX) {
7425 array[len - 1] = end + 1;
7426 }
7427 else {
7428 /* But if the end is the maximum representable on the machine, just let
7429 * the range have no end */
7430 invlist_set_len(invlist, len - 1, offset);
7431 }
7432}
7433
7434#ifndef PERL_IN_XSUB_RE
7435
7436IV
7437Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7438{
7439 /* Searches the inversion list for the entry that contains the input code
7440 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7441 * return value is the index into the list's array of the range that
7442 * contains <cp> */
7443
7444 IV low = 0;
7445 IV mid;
7446 IV high = _invlist_len(invlist);
7447 const IV highest_element = high - 1;
7448 const UV* array;
7449
7450 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7451
7452 /* If list is empty, return failure. */
7453 if (high == 0) {
7454 return -1;
7455 }
7456
7457 /* (We can't get the array unless we know the list is non-empty) */
7458 array = invlist_array(invlist);
7459
7460 mid = invlist_previous_index(invlist);
7461 assert(mid >=0 && mid <= highest_element);
7462
7463 /* <mid> contains the cache of the result of the previous call to this
7464 * function (0 the first time). See if this call is for the same result,
7465 * or if it is for mid-1. This is under the theory that calls to this
7466 * function will often be for related code points that are near each other.
7467 * And benchmarks show that caching gives better results. We also test
7468 * here if the code point is within the bounds of the list. These tests
7469 * replace others that would have had to be made anyway to make sure that
7470 * the array bounds were not exceeded, and these give us extra information
7471 * at the same time */
7472 if (cp >= array[mid]) {
7473 if (cp >= array[highest_element]) {
7474 return highest_element;
7475 }
7476
7477 /* Here, array[mid] <= cp < array[highest_element]. This means that
7478 * the final element is not the answer, so can exclude it; it also
7479 * means that <mid> is not the final element, so can refer to 'mid + 1'
7480 * safely */
7481 if (cp < array[mid + 1]) {
7482 return mid;
7483 }
7484 high--;
7485 low = mid + 1;
7486 }
7487 else { /* cp < aray[mid] */
7488 if (cp < array[0]) { /* Fail if outside the array */
7489 return -1;
7490 }
7491 high = mid;
7492 if (cp >= array[mid - 1]) {
7493 goto found_entry;
7494 }
7495 }
7496
7497 /* Binary search. What we are looking for is <i> such that
7498 * array[i] <= cp < array[i+1]
7499 * The loop below converges on the i+1. Note that there may not be an
7500 * (i+1)th element in the array, and things work nonetheless */
7501 while (low < high) {
7502 mid = (low + high) / 2;
7503 assert(mid <= highest_element);
7504 if (array[mid] <= cp) { /* cp >= array[mid] */
7505 low = mid + 1;
7506
7507 /* We could do this extra test to exit the loop early.
7508 if (cp < array[low]) {
7509 return mid;
7510 }
7511 */
7512 }
7513 else { /* cp < array[mid] */
7514 high = mid;
7515 }
7516 }
7517
7518 found_entry:
7519 high--;
7520 invlist_set_previous_index(invlist, high);
7521 return high;
7522}
7523
7524void
7525Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7526{
7527 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7528 * but is used when the swash has an inversion list. This makes this much
7529 * faster, as it uses a binary search instead of a linear one. This is
7530 * intimately tied to that function, and perhaps should be in utf8.c,
7531 * except it is intimately tied to inversion lists as well. It assumes
7532 * that <swatch> is all 0's on input */
7533
7534 UV current = start;
7535 const IV len = _invlist_len(invlist);
7536 IV i;
7537 const UV * array;
7538
7539 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7540
7541 if (len == 0) { /* Empty inversion list */
7542 return;
7543 }
7544
7545 array = invlist_array(invlist);
7546
7547 /* Find which element it is */
7548 i = _invlist_search(invlist, start);
7549
7550 /* We populate from <start> to <end> */
7551 while (current < end) {
7552 UV upper;
7553
7554 /* The inversion list gives the results for every possible code point
7555 * after the first one in the list. Only those ranges whose index is
7556 * even are ones that the inversion list matches. For the odd ones,
7557 * and if the initial code point is not in the list, we have to skip
7558 * forward to the next element */
7559 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7560 i++;
7561 if (i >= len) { /* Finished if beyond the end of the array */
7562 return;
7563 }
7564 current = array[i];
7565 if (current >= end) { /* Finished if beyond the end of what we
7566 are populating */
7567 if (LIKELY(end < UV_MAX)) {
7568 return;
7569 }
7570
7571 /* We get here when the upper bound is the maximum
7572 * representable on the machine, and we are looking for just
7573 * that code point. Have to special case it */
7574 i = len;
7575 goto join_end_of_list;
7576 }
7577 }
7578 assert(current >= start);
7579
7580 /* The current range ends one below the next one, except don't go past
7581 * <end> */
7582 i++;
7583 upper = (i < len && array[i] < end) ? array[i] : end;
7584
7585 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7586 * for each code point in it */
7587 for (; current < upper; current++) {
7588 const STRLEN offset = (STRLEN)(current - start);
7589 swatch[offset >> 3] |= 1 << (offset & 7);
7590 }
7591
7592 join_end_of_list:
7593
7594 /* Quit if at the end of the list */
7595 if (i >= len) {
7596
7597 /* But first, have to deal with the highest possible code point on
7598 * the platform. The previous code assumes that <end> is one
7599 * beyond where we want to populate, but that is impossible at the
7600 * platform's infinity, so have to handle it specially */
7601 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7602 {
7603 const STRLEN offset = (STRLEN)(end - start);
7604 swatch[offset >> 3] |= 1 << (offset & 7);
7605 }
7606 return;
7607 }
7608
7609 /* Advance to the next range, which will be for code points not in the
7610 * inversion list */
7611 current = array[i];
7612 }
7613
7614 return;
7615}
7616
7617void
7618Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output)
7619{
7620 /* Take the union of two inversion lists and point <output> to it. *output
7621 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7622 * the reference count to that list will be decremented. The first list,
7623 * <a>, may be NULL, in which case a copy of the second list is returned.
7624 * If <complement_b> is TRUE, the union is taken of the complement
7625 * (inversion) of <b> instead of b itself.
7626 *
7627 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7628 * Richard Gillam, published by Addison-Wesley, and explained at some
7629 * length there. The preface says to incorporate its examples into your
7630 * code at your own risk.
7631 *
7632 * The algorithm is like a merge sort.
7633 *
7634 * XXX A potential performance improvement is to keep track as we go along
7635 * if only one of the inputs contributes to the result, meaning the other
7636 * is a subset of that one. In that case, we can skip the final copy and
7637 * return the larger of the input lists, but then outside code might need
7638 * to keep track of whether to free the input list or not */
7639
7640 const UV* array_a; /* a's array */
7641 const UV* array_b;
7642 UV len_a; /* length of a's array */
7643 UV len_b;
7644
7645 SV* u; /* the resulting union */
7646 UV* array_u;
7647 UV len_u;
7648
7649 UV i_a = 0; /* current index into a's array */
7650 UV i_b = 0;
7651 UV i_u = 0;
7652
7653 /* running count, as explained in the algorithm source book; items are
7654 * stopped accumulating and are output when the count changes to/from 0.
7655 * The count is incremented when we start a range that's in the set, and
7656 * decremented when we start a range that's not in the set. So its range
7657 * is 0 to 2. Only when the count is zero is something not in the set.
7658 */
7659 UV count = 0;
7660
7661 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7662 assert(a != b);
7663
7664 /* If either one is empty, the union is the other one */
7665 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7666 if (*output == a) {
7667 if (a != NULL) {
7668 SvREFCNT_dec_NN(a);
7669 }
7670 }
7671 if (*output != b) {
7672 *output = invlist_clone(b);
7673 if (complement_b) {
7674 _invlist_invert(*output);
7675 }
7676 } /* else *output already = b; */
7677 return;
7678 }
7679 else if ((len_b = _invlist_len(b)) == 0) {
7680 if (*output == b) {
7681 SvREFCNT_dec_NN(b);
7682 }
7683
7684 /* The complement of an empty list is a list that has everything in it,
7685 * so the union with <a> includes everything too */
7686 if (complement_b) {
7687 if (a == *output) {
7688 SvREFCNT_dec_NN(a);
7689 }
7690 *output = _new_invlist(1);
7691 _append_range_to_invlist(*output, 0, UV_MAX);
7692 }
7693 else if (*output != a) {
7694 *output = invlist_clone(a);
7695 }
7696 /* else *output already = a; */
7697 return;
7698 }
7699
7700 /* Here both lists exist and are non-empty */
7701 array_a = invlist_array(a);
7702 array_b = invlist_array(b);
7703
7704 /* If are to take the union of 'a' with the complement of b, set it
7705 * up so are looking at b's complement. */
7706 if (complement_b) {
7707
7708 /* To complement, we invert: if the first element is 0, remove it. To
7709 * do this, we just pretend the array starts one later */
7710 if (array_b[0] == 0) {
7711 array_b++;
7712 len_b--;
7713 }
7714 else {
7715
7716 /* But if the first element is not zero, we pretend the list starts
7717 * at the 0 that is always stored immediately before the array. */
7718 array_b--;
7719 len_b++;
7720 }
7721 }
7722
7723 /* Size the union for the worst case: that the sets are completely
7724 * disjoint */
7725 u = _new_invlist(len_a + len_b);
7726
7727 /* Will contain U+0000 if either component does */
7728 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7729 || (len_b > 0 && array_b[0] == 0));
7730
7731 /* Go through each list item by item, stopping when exhausted one of
7732 * them */
7733 while (i_a < len_a && i_b < len_b) {
7734 UV cp; /* The element to potentially add to the union's array */
7735 bool cp_in_set; /* is it in the the input list's set or not */
7736
7737 /* We need to take one or the other of the two inputs for the union.
7738 * Since we are merging two sorted lists, we take the smaller of the
7739 * next items. In case of a tie, we take the one that is in its set
7740 * first. If we took one not in the set first, it would decrement the
7741 * count, possibly to 0 which would cause it to be output as ending the
7742 * range, and the next time through we would take the same number, and
7743 * output it again as beginning the next range. By doing it the
7744 * opposite way, there is no possibility that the count will be
7745 * momentarily decremented to 0, and thus the two adjoining ranges will
7746 * be seamlessly merged. (In a tie and both are in the set or both not
7747 * in the set, it doesn't matter which we take first.) */
7748 if (array_a[i_a] < array_b[i_b]
7749 || (array_a[i_a] == array_b[i_b]
7750 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7751 {
7752 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7753 cp= array_a[i_a++];
7754 }
7755 else {
7756 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7757 cp = array_b[i_b++];
7758 }
7759
7760 /* Here, have chosen which of the two inputs to look at. Only output
7761 * if the running count changes to/from 0, which marks the
7762 * beginning/end of a range in that's in the set */
7763 if (cp_in_set) {
7764 if (count == 0) {
7765 array_u[i_u++] = cp;
7766 }
7767 count++;
7768 }
7769 else {
7770 count--;
7771 if (count == 0) {
7772 array_u[i_u++] = cp;
7773 }
7774 }
7775 }
7776
7777 /* Here, we are finished going through at least one of the lists, which
7778 * means there is something remaining in at most one. We check if the list
7779 * that hasn't been exhausted is positioned such that we are in the middle
7780 * of a range in its set or not. (i_a and i_b point to the element beyond
7781 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7782 * is potentially more to output.
7783 * There are four cases:
7784 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7785 * in the union is entirely from the non-exhausted set.
7786 * 2) Both were in their sets, count is 2. Nothing further should
7787 * be output, as everything that remains will be in the exhausted
7788 * list's set, hence in the union; decrementing to 1 but not 0 insures
7789 * that
7790 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7791 * Nothing further should be output because the union includes
7792 * everything from the exhausted set. Not decrementing ensures that.
7793 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7794 * decrementing to 0 insures that we look at the remainder of the
7795 * non-exhausted set */
7796 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7797 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7798 {
7799 count--;
7800 }
7801
7802 /* The final length is what we've output so far, plus what else is about to
7803 * be output. (If 'count' is non-zero, then the input list we exhausted
7804 * has everything remaining up to the machine's limit in its set, and hence
7805 * in the union, so there will be no further output. */
7806 len_u = i_u;
7807 if (count == 0) {
7808 /* At most one of the subexpressions will be non-zero */
7809 len_u += (len_a - i_a) + (len_b - i_b);
7810 }
7811
7812 /* Set result to final length, which can change the pointer to array_u, so
7813 * re-find it */
7814 if (len_u != _invlist_len(u)) {
7815 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
7816 invlist_trim(u);
7817 array_u = invlist_array(u);
7818 }
7819
7820 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7821 * the other) ended with everything above it not in its set. That means
7822 * that the remaining part of the union is precisely the same as the
7823 * non-exhausted list, so can just copy it unchanged. (If both list were
7824 * exhausted at the same time, then the operations below will be both 0.)
7825 */
7826 if (count == 0) {
7827 IV copy_count; /* At most one will have a non-zero copy count */
7828 if ((copy_count = len_a - i_a) > 0) {
7829 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7830 }
7831 else if ((copy_count = len_b - i_b) > 0) {
7832 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7833 }
7834 }
7835
7836 /* We may be removing a reference to one of the inputs */
7837 if (a == *output || b == *output) {
7838 assert(! invlist_is_iterating(*output));
7839 SvREFCNT_dec_NN(*output);
7840 }
7841
7842 *output = u;
7843 return;
7844}
7845
7846void
7847Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i)
7848{
7849 /* Take the intersection of two inversion lists and point <i> to it. *i
7850 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
7851 * the reference count to that list will be decremented.
7852 * If <complement_b> is TRUE, the result will be the intersection of <a>
7853 * and the complement (or inversion) of <b> instead of <b> directly.
7854 *
7855 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7856 * Richard Gillam, published by Addison-Wesley, and explained at some
7857 * length there. The preface says to incorporate its examples into your
7858 * code at your own risk. In fact, it had bugs
7859 *
7860 * The algorithm is like a merge sort, and is essentially the same as the
7861 * union above
7862 */
7863
7864 const UV* array_a; /* a's array */
7865 const UV* array_b;
7866 UV len_a; /* length of a's array */
7867 UV len_b;
7868
7869 SV* r; /* the resulting intersection */
7870 UV* array_r;
7871 UV len_r;
7872
7873 UV i_a = 0; /* current index into a's array */
7874 UV i_b = 0;
7875 UV i_r = 0;
7876
7877 /* running count, as explained in the algorithm source book; items are
7878 * stopped accumulating and are output when the count changes to/from 2.
7879 * The count is incremented when we start a range that's in the set, and
7880 * decremented when we start a range that's not in the set. So its range
7881 * is 0 to 2. Only when the count is 2 is something in the intersection.
7882 */
7883 UV count = 0;
7884
7885 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7886 assert(a != b);
7887
7888 /* Special case if either one is empty */
7889 len_a = (a == NULL) ? 0 : _invlist_len(a);
7890 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7891
7892 if (len_a != 0 && complement_b) {
7893
7894 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7895 * be empty. Here, also we are using 'b's complement, which hence
7896 * must be every possible code point. Thus the intersection is
7897 * simply 'a'. */
7898 if (*i != a) {
7899 if (*i == b) {
7900 SvREFCNT_dec_NN(b);
7901 }
7902
7903 *i = invlist_clone(a);
7904 }
7905 /* else *i is already 'a' */
7906 return;
7907 }
7908
7909 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7910 * intersection must be empty */
7911 if (*i == a) {
7912 SvREFCNT_dec_NN(a);
7913 }
7914 else if (*i == b) {
7915 SvREFCNT_dec_NN(b);
7916 }
7917 *i = _new_invlist(0);
7918 return;
7919 }
7920
7921 /* Here both lists exist and are non-empty */
7922 array_a = invlist_array(a);
7923 array_b = invlist_array(b);
7924
7925 /* If are to take the intersection of 'a' with the complement of b, set it
7926 * up so are looking at b's complement. */
7927 if (complement_b) {
7928
7929 /* To complement, we invert: if the first element is 0, remove it. To
7930 * do this, we just pretend the array starts one later */
7931 if (array_b[0] == 0) {
7932 array_b++;
7933 len_b--;
7934 }
7935 else {
7936
7937 /* But if the first element is not zero, we pretend the list starts
7938 * at the 0 that is always stored immediately before the array. */
7939 array_b--;
7940 len_b++;
7941 }
7942 }
7943
7944 /* Size the intersection for the worst case: that the intersection ends up
7945 * fragmenting everything to be completely disjoint */
7946 r= _new_invlist(len_a + len_b);
7947
7948 /* Will contain U+0000 iff both components do */
7949 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7950 && len_b > 0 && array_b[0] == 0);
7951
7952 /* Go through each list item by item, stopping when exhausted one of
7953 * them */
7954 while (i_a < len_a && i_b < len_b) {
7955 UV cp; /* The element to potentially add to the intersection's
7956 array */
7957 bool cp_in_set; /* Is it in the input list's set or not */
7958
7959 /* We need to take one or the other of the two inputs for the
7960 * intersection. Since we are merging two sorted lists, we take the
7961 * smaller of the next items. In case of a tie, we take the one that
7962 * is not in its set first (a difference from the union algorithm). If
7963 * we took one in the set first, it would increment the count, possibly
7964 * to 2 which would cause it to be output as starting a range in the
7965 * intersection, and the next time through we would take that same
7966 * number, and output it again as ending the set. By doing it the
7967 * opposite of this, there is no possibility that the count will be
7968 * momentarily incremented to 2. (In a tie and both are in the set or
7969 * both not in the set, it doesn't matter which we take first.) */
7970 if (array_a[i_a] < array_b[i_b]
7971 || (array_a[i_a] == array_b[i_b]
7972 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7973 {
7974 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7975 cp= array_a[i_a++];
7976 }
7977 else {
7978 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7979 cp= array_b[i_b++];
7980 }
7981
7982 /* Here, have chosen which of the two inputs to look at. Only output
7983 * if the running count changes to/from 2, which marks the
7984 * beginning/end of a range that's in the intersection */
7985 if (cp_in_set) {
7986 count++;
7987 if (count == 2) {
7988 array_r[i_r++] = cp;
7989 }
7990 }
7991 else {
7992 if (count == 2) {
7993 array_r[i_r++] = cp;
7994 }
7995 count--;
7996 }
7997 }
7998
7999 /* Here, we are finished going through at least one of the lists, which
8000 * means there is something remaining in at most one. We check if the list
8001 * that has been exhausted is positioned such that we are in the middle
8002 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8003 * the ones we care about.) There are four cases:
8004 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8005 * nothing left in the intersection.
8006 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8007 * above 2. What should be output is exactly that which is in the
8008 * non-exhausted set, as everything it has is also in the intersection
8009 * set, and everything it doesn't have can't be in the intersection
8010 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8011 * gets incremented to 2. Like the previous case, the intersection is
8012 * everything that remains in the non-exhausted set.
8013 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8014 * remains 1. And the intersection has nothing more. */
8015 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8016 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8017 {
8018 count++;
8019 }
8020
8021 /* The final length is what we've output so far plus what else is in the
8022 * intersection. At most one of the subexpressions below will be non-zero */
8023 len_r = i_r;
8024 if (count >= 2) {
8025 len_r += (len_a - i_a) + (len_b - i_b);
8026 }
8027
8028 /* Set result to final length, which can change the pointer to array_r, so
8029 * re-find it */
8030 if (len_r != _invlist_len(r)) {
8031 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8032 invlist_trim(r);
8033 array_r = invlist_array(r);
8034 }
8035
8036 /* Finish outputting any remaining */
8037 if (count >= 2) { /* At most one will have a non-zero copy count */
8038 IV copy_count;
8039 if ((copy_count = len_a - i_a) > 0) {
8040 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8041 }
8042 else if ((copy_count = len_b - i_b) > 0) {
8043 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8044 }
8045 }
8046
8047 /* We may be removing a reference to one of the inputs */
8048 if (a == *i || b == *i) {
8049 assert(! invlist_is_iterating(*i));
8050 SvREFCNT_dec_NN(*i);
8051 }
8052
8053 *i = r;
8054 return;
8055}
8056
8057SV*
8058Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8059{
8060 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8061 * set. A pointer to the inversion list is returned. This may actually be
8062 * a new list, in which case the passed in one has been destroyed. The
8063 * passed in inversion list can be NULL, in which case a new one is created
8064 * with just the one range in it */
8065
8066 SV* range_invlist;
8067 UV len;
8068
8069 if (invlist == NULL) {
8070 invlist = _new_invlist(2);
8071 len = 0;
8072 }
8073 else {
8074 len = _invlist_len(invlist);
8075 }
8076
8077 /* If comes after the final entry actually in the list, can just append it
8078 * to the end, */
8079 if (len == 0
8080 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8081 && start >= invlist_array(invlist)[len - 1]))
8082 {
8083 _append_range_to_invlist(invlist, start, end);
8084 return invlist;
8085 }
8086
8087 /* Here, can't just append things, create and return a new inversion list
8088 * which is the union of this range and the existing inversion list */
8089 range_invlist = _new_invlist(2);
8090 _append_range_to_invlist(range_invlist, start, end);
8091
8092 _invlist_union(invlist, range_invlist, &invlist);
8093
8094 /* The temporary can be freed */
8095 SvREFCNT_dec_NN(range_invlist);
8096
8097 return invlist;
8098}
8099
8100#endif
8101
8102PERL_STATIC_INLINE SV*
8103S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8104 return _add_range_to_invlist(invlist, cp, cp);
8105}
8106
8107#ifndef PERL_IN_XSUB_RE
8108void
8109Perl__invlist_invert(pTHX_ SV* const invlist)
8110{
8111 /* Complement the input inversion list. This adds a 0 if the list didn't
8112 * have a zero; removes it otherwise. As described above, the data
8113 * structure is set up so that this is very efficient */
8114
8115 PERL_ARGS_ASSERT__INVLIST_INVERT;
8116
8117 assert(! invlist_is_iterating(invlist));
8118
8119 /* The inverse of matching nothing is matching everything */
8120 if (_invlist_len(invlist) == 0) {
8121 _append_range_to_invlist(invlist, 0, UV_MAX);
8122 return;
8123 }
8124
8125 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8126}
8127
8128void
8129Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8130{
8131 /* Complement the input inversion list (which must be a Unicode property,
8132 * all of which don't match above the Unicode maximum code point.) And
8133 * Perl has chosen to not have the inversion match above that either. This
8134 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8135 */
8136
8137 UV len;
8138 UV* array;
8139
8140 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8141
8142 _invlist_invert(invlist);
8143
8144 len = _invlist_len(invlist);
8145
8146 if (len != 0) { /* If empty do nothing */
8147 array = invlist_array(invlist);
8148 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8149 /* Add 0x110000. First, grow if necessary */
8150 len++;
8151 if (invlist_max(invlist) < len) {
8152 invlist_extend(invlist, len);
8153 array = invlist_array(invlist);
8154 }
8155 invlist_set_len(invlist, len, *get_invlist_offset_addr(invlist));
8156 array[len - 1] = PERL_UNICODE_MAX + 1;
8157 }
8158 else { /* Remove the 0x110000 */
8159 invlist_set_len(invlist, len - 1, *get_invlist_offset_addr(invlist));
8160 }
8161 }
8162
8163 return;
8164}
8165#endif
8166
8167PERL_STATIC_INLINE SV*
8168S_invlist_clone(pTHX_ SV* const invlist)
8169{
8170
8171 /* Return a new inversion list that is a copy of the input one, which is
8172 * unchanged */
8173
8174 /* Need to allocate extra space to accommodate Perl's addition of a
8175 * trailing NUL to SvPV's, since it thinks they are always strings */
8176 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8177 STRLEN physical_length = SvCUR(invlist);
8178 bool offset = *(get_invlist_offset_addr(invlist));
8179
8180 PERL_ARGS_ASSERT_INVLIST_CLONE;
8181
8182 *(get_invlist_offset_addr(new_invlist)) = offset;
8183 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8184 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8185
8186 return new_invlist;
8187}
8188
8189PERL_STATIC_INLINE STRLEN*
8190S_get_invlist_iter_addr(pTHX_ SV* invlist)
8191{
8192 /* Return the address of the UV that contains the current iteration
8193 * position */
8194
8195 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8196
8197 assert(SvTYPE(invlist) == SVt_INVLIST);
8198
8199 return &(((XINVLIST*) SvANY(invlist))->iterator);
8200}
8201
8202PERL_STATIC_INLINE void
8203S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8204{
8205 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8206
8207 *get_invlist_iter_addr(invlist) = 0;
8208}
8209
8210PERL_STATIC_INLINE void
8211S_invlist_iterfinish(pTHX_ SV* invlist)
8212{
8213 /* Terminate iterator for invlist. This is to catch development errors.
8214 * Any iteration that is interrupted before completed should call this
8215 * function. Functions that add code points anywhere else but to the end
8216 * of an inversion list assert that they are not in the middle of an
8217 * iteration. If they were, the addition would make the iteration
8218 * problematical: if the iteration hadn't reached the place where things
8219 * were being added, it would be ok */
8220
8221 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8222
8223 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8224}
8225
8226STATIC bool
8227S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8228{
8229 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8230 * This call sets in <*start> and <*end>, the next range in <invlist>.
8231 * Returns <TRUE> if successful and the next call will return the next
8232 * range; <FALSE> if was already at the end of the list. If the latter,
8233 * <*start> and <*end> are unchanged, and the next call to this function
8234 * will start over at the beginning of the list */
8235
8236 STRLEN* pos = get_invlist_iter_addr(invlist);
8237 UV len = _invlist_len(invlist);
8238 UV *array;
8239
8240 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8241
8242 if (*pos >= len) {
8243 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8244 return FALSE;
8245 }
8246
8247 array = invlist_array(invlist);
8248
8249 *start = array[(*pos)++];
8250
8251 if (*pos >= len) {
8252 *end = UV_MAX;
8253 }
8254 else {
8255 *end = array[(*pos)++] - 1;
8256 }
8257
8258 return TRUE;
8259}
8260
8261PERL_STATIC_INLINE bool
8262S_invlist_is_iterating(pTHX_ SV* const invlist)
8263{
8264 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8265
8266 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8267}
8268
8269PERL_STATIC_INLINE UV
8270S_invlist_highest(pTHX_ SV* const invlist)
8271{
8272 /* Returns the highest code point that matches an inversion list. This API
8273 * has an ambiguity, as it returns 0 under either the highest is actually
8274 * 0, or if the list is empty. If this distinction matters to you, check
8275 * for emptiness before calling this function */
8276
8277 UV len = _invlist_len(invlist);
8278 UV *array;
8279
8280 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8281
8282 if (len == 0) {
8283 return 0;
8284 }
8285
8286 array = invlist_array(invlist);
8287
8288 /* The last element in the array in the inversion list always starts a
8289 * range that goes to infinity. That range may be for code points that are
8290 * matched in the inversion list, or it may be for ones that aren't
8291 * matched. In the latter case, the highest code point in the set is one
8292 * less than the beginning of this range; otherwise it is the final element
8293 * of this range: infinity */
8294 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8295 ? UV_MAX
8296 : array[len - 1] - 1;
8297}
8298
8299#ifndef PERL_IN_XSUB_RE
8300SV *
8301Perl__invlist_contents(pTHX_ SV* const invlist)
8302{
8303 /* Get the contents of an inversion list into a string SV so that they can
8304 * be printed out. It uses the format traditionally done for debug tracing
8305 */
8306
8307 UV start, end;
8308 SV* output = newSVpvs("\n");
8309
8310 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8311
8312 assert(! invlist_is_iterating(invlist));
8313
8314 invlist_iterinit(invlist);
8315 while (invlist_iternext(invlist, &start, &end)) {
8316 if (end == UV_MAX) {
8317 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8318 }
8319 else if (end != start) {
8320 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8321 start, end);
8322 }
8323 else {
8324 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8325 }
8326 }
8327
8328 return output;
8329}
8330#endif
8331
8332#ifndef PERL_IN_XSUB_RE
8333void
8334Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
8335{
8336 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
8337 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
8338 * the string 'indent'. The output looks like this:
8339 [0] 0x000A .. 0x000D
8340 [2] 0x0085
8341 [4] 0x2028 .. 0x2029
8342 [6] 0x3104 .. INFINITY
8343 * This means that the first range of code points matched by the list are
8344 * 0xA through 0xD; the second range contains only the single code point
8345 * 0x85, etc. An inversion list is an array of UVs. Two array elements
8346 * are used to define each range (except if the final range extends to
8347 * infinity, only a single element is needed). The array index of the
8348 * first element for the corresponding range is given in brackets. */
8349
8350 UV start, end;
8351 STRLEN count = 0;
8352
8353 PERL_ARGS_ASSERT__INVLIST_DUMP;
8354
8355 if (invlist_is_iterating(invlist)) {
8356 Perl_dump_indent(aTHX_ level, file,
8357 "%sCan't dump inversion list because is in middle of iterating\n",
8358 indent);
8359 return;
8360 }
8361
8362 invlist_iterinit(invlist);
8363 while (invlist_iternext(invlist, &start, &end)) {
8364 if (end == UV_MAX) {
8365 Perl_dump_indent(aTHX_ level, file,
8366 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
8367 indent, (UV)count, start);
8368 }
8369 else if (end != start) {
8370 Perl_dump_indent(aTHX_ level, file,
8371 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
8372 indent, (UV)count, start, end);
8373 }
8374 else {
8375 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
8376 indent, (UV)count, start);
8377 }
8378 count += 2;
8379 }
8380}
8381#endif
8382
8383#ifdef PERL_ARGS_ASSERT__INVLISTEQ
8384bool
8385S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
8386{
8387 /* Return a boolean as to if the two passed in inversion lists are
8388 * identical. The final argument, if TRUE, says to take the complement of
8389 * the second inversion list before doing the comparison */
8390
8391 const UV* array_a = invlist_array(a);
8392 const UV* array_b = invlist_array(b);
8393 UV len_a = _invlist_len(a);
8394 UV len_b = _invlist_len(b);
8395
8396 UV i = 0; /* current index into the arrays */
8397 bool retval = TRUE; /* Assume are identical until proven otherwise */
8398
8399 PERL_ARGS_ASSERT__INVLISTEQ;
8400
8401 /* If are to compare 'a' with the complement of b, set it
8402 * up so are looking at b's complement. */
8403 if (complement_b) {
8404
8405 /* The complement of nothing is everything, so <a> would have to have
8406 * just one element, starting at zero (ending at infinity) */
8407 if (len_b == 0) {
8408 return (len_a == 1 && array_a[0] == 0);
8409 }
8410 else if (array_b[0] == 0) {
8411
8412 /* Otherwise, to complement, we invert. Here, the first element is
8413 * 0, just remove it. To do this, we just pretend the array starts
8414 * one later */
8415
8416 array_b++;
8417 len_b--;
8418 }
8419 else {
8420
8421 /* But if the first element is not zero, we pretend the list starts
8422 * at the 0 that is always stored immediately before the array. */
8423 array_b--;
8424 len_b++;
8425 }
8426 }
8427
8428 /* Make sure that the lengths are the same, as well as the final element
8429 * before looping through the remainder. (Thus we test the length, final,
8430 * and first elements right off the bat) */
8431 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8432 retval = FALSE;
8433 }
8434 else for (i = 0; i < len_a - 1; i++) {
8435 if (array_a[i] != array_b[i]) {
8436 retval = FALSE;
8437 break;
8438 }
8439 }
8440
8441 return retval;
8442}
8443#endif
8444
8445#undef HEADER_LENGTH
8446#undef TO_INTERNAL_SIZE
8447#undef FROM_INTERNAL_SIZE
8448#undef INVLIST_VERSION_ID
8449
8450/* End of inversion list object */
8451
8452STATIC void
8453S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
8454{
8455 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
8456 * constructs, and updates RExC_flags with them. On input, RExC_parse
8457 * should point to the first flag; it is updated on output to point to the
8458 * final ')' or ':'. There needs to be at least one flag, or this will
8459 * abort */
8460
8461 /* for (?g), (?gc), and (?o) warnings; warning
8462 about (?c) will warn about (?g) -- japhy */
8463
8464#define WASTED_O 0x01
8465#define WASTED_G 0x02
8466#define WASTED_C 0x04
8467#define WASTED_GC (WASTED_G|WASTED_C)
8468 I32 wastedflags = 0x00;
8469 U32 posflags = 0, negflags = 0;
8470 U32 *flagsp = &posflags;
8471 char has_charset_modifier = '\0';
8472 regex_charset cs;
8473 bool has_use_defaults = FALSE;
8474 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
8475
8476 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
8477
8478 /* '^' as an initial flag sets certain defaults */
8479 if (UCHARAT(RExC_parse) == '^') {
8480 RExC_parse++;
8481 has_use_defaults = TRUE;
8482 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8483 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8484 ? REGEX_UNICODE_CHARSET
8485 : REGEX_DEPENDS_CHARSET);
8486 }
8487
8488 cs = get_regex_charset(RExC_flags);
8489 if (cs == REGEX_DEPENDS_CHARSET
8490 && (RExC_utf8 || RExC_uni_semantics))
8491 {
8492 cs = REGEX_UNICODE_CHARSET;
8493 }
8494
8495 while (*RExC_parse) {
8496 /* && strchr("iogcmsx", *RExC_parse) */
8497 /* (?g), (?gc) and (?o) are useless here
8498 and must be globally applied -- japhy */
8499 switch (*RExC_parse) {
8500
8501 /* Code for the imsx flags */
8502 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8503
8504 case LOCALE_PAT_MOD:
8505 if (has_charset_modifier) {
8506 goto excess_modifier;
8507 }
8508 else if (flagsp == &negflags) {
8509 goto neg_modifier;
8510 }
8511 cs = REGEX_LOCALE_CHARSET;
8512 has_charset_modifier = LOCALE_PAT_MOD;
8513 RExC_contains_locale = 1;
8514 break;
8515 case UNICODE_PAT_MOD:
8516 if (has_charset_modifier) {
8517 goto excess_modifier;
8518 }
8519 else if (flagsp == &negflags) {
8520 goto neg_modifier;
8521 }
8522 cs = REGEX_UNICODE_CHARSET;
8523 has_charset_modifier = UNICODE_PAT_MOD;
8524 break;
8525 case ASCII_RESTRICT_PAT_MOD:
8526 if (flagsp == &negflags) {
8527 goto neg_modifier;
8528 }
8529 if (has_charset_modifier) {
8530 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8531 goto excess_modifier;
8532 }
8533 /* Doubled modifier implies more restricted */
8534 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8535 }
8536 else {
8537 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8538 }
8539 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8540 break;
8541 case DEPENDS_PAT_MOD:
8542 if (has_use_defaults) {
8543 goto fail_modifiers;
8544 }
8545 else if (flagsp == &negflags) {
8546 goto neg_modifier;
8547 }
8548 else if (has_charset_modifier) {
8549 goto excess_modifier;
8550 }
8551
8552 /* The dual charset means unicode semantics if the
8553 * pattern (or target, not known until runtime) are
8554 * utf8, or something in the pattern indicates unicode
8555 * semantics */
8556 cs = (RExC_utf8 || RExC_uni_semantics)
8557 ? REGEX_UNICODE_CHARSET
8558 : REGEX_DEPENDS_CHARSET;
8559 has_charset_modifier = DEPENDS_PAT_MOD;
8560 break;
8561 excess_modifier:
8562 RExC_parse++;
8563 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8564 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8565 }
8566 else if (has_charset_modifier == *(RExC_parse - 1)) {
8567 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8568 }
8569 else {
8570 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8571 }
8572 /*NOTREACHED*/
8573 neg_modifier:
8574 RExC_parse++;
8575 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8576 /*NOTREACHED*/
8577 case ONCE_PAT_MOD: /* 'o' */
8578 case GLOBAL_PAT_MOD: /* 'g' */
8579 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8580 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8581 if (! (wastedflags & wflagbit) ) {
8582 wastedflags |= wflagbit;
8583 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8584 vWARN5(
8585 RExC_parse + 1,
8586 "Useless (%s%c) - %suse /%c modifier",
8587 flagsp == &negflags ? "?-" : "?",
8588 *RExC_parse,
8589 flagsp == &negflags ? "don't " : "",
8590 *RExC_parse
8591 );
8592 }
8593 }
8594 break;
8595
8596 case CONTINUE_PAT_MOD: /* 'c' */
8597 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8598 if (! (wastedflags & WASTED_C) ) {
8599 wastedflags |= WASTED_GC;
8600 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
8601 vWARN3(
8602 RExC_parse + 1,
8603 "Useless (%sc) - %suse /gc modifier",
8604 flagsp == &negflags ? "?-" : "?",
8605 flagsp == &negflags ? "don't " : ""
8606 );
8607 }
8608 }
8609 break;
8610 case KEEPCOPY_PAT_MOD: /* 'p' */
8611 if (flagsp == &negflags) {
8612 if (SIZE_ONLY)
8613 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8614 } else {
8615 *flagsp |= RXf_PMf_KEEPCOPY;
8616 }
8617 break;
8618 case '-':
8619 /* A flag is a default iff it is following a minus, so
8620 * if there is a minus, it means will be trying to
8621 * re-specify a default which is an error */
8622 if (has_use_defaults || flagsp == &negflags) {
8623 goto fail_modifiers;
8624 }
8625 flagsp = &negflags;
8626 wastedflags = 0; /* reset so (?g-c) warns twice */
8627 break;
8628 case ':':
8629 case ')':
8630 RExC_flags |= posflags;
8631 RExC_flags &= ~negflags;
8632 set_regex_charset(&RExC_flags, cs);
8633 return;
8634 /*NOTREACHED*/
8635 default:
8636 fail_modifiers:
8637 RExC_parse++;
8638 vFAIL3("Sequence (%.*s...) not recognized",
8639 RExC_parse-seqstart, seqstart);
8640 /*NOTREACHED*/
8641 }
8642
8643 ++RExC_parse;
8644 }
8645}
8646
8647/*
8648 - reg - regular expression, i.e. main body or parenthesized thing
8649 *
8650 * Caller must absorb opening parenthesis.
8651 *
8652 * Combining parenthesis handling with the base level of regular expression
8653 * is a trifle forced, but the need to tie the tails of the branches to what
8654 * follows makes it hard to avoid.
8655 */
8656#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8657#ifdef DEBUGGING
8658#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8659#else
8660#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8661#endif
8662
8663/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
8664 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
8665 needs to be restarted.
8666 Otherwise would only return NULL if regbranch() returns NULL, which
8667 cannot happen. */
8668STATIC regnode *
8669S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8670 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
8671 * 2 is like 1, but indicates that nextchar() has been called to advance
8672 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
8673 * this flag alerts us to the need to check for that */
8674{
8675 dVAR;
8676 regnode *ret; /* Will be the head of the group. */
8677 regnode *br;
8678 regnode *lastbr;
8679 regnode *ender = NULL;
8680 I32 parno = 0;
8681 I32 flags;
8682 U32 oregflags = RExC_flags;
8683 bool have_branch = 0;
8684 bool is_open = 0;
8685 I32 freeze_paren = 0;
8686 I32 after_freeze = 0;
8687
8688 char * parse_start = RExC_parse; /* MJD */
8689 char * const oregcomp_parse = RExC_parse;
8690
8691 GET_RE_DEBUG_FLAGS_DECL;
8692
8693 PERL_ARGS_ASSERT_REG;
8694 DEBUG_PARSE("reg ");
8695
8696 *flagp = 0; /* Tentatively. */
8697
8698
8699 /* Make an OPEN node, if parenthesized. */
8700 if (paren) {
8701
8702 /* Under /x, space and comments can be gobbled up between the '(' and
8703 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
8704 * intervening space, as the sequence is a token, and a token should be
8705 * indivisible */
8706 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
8707
8708 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8709 char *start_verb = RExC_parse;
8710 STRLEN verb_len = 0;
8711 char *start_arg = NULL;
8712 unsigned char op = 0;
8713 int argok = 1;
8714 int internal_argval = 0; /* internal_argval is only useful if !argok */
8715
8716 if (has_intervening_patws && SIZE_ONLY) {
8717 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
8718 }
8719 while ( *RExC_parse && *RExC_parse != ')' ) {
8720 if ( *RExC_parse == ':' ) {
8721 start_arg = RExC_parse + 1;
8722 break;
8723 }
8724 RExC_parse++;
8725 }
8726 ++start_verb;
8727 verb_len = RExC_parse - start_verb;
8728 if ( start_arg ) {
8729 RExC_parse++;
8730 while ( *RExC_parse && *RExC_parse != ')' )
8731 RExC_parse++;
8732 if ( *RExC_parse != ')' )
8733 vFAIL("Unterminated verb pattern argument");
8734 if ( RExC_parse == start_arg )
8735 start_arg = NULL;
8736 } else {
8737 if ( *RExC_parse != ')' )
8738 vFAIL("Unterminated verb pattern");
8739 }
8740
8741 switch ( *start_verb ) {
8742 case 'A': /* (*ACCEPT) */
8743 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8744 op = ACCEPT;
8745 internal_argval = RExC_nestroot;
8746 }
8747 break;
8748 case 'C': /* (*COMMIT) */
8749 if ( memEQs(start_verb,verb_len,"COMMIT") )
8750 op = COMMIT;
8751 break;
8752 case 'F': /* (*FAIL) */
8753 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8754 op = OPFAIL;
8755 argok = 0;
8756 }
8757 break;
8758 case ':': /* (*:NAME) */
8759 case 'M': /* (*MARK:NAME) */
8760 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8761 op = MARKPOINT;
8762 argok = -1;
8763 }
8764 break;
8765 case 'P': /* (*PRUNE) */
8766 if ( memEQs(start_verb,verb_len,"PRUNE") )
8767 op = PRUNE;
8768 break;
8769 case 'S': /* (*SKIP) */
8770 if ( memEQs(start_verb,verb_len,"SKIP") )
8771 op = SKIP;
8772 break;
8773 case 'T': /* (*THEN) */
8774 /* [19:06] <TimToady> :: is then */
8775 if ( memEQs(start_verb,verb_len,"THEN") ) {
8776 op = CUTGROUP;
8777 RExC_seen |= REG_SEEN_CUTGROUP;
8778 }
8779 break;
8780 }
8781 if ( ! op ) {
8782 RExC_parse++;
8783 vFAIL3("Unknown verb pattern '%.*s'",
8784 verb_len, start_verb);
8785 }
8786 if ( argok ) {
8787 if ( start_arg && internal_argval ) {
8788 vFAIL3("Verb pattern '%.*s' may not have an argument",
8789 verb_len, start_verb);
8790 } else if ( argok < 0 && !start_arg ) {
8791 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8792 verb_len, start_verb);
8793 } else {
8794 ret = reganode(pRExC_state, op, internal_argval);
8795 if ( ! internal_argval && ! SIZE_ONLY ) {
8796 if (start_arg) {
8797 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8798 ARG(ret) = add_data( pRExC_state, 1, "S" );
8799 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8800 ret->flags = 0;
8801 } else {
8802 ret->flags = 1;
8803 }
8804 }
8805 }
8806 if (!internal_argval)
8807 RExC_seen |= REG_SEEN_VERBARG;
8808 } else if ( start_arg ) {
8809 vFAIL3("Verb pattern '%.*s' may not have an argument",
8810 verb_len, start_verb);
8811 } else {
8812 ret = reg_node(pRExC_state, op);
8813 }
8814 nextchar(pRExC_state);
8815 return ret;
8816 }
8817 else if (*RExC_parse == '?') { /* (?...) */
8818 bool is_logical = 0;
8819 const char * const seqstart = RExC_parse;
8820 if (has_intervening_patws && SIZE_ONLY) {
8821 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
8822 }
8823
8824 RExC_parse++;
8825 paren = *RExC_parse++;
8826 ret = NULL; /* For look-ahead/behind. */
8827 switch (paren) {
8828
8829 case 'P': /* (?P...) variants for those used to PCRE/Python */
8830 paren = *RExC_parse++;
8831 if ( paren == '<') /* (?P<...>) named capture */
8832 goto named_capture;
8833 else if (paren == '>') { /* (?P>name) named recursion */
8834 goto named_recursion;
8835 }
8836 else if (paren == '=') { /* (?P=...) named backref */
8837 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8838 you change this make sure you change that */
8839 char* name_start = RExC_parse;
8840 U32 num = 0;
8841 SV *sv_dat = reg_scan_name(pRExC_state,
8842 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8843 if (RExC_parse == name_start || *RExC_parse != ')')
8844 vFAIL2("Sequence %.3s... not terminated",parse_start);
8845
8846 if (!SIZE_ONLY) {
8847 num = add_data( pRExC_state, 1, "S" );
8848 RExC_rxi->data->data[num]=(void*)sv_dat;
8849 SvREFCNT_inc_simple_void(sv_dat);
8850 }
8851 RExC_sawback = 1;
8852 ret = reganode(pRExC_state,
8853 ((! FOLD)
8854 ? NREF
8855 : (ASCII_FOLD_RESTRICTED)
8856 ? NREFFA
8857 : (AT_LEAST_UNI_SEMANTICS)
8858 ? NREFFU
8859 : (LOC)
8860 ? NREFFL
8861 : NREFF),
8862 num);
8863 *flagp |= HASWIDTH;
8864
8865 Set_Node_Offset(ret, parse_start+1);
8866 Set_Node_Cur_Length(ret, parse_start);
8867
8868 nextchar(pRExC_state);
8869 return ret;
8870 }
8871 RExC_parse++;
8872 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8873 /*NOTREACHED*/
8874 case '<': /* (?<...) */
8875 if (*RExC_parse == '!')
8876 paren = ',';
8877 else if (*RExC_parse != '=')
8878 named_capture:
8879 { /* (?<...>) */
8880 char *name_start;
8881 SV *svname;
8882 paren= '>';
8883 case '\'': /* (?'...') */
8884 name_start= RExC_parse;
8885 svname = reg_scan_name(pRExC_state,
8886 SIZE_ONLY ? /* reverse test from the others */
8887 REG_RSN_RETURN_NAME :
8888 REG_RSN_RETURN_NULL);
8889 if (RExC_parse == name_start) {
8890 RExC_parse++;
8891 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8892 /*NOTREACHED*/
8893 }
8894 if (*RExC_parse != paren)
8895 vFAIL2("Sequence (?%c... not terminated",
8896 paren=='>' ? '<' : paren);
8897 if (SIZE_ONLY) {
8898 HE *he_str;
8899 SV *sv_dat = NULL;
8900 if (!svname) /* shouldn't happen */
8901 Perl_croak(aTHX_
8902 "panic: reg_scan_name returned NULL");
8903 if (!RExC_paren_names) {
8904 RExC_paren_names= newHV();
8905 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8906#ifdef DEBUGGING
8907 RExC_paren_name_list= newAV();
8908 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8909#endif
8910 }
8911 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8912 if ( he_str )
8913 sv_dat = HeVAL(he_str);
8914 if ( ! sv_dat ) {
8915 /* croak baby croak */
8916 Perl_croak(aTHX_
8917 "panic: paren_name hash element allocation failed");
8918 } else if ( SvPOK(sv_dat) ) {
8919 /* (?|...) can mean we have dupes so scan to check
8920 its already been stored. Maybe a flag indicating
8921 we are inside such a construct would be useful,
8922 but the arrays are likely to be quite small, so
8923 for now we punt -- dmq */
8924 IV count = SvIV(sv_dat);
8925 I32 *pv = (I32*)SvPVX(sv_dat);
8926 IV i;
8927 for ( i = 0 ; i < count ; i++ ) {
8928 if ( pv[i] == RExC_npar ) {
8929 count = 0;
8930 break;
8931 }
8932 }
8933 if ( count ) {
8934 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8935 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8936 pv[count] = RExC_npar;
8937 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8938 }
8939 } else {
8940 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8941 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8942 SvIOK_on(sv_dat);
8943 SvIV_set(sv_dat, 1);
8944 }
8945#ifdef DEBUGGING
8946 /* Yes this does cause a memory leak in debugging Perls */
8947 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8948 SvREFCNT_dec_NN(svname);
8949#endif
8950
8951 /*sv_dump(sv_dat);*/
8952 }
8953 nextchar(pRExC_state);
8954 paren = 1;
8955 goto capturing_parens;
8956 }
8957 RExC_seen |= REG_SEEN_LOOKBEHIND;
8958 RExC_in_lookbehind++;
8959 RExC_parse++;
8960 case '=': /* (?=...) */
8961 RExC_seen_zerolen++;
8962 break;
8963 case '!': /* (?!...) */
8964 RExC_seen_zerolen++;
8965 if (*RExC_parse == ')') {
8966 ret=reg_node(pRExC_state, OPFAIL);
8967 nextchar(pRExC_state);
8968 return ret;
8969 }
8970 break;
8971 case '|': /* (?|...) */
8972 /* branch reset, behave like a (?:...) except that
8973 buffers in alternations share the same numbers */
8974 paren = ':';
8975 after_freeze = freeze_paren = RExC_npar;
8976 break;
8977 case ':': /* (?:...) */
8978 case '>': /* (?>...) */
8979 break;
8980 case '$': /* (?$...) */
8981 case '@': /* (?@...) */
8982 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8983 break;
8984 case '#': /* (?#...) */
8985 /* XXX As soon as we disallow separating the '?' and '*' (by
8986 * spaces or (?#...) comment), it is believed that this case
8987 * will be unreachable and can be removed. See
8988 * [perl #117327] */
8989 while (*RExC_parse && *RExC_parse != ')')
8990 RExC_parse++;
8991 if (*RExC_parse != ')')
8992 FAIL("Sequence (?#... not terminated");
8993 nextchar(pRExC_state);
8994 *flagp = TRYAGAIN;
8995 return NULL;
8996 case '0' : /* (?0) */
8997 case 'R' : /* (?R) */
8998 if (*RExC_parse != ')')
8999 FAIL("Sequence (?R) not terminated");
9000 ret = reg_node(pRExC_state, GOSTART);
9001 *flagp |= POSTPONED;
9002 nextchar(pRExC_state);
9003 return ret;
9004 /*notreached*/
9005 { /* named and numeric backreferences */
9006 I32 num;
9007 case '&': /* (?&NAME) */
9008 parse_start = RExC_parse - 1;
9009 named_recursion:
9010 {
9011 SV *sv_dat = reg_scan_name(pRExC_state,
9012 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9013 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9014 }
9015 goto gen_recurse_regop;
9016 assert(0); /* NOT REACHED */
9017 case '+':
9018 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9019 RExC_parse++;
9020 vFAIL("Illegal pattern");
9021 }
9022 goto parse_recursion;
9023 /* NOT REACHED*/
9024 case '-': /* (?-1) */
9025 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9026 RExC_parse--; /* rewind to let it be handled later */
9027 goto parse_flags;
9028 }
9029 /*FALLTHROUGH */
9030 case '1': case '2': case '3': case '4': /* (?1) */
9031 case '5': case '6': case '7': case '8': case '9':
9032 RExC_parse--;
9033 parse_recursion:
9034 num = atoi(RExC_parse);
9035 parse_start = RExC_parse - 1; /* MJD */
9036 if (*RExC_parse == '-')
9037 RExC_parse++;
9038 while (isDIGIT(*RExC_parse))
9039 RExC_parse++;
9040 if (*RExC_parse!=')')
9041 vFAIL("Expecting close bracket");
9042
9043 gen_recurse_regop:
9044 if ( paren == '-' ) {
9045 /*
9046 Diagram of capture buffer numbering.
9047 Top line is the normal capture buffer numbers
9048 Bottom line is the negative indexing as from
9049 the X (the (?-2))
9050
9051 + 1 2 3 4 5 X 6 7
9052 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9053 - 5 4 3 2 1 X x x
9054
9055 */
9056 num = RExC_npar + num;
9057 if (num < 1) {
9058 RExC_parse++;
9059 vFAIL("Reference to nonexistent group");
9060 }
9061 } else if ( paren == '+' ) {
9062 num = RExC_npar + num - 1;
9063 }
9064
9065 ret = reganode(pRExC_state, GOSUB, num);
9066 if (!SIZE_ONLY) {
9067 if (num > (I32)RExC_rx->nparens) {
9068 RExC_parse++;
9069 vFAIL("Reference to nonexistent group");
9070 }
9071 ARG2L_SET( ret, RExC_recurse_count++);
9072 RExC_emit++;
9073 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9074 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
9075 } else {
9076 RExC_size++;
9077 }
9078 RExC_seen |= REG_SEEN_RECURSE;
9079 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9080 Set_Node_Offset(ret, parse_start); /* MJD */
9081
9082 *flagp |= POSTPONED;
9083 nextchar(pRExC_state);
9084 return ret;
9085 } /* named and numeric backreferences */
9086 assert(0); /* NOT REACHED */
9087
9088 case '?': /* (??...) */
9089 is_logical = 1;
9090 if (*RExC_parse != '{') {
9091 RExC_parse++;
9092 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9093 /*NOTREACHED*/
9094 }
9095 *flagp |= POSTPONED;
9096 paren = *RExC_parse++;
9097 /* FALL THROUGH */
9098 case '{': /* (?{...}) */
9099 {
9100 U32 n = 0;
9101 struct reg_code_block *cb;
9102
9103 RExC_seen_zerolen++;
9104
9105 if ( !pRExC_state->num_code_blocks
9106 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9107 || pRExC_state->code_blocks[pRExC_state->code_index].start
9108 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9109 - RExC_start)
9110 ) {
9111 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9112 FAIL("panic: Sequence (?{...}): no code block found\n");
9113 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9114 }
9115 /* this is a pre-compiled code block (?{...}) */
9116 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9117 RExC_parse = RExC_start + cb->end;
9118 if (!SIZE_ONLY) {
9119 OP *o = cb->block;
9120 if (cb->src_regex) {
9121 n = add_data(pRExC_state, 2, "rl");
9122 RExC_rxi->data->data[n] =
9123 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9124 RExC_rxi->data->data[n+1] = (void*)o;
9125 }
9126 else {
9127 n = add_data(pRExC_state, 1,
9128 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
9129 RExC_rxi->data->data[n] = (void*)o;
9130 }
9131 }
9132 pRExC_state->code_index++;
9133 nextchar(pRExC_state);
9134
9135 if (is_logical) {
9136 regnode *eval;
9137 ret = reg_node(pRExC_state, LOGICAL);
9138 eval = reganode(pRExC_state, EVAL, n);
9139 if (!SIZE_ONLY) {
9140 ret->flags = 2;
9141 /* for later propagation into (??{}) return value */
9142 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9143 }
9144 REGTAIL(pRExC_state, ret, eval);
9145 /* deal with the length of this later - MJD */
9146 return ret;
9147 }
9148 ret = reganode(pRExC_state, EVAL, n);
9149 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9150 Set_Node_Offset(ret, parse_start);
9151 return ret;
9152 }
9153 case '(': /* (?(?{...})...) and (?(?=...)...) */
9154 {
9155 int is_define= 0;
9156 if (RExC_parse[0] == '?') { /* (?(?...)) */
9157 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9158 || RExC_parse[1] == '<'
9159 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9160 I32 flag;
9161 regnode *tail;
9162
9163 ret = reg_node(pRExC_state, LOGICAL);
9164 if (!SIZE_ONLY)
9165 ret->flags = 1;
9166
9167 tail = reg(pRExC_state, 1, &flag, depth+1);
9168 if (flag & RESTART_UTF8) {
9169 *flagp = RESTART_UTF8;
9170 return NULL;
9171 }
9172 REGTAIL(pRExC_state, ret, tail);
9173 goto insert_if;
9174 }
9175 }
9176 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9177 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9178 {
9179 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9180 char *name_start= RExC_parse++;
9181 U32 num = 0;
9182 SV *sv_dat=reg_scan_name(pRExC_state,
9183 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9184 if (RExC_parse == name_start || *RExC_parse != ch)
9185 vFAIL2("Sequence (?(%c... not terminated",
9186 (ch == '>' ? '<' : ch));
9187 RExC_parse++;
9188 if (!SIZE_ONLY) {
9189 num = add_data( pRExC_state, 1, "S" );
9190 RExC_rxi->data->data[num]=(void*)sv_dat;
9191 SvREFCNT_inc_simple_void(sv_dat);
9192 }
9193 ret = reganode(pRExC_state,NGROUPP,num);
9194 goto insert_if_check_paren;
9195 }
9196 else if (RExC_parse[0] == 'D' &&
9197 RExC_parse[1] == 'E' &&
9198 RExC_parse[2] == 'F' &&
9199 RExC_parse[3] == 'I' &&
9200 RExC_parse[4] == 'N' &&
9201 RExC_parse[5] == 'E')
9202 {
9203 ret = reganode(pRExC_state,DEFINEP,0);
9204 RExC_parse +=6 ;
9205 is_define = 1;
9206 goto insert_if_check_paren;
9207 }
9208 else if (RExC_parse[0] == 'R') {
9209 RExC_parse++;
9210 parno = 0;
9211 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9212 parno = atoi(RExC_parse++);
9213 while (isDIGIT(*RExC_parse))
9214 RExC_parse++;
9215 } else if (RExC_parse[0] == '&') {
9216 SV *sv_dat;
9217 RExC_parse++;
9218 sv_dat = reg_scan_name(pRExC_state,
9219 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9220 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9221 }
9222 ret = reganode(pRExC_state,INSUBP,parno);
9223 goto insert_if_check_paren;
9224 }
9225 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9226 /* (?(1)...) */
9227 char c;
9228 parno = atoi(RExC_parse++);
9229
9230 while (isDIGIT(*RExC_parse))
9231 RExC_parse++;
9232 ret = reganode(pRExC_state, GROUPP, parno);
9233
9234 insert_if_check_paren:
9235 if ((c = *nextchar(pRExC_state)) != ')')
9236 vFAIL("Switch condition not recognized");
9237 insert_if:
9238 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9239 br = regbranch(pRExC_state, &flags, 1,depth+1);
9240 if (br == NULL) {
9241 if (flags & RESTART_UTF8) {
9242 *flagp = RESTART_UTF8;
9243 return NULL;
9244 }
9245 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9246 (UV) flags);
9247 } else
9248 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
9249 c = *nextchar(pRExC_state);
9250 if (flags&HASWIDTH)
9251 *flagp |= HASWIDTH;
9252 if (c == '|') {
9253 if (is_define)
9254 vFAIL("(?(DEFINE)....) does not allow branches");
9255 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
9256 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9257 if (flags & RESTART_UTF8) {
9258 *flagp = RESTART_UTF8;
9259 return NULL;
9260 }
9261 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9262 (UV) flags);
9263 }
9264 REGTAIL(pRExC_state, ret, lastbr);
9265 if (flags&HASWIDTH)
9266 *flagp |= HASWIDTH;
9267 c = *nextchar(pRExC_state);
9268 }
9269 else
9270 lastbr = NULL;
9271 if (c != ')')
9272 vFAIL("Switch (?(condition)... contains too many branches");
9273 ender = reg_node(pRExC_state, TAIL);
9274 REGTAIL(pRExC_state, br, ender);
9275 if (lastbr) {
9276 REGTAIL(pRExC_state, lastbr, ender);
9277 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
9278 }
9279 else
9280 REGTAIL(pRExC_state, ret, ender);
9281 RExC_size++; /* XXX WHY do we need this?!!
9282 For large programs it seems to be required
9283 but I can't figure out why. -- dmq*/
9284 return ret;
9285 }
9286 else {
9287 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
9288 }
9289 }
9290 case '[': /* (?[ ... ]) */
9291 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
9292 oregcomp_parse);
9293 case 0:
9294 RExC_parse--; /* for vFAIL to print correctly */
9295 vFAIL("Sequence (? incomplete");
9296 break;
9297 default: /* e.g., (?i) */
9298 --RExC_parse;
9299 parse_flags:
9300 parse_lparen_question_flags(pRExC_state);
9301 if (UCHARAT(RExC_parse) != ':') {
9302 nextchar(pRExC_state);
9303 *flagp = TRYAGAIN;
9304 return NULL;
9305 }
9306 paren = ':';
9307 nextchar(pRExC_state);
9308 ret = NULL;
9309 goto parse_rest;
9310 } /* end switch */
9311 }
9312 else { /* (...) */
9313 capturing_parens:
9314 parno = RExC_npar;
9315 RExC_npar++;
9316
9317 ret = reganode(pRExC_state, OPEN, parno);
9318 if (!SIZE_ONLY ){
9319 if (!RExC_nestroot)
9320 RExC_nestroot = parno;
9321 if (RExC_seen & REG_SEEN_RECURSE
9322 && !RExC_open_parens[parno-1])
9323 {
9324 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9325 "Setting open paren #%"IVdf" to %d\n",
9326 (IV)parno, REG_NODE_NUM(ret)));
9327 RExC_open_parens[parno-1]= ret;
9328 }
9329 }
9330 Set_Node_Length(ret, 1); /* MJD */
9331 Set_Node_Offset(ret, RExC_parse); /* MJD */
9332 is_open = 1;
9333 }
9334 }
9335 else /* ! paren */
9336 ret = NULL;
9337
9338 parse_rest:
9339 /* Pick up the branches, linking them together. */
9340 parse_start = RExC_parse; /* MJD */
9341 br = regbranch(pRExC_state, &flags, 1,depth+1);
9342
9343 /* branch_len = (paren != 0); */
9344
9345 if (br == NULL) {
9346 if (flags & RESTART_UTF8) {
9347 *flagp = RESTART_UTF8;
9348 return NULL;
9349 }
9350 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9351 }
9352 if (*RExC_parse == '|') {
9353 if (!SIZE_ONLY && RExC_extralen) {
9354 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9355 }
9356 else { /* MJD */
9357 reginsert(pRExC_state, BRANCH, br, depth+1);
9358 Set_Node_Length(br, paren != 0);
9359 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9360 }
9361 have_branch = 1;
9362 if (SIZE_ONLY)
9363 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9364 }
9365 else if (paren == ':') {
9366 *flagp |= flags&SIMPLE;
9367 }
9368 if (is_open) { /* Starts with OPEN. */
9369 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9370 }
9371 else if (paren != '?') /* Not Conditional */
9372 ret = br;
9373 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9374 lastbr = br;
9375 while (*RExC_parse == '|') {
9376 if (!SIZE_ONLY && RExC_extralen) {
9377 ender = reganode(pRExC_state, LONGJMP,0);
9378 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9379 }
9380 if (SIZE_ONLY)
9381 RExC_extralen += 2; /* Account for LONGJMP. */
9382 nextchar(pRExC_state);
9383 if (freeze_paren) {
9384 if (RExC_npar > after_freeze)
9385 after_freeze = RExC_npar;
9386 RExC_npar = freeze_paren;
9387 }
9388 br = regbranch(pRExC_state, &flags, 0, depth+1);
9389
9390 if (br == NULL) {
9391 if (flags & RESTART_UTF8) {
9392 *flagp = RESTART_UTF8;
9393 return NULL;
9394 }
9395 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
9396 }
9397 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9398 lastbr = br;
9399 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9400 }
9401
9402 if (have_branch || paren != ':') {
9403 /* Make a closing node, and hook it on the end. */
9404 switch (paren) {
9405 case ':':
9406 ender = reg_node(pRExC_state, TAIL);
9407 break;
9408 case 1: case 2:
9409 ender = reganode(pRExC_state, CLOSE, parno);
9410 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9411 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9412 "Setting close paren #%"IVdf" to %d\n",
9413 (IV)parno, REG_NODE_NUM(ender)));
9414 RExC_close_parens[parno-1]= ender;
9415 if (RExC_nestroot == parno)
9416 RExC_nestroot = 0;
9417 }
9418 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9419 Set_Node_Length(ender,1); /* MJD */
9420 break;
9421 case '<':
9422 case ',':
9423 case '=':
9424 case '!':
9425 *flagp &= ~HASWIDTH;
9426 /* FALL THROUGH */
9427 case '>':
9428 ender = reg_node(pRExC_state, SUCCEED);
9429 break;
9430 case 0:
9431 ender = reg_node(pRExC_state, END);
9432 if (!SIZE_ONLY) {
9433 assert(!RExC_opend); /* there can only be one! */
9434 RExC_opend = ender;
9435 }
9436 break;
9437 }
9438 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9439 SV * const mysv_val1=sv_newmortal();
9440 SV * const mysv_val2=sv_newmortal();
9441 DEBUG_PARSE_MSG("lsbr");
9442 regprop(RExC_rx, mysv_val1, lastbr);
9443 regprop(RExC_rx, mysv_val2, ender);
9444 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9445 SvPV_nolen_const(mysv_val1),
9446 (IV)REG_NODE_NUM(lastbr),
9447 SvPV_nolen_const(mysv_val2),
9448 (IV)REG_NODE_NUM(ender),
9449 (IV)(ender - lastbr)
9450 );
9451 });
9452 REGTAIL(pRExC_state, lastbr, ender);
9453
9454 if (have_branch && !SIZE_ONLY) {
9455 char is_nothing= 1;
9456 if (depth==1)
9457 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9458
9459 /* Hook the tails of the branches to the closing node. */
9460 for (br = ret; br; br = regnext(br)) {
9461 const U8 op = PL_regkind[OP(br)];
9462 if (op == BRANCH) {
9463 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9464 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9465 is_nothing= 0;
9466 }
9467 else if (op == BRANCHJ) {
9468 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9469 /* for now we always disable this optimisation * /
9470 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9471 */
9472 is_nothing= 0;
9473 }
9474 }
9475 if (is_nothing) {
9476 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9477 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9478 SV * const mysv_val1=sv_newmortal();
9479 SV * const mysv_val2=sv_newmortal();
9480 DEBUG_PARSE_MSG("NADA");
9481 regprop(RExC_rx, mysv_val1, ret);
9482 regprop(RExC_rx, mysv_val2, ender);
9483 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9484 SvPV_nolen_const(mysv_val1),
9485 (IV)REG_NODE_NUM(ret),
9486 SvPV_nolen_const(mysv_val2),
9487 (IV)REG_NODE_NUM(ender),
9488 (IV)(ender - ret)
9489 );
9490 });
9491 OP(br)= NOTHING;
9492 if (OP(ender) == TAIL) {
9493 NEXT_OFF(br)= 0;
9494 RExC_emit= br + 1;
9495 } else {
9496 regnode *opt;
9497 for ( opt= br + 1; opt < ender ; opt++ )
9498 OP(opt)= OPTIMIZED;
9499 NEXT_OFF(br)= ender - br;
9500 }
9501 }
9502 }
9503 }
9504
9505 {
9506 const char *p;
9507 static const char parens[] = "=!<,>";
9508
9509 if (paren && (p = strchr(parens, paren))) {
9510 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9511 int flag = (p - parens) > 1;
9512
9513 if (paren == '>')
9514 node = SUSPEND, flag = 0;
9515 reginsert(pRExC_state, node,ret, depth+1);
9516 Set_Node_Cur_Length(ret, parse_start);
9517 Set_Node_Offset(ret, parse_start + 1);
9518 ret->flags = flag;
9519 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9520 }
9521 }
9522
9523 /* Check for proper termination. */
9524 if (paren) {
9525 /* restore original flags, but keep (?p) */
9526 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
9527 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9528 RExC_parse = oregcomp_parse;
9529 vFAIL("Unmatched (");
9530 }
9531 }
9532 else if (!paren && RExC_parse < RExC_end) {
9533 if (*RExC_parse == ')') {
9534 RExC_parse++;
9535 vFAIL("Unmatched )");
9536 }
9537 else
9538 FAIL("Junk on end of regexp"); /* "Can't happen". */
9539 assert(0); /* NOTREACHED */
9540 }
9541
9542 if (RExC_in_lookbehind) {
9543 RExC_in_lookbehind--;
9544 }
9545 if (after_freeze > RExC_npar)
9546 RExC_npar = after_freeze;
9547 return(ret);
9548}
9549
9550/*
9551 - regbranch - one alternative of an | operator
9552 *
9553 * Implements the concatenation operator.
9554 *
9555 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9556 * restarted.
9557 */
9558STATIC regnode *
9559S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9560{
9561 dVAR;
9562 regnode *ret;
9563 regnode *chain = NULL;
9564 regnode *latest;
9565 I32 flags = 0, c = 0;
9566 GET_RE_DEBUG_FLAGS_DECL;
9567
9568 PERL_ARGS_ASSERT_REGBRANCH;
9569
9570 DEBUG_PARSE("brnc");
9571
9572 if (first)
9573 ret = NULL;
9574 else {
9575 if (!SIZE_ONLY && RExC_extralen)
9576 ret = reganode(pRExC_state, BRANCHJ,0);
9577 else {
9578 ret = reg_node(pRExC_state, BRANCH);
9579 Set_Node_Length(ret, 1);
9580 }
9581 }
9582
9583 if (!first && SIZE_ONLY)
9584 RExC_extralen += 1; /* BRANCHJ */
9585
9586 *flagp = WORST; /* Tentatively. */
9587
9588 RExC_parse--;
9589 nextchar(pRExC_state);
9590 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9591 flags &= ~TRYAGAIN;
9592 latest = regpiece(pRExC_state, &flags,depth+1);
9593 if (latest == NULL) {
9594 if (flags & TRYAGAIN)
9595 continue;
9596 if (flags & RESTART_UTF8) {
9597 *flagp = RESTART_UTF8;
9598 return NULL;
9599 }
9600 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
9601 }
9602 else if (ret == NULL)
9603 ret = latest;
9604 *flagp |= flags&(HASWIDTH|POSTPONED);
9605 if (chain == NULL) /* First piece. */
9606 *flagp |= flags&SPSTART;
9607 else {
9608 RExC_naughty++;
9609 REGTAIL(pRExC_state, chain, latest);
9610 }
9611 chain = latest;
9612 c++;
9613 }
9614 if (chain == NULL) { /* Loop ran zero times. */
9615 chain = reg_node(pRExC_state, NOTHING);
9616 if (ret == NULL)
9617 ret = chain;
9618 }
9619 if (c == 1) {
9620 *flagp |= flags&SIMPLE;
9621 }
9622
9623 return ret;
9624}
9625
9626/*
9627 - regpiece - something followed by possible [*+?]
9628 *
9629 * Note that the branching code sequences used for ? and the general cases
9630 * of * and + are somewhat optimized: they use the same NOTHING node as
9631 * both the endmarker for their branch list and the body of the last branch.
9632 * It might seem that this node could be dispensed with entirely, but the
9633 * endmarker role is not redundant.
9634 *
9635 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
9636 * TRYAGAIN.
9637 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
9638 * restarted.
9639 */
9640STATIC regnode *
9641S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9642{
9643 dVAR;
9644 regnode *ret;
9645 char op;
9646 char *next;
9647 I32 flags;
9648 const char * const origparse = RExC_parse;
9649 I32 min;
9650 I32 max = REG_INFTY;
9651#ifdef RE_TRACK_PATTERN_OFFSETS
9652 char *parse_start;
9653#endif
9654 const char *maxpos = NULL;
9655
9656 /* Save the original in case we change the emitted regop to a FAIL. */
9657 regnode * const orig_emit = RExC_emit;
9658
9659 GET_RE_DEBUG_FLAGS_DECL;
9660
9661 PERL_ARGS_ASSERT_REGPIECE;
9662
9663 DEBUG_PARSE("piec");
9664
9665 ret = regatom(pRExC_state, &flags,depth+1);
9666 if (ret == NULL) {
9667 if (flags & (TRYAGAIN|RESTART_UTF8))
9668 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
9669 else
9670 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
9671 return(NULL);
9672 }
9673
9674 op = *RExC_parse;
9675
9676 if (op == '{' && regcurly(RExC_parse, FALSE)) {
9677 maxpos = NULL;
9678#ifdef RE_TRACK_PATTERN_OFFSETS
9679 parse_start = RExC_parse; /* MJD */
9680#endif
9681 next = RExC_parse + 1;
9682 while (isDIGIT(*next) || *next == ',') {
9683 if (*next == ',') {
9684 if (maxpos)
9685 break;
9686 else
9687 maxpos = next;
9688 }
9689 next++;
9690 }
9691 if (*next == '}') { /* got one */
9692 if (!maxpos)
9693 maxpos = next;
9694 RExC_parse++;
9695 min = atoi(RExC_parse);
9696 if (*maxpos == ',')
9697 maxpos++;
9698 else
9699 maxpos = RExC_parse;
9700 max = atoi(maxpos);
9701 if (!max && *maxpos != '0')
9702 max = REG_INFTY; /* meaning "infinity" */
9703 else if (max >= REG_INFTY)
9704 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9705 RExC_parse = next;
9706 nextchar(pRExC_state);
9707 if (max < min) { /* If can't match, warn and optimize to fail
9708 unconditionally */
9709 if (SIZE_ONLY) {
9710 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9711
9712 /* We can't back off the size because we have to reserve
9713 * enough space for all the things we are about to throw
9714 * away, but we can shrink it by the ammount we are about
9715 * to re-use here */
9716 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9717 }
9718 else {
9719 RExC_emit = orig_emit;
9720 }
9721 ret = reg_node(pRExC_state, OPFAIL);
9722 return ret;
9723 }
9724
9725 do_curly:
9726 if ((flags&SIMPLE)) {
9727 RExC_naughty += 2 + RExC_naughty / 2;
9728 reginsert(pRExC_state, CURLY, ret, depth+1);
9729 Set_Node_Offset(ret, parse_start+1); /* MJD */
9730 Set_Node_Cur_Length(ret, parse_start);
9731 }
9732 else {
9733 regnode * const w = reg_node(pRExC_state, WHILEM);
9734
9735 w->flags = 0;
9736 REGTAIL(pRExC_state, ret, w);
9737 if (!SIZE_ONLY && RExC_extralen) {
9738 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9739 reginsert(pRExC_state, NOTHING,ret, depth+1);
9740 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9741 }
9742 reginsert(pRExC_state, CURLYX,ret, depth+1);
9743 /* MJD hk */
9744 Set_Node_Offset(ret, parse_start+1);
9745 Set_Node_Length(ret,
9746 op == '{' ? (RExC_parse - parse_start) : 1);
9747
9748 if (!SIZE_ONLY && RExC_extralen)
9749 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9750 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9751 if (SIZE_ONLY)
9752 RExC_whilem_seen++, RExC_extralen += 3;
9753 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9754 }
9755 ret->flags = 0;
9756
9757 if (min > 0)
9758 *flagp = WORST;
9759 if (max > 0)
9760 *flagp |= HASWIDTH;
9761 if (!SIZE_ONLY) {
9762 ARG1_SET(ret, (U16)min);
9763 ARG2_SET(ret, (U16)max);
9764 }
9765
9766 goto nest_check;
9767 }
9768 }
9769
9770 if (!ISMULT1(op)) {
9771 *flagp = flags;
9772 return(ret);
9773 }
9774
9775#if 0 /* Now runtime fix should be reliable. */
9776
9777 /* if this is reinstated, don't forget to put this back into perldiag:
9778
9779 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9780
9781 (F) The part of the regexp subject to either the * or + quantifier
9782 could match an empty string. The {#} shows in the regular
9783 expression about where the problem was discovered.
9784
9785 */
9786
9787 if (!(flags&HASWIDTH) && op != '?')
9788 vFAIL("Regexp *+ operand could be empty");
9789#endif
9790
9791#ifdef RE_TRACK_PATTERN_OFFSETS
9792 parse_start = RExC_parse;
9793#endif
9794 nextchar(pRExC_state);
9795
9796 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9797
9798 if (op == '*' && (flags&SIMPLE)) {
9799 reginsert(pRExC_state, STAR, ret, depth+1);
9800 ret->flags = 0;
9801 RExC_naughty += 4;
9802 }
9803 else if (op == '*') {
9804 min = 0;
9805 goto do_curly;
9806 }
9807 else if (op == '+' && (flags&SIMPLE)) {
9808 reginsert(pRExC_state, PLUS, ret, depth+1);
9809 ret->flags = 0;
9810 RExC_naughty += 3;
9811 }
9812 else if (op == '+') {
9813 min = 1;
9814 goto do_curly;
9815 }
9816 else if (op == '?') {
9817 min = 0; max = 1;
9818 goto do_curly;
9819 }
9820 nest_check:
9821 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9822 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
9823 ckWARN3reg(RExC_parse,
9824 "%.*s matches null string many times",
9825 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9826 origparse);
9827 (void)ReREFCNT_inc(RExC_rx_sv);
9828 }
9829
9830 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9831 nextchar(pRExC_state);
9832 reginsert(pRExC_state, MINMOD, ret, depth+1);
9833 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9834 }
9835 else
9836 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9837 regnode *ender;
9838 nextchar(pRExC_state);
9839 ender = reg_node(pRExC_state, SUCCEED);
9840 REGTAIL(pRExC_state, ret, ender);
9841 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9842 ret->flags = 0;
9843 ender = reg_node(pRExC_state, TAIL);
9844 REGTAIL(pRExC_state, ret, ender);
9845 }
9846
9847 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9848 RExC_parse++;
9849 vFAIL("Nested quantifiers");
9850 }
9851
9852 return(ret);
9853}
9854
9855STATIC bool
9856S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
9857 const bool strict /* Apply stricter parsing rules? */
9858 )
9859{
9860
9861 /* This is expected to be called by a parser routine that has recognized '\N'
9862 and needs to handle the rest. RExC_parse is expected to point at the first
9863 char following the N at the time of the call. On successful return,
9864 RExC_parse has been updated to point to just after the sequence identified
9865 by this routine, and <*flagp> has been updated.
9866
9867 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9868 character class.
9869
9870 \N may begin either a named sequence, or if outside a character class, mean
9871 to match a non-newline. For non single-quoted regexes, the tokenizer has
9872 attempted to decide which, and in the case of a named sequence, converted it
9873 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9874 where c1... are the characters in the sequence. For single-quoted regexes,
9875 the tokenizer passes the \N sequence through unchanged; this code will not
9876 attempt to determine this nor expand those, instead raising a syntax error.
9877 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9878 or there is no '}', it signals that this \N occurrence means to match a
9879 non-newline.
9880
9881 Only the \N{U+...} form should occur in a character class, for the same
9882 reason that '.' inside a character class means to just match a period: it
9883 just doesn't make sense.
9884
9885 The function raises an error (via vFAIL), and doesn't return for various
9886 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9887 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
9888 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
9889 only possible if node_p is non-NULL.
9890
9891
9892 If <valuep> is non-null, it means the caller can accept an input sequence
9893 consisting of a just a single code point; <*valuep> is set to that value
9894 if the input is such.
9895
9896 If <node_p> is non-null it signifies that the caller can accept any other
9897 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9898 is set as follows:
9899 1) \N means not-a-NL: points to a newly created REG_ANY node;
9900 2) \N{}: points to a new NOTHING node;
9901 3) otherwise: points to a new EXACT node containing the resolved
9902 string.
9903 Note that FALSE is returned for single code point sequences if <valuep> is
9904 null.
9905 */
9906
9907 char * endbrace; /* '}' following the name */
9908 char* p;
9909 char *endchar; /* Points to '.' or '}' ending cur char in the input
9910 stream */
9911 bool has_multiple_chars; /* true if the input stream contains a sequence of
9912 more than one character */
9913
9914 GET_RE_DEBUG_FLAGS_DECL;
9915
9916 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9917
9918 GET_RE_DEBUG_FLAGS;
9919
9920 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9921
9922 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9923 * modifier. The other meaning does not */
9924 p = (RExC_flags & RXf_PMf_EXTENDED)
9925 ? regwhite( pRExC_state, RExC_parse )
9926 : RExC_parse;
9927
9928 /* Disambiguate between \N meaning a named character versus \N meaning
9929 * [^\n]. The former is assumed when it can't be the latter. */
9930 if (*p != '{' || regcurly(p, FALSE)) {
9931 RExC_parse = p;
9932 if (! node_p) {
9933 /* no bare \N in a charclass */
9934 if (in_char_class) {
9935 vFAIL("\\N in a character class must be a named character: \\N{...}");
9936 }
9937 return FALSE;
9938 }
9939 nextchar(pRExC_state);
9940 *node_p = reg_node(pRExC_state, REG_ANY);
9941 *flagp |= HASWIDTH|SIMPLE;
9942 RExC_naughty++;
9943 RExC_parse--;
9944 Set_Node_Length(*node_p, 1); /* MJD */
9945 return TRUE;
9946 }
9947
9948 /* Here, we have decided it should be a named character or sequence */
9949
9950 /* The test above made sure that the next real character is a '{', but
9951 * under the /x modifier, it could be separated by space (or a comment and
9952 * \n) and this is not allowed (for consistency with \x{...} and the
9953 * tokenizer handling of \N{NAME}). */
9954 if (*RExC_parse != '{') {
9955 vFAIL("Missing braces on \\N{}");
9956 }
9957
9958 RExC_parse++; /* Skip past the '{' */
9959
9960 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9961 || ! (endbrace == RExC_parse /* nothing between the {} */
9962 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9963 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9964 {
9965 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9966 vFAIL("\\N{NAME} must be resolved by the lexer");
9967 }
9968
9969 if (endbrace == RExC_parse) { /* empty: \N{} */
9970 bool ret = TRUE;
9971 if (node_p) {
9972 *node_p = reg_node(pRExC_state,NOTHING);
9973 }
9974 else if (in_char_class) {
9975 if (SIZE_ONLY && in_char_class) {
9976 if (strict) {
9977 RExC_parse++; /* Position after the "}" */
9978 vFAIL("Zero length \\N{}");
9979 }
9980 else {
9981 ckWARNreg(RExC_parse,
9982 "Ignoring zero length \\N{} in character class");
9983 }
9984 }
9985 ret = FALSE;
9986 }
9987 else {
9988 return FALSE;
9989 }
9990 nextchar(pRExC_state);
9991 return ret;
9992 }
9993
9994 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9995 RExC_parse += 2; /* Skip past the 'U+' */
9996
9997 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9998
9999 /* Code points are separated by dots. If none, there is only one code
10000 * point, and is terminated by the brace */
10001 has_multiple_chars = (endchar < endbrace);
10002
10003 if (valuep && (! has_multiple_chars || in_char_class)) {
10004 /* We only pay attention to the first char of
10005 multichar strings being returned in char classes. I kinda wonder
10006 if this makes sense as it does change the behaviour
10007 from earlier versions, OTOH that behaviour was broken
10008 as well. XXX Solution is to recharacterize as
10009 [rest-of-class]|multi1|multi2... */
10010
10011 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10012 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10013 | PERL_SCAN_DISALLOW_PREFIX
10014 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10015
10016 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10017
10018 /* The tokenizer should have guaranteed validity, but it's possible to
10019 * bypass it by using single quoting, so check */
10020 if (length_of_hex == 0
10021 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10022 {
10023 RExC_parse += length_of_hex; /* Includes all the valid */
10024 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10025 ? UTF8SKIP(RExC_parse)
10026 : 1;
10027 /* Guard against malformed utf8 */
10028 if (RExC_parse >= endchar) {
10029 RExC_parse = endchar;
10030 }
10031 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10032 }
10033
10034 if (in_char_class && has_multiple_chars) {
10035 if (strict) {
10036 RExC_parse = endbrace;
10037 vFAIL("\\N{} in character class restricted to one character");
10038 }
10039 else {
10040 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10041 }
10042 }
10043
10044 RExC_parse = endbrace + 1;
10045 }
10046 else if (! node_p || ! has_multiple_chars) {
10047
10048 /* Here, the input is legal, but not according to the caller's
10049 * options. We fail without advancing the parse, so that the
10050 * caller can try again */
10051 RExC_parse = p;
10052 return FALSE;
10053 }
10054 else {
10055
10056 /* What is done here is to convert this to a sub-pattern of the form
10057 * (?:\x{char1}\x{char2}...)
10058 * and then call reg recursively. That way, it retains its atomicness,
10059 * while not having to worry about special handling that some code
10060 * points may have. toke.c has converted the original Unicode values
10061 * to native, so that we can just pass on the hex values unchanged. We
10062 * do have to set a flag to keep recoding from happening in the
10063 * recursion */
10064
10065 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10066 STRLEN len;
10067 char *orig_end = RExC_end;
10068 I32 flags;
10069
10070 while (RExC_parse < endbrace) {
10071
10072 /* Convert to notation the rest of the code understands */
10073 sv_catpv(substitute_parse, "\\x{");
10074 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10075 sv_catpv(substitute_parse, "}");
10076
10077 /* Point to the beginning of the next character in the sequence. */
10078 RExC_parse = endchar + 1;
10079 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10080 }
10081 sv_catpv(substitute_parse, ")");
10082
10083 RExC_parse = SvPV(substitute_parse, len);
10084
10085 /* Don't allow empty number */
10086 if (len < 8) {
10087 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10088 }
10089 RExC_end = RExC_parse + len;
10090
10091 /* The values are Unicode, and therefore not subject to recoding */
10092 RExC_override_recoding = 1;
10093
10094 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10095 if (flags & RESTART_UTF8) {
10096 *flagp = RESTART_UTF8;
10097 return FALSE;
10098 }
10099 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10100 (UV) flags);
10101 }
10102 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10103
10104 RExC_parse = endbrace;
10105 RExC_end = orig_end;
10106 RExC_override_recoding = 0;
10107
10108 nextchar(pRExC_state);
10109 }
10110
10111 return TRUE;
10112}
10113
10114
10115/*
10116 * reg_recode
10117 *
10118 * It returns the code point in utf8 for the value in *encp.
10119 * value: a code value in the source encoding
10120 * encp: a pointer to an Encode object
10121 *
10122 * If the result from Encode is not a single character,
10123 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10124 */
10125STATIC UV
10126S_reg_recode(pTHX_ const char value, SV **encp)
10127{
10128 STRLEN numlen = 1;
10129 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10130 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10131 const STRLEN newlen = SvCUR(sv);
10132 UV uv = UNICODE_REPLACEMENT;
10133
10134 PERL_ARGS_ASSERT_REG_RECODE;
10135
10136 if (newlen)
10137 uv = SvUTF8(sv)
10138 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10139 : *(U8*)s;
10140
10141 if (!newlen || numlen != newlen) {
10142 uv = UNICODE_REPLACEMENT;
10143 *encp = NULL;
10144 }
10145 return uv;
10146}
10147
10148PERL_STATIC_INLINE U8
10149S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10150{
10151 U8 op;
10152
10153 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10154
10155 if (! FOLD) {
10156 return EXACT;
10157 }
10158
10159 op = get_regex_charset(RExC_flags);
10160 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10161 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10162 been, so there is no hole */
10163 }
10164
10165 return op + EXACTF;
10166}
10167
10168PERL_STATIC_INLINE void
10169S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
10170{
10171 /* This knows the details about sizing an EXACTish node, setting flags for
10172 * it (by setting <*flagp>, and potentially populating it with a single
10173 * character.
10174 *
10175 * If <len> (the length in bytes) is non-zero, this function assumes that
10176 * the node has already been populated, and just does the sizing. In this
10177 * case <code_point> should be the final code point that has already been
10178 * placed into the node. This value will be ignored except that under some
10179 * circumstances <*flagp> is set based on it.
10180 *
10181 * If <len> is zero, the function assumes that the node is to contain only
10182 * the single character given by <code_point> and calculates what <len>
10183 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10184 * additionally will populate the node's STRING with <code_point>, if <len>
10185 * is 0. In both cases <*flagp> is appropriately set
10186 *
10187 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10188 * 255, must be folded (the former only when the rules indicate it can
10189 * match 'ss') */
10190
10191 bool len_passed_in = cBOOL(len != 0);
10192 U8 character[UTF8_MAXBYTES_CASE+1];
10193
10194 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10195
10196 if (! len_passed_in) {
10197 if (UTF) {
10198 if (FOLD && (! LOC || code_point > 255)) {
10199 _to_uni_fold_flags(code_point,
10200 character,
10201 &len,
10202 FOLD_FLAGS_FULL | ((LOC)
10203 ? FOLD_FLAGS_LOCALE
10204 : (ASCII_FOLD_RESTRICTED)
10205 ? FOLD_FLAGS_NOMIX_ASCII
10206 : 0));
10207 }
10208 else {
10209 uvchr_to_utf8( character, code_point);
10210 len = UTF8SKIP(character);
10211 }
10212 }
10213 else if (! FOLD
10214 || code_point != LATIN_SMALL_LETTER_SHARP_S
10215 || ASCII_FOLD_RESTRICTED
10216 || ! AT_LEAST_UNI_SEMANTICS)
10217 {
10218 *character = (U8) code_point;
10219 len = 1;
10220 }
10221 else {
10222 *character = 's';
10223 *(character + 1) = 's';
10224 len = 2;
10225 }
10226 }
10227
10228 if (SIZE_ONLY) {
10229 RExC_size += STR_SZ(len);
10230 }
10231 else {
10232 RExC_emit += STR_SZ(len);
10233 STR_LEN(node) = len;
10234 if (! len_passed_in) {
10235 Copy((char *) character, STRING(node), len, char);
10236 }
10237 }
10238
10239 *flagp |= HASWIDTH;
10240
10241 /* A single character node is SIMPLE, except for the special-cased SHARP S
10242 * under /di. */
10243 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
10244 && (code_point != LATIN_SMALL_LETTER_SHARP_S
10245 || ! FOLD || ! DEPENDS_SEMANTICS))
10246 {
10247 *flagp |= SIMPLE;
10248 }
10249}
10250
10251/*
10252 - regatom - the lowest level
10253
10254 Try to identify anything special at the start of the pattern. If there
10255 is, then handle it as required. This may involve generating a single regop,
10256 such as for an assertion; or it may involve recursing, such as to
10257 handle a () structure.
10258
10259 If the string doesn't start with something special then we gobble up
10260 as much literal text as we can.
10261
10262 Once we have been able to handle whatever type of thing started the
10263 sequence, we return.
10264
10265 Note: we have to be careful with escapes, as they can be both literal
10266 and special, and in the case of \10 and friends, context determines which.
10267
10268 A summary of the code structure is:
10269
10270 switch (first_byte) {
10271 cases for each special:
10272 handle this special;
10273 break;
10274 case '\\':
10275 switch (2nd byte) {
10276 cases for each unambiguous special:
10277 handle this special;
10278 break;
10279 cases for each ambigous special/literal:
10280 disambiguate;
10281 if (special) handle here
10282 else goto defchar;
10283 default: // unambiguously literal:
10284 goto defchar;
10285 }
10286 default: // is a literal char
10287 // FALL THROUGH
10288 defchar:
10289 create EXACTish node for literal;
10290 while (more input and node isn't full) {
10291 switch (input_byte) {
10292 cases for each special;
10293 make sure parse pointer is set so that the next call to
10294 regatom will see this special first
10295 goto loopdone; // EXACTish node terminated by prev. char
10296 default:
10297 append char to EXACTISH node;
10298 }
10299 get next input byte;
10300 }
10301 loopdone:
10302 }
10303 return the generated node;
10304
10305 Specifically there are two separate switches for handling
10306 escape sequences, with the one for handling literal escapes requiring
10307 a dummy entry for all of the special escapes that are actually handled
10308 by the other.
10309
10310 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
10311 TRYAGAIN.
10312 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10313 restarted.
10314 Otherwise does not return NULL.
10315*/
10316
10317STATIC regnode *
10318S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10319{
10320 dVAR;
10321 regnode *ret = NULL;
10322 I32 flags = 0;
10323 char *parse_start = RExC_parse;
10324 U8 op;
10325 int invert = 0;
10326
10327 GET_RE_DEBUG_FLAGS_DECL;
10328
10329 *flagp = WORST; /* Tentatively. */
10330
10331 DEBUG_PARSE("atom");
10332
10333 PERL_ARGS_ASSERT_REGATOM;
10334
10335tryagain:
10336 switch ((U8)*RExC_parse) {
10337 case '^':
10338 RExC_seen_zerolen++;
10339 nextchar(pRExC_state);
10340 if (RExC_flags & RXf_PMf_MULTILINE)
10341 ret = reg_node(pRExC_state, MBOL);
10342 else if (RExC_flags & RXf_PMf_SINGLELINE)
10343 ret = reg_node(pRExC_state, SBOL);
10344 else
10345 ret = reg_node(pRExC_state, BOL);
10346 Set_Node_Length(ret, 1); /* MJD */
10347 break;
10348 case '$':
10349 nextchar(pRExC_state);
10350 if (*RExC_parse)
10351 RExC_seen_zerolen++;
10352 if (RExC_flags & RXf_PMf_MULTILINE)
10353 ret = reg_node(pRExC_state, MEOL);
10354 else if (RExC_flags & RXf_PMf_SINGLELINE)
10355 ret = reg_node(pRExC_state, SEOL);
10356 else
10357 ret = reg_node(pRExC_state, EOL);
10358 Set_Node_Length(ret, 1); /* MJD */
10359 break;
10360 case '.':
10361 nextchar(pRExC_state);
10362 if (RExC_flags & RXf_PMf_SINGLELINE)
10363 ret = reg_node(pRExC_state, SANY);
10364 else
10365 ret = reg_node(pRExC_state, REG_ANY);
10366 *flagp |= HASWIDTH|SIMPLE;
10367 RExC_naughty++;
10368 Set_Node_Length(ret, 1); /* MJD */
10369 break;
10370 case '[':
10371 {
10372 char * const oregcomp_parse = ++RExC_parse;
10373 ret = regclass(pRExC_state, flagp,depth+1,
10374 FALSE, /* means parse the whole char class */
10375 TRUE, /* allow multi-char folds */
10376 FALSE, /* don't silence non-portable warnings. */
10377 NULL);
10378 if (*RExC_parse != ']') {
10379 RExC_parse = oregcomp_parse;
10380 vFAIL("Unmatched [");
10381 }
10382 if (ret == NULL) {
10383 if (*flagp & RESTART_UTF8)
10384 return NULL;
10385 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10386 (UV) *flagp);
10387 }
10388 nextchar(pRExC_state);
10389 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10390 break;
10391 }
10392 case '(':
10393 nextchar(pRExC_state);
10394 ret = reg(pRExC_state, 2, &flags,depth+1);
10395 if (ret == NULL) {
10396 if (flags & TRYAGAIN) {
10397 if (RExC_parse == RExC_end) {
10398 /* Make parent create an empty node if needed. */
10399 *flagp |= TRYAGAIN;
10400 return(NULL);
10401 }
10402 goto tryagain;
10403 }
10404 if (flags & RESTART_UTF8) {
10405 *flagp = RESTART_UTF8;
10406 return NULL;
10407 }
10408 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", (UV) flags);
10409 }
10410 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10411 break;
10412 case '|':
10413 case ')':
10414 if (flags & TRYAGAIN) {
10415 *flagp |= TRYAGAIN;
10416 return NULL;
10417 }
10418 vFAIL("Internal urp");
10419 /* Supposed to be caught earlier. */
10420 break;
10421 case '{':
10422 if (!regcurly(RExC_parse, FALSE)) {
10423 RExC_parse++;
10424 goto defchar;
10425 }
10426 /* FALL THROUGH */
10427 case '?':
10428 case '+':
10429 case '*':
10430 RExC_parse++;
10431 vFAIL("Quantifier follows nothing");
10432 break;
10433 case '\\':
10434 /* Special Escapes
10435
10436 This switch handles escape sequences that resolve to some kind
10437 of special regop and not to literal text. Escape sequnces that
10438 resolve to literal text are handled below in the switch marked
10439 "Literal Escapes".
10440
10441 Every entry in this switch *must* have a corresponding entry
10442 in the literal escape switch. However, the opposite is not
10443 required, as the default for this switch is to jump to the
10444 literal text handling code.
10445 */
10446 switch ((U8)*++RExC_parse) {
10447 U8 arg;
10448 /* Special Escapes */
10449 case 'A':
10450 RExC_seen_zerolen++;
10451 ret = reg_node(pRExC_state, SBOL);
10452 *flagp |= SIMPLE;
10453 goto finish_meta_pat;
10454 case 'G':
10455 ret = reg_node(pRExC_state, GPOS);
10456 RExC_seen |= REG_SEEN_GPOS;
10457 *flagp |= SIMPLE;
10458 goto finish_meta_pat;
10459 case 'K':
10460 RExC_seen_zerolen++;
10461 ret = reg_node(pRExC_state, KEEPS);
10462 *flagp |= SIMPLE;
10463 /* XXX:dmq : disabling in-place substitution seems to
10464 * be necessary here to avoid cases of memory corruption, as
10465 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10466 */
10467 RExC_seen |= REG_SEEN_LOOKBEHIND;
10468 goto finish_meta_pat;
10469 case 'Z':
10470 ret = reg_node(pRExC_state, SEOL);
10471 *flagp |= SIMPLE;
10472 RExC_seen_zerolen++; /* Do not optimize RE away */
10473 goto finish_meta_pat;
10474 case 'z':
10475 ret = reg_node(pRExC_state, EOS);
10476 *flagp |= SIMPLE;
10477 RExC_seen_zerolen++; /* Do not optimize RE away */
10478 goto finish_meta_pat;
10479 case 'C':
10480 ret = reg_node(pRExC_state, CANY);
10481 RExC_seen |= REG_SEEN_CANY;
10482 *flagp |= HASWIDTH|SIMPLE;
10483 goto finish_meta_pat;
10484 case 'X':
10485 ret = reg_node(pRExC_state, CLUMP);
10486 *flagp |= HASWIDTH;
10487 goto finish_meta_pat;
10488
10489 case 'W':
10490 invert = 1;
10491 /* FALLTHROUGH */
10492 case 'w':
10493 arg = ANYOF_WORDCHAR;
10494 goto join_posix;
10495
10496 case 'b':
10497 RExC_seen_zerolen++;
10498 RExC_seen |= REG_SEEN_LOOKBEHIND;
10499 op = BOUND + get_regex_charset(RExC_flags);
10500 if (op > BOUNDA) { /* /aa is same as /a */
10501 op = BOUNDA;
10502 }
10503 ret = reg_node(pRExC_state, op);
10504 FLAGS(ret) = get_regex_charset(RExC_flags);
10505 *flagp |= SIMPLE;
10506 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10507 ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
10508 }
10509 goto finish_meta_pat;
10510 case 'B':
10511 RExC_seen_zerolen++;
10512 RExC_seen |= REG_SEEN_LOOKBEHIND;
10513 op = NBOUND + get_regex_charset(RExC_flags);
10514 if (op > NBOUNDA) { /* /aa is same as /a */
10515 op = NBOUNDA;
10516 }
10517 ret = reg_node(pRExC_state, op);
10518 FLAGS(ret) = get_regex_charset(RExC_flags);
10519 *flagp |= SIMPLE;
10520 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
10521 ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
10522 }
10523 goto finish_meta_pat;
10524
10525 case 'D':
10526 invert = 1;
10527 /* FALLTHROUGH */
10528 case 'd':
10529 arg = ANYOF_DIGIT;
10530 goto join_posix;
10531
10532 case 'R':
10533 ret = reg_node(pRExC_state, LNBREAK);
10534 *flagp |= HASWIDTH|SIMPLE;
10535 goto finish_meta_pat;
10536
10537 case 'H':
10538 invert = 1;
10539 /* FALLTHROUGH */
10540 case 'h':
10541 arg = ANYOF_BLANK;
10542 op = POSIXU;
10543 goto join_posix_op_known;
10544
10545 case 'V':
10546 invert = 1;
10547 /* FALLTHROUGH */
10548 case 'v':
10549 arg = ANYOF_VERTWS;
10550 op = POSIXU;
10551 goto join_posix_op_known;
10552
10553 case 'S':
10554 invert = 1;
10555 /* FALLTHROUGH */
10556 case 's':
10557 arg = ANYOF_SPACE;
10558
10559 join_posix:
10560
10561 op = POSIXD + get_regex_charset(RExC_flags);
10562 if (op > POSIXA) { /* /aa is same as /a */
10563 op = POSIXA;
10564 }
10565
10566 join_posix_op_known:
10567
10568 if (invert) {
10569 op += NPOSIXD - POSIXD;
10570 }
10571
10572 ret = reg_node(pRExC_state, op);
10573 if (! SIZE_ONLY) {
10574 FLAGS(ret) = namedclass_to_classnum(arg);
10575 }
10576
10577 *flagp |= HASWIDTH|SIMPLE;
10578 /* FALL THROUGH */
10579
10580 finish_meta_pat:
10581 nextchar(pRExC_state);
10582 Set_Node_Length(ret, 2); /* MJD */
10583 break;
10584 case 'p':
10585 case 'P':
10586 {
10587#ifdef DEBUGGING
10588 char* parse_start = RExC_parse - 2;
10589#endif
10590
10591 RExC_parse--;
10592
10593 ret = regclass(pRExC_state, flagp,depth+1,
10594 TRUE, /* means just parse this element */
10595 FALSE, /* don't allow multi-char folds */
10596 FALSE, /* don't silence non-portable warnings.
10597 It would be a bug if these returned
10598 non-portables */
10599 NULL);
10600 /* regclass() can only return RESTART_UTF8 if multi-char folds
10601 are allowed. */
10602 if (!ret)
10603 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
10604 (UV) *flagp);
10605
10606 RExC_parse--;
10607
10608 Set_Node_Offset(ret, parse_start + 2);
10609 Set_Node_Cur_Length(ret, parse_start);
10610 nextchar(pRExC_state);
10611 }
10612 break;
10613 case 'N':
10614 /* Handle \N and \N{NAME} with multiple code points here and not
10615 * below because it can be multicharacter. join_exact() will join
10616 * them up later on. Also this makes sure that things like
10617 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10618 * The options to the grok function call causes it to fail if the
10619 * sequence is just a single code point. We then go treat it as
10620 * just another character in the current EXACT node, and hence it
10621 * gets uniform treatment with all the other characters. The
10622 * special treatment for quantifiers is not needed for such single
10623 * character sequences */
10624 ++RExC_parse;
10625 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
10626 FALSE /* not strict */ )) {
10627 if (*flagp & RESTART_UTF8)
10628 return NULL;
10629 RExC_parse--;
10630 goto defchar;
10631 }
10632 break;
10633 case 'k': /* Handle \k<NAME> and \k'NAME' */
10634 parse_named_seq:
10635 {
10636 char ch= RExC_parse[1];
10637 if (ch != '<' && ch != '\'' && ch != '{') {
10638 RExC_parse++;
10639 vFAIL2("Sequence %.2s... not terminated",parse_start);
10640 } else {
10641 /* this pretty much dupes the code for (?P=...) in reg(), if
10642 you change this make sure you change that */
10643 char* name_start = (RExC_parse += 2);
10644 U32 num = 0;
10645 SV *sv_dat = reg_scan_name(pRExC_state,
10646 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10647 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10648 if (RExC_parse == name_start || *RExC_parse != ch)
10649 vFAIL2("Sequence %.3s... not terminated",parse_start);
10650
10651 if (!SIZE_ONLY) {
10652 num = add_data( pRExC_state, 1, "S" );
10653 RExC_rxi->data->data[num]=(void*)sv_dat;
10654 SvREFCNT_inc_simple_void(sv_dat);
10655 }
10656
10657 RExC_sawback = 1;
10658 ret = reganode(pRExC_state,
10659 ((! FOLD)
10660 ? NREF
10661 : (ASCII_FOLD_RESTRICTED)
10662 ? NREFFA
10663 : (AT_LEAST_UNI_SEMANTICS)
10664 ? NREFFU
10665 : (LOC)
10666 ? NREFFL
10667 : NREFF),
10668 num);
10669 *flagp |= HASWIDTH;
10670
10671 /* override incorrect value set in reganode MJD */
10672 Set_Node_Offset(ret, parse_start+1);
10673 Set_Node_Cur_Length(ret, parse_start);
10674 nextchar(pRExC_state);
10675
10676 }
10677 break;
10678 }
10679 case 'g':
10680 case '1': case '2': case '3': case '4':
10681 case '5': case '6': case '7': case '8': case '9':
10682 {
10683 I32 num;
10684 bool isg = *RExC_parse == 'g';
10685 bool isrel = 0;
10686 bool hasbrace = 0;
10687 if (isg) {
10688 RExC_parse++;
10689 if (*RExC_parse == '{') {
10690 RExC_parse++;
10691 hasbrace = 1;
10692 }
10693 if (*RExC_parse == '-') {
10694 RExC_parse++;
10695 isrel = 1;
10696 }
10697 if (hasbrace && !isDIGIT(*RExC_parse)) {
10698 if (isrel) RExC_parse--;
10699 RExC_parse -= 2;
10700 goto parse_named_seq;
10701 } }
10702 num = atoi(RExC_parse);
10703 if (isg && num == 0) {
10704 if (*RExC_parse == '0') {
10705 vFAIL("Reference to invalid group 0");
10706 }
10707 else {
10708 vFAIL("Unterminated \\g... pattern");
10709 }
10710 }
10711 if (isrel) {
10712 num = RExC_npar - num;
10713 if (num < 1)
10714 vFAIL("Reference to nonexistent or unclosed group");
10715 }
10716 if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
10717 /* Probably a character specified in octal, e.g. \35 */
10718 goto defchar;
10719 else {
10720#ifdef RE_TRACK_PATTERN_OFFSETS
10721 char * const parse_start = RExC_parse - 1; /* MJD */
10722#endif
10723 while (isDIGIT(*RExC_parse))
10724 RExC_parse++;
10725 if (hasbrace) {
10726 if (*RExC_parse != '}')
10727 vFAIL("Unterminated \\g{...} pattern");
10728 RExC_parse++;
10729 }
10730 if (!SIZE_ONLY) {
10731 if (num > (I32)RExC_rx->nparens)
10732 vFAIL("Reference to nonexistent group");
10733 }
10734 RExC_sawback = 1;
10735 ret = reganode(pRExC_state,
10736 ((! FOLD)
10737 ? REF
10738 : (ASCII_FOLD_RESTRICTED)
10739 ? REFFA
10740 : (AT_LEAST_UNI_SEMANTICS)
10741 ? REFFU
10742 : (LOC)
10743 ? REFFL
10744 : REFF),
10745 num);
10746 *flagp |= HASWIDTH;
10747
10748 /* override incorrect value set in reganode MJD */
10749 Set_Node_Offset(ret, parse_start+1);
10750 Set_Node_Cur_Length(ret, parse_start);
10751 RExC_parse--;
10752 nextchar(pRExC_state);
10753 }
10754 }
10755 break;
10756 case '\0':
10757 if (RExC_parse >= RExC_end)
10758 FAIL("Trailing \\");
10759 /* FALL THROUGH */
10760 default:
10761 /* Do not generate "unrecognized" warnings here, we fall
10762 back into the quick-grab loop below */
10763 parse_start--;
10764 goto defchar;
10765 }
10766 break;
10767
10768 case '#':
10769 if (RExC_flags & RXf_PMf_EXTENDED) {
10770 if ( reg_skipcomment( pRExC_state ) )
10771 goto tryagain;
10772 }
10773 /* FALL THROUGH */
10774
10775 default:
10776
10777 parse_start = RExC_parse - 1;
10778
10779 RExC_parse++;
10780
10781 defchar: {
10782 STRLEN len = 0;
10783 UV ender = 0;
10784 char *p;
10785 char *s;
10786#define MAX_NODE_STRING_SIZE 127
10787 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10788 char *s0;
10789 U8 upper_parse = MAX_NODE_STRING_SIZE;
10790 STRLEN foldlen;
10791 U8 node_type = compute_EXACTish(pRExC_state);
10792 bool next_is_quantifier;
10793 char * oldp = NULL;
10794
10795 /* We can convert EXACTF nodes to EXACTFU if they contain only
10796 * characters that match identically regardless of the target
10797 * string's UTF8ness. The reason to do this is that EXACTF is not
10798 * trie-able, EXACTFU is. (We don't need to figure this out until
10799 * pass 2) */
10800 bool maybe_exactfu = node_type == EXACTF && PASS2;
10801
10802 /* If a folding node contains only code points that don't
10803 * participate in folds, it can be changed into an EXACT node,
10804 * which allows the optimizer more things to look for */
10805 bool maybe_exact;
10806
10807 ret = reg_node(pRExC_state, node_type);
10808
10809 /* In pass1, folded, we use a temporary buffer instead of the
10810 * actual node, as the node doesn't exist yet */
10811 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10812
10813 s0 = s;
10814
10815 reparse:
10816
10817 /* We do the EXACTFish to EXACT node only if folding, and not if in
10818 * locale, as whether a character folds or not isn't known until
10819 * runtime. (And we don't need to figure this out until pass 2) */
10820 maybe_exact = FOLD && ! LOC && PASS2;
10821
10822 /* XXX The node can hold up to 255 bytes, yet this only goes to
10823 * 127. I (khw) do not know why. Keeping it somewhat less than
10824 * 255 allows us to not have to worry about overflow due to
10825 * converting to utf8 and fold expansion, but that value is
10826 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10827 * split up by this limit into a single one using the real max of
10828 * 255. Even at 127, this breaks under rare circumstances. If
10829 * folding, we do not want to split a node at a character that is a
10830 * non-final in a multi-char fold, as an input string could just
10831 * happen to want to match across the node boundary. The join
10832 * would solve that problem if the join actually happens. But a
10833 * series of more than two nodes in a row each of 127 would cause
10834 * the first join to succeed to get to 254, but then there wouldn't
10835 * be room for the next one, which could at be one of those split
10836 * multi-char folds. I don't know of any fool-proof solution. One
10837 * could back off to end with only a code point that isn't such a
10838 * non-final, but it is possible for there not to be any in the
10839 * entire node. */
10840 for (p = RExC_parse - 1;
10841 len < upper_parse && p < RExC_end;
10842 len++)
10843 {
10844 oldp = p;
10845
10846 if (RExC_flags & RXf_PMf_EXTENDED)
10847 p = regwhite( pRExC_state, p );
10848 switch ((U8)*p) {
10849 case '^':
10850 case '$':
10851 case '.':
10852 case '[':
10853 case '(':
10854 case ')':
10855 case '|':
10856 goto loopdone;
10857 case '\\':
10858 /* Literal Escapes Switch
10859
10860 This switch is meant to handle escape sequences that
10861 resolve to a literal character.
10862
10863 Every escape sequence that represents something
10864 else, like an assertion or a char class, is handled
10865 in the switch marked 'Special Escapes' above in this
10866 routine, but also has an entry here as anything that
10867 isn't explicitly mentioned here will be treated as
10868 an unescaped equivalent literal.
10869 */
10870
10871 switch ((U8)*++p) {
10872 /* These are all the special escapes. */
10873 case 'A': /* Start assertion */
10874 case 'b': case 'B': /* Word-boundary assertion*/
10875 case 'C': /* Single char !DANGEROUS! */
10876 case 'd': case 'D': /* digit class */
10877 case 'g': case 'G': /* generic-backref, pos assertion */
10878 case 'h': case 'H': /* HORIZWS */
10879 case 'k': case 'K': /* named backref, keep marker */
10880 case 'p': case 'P': /* Unicode property */
10881 case 'R': /* LNBREAK */
10882 case 's': case 'S': /* space class */
10883 case 'v': case 'V': /* VERTWS */
10884 case 'w': case 'W': /* word class */
10885 case 'X': /* eXtended Unicode "combining character sequence" */
10886 case 'z': case 'Z': /* End of line/string assertion */
10887 --p;
10888 goto loopdone;
10889
10890 /* Anything after here is an escape that resolves to a
10891 literal. (Except digits, which may or may not)
10892 */
10893 case 'n':
10894 ender = '\n';
10895 p++;
10896 break;
10897 case 'N': /* Handle a single-code point named character. */
10898 /* The options cause it to fail if a multiple code
10899 * point sequence. Handle those in the switch() above
10900 * */
10901 RExC_parse = p + 1;
10902 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10903 flagp, depth, FALSE,
10904 FALSE /* not strict */ ))
10905 {
10906 if (*flagp & RESTART_UTF8)
10907 FAIL("panic: grok_bslash_N set RESTART_UTF8");
10908 RExC_parse = p = oldp;
10909 goto loopdone;
10910 }
10911 p = RExC_parse;
10912 if (ender > 0xff) {
10913 REQUIRE_UTF8;
10914 }
10915 break;
10916 case 'r':
10917 ender = '\r';
10918 p++;
10919 break;
10920 case 't':
10921 ender = '\t';
10922 p++;
10923 break;
10924 case 'f':
10925 ender = '\f';
10926 p++;
10927 break;
10928 case 'e':
10929 ender = ASCII_TO_NATIVE('\033');
10930 p++;
10931 break;
10932 case 'a':
10933 ender = '\a';
10934 p++;
10935 break;
10936 case 'o':
10937 {
10938 UV result;
10939 const char* error_msg;
10940
10941 bool valid = grok_bslash_o(&p,
10942 &result,
10943 &error_msg,
10944 TRUE, /* out warnings */
10945 FALSE, /* not strict */
10946 TRUE, /* Output warnings
10947 for non-
10948 portables */
10949 UTF);
10950 if (! valid) {
10951 RExC_parse = p; /* going to die anyway; point
10952 to exact spot of failure */
10953 vFAIL(error_msg);
10954 }
10955 ender = result;
10956 if (PL_encoding && ender < 0x100) {
10957 goto recode_encoding;
10958 }
10959 if (ender > 0xff) {
10960 REQUIRE_UTF8;
10961 }
10962 break;
10963 }
10964 case 'x':
10965 {
10966 UV result = UV_MAX; /* initialize to erroneous
10967 value */
10968 const char* error_msg;
10969
10970 bool valid = grok_bslash_x(&p,
10971 &result,
10972 &error_msg,
10973 TRUE, /* out warnings */
10974 FALSE, /* not strict */
10975 TRUE, /* Output warnings
10976 for non-
10977 portables */
10978 UTF);
10979 if (! valid) {
10980 RExC_parse = p; /* going to die anyway; point
10981 to exact spot of failure */
10982 vFAIL(error_msg);
10983 }
10984 ender = result;
10985
10986 if (PL_encoding && ender < 0x100) {
10987 goto recode_encoding;
10988 }
10989 if (ender > 0xff) {
10990 REQUIRE_UTF8;
10991 }
10992 break;
10993 }
10994 case 'c':
10995 p++;
10996 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10997 break;
10998 case '8': case '9': /* must be a backreference */
10999 --p;
11000 goto loopdone;
11001 case '1': case '2': case '3':case '4':
11002 case '5': case '6': case '7':
11003 /* When we parse backslash escapes there is ambiguity
11004 * between backreferences and octal escapes. Any escape
11005 * from \1 - \9 is a backreference, any multi-digit
11006 * escape which does not start with 0 and which when
11007 * evaluated as decimal could refer to an already
11008 * parsed capture buffer is a backslash. Anything else
11009 * is octal.
11010 *
11011 * Note this implies that \118 could be interpreted as
11012 * 118 OR as "\11" . "8" depending on whether there
11013 * were 118 capture buffers defined already in the
11014 * pattern. */
11015 if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
11016 { /* Not to be treated as an octal constant, go
11017 find backref */
11018 --p;
11019 goto loopdone;
11020 }
11021 case '0':
11022 {
11023 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11024 STRLEN numlen = 3;
11025 ender = grok_oct(p, &numlen, &flags, NULL);
11026 if (ender > 0xff) {
11027 REQUIRE_UTF8;
11028 }
11029 p += numlen;
11030 if (SIZE_ONLY /* like \08, \178 */
11031 && numlen < 3
11032 && p < RExC_end
11033 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11034 {
11035 reg_warn_non_literal_string(
11036 p + 1,
11037 form_short_octal_warning(p, numlen));
11038 }
11039 }
11040 if (PL_encoding && ender < 0x100)
11041 goto recode_encoding;
11042 break;
11043 recode_encoding:
11044 if (! RExC_override_recoding) {
11045 SV* enc = PL_encoding;
11046 ender = reg_recode((const char)(U8)ender, &enc);
11047 if (!enc && SIZE_ONLY)
11048 ckWARNreg(p, "Invalid escape in the specified encoding");
11049 REQUIRE_UTF8;
11050 }
11051 break;
11052 case '\0':
11053 if (p >= RExC_end)
11054 FAIL("Trailing \\");
11055 /* FALL THROUGH */
11056 default:
11057 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11058 /* Include any { following the alpha to emphasize
11059 * that it could be part of an escape at some point
11060 * in the future */
11061 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11062 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11063 }
11064 goto normal_default;
11065 } /* End of switch on '\' */
11066 break;
11067 default: /* A literal character */
11068
11069 if (! SIZE_ONLY
11070 && RExC_flags & RXf_PMf_EXTENDED
11071 && ckWARN_d(WARN_DEPRECATED)
11072 && is_PATWS_non_low(p, UTF))
11073 {
11074 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11075 "Escape literal pattern white space under /x");
11076 }
11077
11078 normal_default:
11079 if (UTF8_IS_START(*p) && UTF) {
11080 STRLEN numlen;
11081 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11082 &numlen, UTF8_ALLOW_DEFAULT);
11083 p += numlen;
11084 }
11085 else
11086 ender = (U8) *p++;
11087 break;
11088 } /* End of switch on the literal */
11089
11090 /* Here, have looked at the literal character and <ender>
11091 * contains its ordinal, <p> points to the character after it
11092 */
11093
11094 if ( RExC_flags & RXf_PMf_EXTENDED)
11095 p = regwhite( pRExC_state, p );
11096
11097 /* If the next thing is a quantifier, it applies to this
11098 * character only, which means that this character has to be in
11099 * its own node and can't just be appended to the string in an
11100 * existing node, so if there are already other characters in
11101 * the node, close the node with just them, and set up to do
11102 * this character again next time through, when it will be the
11103 * only thing in its new node */
11104 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
11105 {
11106 p = oldp;
11107 goto loopdone;
11108 }
11109
11110 if (! FOLD) {
11111 if (UTF) {
11112 const STRLEN unilen = reguni(pRExC_state, ender, s);
11113 if (unilen > 0) {
11114 s += unilen;
11115 len += unilen;
11116 }
11117
11118 /* The loop increments <len> each time, as all but this
11119 * path (and one other) through it add a single byte to
11120 * the EXACTish node. But this one has changed len to
11121 * be the correct final value, so subtract one to
11122 * cancel out the increment that follows */
11123 len--;
11124 }
11125 else {
11126 REGC((char)ender, s++);
11127 }
11128 }
11129 else /* FOLD */ if (! ( UTF
11130 /* See comments for join_exact() as to why we fold this
11131 * non-UTF at compile time */
11132 || (node_type == EXACTFU
11133 && ender == LATIN_SMALL_LETTER_SHARP_S)))
11134 {
11135 if (IS_IN_SOME_FOLD_L1(ender)) {
11136 maybe_exact = FALSE;
11137
11138 /* See if the character's fold differs between /d and
11139 * /u. This includes the multi-char fold SHARP S to
11140 * 'ss' */
11141 if (maybe_exactfu
11142 && (PL_fold[ender] != PL_fold_latin1[ender]
11143 || ender == LATIN_SMALL_LETTER_SHARP_S
11144 || (len > 0
11145 && isARG2_lower_or_UPPER_ARG1('s', ender)
11146 && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
11147 {
11148 maybe_exactfu = FALSE;
11149 }
11150 }
11151 *(s++) = (char) ender;
11152 }
11153 else { /* UTF */
11154
11155 /* Prime the casefolded buffer. Locale rules, which apply
11156 * only to code points < 256, aren't known until execution,
11157 * so for them, just output the original character using
11158 * utf8. If we start to fold non-UTF patterns, be sure to
11159 * update join_exact() */
11160 if (LOC && ender < 256) {
11161 if (NATIVE_IS_INVARIANT(ender)) {
11162 *s = (U8) ender;
11163 foldlen = 1;
11164 } else {
11165 *s = UTF8_TWO_BYTE_HI(ender);
11166 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
11167 foldlen = 2;
11168 }
11169 }
11170 else {
11171 UV folded = _to_uni_fold_flags(
11172 ender,
11173 (U8 *) s,
11174 &foldlen,
11175 FOLD_FLAGS_FULL
11176 | ((LOC) ? FOLD_FLAGS_LOCALE
11177 : (ASCII_FOLD_RESTRICTED)
11178 ? FOLD_FLAGS_NOMIX_ASCII
11179 : 0)
11180 );
11181
11182 /* If this node only contains non-folding code points
11183 * so far, see if this new one is also non-folding */
11184 if (maybe_exact) {
11185 if (folded != ender) {
11186 maybe_exact = FALSE;
11187 }
11188 else {
11189 /* Here the fold is the original; we have
11190 * to check further to see if anything
11191 * folds to it */
11192 if (! PL_utf8_foldable) {
11193 SV* swash = swash_init("utf8",
11194 "_Perl_Any_Folds",
11195 &PL_sv_undef, 1, 0);
11196 PL_utf8_foldable =
11197 _get_swash_invlist(swash);
11198 SvREFCNT_dec_NN(swash);
11199 }
11200 if (_invlist_contains_cp(PL_utf8_foldable,
11201 ender))
11202 {
11203 maybe_exact = FALSE;
11204 }
11205 }
11206 }
11207 ender = folded;
11208 }
11209 s += foldlen;
11210
11211 /* The loop increments <len> each time, as all but this
11212 * path (and one other) through it add a single byte to the
11213 * EXACTish node. But this one has changed len to be the
11214 * correct final value, so subtract one to cancel out the
11215 * increment that follows */
11216 len += foldlen - 1;
11217 }
11218
11219 if (next_is_quantifier) {
11220
11221 /* Here, the next input is a quantifier, and to get here,
11222 * the current character is the only one in the node.
11223 * Also, here <len> doesn't include the final byte for this
11224 * character */
11225 len++;
11226 goto loopdone;
11227 }
11228
11229 } /* End of loop through literal characters */
11230
11231 /* Here we have either exhausted the input or ran out of room in
11232 * the node. (If we encountered a character that can't be in the
11233 * node, transfer is made directly to <loopdone>, and so we
11234 * wouldn't have fallen off the end of the loop.) In the latter
11235 * case, we artificially have to split the node into two, because
11236 * we just don't have enough space to hold everything. This
11237 * creates a problem if the final character participates in a
11238 * multi-character fold in the non-final position, as a match that
11239 * should have occurred won't, due to the way nodes are matched,
11240 * and our artificial boundary. So back off until we find a non-
11241 * problematic character -- one that isn't at the beginning or
11242 * middle of such a fold. (Either it doesn't participate in any
11243 * folds, or appears only in the final position of all the folds it
11244 * does participate in.) A better solution with far fewer false
11245 * positives, and that would fill the nodes more completely, would
11246 * be to actually have available all the multi-character folds to
11247 * test against, and to back-off only far enough to be sure that
11248 * this node isn't ending with a partial one. <upper_parse> is set
11249 * further below (if we need to reparse the node) to include just
11250 * up through that final non-problematic character that this code
11251 * identifies, so when it is set to less than the full node, we can
11252 * skip the rest of this */
11253 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
11254
11255 const STRLEN full_len = len;
11256
11257 assert(len >= MAX_NODE_STRING_SIZE);
11258
11259 /* Here, <s> points to the final byte of the final character.
11260 * Look backwards through the string until find a non-
11261 * problematic character */
11262
11263 if (! UTF) {
11264
11265 /* These two have no multi-char folds to non-UTF characters
11266 */
11267 if (ASCII_FOLD_RESTRICTED || LOC) {
11268 goto loopdone;
11269 }
11270
11271 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
11272 len = s - s0 + 1;
11273 }
11274 else {
11275 if (! PL_NonL1NonFinalFold) {
11276 PL_NonL1NonFinalFold = _new_invlist_C_array(
11277 NonL1_Perl_Non_Final_Folds_invlist);
11278 }
11279
11280 /* Point to the first byte of the final character */
11281 s = (char *) utf8_hop((U8 *) s, -1);
11282
11283 while (s >= s0) { /* Search backwards until find
11284 non-problematic char */
11285 if (UTF8_IS_INVARIANT(*s)) {
11286
11287 /* There are no ascii characters that participate
11288 * in multi-char folds under /aa. In EBCDIC, the
11289 * non-ascii invariants are all control characters,
11290 * so don't ever participate in any folds. */
11291 if (ASCII_FOLD_RESTRICTED
11292 || ! IS_NON_FINAL_FOLD(*s))
11293 {
11294 break;
11295 }
11296 }
11297 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
11298
11299 /* No Latin1 characters participate in multi-char
11300 * folds under /l */
11301 if (LOC
11302 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
11303 *s, *(s+1))))
11304 {
11305 break;
11306 }
11307 }
11308 else if (! _invlist_contains_cp(
11309 PL_NonL1NonFinalFold,
11310 valid_utf8_to_uvchr((U8 *) s, NULL)))
11311 {
11312 break;
11313 }
11314
11315 /* Here, the current character is problematic in that
11316 * it does occur in the non-final position of some
11317 * fold, so try the character before it, but have to
11318 * special case the very first byte in the string, so
11319 * we don't read outside the string */
11320 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
11321 } /* End of loop backwards through the string */
11322
11323 /* If there were only problematic characters in the string,
11324 * <s> will point to before s0, in which case the length
11325 * should be 0, otherwise include the length of the
11326 * non-problematic character just found */
11327 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
11328 }
11329
11330 /* Here, have found the final character, if any, that is
11331 * non-problematic as far as ending the node without splitting
11332 * it across a potential multi-char fold. <len> contains the
11333 * number of bytes in the node up-to and including that
11334 * character, or is 0 if there is no such character, meaning
11335 * the whole node contains only problematic characters. In
11336 * this case, give up and just take the node as-is. We can't
11337 * do any better */
11338 if (len == 0) {
11339 len = full_len;
11340
11341 /* If the node ends in an 's' we make sure it stays EXACTF,
11342 * as if it turns into an EXACTFU, it could later get
11343 * joined with another 's' that would then wrongly match
11344 * the sharp s */
11345 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
11346 {
11347 maybe_exactfu = FALSE;
11348 }
11349 } else {
11350
11351 /* Here, the node does contain some characters that aren't
11352 * problematic. If one such is the final character in the
11353 * node, we are done */
11354 if (len == full_len) {
11355 goto loopdone;
11356 }
11357 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11358
11359 /* If the final character is problematic, but the
11360 * penultimate is not, back-off that last character to
11361 * later start a new node with it */
11362 p = oldp;
11363 goto loopdone;
11364 }
11365
11366 /* Here, the final non-problematic character is earlier
11367 * in the input than the penultimate character. What we do
11368 * is reparse from the beginning, going up only as far as
11369 * this final ok one, thus guaranteeing that the node ends
11370 * in an acceptable character. The reason we reparse is
11371 * that we know how far in the character is, but we don't
11372 * know how to correlate its position with the input parse.
11373 * An alternate implementation would be to build that
11374 * correlation as we go along during the original parse,
11375 * but that would entail extra work for every node, whereas
11376 * this code gets executed only when the string is too
11377 * large for the node, and the final two characters are
11378 * problematic, an infrequent occurrence. Yet another
11379 * possible strategy would be to save the tail of the
11380 * string, and the next time regatom is called, initialize
11381 * with that. The problem with this is that unless you
11382 * back off one more character, you won't be guaranteed
11383 * regatom will get called again, unless regbranch,
11384 * regpiece ... are also changed. If you do back off that
11385 * extra character, so that there is input guaranteed to
11386 * force calling regatom, you can't handle the case where
11387 * just the first character in the node is acceptable. I
11388 * (khw) decided to try this method which doesn't have that
11389 * pitfall; if performance issues are found, we can do a
11390 * combination of the current approach plus that one */
11391 upper_parse = len;
11392 len = 0;
11393 s = s0;
11394 goto reparse;
11395 }
11396 } /* End of verifying node ends with an appropriate char */
11397
11398 loopdone: /* Jumped to when encounters something that shouldn't be in
11399 the node */
11400
11401 /* I (khw) don't know if you can get here with zero length, but the
11402 * old code handled this situation by creating a zero-length EXACT
11403 * node. Might as well be NOTHING instead */
11404 if (len == 0) {
11405 OP(ret) = NOTHING;
11406 }
11407 else {
11408 if (FOLD) {
11409 /* If 'maybe_exact' is still set here, means there are no
11410 * code points in the node that participate in folds;
11411 * similarly for 'maybe_exactfu' and code points that match
11412 * differently depending on UTF8ness of the target string
11413 * */
11414 if (maybe_exact) {
11415 OP(ret) = EXACT;
11416 }
11417 else if (maybe_exactfu) {
11418 OP(ret) = EXACTFU;
11419 }
11420 }
11421 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11422 }
11423
11424 RExC_parse = p - 1;
11425 Set_Node_Cur_Length(ret, parse_start);
11426 nextchar(pRExC_state);
11427 {
11428 /* len is STRLEN which is unsigned, need to copy to signed */
11429 IV iv = len;
11430 if (iv < 0)
11431 vFAIL("Internal disaster");
11432 }
11433
11434 } /* End of label 'defchar:' */
11435 break;
11436 } /* End of giant switch on input character */
11437
11438 return(ret);
11439}
11440
11441STATIC char *
11442S_regwhite( RExC_state_t *pRExC_state, char *p )
11443{
11444 const char *e = RExC_end;
11445
11446 PERL_ARGS_ASSERT_REGWHITE;
11447
11448 while (p < e) {
11449 if (isSPACE(*p))
11450 ++p;
11451 else if (*p == '#') {
11452 bool ended = 0;
11453 do {
11454 if (*p++ == '\n') {
11455 ended = 1;
11456 break;
11457 }
11458 } while (p < e);
11459 if (!ended)
11460 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11461 }
11462 else
11463 break;
11464 }
11465 return p;
11466}
11467
11468STATIC char *
11469S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
11470{
11471 /* Returns the next non-pattern-white space, non-comment character (the
11472 * latter only if 'recognize_comment is true) in the string p, which is
11473 * ended by RExC_end. If there is no line break ending a comment,
11474 * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
11475 const char *e = RExC_end;
11476
11477 PERL_ARGS_ASSERT_REGPATWS;
11478
11479 while (p < e) {
11480 STRLEN len;
11481 if ((len = is_PATWS_safe(p, e, UTF))) {
11482 p += len;
11483 }
11484 else if (recognize_comment && *p == '#') {
11485 bool ended = 0;
11486 do {
11487 p++;
11488 if (is_LNBREAK_safe(p, e, UTF)) {
11489 ended = 1;
11490 break;
11491 }
11492 } while (p < e);
11493 if (!ended)
11494 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11495 }
11496 else
11497 break;
11498 }
11499 return p;
11500}
11501
11502/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11503 Character classes ([:foo:]) can also be negated ([:^foo:]).
11504 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11505 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11506 but trigger failures because they are currently unimplemented. */
11507
11508#define POSIXCC_DONE(c) ((c) == ':')
11509#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11510#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11511
11512PERL_STATIC_INLINE I32
11513S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
11514{
11515 dVAR;
11516 I32 namedclass = OOB_NAMEDCLASS;
11517
11518 PERL_ARGS_ASSERT_REGPPOSIXCC;
11519
11520 if (value == '[' && RExC_parse + 1 < RExC_end &&
11521 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11522 POSIXCC(UCHARAT(RExC_parse)))
11523 {
11524 const char c = UCHARAT(RExC_parse);
11525 char* const s = RExC_parse++;
11526
11527 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11528 RExC_parse++;
11529 if (RExC_parse == RExC_end) {
11530 if (strict) {
11531
11532 /* Try to give a better location for the error (than the end of
11533 * the string) by looking for the matching ']' */
11534 RExC_parse = s;
11535 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11536 RExC_parse++;
11537 }
11538 vFAIL2("Unmatched '%c' in POSIX class", c);
11539 }
11540 /* Grandfather lone [:, [=, [. */
11541 RExC_parse = s;
11542 }
11543 else {
11544 const char* const t = RExC_parse++; /* skip over the c */
11545 assert(*t == c);
11546
11547 if (UCHARAT(RExC_parse) == ']') {
11548 const char *posixcc = s + 1;
11549 RExC_parse++; /* skip over the ending ] */
11550
11551 if (*s == ':') {
11552 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11553 const I32 skip = t - posixcc;
11554
11555 /* Initially switch on the length of the name. */
11556 switch (skip) {
11557 case 4:
11558 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
11559 this is the Perl \w
11560 */
11561 namedclass = ANYOF_WORDCHAR;
11562 break;
11563 case 5:
11564 /* Names all of length 5. */
11565 /* alnum alpha ascii blank cntrl digit graph lower
11566 print punct space upper */
11567 /* Offset 4 gives the best switch position. */
11568 switch (posixcc[4]) {
11569 case 'a':
11570 if (memEQ(posixcc, "alph", 4)) /* alpha */
11571 namedclass = ANYOF_ALPHA;
11572 break;
11573 case 'e':
11574 if (memEQ(posixcc, "spac", 4)) /* space */
11575 namedclass = ANYOF_PSXSPC;
11576 break;
11577 case 'h':
11578 if (memEQ(posixcc, "grap", 4)) /* graph */
11579 namedclass = ANYOF_GRAPH;
11580 break;
11581 case 'i':
11582 if (memEQ(posixcc, "asci", 4)) /* ascii */
11583 namedclass = ANYOF_ASCII;
11584 break;
11585 case 'k':
11586 if (memEQ(posixcc, "blan", 4)) /* blank */
11587 namedclass = ANYOF_BLANK;
11588 break;
11589 case 'l':
11590 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11591 namedclass = ANYOF_CNTRL;
11592 break;
11593 case 'm':
11594 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11595 namedclass = ANYOF_ALPHANUMERIC;
11596 break;
11597 case 'r':
11598 if (memEQ(posixcc, "lowe", 4)) /* lower */
11599 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
11600 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11601 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
11602 break;
11603 case 't':
11604 if (memEQ(posixcc, "digi", 4)) /* digit */
11605 namedclass = ANYOF_DIGIT;
11606 else if (memEQ(posixcc, "prin", 4)) /* print */
11607 namedclass = ANYOF_PRINT;
11608 else if (memEQ(posixcc, "punc", 4)) /* punct */
11609 namedclass = ANYOF_PUNCT;
11610 break;
11611 }
11612 break;
11613 case 6:
11614 if (memEQ(posixcc, "xdigit", 6))
11615 namedclass = ANYOF_XDIGIT;
11616 break;
11617 }
11618
11619 if (namedclass == OOB_NAMEDCLASS)
11620 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11621 t - s - 1, s + 1);
11622
11623 /* The #defines are structured so each complement is +1 to
11624 * the normal one */
11625 if (complement) {
11626 namedclass++;
11627 }
11628 assert (posixcc[skip] == ':');
11629 assert (posixcc[skip+1] == ']');
11630 } else if (!SIZE_ONLY) {
11631 /* [[=foo=]] and [[.foo.]] are still future. */
11632
11633 /* adjust RExC_parse so the warning shows after
11634 the class closes */
11635 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11636 RExC_parse++;
11637 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11638 }
11639 } else {
11640 /* Maternal grandfather:
11641 * "[:" ending in ":" but not in ":]" */
11642 if (strict) {
11643 vFAIL("Unmatched '[' in POSIX class");
11644 }
11645
11646 /* Grandfather lone [:, [=, [. */
11647 RExC_parse = s;
11648 }
11649 }
11650 }
11651
11652 return namedclass;
11653}
11654
11655STATIC bool
11656S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
11657{
11658 /* This applies some heuristics at the current parse position (which should
11659 * be at a '[') to see if what follows might be intended to be a [:posix:]
11660 * class. It returns true if it really is a posix class, of course, but it
11661 * also can return true if it thinks that what was intended was a posix
11662 * class that didn't quite make it.
11663 *
11664 * It will return true for
11665 * [:alphanumerics:
11666 * [:alphanumerics] (as long as the ] isn't followed immediately by a
11667 * ')' indicating the end of the (?[
11668 * [:any garbage including %^&$ punctuation:]
11669 *
11670 * This is designed to be called only from S_handle_regex_sets; it could be
11671 * easily adapted to be called from the spot at the beginning of regclass()
11672 * that checks to see in a normal bracketed class if the surrounding []
11673 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
11674 * change long-standing behavior, so I (khw) didn't do that */
11675 char* p = RExC_parse + 1;
11676 char first_char = *p;
11677
11678 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
11679
11680 assert(*(p - 1) == '[');
11681
11682 if (! POSIXCC(first_char)) {
11683 return FALSE;
11684 }
11685
11686 p++;
11687 while (p < RExC_end && isWORDCHAR(*p)) p++;
11688
11689 if (p >= RExC_end) {
11690 return FALSE;
11691 }
11692
11693 if (p - RExC_parse > 2 /* Got at least 1 word character */
11694 && (*p == first_char
11695 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
11696 {
11697 return TRUE;
11698 }
11699
11700 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
11701
11702 return (p
11703 && p - RExC_parse > 2 /* [:] evaluates to colon;
11704 [::] is a bad posix class. */
11705 && first_char == *(p - 1));
11706}
11707
11708STATIC regnode *
11709S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
11710 char * const oregcomp_parse)
11711{
11712 /* Handle the (?[...]) construct to do set operations */
11713
11714 U8 curchar;
11715 UV start, end; /* End points of code point ranges */
11716 SV* result_string;
11717 char *save_end, *save_parse;
11718 SV* final;
11719 STRLEN len;
11720 regnode* node;
11721 AV* stack;
11722 const bool save_fold = FOLD;
11723
11724 GET_RE_DEBUG_FLAGS_DECL;
11725
11726 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
11727
11728 if (LOC) {
11729 vFAIL("(?[...]) not valid in locale");
11730 }
11731 RExC_uni_semantics = 1;
11732
11733 /* This will return only an ANYOF regnode, or (unlikely) something smaller
11734 * (such as EXACT). Thus we can skip most everything if just sizing. We
11735 * call regclass to handle '[]' so as to not have to reinvent its parsing
11736 * rules here (throwing away the size it computes each time). And, we exit
11737 * upon an unescaped ']' that isn't one ending a regclass. To do both
11738 * these things, we need to realize that something preceded by a backslash
11739 * is escaped, so we have to keep track of backslashes */
11740 if (SIZE_ONLY) {
11741 UV depth = 0; /* how many nested (?[...]) constructs */
11742
11743 Perl_ck_warner_d(aTHX_
11744 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
11745 "The regex_sets feature is experimental" REPORT_LOCATION,
11746 (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
11747
11748 while (RExC_parse < RExC_end) {
11749 SV* current = NULL;
11750 RExC_parse = regpatws(pRExC_state, RExC_parse,
11751 TRUE); /* means recognize comments */
11752 switch (*RExC_parse) {
11753 case '?':
11754 if (RExC_parse[1] == '[') depth++, RExC_parse++;
11755 /* FALL THROUGH */
11756 default:
11757 break;
11758 case '\\':
11759 /* Skip the next byte (which could cause us to end up in
11760 * the middle of a UTF-8 character, but since none of those
11761 * are confusable with anything we currently handle in this
11762 * switch (invariants all), it's safe. We'll just hit the
11763 * default: case next time and keep on incrementing until
11764 * we find one of the invariants we do handle. */
11765 RExC_parse++;
11766 break;
11767 case '[':
11768 {
11769 /* If this looks like it is a [:posix:] class, leave the
11770 * parse pointer at the '[' to fool regclass() into
11771 * thinking it is part of a '[[:posix:]]'. That function
11772 * will use strict checking to force a syntax error if it
11773 * doesn't work out to a legitimate class */
11774 bool is_posix_class
11775 = could_it_be_a_POSIX_class(pRExC_state);
11776 if (! is_posix_class) {
11777 RExC_parse++;
11778 }
11779
11780 /* regclass() can only return RESTART_UTF8 if multi-char
11781 folds are allowed. */
11782 if (!regclass(pRExC_state, flagp,depth+1,
11783 is_posix_class, /* parse the whole char
11784 class only if not a
11785 posix class */
11786 FALSE, /* don't allow multi-char folds */
11787 TRUE, /* silence non-portable warnings. */
11788 &current))
11789 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11790 (UV) *flagp);
11791
11792 /* function call leaves parse pointing to the ']', except
11793 * if we faked it */
11794 if (is_posix_class) {
11795 RExC_parse--;
11796 }
11797
11798 SvREFCNT_dec(current); /* In case it returned something */
11799 break;
11800 }
11801
11802 case ']':
11803 if (depth--) break;
11804 RExC_parse++;
11805 if (RExC_parse < RExC_end
11806 && *RExC_parse == ')')
11807 {
11808 node = reganode(pRExC_state, ANYOF, 0);
11809 RExC_size += ANYOF_SKIP;
11810 nextchar(pRExC_state);
11811 Set_Node_Length(node,
11812 RExC_parse - oregcomp_parse + 1); /* MJD */
11813 return node;
11814 }
11815 goto no_close;
11816 }
11817 RExC_parse++;
11818 }
11819
11820 no_close:
11821 FAIL("Syntax error in (?[...])");
11822 }
11823
11824 /* Pass 2 only after this. Everything in this construct is a
11825 * metacharacter. Operands begin with either a '\' (for an escape
11826 * sequence), or a '[' for a bracketed character class. Any other
11827 * character should be an operator, or parenthesis for grouping. Both
11828 * types of operands are handled by calling regclass() to parse them. It
11829 * is called with a parameter to indicate to return the computed inversion
11830 * list. The parsing here is implemented via a stack. Each entry on the
11831 * stack is a single character representing one of the operators, or the
11832 * '('; or else a pointer to an operand inversion list. */
11833
11834#define IS_OPERAND(a) (! SvIOK(a))
11835
11836 /* The stack starts empty. It is a syntax error if the first thing parsed
11837 * is a binary operator; everything else is pushed on the stack. When an
11838 * operand is parsed, the top of the stack is examined. If it is a binary
11839 * operator, the item before it should be an operand, and both are replaced
11840 * by the result of doing that operation on the new operand and the one on
11841 * the stack. Thus a sequence of binary operands is reduced to a single
11842 * one before the next one is parsed.
11843 *
11844 * A unary operator may immediately follow a binary in the input, for
11845 * example
11846 * [a] + ! [b]
11847 * When an operand is parsed and the top of the stack is a unary operator,
11848 * the operation is performed, and then the stack is rechecked to see if
11849 * this new operand is part of a binary operation; if so, it is handled as
11850 * above.
11851 *
11852 * A '(' is simply pushed on the stack; it is valid only if the stack is
11853 * empty, or the top element of the stack is an operator or another '('
11854 * (for which the parenthesized expression will become an operand). By the
11855 * time the corresponding ')' is parsed everything in between should have
11856 * been parsed and evaluated to a single operand (or else is a syntax
11857 * error), and is handled as a regular operand */
11858
11859 sv_2mortal((SV *)(stack = newAV()));
11860
11861 while (RExC_parse < RExC_end) {
11862 I32 top_index = av_tindex(stack);
11863 SV** top_ptr;
11864 SV* current = NULL;
11865
11866 /* Skip white space */
11867 RExC_parse = regpatws(pRExC_state, RExC_parse,
11868 TRUE); /* means recognize comments */
11869 if (RExC_parse >= RExC_end) {
11870 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
11871 }
11872 if ((curchar = UCHARAT(RExC_parse)) == ']') {
11873 break;
11874 }
11875
11876 switch (curchar) {
11877
11878 case '?':
11879 if (av_tindex(stack) >= 0 /* This makes sure that we can
11880 safely subtract 1 from
11881 RExC_parse in the next clause.
11882 If we have something on the
11883 stack, we have parsed something
11884 */
11885 && UCHARAT(RExC_parse - 1) == '('
11886 && RExC_parse < RExC_end)
11887 {
11888 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
11889 * This happens when we have some thing like
11890 *
11891 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
11892 * ...
11893 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
11894 *
11895 * Here we would be handling the interpolated
11896 * '$thai_or_lao'. We handle this by a recursive call to
11897 * ourselves which returns the inversion list the
11898 * interpolated expression evaluates to. We use the flags
11899 * from the interpolated pattern. */
11900 U32 save_flags = RExC_flags;
11901 const char * const save_parse = ++RExC_parse;
11902
11903 parse_lparen_question_flags(pRExC_state);
11904
11905 if (RExC_parse == save_parse /* Makes sure there was at
11906 least one flag (or this
11907 embedding wasn't compiled)
11908 */
11909 || RExC_parse >= RExC_end - 4
11910 || UCHARAT(RExC_parse) != ':'
11911 || UCHARAT(++RExC_parse) != '('
11912 || UCHARAT(++RExC_parse) != '?'
11913 || UCHARAT(++RExC_parse) != '[')
11914 {
11915
11916 /* In combination with the above, this moves the
11917 * pointer to the point just after the first erroneous
11918 * character (or if there are no flags, to where they
11919 * should have been) */
11920 if (RExC_parse >= RExC_end - 4) {
11921 RExC_parse = RExC_end;
11922 }
11923 else if (RExC_parse != save_parse) {
11924 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11925 }
11926 vFAIL("Expecting '(?flags:(?[...'");
11927 }
11928 RExC_parse++;
11929 (void) handle_regex_sets(pRExC_state, &current, flagp,
11930 depth+1, oregcomp_parse);
11931
11932 /* Here, 'current' contains the embedded expression's
11933 * inversion list, and RExC_parse points to the trailing
11934 * ']'; the next character should be the ')' which will be
11935 * paired with the '(' that has been put on the stack, so
11936 * the whole embedded expression reduces to '(operand)' */
11937 RExC_parse++;
11938
11939 RExC_flags = save_flags;
11940 goto handle_operand;
11941 }
11942 /* FALL THROUGH */
11943
11944 default:
11945 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
11946 vFAIL("Unexpected character");
11947
11948 case '\\':
11949 /* regclass() can only return RESTART_UTF8 if multi-char
11950 folds are allowed. */
11951 if (!regclass(pRExC_state, flagp,depth+1,
11952 TRUE, /* means parse just the next thing */
11953 FALSE, /* don't allow multi-char folds */
11954 FALSE, /* don't silence non-portable warnings. */
11955 &current))
11956 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11957 (UV) *flagp);
11958 /* regclass() will return with parsing just the \ sequence,
11959 * leaving the parse pointer at the next thing to parse */
11960 RExC_parse--;
11961 goto handle_operand;
11962
11963 case '[': /* Is a bracketed character class */
11964 {
11965 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
11966
11967 if (! is_posix_class) {
11968 RExC_parse++;
11969 }
11970
11971 /* regclass() can only return RESTART_UTF8 if multi-char
11972 folds are allowed. */
11973 if(!regclass(pRExC_state, flagp,depth+1,
11974 is_posix_class, /* parse the whole char class
11975 only if not a posix class */
11976 FALSE, /* don't allow multi-char folds */
11977 FALSE, /* don't silence non-portable warnings. */
11978 &current))
11979 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
11980 (UV) *flagp);
11981 /* function call leaves parse pointing to the ']', except if we
11982 * faked it */
11983 if (is_posix_class) {
11984 RExC_parse--;
11985 }
11986
11987 goto handle_operand;
11988 }
11989
11990 case '&':
11991 case '|':
11992 case '+':
11993 case '-':
11994 case '^':
11995 if (top_index < 0
11996 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
11997 || ! IS_OPERAND(*top_ptr))
11998 {
11999 RExC_parse++;
12000 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12001 }
12002 av_push(stack, newSVuv(curchar));
12003 break;
12004
12005 case '!':
12006 av_push(stack, newSVuv(curchar));
12007 break;
12008
12009 case '(':
12010 if (top_index >= 0) {
12011 top_ptr = av_fetch(stack, top_index, FALSE);
12012 assert(top_ptr);
12013 if (IS_OPERAND(*top_ptr)) {
12014 RExC_parse++;
12015 vFAIL("Unexpected '(' with no preceding operator");
12016 }
12017 }
12018 av_push(stack, newSVuv(curchar));
12019 break;
12020
12021 case ')':
12022 {
12023 SV* lparen;
12024 if (top_index < 1
12025 || ! (current = av_pop(stack))
12026 || ! IS_OPERAND(current)
12027 || ! (lparen = av_pop(stack))
12028 || IS_OPERAND(lparen)
12029 || SvUV(lparen) != '(')
12030 {
12031 SvREFCNT_dec(current);
12032 RExC_parse++;
12033 vFAIL("Unexpected ')'");
12034 }
12035 top_index -= 2;
12036 SvREFCNT_dec_NN(lparen);
12037
12038 /* FALL THROUGH */
12039 }
12040
12041 handle_operand:
12042
12043 /* Here, we have an operand to process, in 'current' */
12044
12045 if (top_index < 0) { /* Just push if stack is empty */
12046 av_push(stack, current);
12047 }
12048 else {
12049 SV* top = av_pop(stack);
12050 SV *prev = NULL;
12051 char current_operator;
12052
12053 if (IS_OPERAND(top)) {
12054 SvREFCNT_dec_NN(top);
12055 SvREFCNT_dec_NN(current);
12056 vFAIL("Operand with no preceding operator");
12057 }
12058 current_operator = (char) SvUV(top);
12059 switch (current_operator) {
12060 case '(': /* Push the '(' back on followed by the new
12061 operand */
12062 av_push(stack, top);
12063 av_push(stack, current);
12064 SvREFCNT_inc(top); /* Counters the '_dec' done
12065 just after the 'break', so
12066 it doesn't get wrongly freed
12067 */
12068 break;
12069
12070 case '!':
12071 _invlist_invert(current);
12072
12073 /* Unlike binary operators, the top of the stack,
12074 * now that this unary one has been popped off, may
12075 * legally be an operator, and we now have operand
12076 * for it. */
12077 top_index--;
12078 SvREFCNT_dec_NN(top);
12079 goto handle_operand;
12080
12081 case '&':
12082 prev = av_pop(stack);
12083 _invlist_intersection(prev,
12084 current,
12085 &current);
12086 av_push(stack, current);
12087 break;
12088
12089 case '|':
12090 case '+':
12091 prev = av_pop(stack);
12092 _invlist_union(prev, current, &current);
12093 av_push(stack, current);
12094 break;
12095
12096 case '-':
12097 prev = av_pop(stack);;
12098 _invlist_subtract(prev, current, &current);
12099 av_push(stack, current);
12100 break;
12101
12102 case '^': /* The union minus the intersection */
12103 {
12104 SV* i = NULL;
12105 SV* u = NULL;
12106 SV* element;
12107
12108 prev = av_pop(stack);
12109 _invlist_union(prev, current, &u);
12110 _invlist_intersection(prev, current, &i);
12111 /* _invlist_subtract will overwrite current
12112 without freeing what it already contains */
12113 element = current;
12114 _invlist_subtract(u, i, &current);
12115 av_push(stack, current);
12116 SvREFCNT_dec_NN(i);
12117 SvREFCNT_dec_NN(u);
12118 SvREFCNT_dec_NN(element);
12119 break;
12120 }
12121
12122 default:
12123 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
12124 }
12125 SvREFCNT_dec_NN(top);
12126 SvREFCNT_dec(prev);
12127 }
12128 }
12129
12130 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12131 }
12132
12133 if (av_tindex(stack) < 0 /* Was empty */
12134 || ((final = av_pop(stack)) == NULL)
12135 || ! IS_OPERAND(final)
12136 || av_tindex(stack) >= 0) /* More left on stack */
12137 {
12138 vFAIL("Incomplete expression within '(?[ ])'");
12139 }
12140
12141 /* Here, 'final' is the resultant inversion list from evaluating the
12142 * expression. Return it if so requested */
12143 if (return_invlist) {
12144 *return_invlist = final;
12145 return END;
12146 }
12147
12148 /* Otherwise generate a resultant node, based on 'final'. regclass() is
12149 * expecting a string of ranges and individual code points */
12150 invlist_iterinit(final);
12151 result_string = newSVpvs("");
12152 while (invlist_iternext(final, &start, &end)) {
12153 if (start == end) {
12154 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
12155 }
12156 else {
12157 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
12158 start, end);
12159 }
12160 }
12161
12162 save_parse = RExC_parse;
12163 RExC_parse = SvPV(result_string, len);
12164 save_end = RExC_end;
12165 RExC_end = RExC_parse + len;
12166
12167 /* We turn off folding around the call, as the class we have constructed
12168 * already has all folding taken into consideration, and we don't want
12169 * regclass() to add to that */
12170 RExC_flags &= ~RXf_PMf_FOLD;
12171 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
12172 */
12173 node = regclass(pRExC_state, flagp,depth+1,
12174 FALSE, /* means parse the whole char class */
12175 FALSE, /* don't allow multi-char folds */
12176 TRUE, /* silence non-portable warnings. The above may very
12177 well have generated non-portable code points, but
12178 they're valid on this machine */
12179 NULL);
12180 if (!node)
12181 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
12182 PTR2UV(flagp));
12183 if (save_fold) {
12184 RExC_flags |= RXf_PMf_FOLD;
12185 }
12186 RExC_parse = save_parse + 1;
12187 RExC_end = save_end;
12188 SvREFCNT_dec_NN(final);
12189 SvREFCNT_dec_NN(result_string);
12190
12191 nextchar(pRExC_state);
12192 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
12193 return node;
12194}
12195#undef IS_OPERAND
12196
12197/* The names of properties whose definitions are not known at compile time are
12198 * stored in this SV, after a constant heading. So if the length has been
12199 * changed since initialization, then there is a run-time definition. */
12200#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
12201
12202STATIC regnode *
12203S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
12204 const bool stop_at_1, /* Just parse the next thing, don't
12205 look for a full character class */
12206 bool allow_multi_folds,
12207 const bool silence_non_portable, /* Don't output warnings
12208 about too large
12209 characters */
12210 SV** ret_invlist) /* Return an inversion list, not a node */
12211{
12212 /* parse a bracketed class specification. Most of these will produce an
12213 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
12214 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
12215 * under /i with multi-character folds: it will be rewritten following the
12216 * paradigm of this example, where the <multi-fold>s are characters which
12217 * fold to multiple character sequences:
12218 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
12219 * gets effectively rewritten as:
12220 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
12221 * reg() gets called (recursively) on the rewritten version, and this
12222 * function will return what it constructs. (Actually the <multi-fold>s
12223 * aren't physically removed from the [abcdefghi], it's just that they are
12224 * ignored in the recursion by means of a flag:
12225 * <RExC_in_multi_char_class>.)
12226 *
12227 * ANYOF nodes contain a bit map for the first 256 characters, with the
12228 * corresponding bit set if that character is in the list. For characters
12229 * above 255, a range list or swash is used. There are extra bits for \w,
12230 * etc. in locale ANYOFs, as what these match is not determinable at
12231 * compile time
12232 *
12233 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
12234 * to be restarted. This can only happen if ret_invlist is non-NULL.
12235 */
12236
12237 dVAR;
12238 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
12239 IV range = 0;
12240 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
12241 regnode *ret;
12242 STRLEN numlen;
12243 IV namedclass = OOB_NAMEDCLASS;
12244 char *rangebegin = NULL;
12245 bool need_class = 0;
12246 SV *listsv = NULL;
12247 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
12248 than just initialized. */
12249 SV* properties = NULL; /* Code points that match \p{} \P{} */
12250 SV* posixes = NULL; /* Code points that match classes like, [:word:],
12251 extended beyond the Latin1 range */
12252 UV element_count = 0; /* Number of distinct elements in the class.
12253 Optimizations may be possible if this is tiny */
12254 AV * multi_char_matches = NULL; /* Code points that fold to more than one
12255 character; used under /i */
12256 UV n;
12257 char * stop_ptr = RExC_end; /* where to stop parsing */
12258 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
12259 space? */
12260 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
12261
12262 /* Unicode properties are stored in a swash; this holds the current one
12263 * being parsed. If this swash is the only above-latin1 component of the
12264 * character class, an optimization is to pass it directly on to the
12265 * execution engine. Otherwise, it is set to NULL to indicate that there
12266 * are other things in the class that have to be dealt with at execution
12267 * time */
12268 SV* swash = NULL; /* Code points that match \p{} \P{} */
12269
12270 /* Set if a component of this character class is user-defined; just passed
12271 * on to the engine */
12272 bool has_user_defined_property = FALSE;
12273
12274 /* inversion list of code points this node matches only when the target
12275 * string is in UTF-8. (Because is under /d) */
12276 SV* depends_list = NULL;
12277
12278 /* inversion list of code points this node matches. For much of the
12279 * function, it includes only those that match regardless of the utf8ness
12280 * of the target string */
12281 SV* cp_list = NULL;
12282
12283#ifdef EBCDIC
12284 /* In a range, counts how many 0-2 of the ends of it came from literals,
12285 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
12286 UV literal_endpoint = 0;
12287#endif
12288 bool invert = FALSE; /* Is this class to be complemented */
12289
12290 /* Is there any thing like \W or [:^digit:] that matches above the legal
12291 * Unicode range? */
12292 bool runtime_posix_matches_above_Unicode = FALSE;
12293
12294 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
12295 case we need to change the emitted regop to an EXACT. */
12296 const char * orig_parse = RExC_parse;
12297 const SSize_t orig_size = RExC_size;
12298 GET_RE_DEBUG_FLAGS_DECL;
12299
12300 PERL_ARGS_ASSERT_REGCLASS;
12301#ifndef DEBUGGING
12302 PERL_UNUSED_ARG(depth);
12303#endif
12304
12305 DEBUG_PARSE("clas");
12306
12307 /* Assume we are going to generate an ANYOF node. */
12308 ret = reganode(pRExC_state, ANYOF, 0);
12309
12310 if (SIZE_ONLY) {
12311 RExC_size += ANYOF_SKIP;
12312 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
12313 }
12314 else {
12315 ANYOF_FLAGS(ret) = 0;
12316
12317 RExC_emit += ANYOF_SKIP;
12318 if (LOC) {
12319 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
12320 }
12321 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
12322 initial_listsv_len = SvCUR(listsv);
12323 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
12324 }
12325
12326 if (skip_white) {
12327 RExC_parse = regpatws(pRExC_state, RExC_parse,
12328 FALSE /* means don't recognize comments */);
12329 }
12330
12331 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
12332 RExC_parse++;
12333 invert = TRUE;
12334 allow_multi_folds = FALSE;
12335 RExC_naughty++;
12336 if (skip_white) {
12337 RExC_parse = regpatws(pRExC_state, RExC_parse,
12338 FALSE /* means don't recognize comments */);
12339 }
12340 }
12341
12342 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
12343 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
12344 const char *s = RExC_parse;
12345 const char c = *s++;
12346
12347 while (isWORDCHAR(*s))
12348 s++;
12349 if (*s && c == *s && s[1] == ']') {
12350 SAVEFREESV(RExC_rx_sv);
12351 ckWARN3reg(s+2,
12352 "POSIX syntax [%c %c] belongs inside character classes",
12353 c, c);
12354 (void)ReREFCNT_inc(RExC_rx_sv);
12355 }
12356 }
12357
12358 /* If the caller wants us to just parse a single element, accomplish this
12359 * by faking the loop ending condition */
12360 if (stop_at_1 && RExC_end > RExC_parse) {
12361 stop_ptr = RExC_parse + 1;
12362 }
12363
12364 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
12365 if (UCHARAT(RExC_parse) == ']')
12366 goto charclassloop;
12367
12368parseit:
12369 while (1) {
12370 if (RExC_parse >= stop_ptr) {
12371 break;
12372 }
12373
12374 if (skip_white) {
12375 RExC_parse = regpatws(pRExC_state, RExC_parse,
12376 FALSE /* means don't recognize comments */);
12377 }
12378
12379 if (UCHARAT(RExC_parse) == ']') {
12380 break;
12381 }
12382
12383 charclassloop:
12384
12385 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
12386 save_value = value;
12387 save_prevvalue = prevvalue;
12388
12389 if (!range) {
12390 rangebegin = RExC_parse;
12391 element_count++;
12392 }
12393 if (UTF) {
12394 value = utf8n_to_uvchr((U8*)RExC_parse,
12395 RExC_end - RExC_parse,
12396 &numlen, UTF8_ALLOW_DEFAULT);
12397 RExC_parse += numlen;
12398 }
12399 else
12400 value = UCHARAT(RExC_parse++);
12401
12402 if (value == '['
12403 && RExC_parse < RExC_end
12404 && POSIXCC(UCHARAT(RExC_parse)))
12405 {
12406 namedclass = regpposixcc(pRExC_state, value, strict);
12407 }
12408 else if (value == '\\') {
12409 if (UTF) {
12410 value = utf8n_to_uvchr((U8*)RExC_parse,
12411 RExC_end - RExC_parse,
12412 &numlen, UTF8_ALLOW_DEFAULT);
12413 RExC_parse += numlen;
12414 }
12415 else
12416 value = UCHARAT(RExC_parse++);
12417
12418 /* Some compilers cannot handle switching on 64-bit integer
12419 * values, therefore value cannot be an UV. Yes, this will
12420 * be a problem later if we want switch on Unicode.
12421 * A similar issue a little bit later when switching on
12422 * namedclass. --jhi */
12423
12424 /* If the \ is escaping white space when white space is being
12425 * skipped, it means that that white space is wanted literally, and
12426 * is already in 'value'. Otherwise, need to translate the escape
12427 * into what it signifies. */
12428 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
12429
12430 case 'w': namedclass = ANYOF_WORDCHAR; break;
12431 case 'W': namedclass = ANYOF_NWORDCHAR; break;
12432 case 's': namedclass = ANYOF_SPACE; break;
12433 case 'S': namedclass = ANYOF_NSPACE; break;
12434 case 'd': namedclass = ANYOF_DIGIT; break;
12435 case 'D': namedclass = ANYOF_NDIGIT; break;
12436 case 'v': namedclass = ANYOF_VERTWS; break;
12437 case 'V': namedclass = ANYOF_NVERTWS; break;
12438 case 'h': namedclass = ANYOF_HORIZWS; break;
12439 case 'H': namedclass = ANYOF_NHORIZWS; break;
12440 case 'N': /* Handle \N{NAME} in class */
12441 {
12442 /* We only pay attention to the first char of
12443 multichar strings being returned. I kinda wonder
12444 if this makes sense as it does change the behaviour
12445 from earlier versions, OTOH that behaviour was broken
12446 as well. */
12447 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
12448 TRUE, /* => charclass */
12449 strict))
12450 {
12451 if (*flagp & RESTART_UTF8)
12452 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12453 goto parseit;
12454 }
12455 }
12456 break;
12457 case 'p':
12458 case 'P':
12459 {
12460 char *e;
12461
12462 /* We will handle any undefined properties ourselves */
12463 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
12464
12465 if (RExC_parse >= RExC_end)
12466 vFAIL2("Empty \\%c{}", (U8)value);
12467 if (*RExC_parse == '{') {
12468 const U8 c = (U8)value;
12469 e = strchr(RExC_parse++, '}');
12470 if (!e)
12471 vFAIL2("Missing right brace on \\%c{}", c);
12472 while (isSPACE(UCHARAT(RExC_parse)))
12473 RExC_parse++;
12474 if (e == RExC_parse)
12475 vFAIL2("Empty \\%c{}", c);
12476 n = e - RExC_parse;
12477 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
12478 n--;
12479 }
12480 else {
12481 e = RExC_parse;
12482 n = 1;
12483 }
12484 if (!SIZE_ONLY) {
12485 SV* invlist;
12486 char* name;
12487
12488 if (UCHARAT(RExC_parse) == '^') {
12489 RExC_parse++;
12490 n--;
12491 /* toggle. (The rhs xor gets the single bit that
12492 * differs between P and p; the other xor inverts just
12493 * that bit) */
12494 value ^= 'P' ^ 'p';
12495
12496 while (isSPACE(UCHARAT(RExC_parse))) {
12497 RExC_parse++;
12498 n--;
12499 }
12500 }
12501 /* Try to get the definition of the property into
12502 * <invlist>. If /i is in effect, the effective property
12503 * will have its name be <__NAME_i>. The design is
12504 * discussed in commit
12505 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
12506 Newx(name, n + sizeof("_i__\n"), char);
12507
12508 sprintf(name, "%s%.*s%s\n",
12509 (FOLD) ? "__" : "",
12510 (int)n,
12511 RExC_parse,
12512 (FOLD) ? "_i" : ""
12513 );
12514
12515 /* Look up the property name, and get its swash and
12516 * inversion list, if the property is found */
12517 if (swash) {
12518 SvREFCNT_dec_NN(swash);
12519 }
12520 swash = _core_swash_init("utf8", name, &PL_sv_undef,
12521 1, /* binary */
12522 0, /* not tr/// */
12523 NULL, /* No inversion list */
12524 &swash_init_flags
12525 );
12526 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
12527 if (swash) {
12528 SvREFCNT_dec_NN(swash);
12529 swash = NULL;
12530 }
12531
12532 /* Here didn't find it. It could be a user-defined
12533 * property that will be available at run-time. If we
12534 * accept only compile-time properties, is an error;
12535 * otherwise add it to the list for run-time look up */
12536 if (ret_invlist) {
12537 RExC_parse = e + 1;
12538 vFAIL3("Property '%.*s' is unknown", (int) n, name);
12539 }
12540 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
12541 (value == 'p' ? '+' : '!'),
12542 name);
12543 has_user_defined_property = TRUE;
12544
12545 /* We don't know yet, so have to assume that the
12546 * property could match something in the Latin1 range,
12547 * hence something that isn't utf8. Note that this
12548 * would cause things in <depends_list> to match
12549 * inappropriately, except that any \p{}, including
12550 * this one forces Unicode semantics, which means there
12551 * is <no depends_list> */
12552 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
12553 }
12554 else {
12555
12556 /* Here, did get the swash and its inversion list. If
12557 * the swash is from a user-defined property, then this
12558 * whole character class should be regarded as such */
12559 has_user_defined_property =
12560 (swash_init_flags
12561 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
12562
12563 /* Invert if asking for the complement */
12564 if (value == 'P') {
12565 _invlist_union_complement_2nd(properties,
12566 invlist,
12567 &properties);
12568
12569 /* The swash can't be used as-is, because we've
12570 * inverted things; delay removing it to here after
12571 * have copied its invlist above */
12572 SvREFCNT_dec_NN(swash);
12573 swash = NULL;
12574 }
12575 else {
12576 _invlist_union(properties, invlist, &properties);
12577 }
12578 }
12579 Safefree(name);
12580 }
12581 RExC_parse = e + 1;
12582 namedclass = ANYOF_UNIPROP; /* no official name, but it's
12583 named */
12584
12585 /* \p means they want Unicode semantics */
12586 RExC_uni_semantics = 1;
12587 }
12588 break;
12589 case 'n': value = '\n'; break;
12590 case 'r': value = '\r'; break;
12591 case 't': value = '\t'; break;
12592 case 'f': value = '\f'; break;
12593 case 'b': value = '\b'; break;
12594 case 'e': value = ASCII_TO_NATIVE('\033');break;
12595 case 'a': value = '\a'; break;
12596 case 'o':
12597 RExC_parse--; /* function expects to be pointed at the 'o' */
12598 {
12599 const char* error_msg;
12600 bool valid = grok_bslash_o(&RExC_parse,
12601 &value,
12602 &error_msg,
12603 SIZE_ONLY, /* warnings in pass
12604 1 only */
12605 strict,
12606 silence_non_portable,
12607 UTF);
12608 if (! valid) {
12609 vFAIL(error_msg);
12610 }
12611 }
12612 if (PL_encoding && value < 0x100) {
12613 goto recode_encoding;
12614 }
12615 break;
12616 case 'x':
12617 RExC_parse--; /* function expects to be pointed at the 'x' */
12618 {
12619 const char* error_msg;
12620 bool valid = grok_bslash_x(&RExC_parse,
12621 &value,
12622 &error_msg,
12623 TRUE, /* Output warnings */
12624 strict,
12625 silence_non_portable,
12626 UTF);
12627 if (! valid) {
12628 vFAIL(error_msg);
12629 }
12630 }
12631 if (PL_encoding && value < 0x100)
12632 goto recode_encoding;
12633 break;
12634 case 'c':
12635 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
12636 break;
12637 case '0': case '1': case '2': case '3': case '4':
12638 case '5': case '6': case '7':
12639 {
12640 /* Take 1-3 octal digits */
12641 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12642 numlen = (strict) ? 4 : 3;
12643 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
12644 RExC_parse += numlen;
12645 if (numlen != 3) {
12646 if (strict) {
12647 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12648 vFAIL("Need exactly 3 octal digits");
12649 }
12650 else if (! SIZE_ONLY /* like \08, \178 */
12651 && numlen < 3
12652 && RExC_parse < RExC_end
12653 && isDIGIT(*RExC_parse)
12654 && ckWARN(WARN_REGEXP))
12655 {
12656 SAVEFREESV(RExC_rx_sv);
12657 reg_warn_non_literal_string(
12658 RExC_parse + 1,
12659 form_short_octal_warning(RExC_parse, numlen));
12660 (void)ReREFCNT_inc(RExC_rx_sv);
12661 }
12662 }
12663 if (PL_encoding && value < 0x100)
12664 goto recode_encoding;
12665 break;
12666 }
12667 recode_encoding:
12668 if (! RExC_override_recoding) {
12669 SV* enc = PL_encoding;
12670 value = reg_recode((const char)(U8)value, &enc);
12671 if (!enc) {
12672 if (strict) {
12673 vFAIL("Invalid escape in the specified encoding");
12674 }
12675 else if (SIZE_ONLY) {
12676 ckWARNreg(RExC_parse,
12677 "Invalid escape in the specified encoding");
12678 }
12679 }
12680 break;
12681 }
12682 default:
12683 /* Allow \_ to not give an error */
12684 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
12685 if (strict) {
12686 vFAIL2("Unrecognized escape \\%c in character class",
12687 (int)value);
12688 }
12689 else {
12690 SAVEFREESV(RExC_rx_sv);
12691 ckWARN2reg(RExC_parse,
12692 "Unrecognized escape \\%c in character class passed through",
12693 (int)value);
12694 (void)ReREFCNT_inc(RExC_rx_sv);
12695 }
12696 }
12697 break;
12698 } /* End of switch on char following backslash */
12699 } /* end of handling backslash escape sequences */
12700#ifdef EBCDIC
12701 else
12702 literal_endpoint++;
12703#endif
12704
12705 /* Here, we have the current token in 'value' */
12706
12707 /* What matches in a locale is not known until runtime. This includes
12708 * what the Posix classes (like \w, [:space:]) match. Room must be
12709 * reserved (one time per class) to store such classes, either if Perl
12710 * is compiled so that locale nodes always should have this space, or
12711 * if there is such class info to be stored. The space will contain a
12712 * bit for each named class that is to be matched against. This isn't
12713 * needed for \p{} and pseudo-classes, as they are not affected by
12714 * locale, and hence are dealt with separately */
12715 if (LOC
12716 && ! need_class
12717 && (ANYOF_LOCALE == ANYOF_CLASS
12718 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
12719 {
12720 need_class = 1;
12721 if (SIZE_ONLY) {
12722 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12723 }
12724 else {
12725 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
12726 ANYOF_CLASS_ZERO(ret);
12727 }
12728 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
12729 }
12730
12731 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
12732
12733 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
12734 * literal, as is the character that began the false range, i.e.
12735 * the 'a' in the examples */
12736 if (range) {
12737 if (!SIZE_ONLY) {
12738 const int w = (RExC_parse >= rangebegin)
12739 ? RExC_parse - rangebegin
12740 : 0;
12741 if (strict) {
12742 vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
12743 }
12744 else {
12745 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
12746 ckWARN4reg(RExC_parse,
12747 "False [] range \"%*.*s\"",
12748 w, w, rangebegin);
12749 (void)ReREFCNT_inc(RExC_rx_sv);
12750 cp_list = add_cp_to_invlist(cp_list, '-');
12751 cp_list = add_cp_to_invlist(cp_list, prevvalue);
12752 }
12753 }
12754
12755 range = 0; /* this was not a true range */
12756 element_count += 2; /* So counts for three values */
12757 }
12758
12759 if (! SIZE_ONLY) {
12760 U8 classnum = namedclass_to_classnum(namedclass);
12761 if (namedclass >= ANYOF_MAX) { /* If a special class */
12762 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
12763
12764 /* Here, should be \h, \H, \v, or \V. Neither /d nor
12765 * /l make a difference in what these match. There
12766 * would be problems if these characters had folds
12767 * other than themselves, as cp_list is subject to
12768 * folding. */
12769 if (classnum != _CC_VERTSPACE) {
12770 assert( namedclass == ANYOF_HORIZWS
12771 || namedclass == ANYOF_NHORIZWS);
12772
12773 /* It turns out that \h is just a synonym for
12774 * XPosixBlank */
12775 classnum = _CC_BLANK;
12776 }
12777
12778 _invlist_union_maybe_complement_2nd(
12779 cp_list,
12780 PL_XPosix_ptrs[classnum],
12781 cBOOL(namedclass % 2), /* Complement if odd
12782 (NHORIZWS, NVERTWS)
12783 */
12784 &cp_list);
12785 }
12786 }
12787 else if (classnum == _CC_ASCII) {
12788#ifdef HAS_ISASCII
12789 if (LOC) {
12790 ANYOF_CLASS_SET(ret, namedclass);
12791 }
12792 else
12793#endif /* Not isascii(); just use the hard-coded definition for it */
12794 _invlist_union_maybe_complement_2nd(
12795 posixes,
12796 PL_ASCII,
12797 cBOOL(namedclass % 2), /* Complement if odd
12798 (NASCII) */
12799 &posixes);
12800 }
12801 else { /* Garden variety class */
12802
12803 /* The ascii range inversion list */
12804 SV* ascii_source = PL_Posix_ptrs[classnum];
12805
12806 /* The full Latin1 range inversion list */
12807 SV* l1_source = PL_L1Posix_ptrs[classnum];
12808
12809 /* This code is structured into two major clauses. The
12810 * first is for classes whose complete definitions may not
12811 * already be known. It not, the Latin1 definition
12812 * (guaranteed to already known) is used plus code is
12813 * generated to load the rest at run-time (only if needed).
12814 * If the complete definition is known, it drops down to
12815 * the second clause, where the complete definition is
12816 * known */
12817
12818 if (classnum < _FIRST_NON_SWASH_CC) {
12819
12820 /* Here, the class has a swash, which may or not
12821 * already be loaded */
12822
12823 /* The name of the property to use to match the full
12824 * eXtended Unicode range swash for this character
12825 * class */
12826 const char *Xname = swash_property_names[classnum];
12827
12828 /* If returning the inversion list, we can't defer
12829 * getting this until runtime */
12830 if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
12831 PL_utf8_swash_ptrs[classnum] =
12832 _core_swash_init("utf8", Xname, &PL_sv_undef,
12833 1, /* binary */
12834 0, /* not tr/// */
12835 NULL, /* No inversion list */
12836 NULL /* No flags */
12837 );
12838 assert(PL_utf8_swash_ptrs[classnum]);
12839 }
12840 if ( ! PL_utf8_swash_ptrs[classnum]) {
12841 if (namedclass % 2 == 0) { /* A non-complemented
12842 class */
12843 /* If not /a matching, there are code points we
12844 * don't know at compile time. Arrange for the
12845 * unknown matches to be loaded at run-time, if
12846 * needed */
12847 if (! AT_LEAST_ASCII_RESTRICTED) {
12848 Perl_sv_catpvf(aTHX_ listsv, "+utf8::%s\n",
12849 Xname);
12850 }
12851 if (LOC) { /* Under locale, set run-time
12852 lookup */
12853 ANYOF_CLASS_SET(ret, namedclass);
12854 }
12855 else {
12856 /* Add the current class's code points to
12857 * the running total */
12858 _invlist_union(posixes,
12859 (AT_LEAST_ASCII_RESTRICTED)
12860 ? ascii_source
12861 : l1_source,
12862 &posixes);
12863 }
12864 }
12865 else { /* A complemented class */
12866 if (AT_LEAST_ASCII_RESTRICTED) {
12867 /* Under /a should match everything above
12868 * ASCII, plus the complement of the set's
12869 * ASCII matches */
12870 _invlist_union_complement_2nd(posixes,
12871 ascii_source,
12872 &posixes);
12873 }
12874 else {
12875 /* Arrange for the unknown matches to be
12876 * loaded at run-time, if needed */
12877 Perl_sv_catpvf(aTHX_ listsv, "!utf8::%s\n",
12878 Xname);
12879 runtime_posix_matches_above_Unicode = TRUE;
12880 if (LOC) {
12881 ANYOF_CLASS_SET(ret, namedclass);
12882 }
12883 else {
12884
12885 /* We want to match everything in
12886 * Latin1, except those things that
12887 * l1_source matches */
12888 SV* scratch_list = NULL;
12889 _invlist_subtract(PL_Latin1, l1_source,
12890 &scratch_list);
12891
12892 /* Add the list from this class to the
12893 * running total */
12894 if (! posixes) {
12895 posixes = scratch_list;
12896 }
12897 else {
12898 _invlist_union(posixes,
12899 scratch_list,
12900 &posixes);
12901 SvREFCNT_dec_NN(scratch_list);
12902 }
12903 if (DEPENDS_SEMANTICS) {
12904 ANYOF_FLAGS(ret)
12905 |= ANYOF_NON_UTF8_LATIN1_ALL;
12906 }
12907 }
12908 }
12909 }
12910 goto namedclass_done;
12911 }
12912
12913 /* Here, there is a swash loaded for the class. If no
12914 * inversion list for it yet, get it */
12915 if (! PL_XPosix_ptrs[classnum]) {
12916 PL_XPosix_ptrs[classnum]
12917 = _swash_to_invlist(PL_utf8_swash_ptrs[classnum]);
12918 }
12919 }
12920
12921 /* Here there is an inversion list already loaded for the
12922 * entire class */
12923
12924 if (namedclass % 2 == 0) { /* A non-complemented class,
12925 like ANYOF_PUNCT */
12926 if (! LOC) {
12927 /* For non-locale, just add it to any existing list
12928 * */
12929 _invlist_union(posixes,
12930 (AT_LEAST_ASCII_RESTRICTED)
12931 ? ascii_source
12932 : PL_XPosix_ptrs[classnum],
12933 &posixes);
12934 }
12935 else { /* Locale */
12936 SV* scratch_list = NULL;
12937
12938 /* For above Latin1 code points, we use the full
12939 * Unicode range */
12940 _invlist_intersection(PL_AboveLatin1,
12941 PL_XPosix_ptrs[classnum],
12942 &scratch_list);
12943 /* And set the output to it, adding instead if
12944 * there already is an output. Checking if
12945 * 'posixes' is NULL first saves an extra clone.
12946 * Its reference count will be decremented at the
12947 * next union, etc, or if this is the only
12948 * instance, at the end of the routine */
12949 if (! posixes) {
12950 posixes = scratch_list;
12951 }
12952 else {
12953 _invlist_union(posixes, scratch_list, &posixes);
12954 SvREFCNT_dec_NN(scratch_list);
12955 }
12956
12957#ifndef HAS_ISBLANK
12958 if (namedclass != ANYOF_BLANK) {
12959#endif
12960 /* Set this class in the node for runtime
12961 * matching */
12962 ANYOF_CLASS_SET(ret, namedclass);
12963#ifndef HAS_ISBLANK
12964 }
12965 else {
12966 /* No isblank(), use the hard-coded ASCII-range
12967 * blanks, adding them to the running total. */
12968
12969 _invlist_union(posixes, ascii_source, &posixes);
12970 }
12971#endif
12972 }
12973 }
12974 else { /* A complemented class, like ANYOF_NPUNCT */
12975 if (! LOC) {
12976 _invlist_union_complement_2nd(
12977 posixes,
12978 (AT_LEAST_ASCII_RESTRICTED)
12979 ? ascii_source
12980 : PL_XPosix_ptrs[classnum],
12981 &posixes);
12982 /* Under /d, everything in the upper half of the
12983 * Latin1 range matches this complement */
12984 if (DEPENDS_SEMANTICS) {
12985 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
12986 }
12987 }
12988 else { /* Locale */
12989 SV* scratch_list = NULL;
12990 _invlist_subtract(PL_AboveLatin1,
12991 PL_XPosix_ptrs[classnum],
12992 &scratch_list);
12993 if (! posixes) {
12994 posixes = scratch_list;
12995 }
12996 else {
12997 _invlist_union(posixes, scratch_list, &posixes);
12998 SvREFCNT_dec_NN(scratch_list);
12999 }
13000#ifndef HAS_ISBLANK
13001 if (namedclass != ANYOF_NBLANK) {
13002#endif
13003 ANYOF_CLASS_SET(ret, namedclass);
13004#ifndef HAS_ISBLANK
13005 }
13006 else {
13007 /* Get the list of all code points in Latin1
13008 * that are not ASCII blanks, and add them to
13009 * the running total */
13010 _invlist_subtract(PL_Latin1, ascii_source,
13011 &scratch_list);
13012 _invlist_union(posixes, scratch_list, &posixes);
13013 SvREFCNT_dec_NN(scratch_list);
13014 }
13015#endif
13016 }
13017 }
13018 }
13019 namedclass_done:
13020 continue; /* Go get next character */
13021 }
13022 } /* end of namedclass \blah */
13023
13024 /* Here, we have a single value. If 'range' is set, it is the ending
13025 * of a range--check its validity. Later, we will handle each
13026 * individual code point in the range. If 'range' isn't set, this
13027 * could be the beginning of a range, so check for that by looking
13028 * ahead to see if the next real character to be processed is the range
13029 * indicator--the minus sign */
13030
13031 if (skip_white) {
13032 RExC_parse = regpatws(pRExC_state, RExC_parse,
13033 FALSE /* means don't recognize comments */);
13034 }
13035
13036 if (range) {
13037 if (prevvalue > value) /* b-a */ {
13038 const int w = RExC_parse - rangebegin;
13039 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
13040 range = 0; /* not a valid range */
13041 }
13042 }
13043 else {
13044 prevvalue = value; /* save the beginning of the potential range */
13045 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13046 && *RExC_parse == '-')
13047 {
13048 char* next_char_ptr = RExC_parse + 1;
13049 if (skip_white) { /* Get the next real char after the '-' */
13050 next_char_ptr = regpatws(pRExC_state,
13051 RExC_parse + 1,
13052 FALSE); /* means don't recognize
13053 comments */
13054 }
13055
13056 /* If the '-' is at the end of the class (just before the ']',
13057 * it is a literal minus; otherwise it is a range */
13058 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13059 RExC_parse = next_char_ptr;
13060
13061 /* a bad range like \w-, [:word:]- ? */
13062 if (namedclass > OOB_NAMEDCLASS) {
13063 if (strict || ckWARN(WARN_REGEXP)) {
13064 const int w =
13065 RExC_parse >= rangebegin ?
13066 RExC_parse - rangebegin : 0;
13067 if (strict) {
13068 vFAIL4("False [] range \"%*.*s\"",
13069 w, w, rangebegin);
13070 }
13071 else {
13072 vWARN4(RExC_parse,
13073 "False [] range \"%*.*s\"",
13074 w, w, rangebegin);
13075 }
13076 }
13077 if (!SIZE_ONLY) {
13078 cp_list = add_cp_to_invlist(cp_list, '-');
13079 }
13080 element_count++;
13081 } else
13082 range = 1; /* yeah, it's a range! */
13083 continue; /* but do it the next time */
13084 }
13085 }
13086 }
13087
13088 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13089 * if not */
13090
13091 /* non-Latin1 code point implies unicode semantics. Must be set in
13092 * pass1 so is there for the whole of pass 2 */
13093 if (value > 255) {
13094 RExC_uni_semantics = 1;
13095 }
13096
13097 /* Ready to process either the single value, or the completed range.
13098 * For single-valued non-inverted ranges, we consider the possibility
13099 * of multi-char folds. (We made a conscious decision to not do this
13100 * for the other cases because it can often lead to non-intuitive
13101 * results. For example, you have the peculiar case that:
13102 * "s s" =~ /^[^\xDF]+$/i => Y
13103 * "ss" =~ /^[^\xDF]+$/i => N
13104 *
13105 * See [perl #89750] */
13106 if (FOLD && allow_multi_folds && value == prevvalue) {
13107 if (value == LATIN_SMALL_LETTER_SHARP_S
13108 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13109 value)))
13110 {
13111 /* Here <value> is indeed a multi-char fold. Get what it is */
13112
13113 U8 foldbuf[UTF8_MAXBYTES_CASE];
13114 STRLEN foldlen;
13115
13116 UV folded = _to_uni_fold_flags(
13117 value,
13118 foldbuf,
13119 &foldlen,
13120 FOLD_FLAGS_FULL
13121 | ((LOC) ? FOLD_FLAGS_LOCALE
13122 : (ASCII_FOLD_RESTRICTED)
13123 ? FOLD_FLAGS_NOMIX_ASCII
13124 : 0)
13125 );
13126
13127 /* Here, <folded> should be the first character of the
13128 * multi-char fold of <value>, with <foldbuf> containing the
13129 * whole thing. But, if this fold is not allowed (because of
13130 * the flags), <fold> will be the same as <value>, and should
13131 * be processed like any other character, so skip the special
13132 * handling */
13133 if (folded != value) {
13134
13135 /* Skip if we are recursed, currently parsing the class
13136 * again. Otherwise add this character to the list of
13137 * multi-char folds. */
13138 if (! RExC_in_multi_char_class) {
13139 AV** this_array_ptr;
13140 AV* this_array;
13141 STRLEN cp_count = utf8_length(foldbuf,
13142 foldbuf + foldlen);
13143 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13144
13145 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13146
13147
13148 if (! multi_char_matches) {
13149 multi_char_matches = newAV();
13150 }
13151
13152 /* <multi_char_matches> is actually an array of arrays.
13153 * There will be one or two top-level elements: [2],
13154 * and/or [3]. The [2] element is an array, each
13155 * element thereof is a character which folds to TWO
13156 * characters; [3] is for folds to THREE characters.
13157 * (Unicode guarantees a maximum of 3 characters in any
13158 * fold.) When we rewrite the character class below,
13159 * we will do so such that the longest folds are
13160 * written first, so that it prefers the longest
13161 * matching strings first. This is done even if it
13162 * turns out that any quantifier is non-greedy, out of
13163 * programmer laziness. Tom Christiansen has agreed
13164 * that this is ok. This makes the test for the
13165 * ligature 'ffi' come before the test for 'ff' */
13166 if (av_exists(multi_char_matches, cp_count)) {
13167 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13168 cp_count, FALSE);
13169 this_array = *this_array_ptr;
13170 }
13171 else {
13172 this_array = newAV();
13173 av_store(multi_char_matches, cp_count,
13174 (SV*) this_array);
13175 }
13176 av_push(this_array, multi_fold);
13177 }
13178
13179 /* This element should not be processed further in this
13180 * class */
13181 element_count--;
13182 value = save_value;
13183 prevvalue = save_prevvalue;
13184 continue;
13185 }
13186 }
13187 }
13188
13189 /* Deal with this element of the class */
13190 if (! SIZE_ONLY) {
13191#ifndef EBCDIC
13192 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
13193#else
13194 SV* this_range = _new_invlist(1);
13195 _append_range_to_invlist(this_range, prevvalue, value);
13196
13197 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
13198 * If this range was specified using something like 'i-j', we want
13199 * to include only the 'i' and the 'j', and not anything in
13200 * between, so exclude non-ASCII, non-alphabetics from it.
13201 * However, if the range was specified with something like
13202 * [\x89-\x91] or [\x89-j], all code points within it should be
13203 * included. literal_endpoint==2 means both ends of the range used
13204 * a literal character, not \x{foo} */
13205 if (literal_endpoint == 2
13206 && ((prevvalue >= 'a' && value <= 'z')
13207 || (prevvalue >= 'A' && value <= 'Z')))
13208 {
13209 _invlist_intersection(this_range, PL_ASCII,
13210 &this_range);
13211 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
13212 &this_range);
13213 }
13214 _invlist_union(cp_list, this_range, &cp_list);
13215 literal_endpoint = 0;
13216#endif
13217 }
13218
13219 range = 0; /* this range (if it was one) is done now */
13220 } /* End of loop through all the text within the brackets */
13221
13222 /* If anything in the class expands to more than one character, we have to
13223 * deal with them by building up a substitute parse string, and recursively
13224 * calling reg() on it, instead of proceeding */
13225 if (multi_char_matches) {
13226 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
13227 I32 cp_count;
13228 STRLEN len;
13229 char *save_end = RExC_end;
13230 char *save_parse = RExC_parse;
13231 bool first_time = TRUE; /* First multi-char occurrence doesn't get
13232 a "|" */
13233 I32 reg_flags;
13234
13235 assert(! invert);
13236#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
13237 because too confusing */
13238 if (invert) {
13239 sv_catpv(substitute_parse, "(?:");
13240 }
13241#endif
13242
13243 /* Look at the longest folds first */
13244 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
13245
13246 if (av_exists(multi_char_matches, cp_count)) {
13247 AV** this_array_ptr;
13248 SV* this_sequence;
13249
13250 this_array_ptr = (AV**) av_fetch(multi_char_matches,
13251 cp_count, FALSE);
13252 while ((this_sequence = av_pop(*this_array_ptr)) !=
13253 &PL_sv_undef)
13254 {
13255 if (! first_time) {
13256 sv_catpv(substitute_parse, "|");
13257 }
13258 first_time = FALSE;
13259
13260 sv_catpv(substitute_parse, SvPVX(this_sequence));
13261 }
13262 }
13263 }
13264
13265 /* If the character class contains anything else besides these
13266 * multi-character folds, have to include it in recursive parsing */
13267 if (element_count) {
13268 sv_catpv(substitute_parse, "|[");
13269 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
13270 sv_catpv(substitute_parse, "]");
13271 }
13272
13273 sv_catpv(substitute_parse, ")");
13274#if 0
13275 if (invert) {
13276 /* This is a way to get the parse to skip forward a whole named
13277 * sequence instead of matching the 2nd character when it fails the
13278 * first */
13279 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
13280 }
13281#endif
13282
13283 RExC_parse = SvPV(substitute_parse, len);
13284 RExC_end = RExC_parse + len;
13285 RExC_in_multi_char_class = 1;
13286 RExC_emit = (regnode *)orig_emit;
13287
13288 ret = reg(pRExC_state, 1, &reg_flags, depth+1);
13289
13290 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
13291
13292 RExC_parse = save_parse;
13293 RExC_end = save_end;
13294 RExC_in_multi_char_class = 0;
13295 SvREFCNT_dec_NN(multi_char_matches);
13296 return ret;
13297 }
13298
13299 /* If the character class contains only a single element, it may be
13300 * optimizable into another node type which is smaller and runs faster.
13301 * Check if this is the case for this class */
13302 if (element_count == 1 && ! ret_invlist) {
13303 U8 op = END;
13304 U8 arg = 0;
13305
13306 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
13307 [:digit:] or \p{foo} */
13308
13309 /* All named classes are mapped into POSIXish nodes, with its FLAG
13310 * argument giving which class it is */
13311 switch ((I32)namedclass) {
13312 case ANYOF_UNIPROP:
13313 break;
13314
13315 /* These don't depend on the charset modifiers. They always
13316 * match under /u rules */
13317 case ANYOF_NHORIZWS:
13318 case ANYOF_HORIZWS:
13319 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
13320 /* FALLTHROUGH */
13321
13322 case ANYOF_NVERTWS:
13323 case ANYOF_VERTWS:
13324 op = POSIXU;
13325 goto join_posix;
13326
13327 /* The actual POSIXish node for all the rest depends on the
13328 * charset modifier. The ones in the first set depend only on
13329 * ASCII or, if available on this platform, locale */
13330 case ANYOF_ASCII:
13331 case ANYOF_NASCII:
13332#ifdef HAS_ISASCII
13333 op = (LOC) ? POSIXL : POSIXA;
13334#else
13335 op = POSIXA;
13336#endif
13337 goto join_posix;
13338
13339 case ANYOF_NCASED:
13340 case ANYOF_LOWER:
13341 case ANYOF_NLOWER:
13342 case ANYOF_UPPER:
13343 case ANYOF_NUPPER:
13344 /* under /a could be alpha */
13345 if (FOLD) {
13346 if (ASCII_RESTRICTED) {
13347 namedclass = ANYOF_ALPHA + (namedclass % 2);
13348 }
13349 else if (! LOC) {
13350 break;
13351 }
13352 }
13353 /* FALLTHROUGH */
13354
13355 /* The rest have more possibilities depending on the charset.
13356 * We take advantage of the enum ordering of the charset
13357 * modifiers to get the exact node type, */
13358 default:
13359 op = POSIXD + get_regex_charset(RExC_flags);
13360 if (op > POSIXA) { /* /aa is same as /a */
13361 op = POSIXA;
13362 }
13363#ifndef HAS_ISBLANK
13364 if (op == POSIXL
13365 && (namedclass == ANYOF_BLANK
13366 || namedclass == ANYOF_NBLANK))
13367 {
13368 op = POSIXA;
13369 }
13370#endif
13371
13372 join_posix:
13373 /* The odd numbered ones are the complements of the
13374 * next-lower even number one */
13375 if (namedclass % 2 == 1) {
13376 invert = ! invert;
13377 namedclass--;
13378 }
13379 arg = namedclass_to_classnum(namedclass);
13380 break;
13381 }
13382 }
13383 else if (value == prevvalue) {
13384
13385 /* Here, the class consists of just a single code point */
13386
13387 if (invert) {
13388 if (! LOC && value == '\n') {
13389 op = REG_ANY; /* Optimize [^\n] */
13390 *flagp |= HASWIDTH|SIMPLE;
13391 RExC_naughty++;
13392 }
13393 }
13394 else if (value < 256 || UTF) {
13395
13396 /* Optimize a single value into an EXACTish node, but not if it
13397 * would require converting the pattern to UTF-8. */
13398 op = compute_EXACTish(pRExC_state);
13399 }
13400 } /* Otherwise is a range */
13401 else if (! LOC) { /* locale could vary these */
13402 if (prevvalue == '0') {
13403 if (value == '9') {
13404 arg = _CC_DIGIT;
13405 op = POSIXA;
13406 }
13407 }
13408 }
13409
13410 /* Here, we have changed <op> away from its initial value iff we found
13411 * an optimization */
13412 if (op != END) {
13413
13414 /* Throw away this ANYOF regnode, and emit the calculated one,
13415 * which should correspond to the beginning, not current, state of
13416 * the parse */
13417 const char * cur_parse = RExC_parse;
13418 RExC_parse = (char *)orig_parse;
13419 if ( SIZE_ONLY) {
13420 if (! LOC) {
13421
13422 /* To get locale nodes to not use the full ANYOF size would
13423 * require moving the code above that writes the portions
13424 * of it that aren't in other nodes to after this point.
13425 * e.g. ANYOF_CLASS_SET */
13426 RExC_size = orig_size;
13427 }
13428 }
13429 else {
13430 RExC_emit = (regnode *)orig_emit;
13431 if (PL_regkind[op] == POSIXD) {
13432 if (invert) {
13433 op += NPOSIXD - POSIXD;
13434 }
13435 }
13436 }
13437
13438 ret = reg_node(pRExC_state, op);
13439
13440 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
13441 if (! SIZE_ONLY) {
13442 FLAGS(ret) = arg;
13443 }
13444 *flagp |= HASWIDTH|SIMPLE;
13445 }
13446 else if (PL_regkind[op] == EXACT) {
13447 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13448 }
13449
13450 RExC_parse = (char *) cur_parse;
13451
13452 SvREFCNT_dec(posixes);
13453 SvREFCNT_dec(cp_list);
13454 return ret;
13455 }
13456 }
13457
13458 if (SIZE_ONLY)
13459 return ret;
13460 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
13461
13462 /* If folding, we calculate all characters that could fold to or from the
13463 * ones already on the list */
13464 if (FOLD && cp_list) {
13465 UV start, end; /* End points of code point ranges */
13466
13467 SV* fold_intersection = NULL;
13468
13469 /* If the highest code point is within Latin1, we can use the
13470 * compiled-in Alphas list, and not have to go out to disk. This
13471 * yields two false positives, the masculine and feminine ordinal
13472 * indicators, which are weeded out below using the
13473 * IS_IN_SOME_FOLD_L1() macro */
13474 if (invlist_highest(cp_list) < 256) {
13475 _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
13476 &fold_intersection);
13477 }
13478 else {
13479
13480 /* Here, there are non-Latin1 code points, so we will have to go
13481 * fetch the list of all the characters that participate in folds
13482 */
13483 if (! PL_utf8_foldable) {
13484 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13485 &PL_sv_undef, 1, 0);
13486 PL_utf8_foldable = _get_swash_invlist(swash);
13487 SvREFCNT_dec_NN(swash);
13488 }
13489
13490 /* This is a hash that for a particular fold gives all characters
13491 * that are involved in it */
13492 if (! PL_utf8_foldclosures) {
13493
13494 /* If we were unable to find any folds, then we likely won't be
13495 * able to find the closures. So just create an empty list.
13496 * Folding will effectively be restricted to the non-Unicode
13497 * rules hard-coded into Perl. (This case happens legitimately
13498 * during compilation of Perl itself before the Unicode tables
13499 * are generated) */
13500 if (_invlist_len(PL_utf8_foldable) == 0) {
13501 PL_utf8_foldclosures = newHV();
13502 }
13503 else {
13504 /* If the folds haven't been read in, call a fold function
13505 * to force that */
13506 if (! PL_utf8_tofold) {
13507 U8 dummy[UTF8_MAXBYTES_CASE+1];
13508
13509 /* This string is just a short named one above \xff */
13510 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
13511 assert(PL_utf8_tofold); /* Verify that worked */
13512 }
13513 PL_utf8_foldclosures =
13514 _swash_inversion_hash(PL_utf8_tofold);
13515 }
13516 }
13517
13518 /* Only the characters in this class that participate in folds need
13519 * be checked. Get the intersection of this class and all the
13520 * possible characters that are foldable. This can quickly narrow
13521 * down a large class */
13522 _invlist_intersection(PL_utf8_foldable, cp_list,
13523 &fold_intersection);
13524 }
13525
13526 /* Now look at the foldable characters in this class individually */
13527 invlist_iterinit(fold_intersection);
13528 while (invlist_iternext(fold_intersection, &start, &end)) {
13529 UV j;
13530
13531 /* Locale folding for Latin1 characters is deferred until runtime */
13532 if (LOC && start < 256) {
13533 start = 256;
13534 }
13535
13536 /* Look at every character in the range */
13537 for (j = start; j <= end; j++) {
13538
13539 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
13540 STRLEN foldlen;
13541 SV** listp;
13542
13543 if (j < 256) {
13544
13545 /* We have the latin1 folding rules hard-coded here so that
13546 * an innocent-looking character class, like /[ks]/i won't
13547 * have to go out to disk to find the possible matches.
13548 * XXX It would be better to generate these via regen, in
13549 * case a new version of the Unicode standard adds new
13550 * mappings, though that is not really likely, and may be
13551 * caught by the default: case of the switch below. */
13552
13553 if (IS_IN_SOME_FOLD_L1(j)) {
13554
13555 /* ASCII is always matched; non-ASCII is matched only
13556 * under Unicode rules */
13557 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
13558 cp_list =
13559 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
13560 }
13561 else {
13562 depends_list =
13563 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
13564 }
13565 }
13566
13567 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
13568 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
13569 {
13570 /* Certain Latin1 characters have matches outside
13571 * Latin1. To get here, <j> is one of those
13572 * characters. None of these matches is valid for
13573 * ASCII characters under /aa, which is why the 'if'
13574 * just above excludes those. These matches only
13575 * happen when the target string is utf8. The code
13576 * below adds the single fold closures for <j> to the
13577 * inversion list. */
13578 switch (j) {
13579 case 'k':
13580 case 'K':
13581 cp_list =
13582 add_cp_to_invlist(cp_list, KELVIN_SIGN);
13583 break;
13584 case 's':
13585 case 'S':
13586 cp_list = add_cp_to_invlist(cp_list,
13587 LATIN_SMALL_LETTER_LONG_S);
13588 break;
13589 case MICRO_SIGN:
13590 cp_list = add_cp_to_invlist(cp_list,
13591 GREEK_CAPITAL_LETTER_MU);
13592 cp_list = add_cp_to_invlist(cp_list,
13593 GREEK_SMALL_LETTER_MU);
13594 break;
13595 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13596 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13597 cp_list =
13598 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
13599 break;
13600 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13601 cp_list = add_cp_to_invlist(cp_list,
13602 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13603 break;
13604 case LATIN_SMALL_LETTER_SHARP_S:
13605 cp_list = add_cp_to_invlist(cp_list,
13606 LATIN_CAPITAL_LETTER_SHARP_S);
13607 break;
13608 case 'F': case 'f':
13609 case 'I': case 'i':
13610 case 'L': case 'l':
13611 case 'T': case 't':
13612 case 'A': case 'a':
13613 case 'H': case 'h':
13614 case 'J': case 'j':
13615 case 'N': case 'n':
13616 case 'W': case 'w':
13617 case 'Y': case 'y':
13618 /* These all are targets of multi-character
13619 * folds from code points that require UTF8 to
13620 * express, so they can't match unless the
13621 * target string is in UTF-8, so no action here
13622 * is necessary, as regexec.c properly handles
13623 * the general case for UTF-8 matching and
13624 * multi-char folds */
13625 break;
13626 default:
13627 /* Use deprecated warning to increase the
13628 * chances of this being output */
13629 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
13630 break;
13631 }
13632 }
13633 continue;
13634 }
13635
13636 /* Here is an above Latin1 character. We don't have the rules
13637 * hard-coded for it. First, get its fold. This is the simple
13638 * fold, as the multi-character folds have been handled earlier
13639 * and separated out */
13640 _to_uni_fold_flags(j, foldbuf, &foldlen,
13641 ((LOC)
13642 ? FOLD_FLAGS_LOCALE
13643 : (ASCII_FOLD_RESTRICTED)
13644 ? FOLD_FLAGS_NOMIX_ASCII
13645 : 0));
13646
13647 /* Single character fold of above Latin1. Add everything in
13648 * its fold closure to the list that this node should match.
13649 * The fold closures data structure is a hash with the keys
13650 * being the UTF-8 of every character that is folded to, like
13651 * 'k', and the values each an array of all code points that
13652 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
13653 * Multi-character folds are not included */
13654 if ((listp = hv_fetch(PL_utf8_foldclosures,
13655 (char *) foldbuf, foldlen, FALSE)))
13656 {
13657 AV* list = (AV*) *listp;
13658 IV k;
13659 for (k = 0; k <= av_len(list); k++) {
13660 SV** c_p = av_fetch(list, k, FALSE);
13661 UV c;
13662 if (c_p == NULL) {
13663 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
13664 }
13665 c = SvUV(*c_p);
13666
13667 /* /aa doesn't allow folds between ASCII and non-; /l
13668 * doesn't allow them between above and below 256 */
13669 if ((ASCII_FOLD_RESTRICTED
13670 && (isASCII(c) != isASCII(j)))
13671 || (LOC && c < 256)) {
13672 continue;
13673 }
13674
13675 /* Folds involving non-ascii Latin1 characters
13676 * under /d are added to a separate list */
13677 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
13678 {
13679 cp_list = add_cp_to_invlist(cp_list, c);
13680 }
13681 else {
13682 depends_list = add_cp_to_invlist(depends_list, c);
13683 }
13684 }
13685 }
13686 }
13687 }
13688 SvREFCNT_dec_NN(fold_intersection);
13689 }
13690
13691 /* And combine the result (if any) with any inversion list from posix
13692 * classes. The lists are kept separate up to now because we don't want to
13693 * fold the classes (folding of those is automatically handled by the swash
13694 * fetching code) */
13695 if (posixes) {
13696 if (! DEPENDS_SEMANTICS) {
13697 if (cp_list) {
13698 _invlist_union(cp_list, posixes, &cp_list);
13699 SvREFCNT_dec_NN(posixes);
13700 }
13701 else {
13702 cp_list = posixes;
13703 }
13704 }
13705 else {
13706 /* Under /d, we put into a separate list the Latin1 things that
13707 * match only when the target string is utf8 */
13708 SV* nonascii_but_latin1_properties = NULL;
13709 _invlist_intersection(posixes, PL_Latin1,
13710 &nonascii_but_latin1_properties);
13711 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
13712 &nonascii_but_latin1_properties);
13713 _invlist_subtract(posixes, nonascii_but_latin1_properties,
13714 &posixes);
13715 if (cp_list) {
13716 _invlist_union(cp_list, posixes, &cp_list);
13717 SvREFCNT_dec_NN(posixes);
13718 }
13719 else {
13720 cp_list = posixes;
13721 }
13722
13723 if (depends_list) {
13724 _invlist_union(depends_list, nonascii_but_latin1_properties,
13725 &depends_list);
13726 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
13727 }
13728 else {
13729 depends_list = nonascii_but_latin1_properties;
13730 }
13731 }
13732 }
13733
13734 /* And combine the result (if any) with any inversion list from properties.
13735 * The lists are kept separate up to now so that we can distinguish the two
13736 * in regards to matching above-Unicode. A run-time warning is generated
13737 * if a Unicode property is matched against a non-Unicode code point. But,
13738 * we allow user-defined properties to match anything, without any warning,
13739 * and we also suppress the warning if there is a portion of the character
13740 * class that isn't a Unicode property, and which matches above Unicode, \W
13741 * or [\x{110000}] for example.
13742 * (Note that in this case, unlike the Posix one above, there is no
13743 * <depends_list>, because having a Unicode property forces Unicode
13744 * semantics */
13745 if (properties) {
13746 bool warn_super = ! has_user_defined_property;
13747 if (cp_list) {
13748
13749 /* If it matters to the final outcome, see if a non-property
13750 * component of the class matches above Unicode. If so, the
13751 * warning gets suppressed. This is true even if just a single
13752 * such code point is specified, as though not strictly correct if
13753 * another such code point is matched against, the fact that they
13754 * are using above-Unicode code points indicates they should know
13755 * the issues involved */
13756 if (warn_super) {
13757 bool non_prop_matches_above_Unicode =
13758 runtime_posix_matches_above_Unicode
13759 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
13760 if (invert) {
13761 non_prop_matches_above_Unicode =
13762 ! non_prop_matches_above_Unicode;
13763 }
13764 warn_super = ! non_prop_matches_above_Unicode;
13765 }
13766
13767 _invlist_union(properties, cp_list, &cp_list);
13768 SvREFCNT_dec_NN(properties);
13769 }
13770 else {
13771 cp_list = properties;
13772 }
13773
13774 if (warn_super) {
13775 OP(ret) = ANYOF_WARN_SUPER;
13776 }
13777 }
13778
13779 /* Here, we have calculated what code points should be in the character
13780 * class.
13781 *
13782 * Now we can see about various optimizations. Fold calculation (which we
13783 * did above) needs to take place before inversion. Otherwise /[^k]/i
13784 * would invert to include K, which under /i would match k, which it
13785 * shouldn't. Therefore we can't invert folded locale now, as it won't be
13786 * folded until runtime */
13787
13788 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
13789 * at compile time. Besides not inverting folded locale now, we can't
13790 * invert if there are things such as \w, which aren't known until runtime
13791 * */
13792 if (invert
13793 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
13794 && ! depends_list
13795 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13796 {
13797 _invlist_invert(cp_list);
13798
13799 /* Any swash can't be used as-is, because we've inverted things */
13800 if (swash) {
13801 SvREFCNT_dec_NN(swash);
13802 swash = NULL;
13803 }
13804
13805 /* Clear the invert flag since have just done it here */
13806 invert = FALSE;
13807 }
13808
13809 if (ret_invlist) {
13810 *ret_invlist = cp_list;
13811 SvREFCNT_dec(swash);
13812
13813 /* Discard the generated node */
13814 if (SIZE_ONLY) {
13815 RExC_size = orig_size;
13816 }
13817 else {
13818 RExC_emit = orig_emit;
13819 }
13820 return orig_emit;
13821 }
13822
13823 /* If we didn't do folding, it's because some information isn't available
13824 * until runtime; set the run-time fold flag for these. (We don't have to
13825 * worry about properties folding, as that is taken care of by the swash
13826 * fetching) */
13827 if (FOLD && LOC)
13828 {
13829 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
13830 }
13831
13832 /* Some character classes are equivalent to other nodes. Such nodes take
13833 * up less room and generally fewer operations to execute than ANYOF nodes.
13834 * Above, we checked for and optimized into some such equivalents for
13835 * certain common classes that are easy to test. Getting to this point in
13836 * the code means that the class didn't get optimized there. Since this
13837 * code is only executed in Pass 2, it is too late to save space--it has
13838 * been allocated in Pass 1, and currently isn't given back. But turning
13839 * things into an EXACTish node can allow the optimizer to join it to any
13840 * adjacent such nodes. And if the class is equivalent to things like /./,
13841 * expensive run-time swashes can be avoided. Now that we have more
13842 * complete information, we can find things necessarily missed by the
13843 * earlier code. I (khw) am not sure how much to look for here. It would
13844 * be easy, but perhaps too slow, to check any candidates against all the
13845 * node types they could possibly match using _invlistEQ(). */
13846
13847 if (cp_list
13848 && ! invert
13849 && ! depends_list
13850 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
13851 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13852 {
13853 UV start, end;
13854 U8 op = END; /* The optimzation node-type */
13855 const char * cur_parse= RExC_parse;
13856
13857 invlist_iterinit(cp_list);
13858 if (! invlist_iternext(cp_list, &start, &end)) {
13859
13860 /* Here, the list is empty. This happens, for example, when a
13861 * Unicode property is the only thing in the character class, and
13862 * it doesn't match anything. (perluniprops.pod notes such
13863 * properties) */
13864 op = OPFAIL;
13865 *flagp |= HASWIDTH|SIMPLE;
13866 }
13867 else if (start == end) { /* The range is a single code point */
13868 if (! invlist_iternext(cp_list, &start, &end)
13869
13870 /* Don't do this optimization if it would require changing
13871 * the pattern to UTF-8 */
13872 && (start < 256 || UTF))
13873 {
13874 /* Here, the list contains a single code point. Can optimize
13875 * into an EXACT node */
13876
13877 value = start;
13878
13879 if (! FOLD) {
13880 op = EXACT;
13881 }
13882 else if (LOC) {
13883
13884 /* A locale node under folding with one code point can be
13885 * an EXACTFL, as its fold won't be calculated until
13886 * runtime */
13887 op = EXACTFL;
13888 }
13889 else {
13890
13891 /* Here, we are generally folding, but there is only one
13892 * code point to match. If we have to, we use an EXACT
13893 * node, but it would be better for joining with adjacent
13894 * nodes in the optimization pass if we used the same
13895 * EXACTFish node that any such are likely to be. We can
13896 * do this iff the code point doesn't participate in any
13897 * folds. For example, an EXACTF of a colon is the same as
13898 * an EXACT one, since nothing folds to or from a colon. */
13899 if (value < 256) {
13900 if (IS_IN_SOME_FOLD_L1(value)) {
13901 op = EXACT;
13902 }
13903 }
13904 else {
13905 if (! PL_utf8_foldable) {
13906 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13907 &PL_sv_undef, 1, 0);
13908 PL_utf8_foldable = _get_swash_invlist(swash);
13909 SvREFCNT_dec_NN(swash);
13910 }
13911 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13912 op = EXACT;
13913 }
13914 }
13915
13916 /* If we haven't found the node type, above, it means we
13917 * can use the prevailing one */
13918 if (op == END) {
13919 op = compute_EXACTish(pRExC_state);
13920 }
13921 }
13922 }
13923 }
13924 else if (start == 0) {
13925 if (end == UV_MAX) {
13926 op = SANY;
13927 *flagp |= HASWIDTH|SIMPLE;
13928 RExC_naughty++;
13929 }
13930 else if (end == '\n' - 1
13931 && invlist_iternext(cp_list, &start, &end)
13932 && start == '\n' + 1 && end == UV_MAX)
13933 {
13934 op = REG_ANY;
13935 *flagp |= HASWIDTH|SIMPLE;
13936 RExC_naughty++;
13937 }
13938 }
13939 invlist_iterfinish(cp_list);
13940
13941 if (op != END) {
13942 RExC_parse = (char *)orig_parse;
13943 RExC_emit = (regnode *)orig_emit;
13944
13945 ret = reg_node(pRExC_state, op);
13946
13947 RExC_parse = (char *)cur_parse;
13948
13949 if (PL_regkind[op] == EXACT) {
13950 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13951 }
13952
13953 SvREFCNT_dec_NN(cp_list);
13954 return ret;
13955 }
13956 }
13957
13958 /* Here, <cp_list> contains all the code points we can determine at
13959 * compile time that match under all conditions. Go through it, and
13960 * for things that belong in the bitmap, put them there, and delete from
13961 * <cp_list>. While we are at it, see if everything above 255 is in the
13962 * list, and if so, set a flag to speed up execution */
13963 ANYOF_BITMAP_ZERO(ret);
13964 if (cp_list) {
13965
13966 /* This gets set if we actually need to modify things */
13967 bool change_invlist = FALSE;
13968
13969 UV start, end;
13970
13971 /* Start looking through <cp_list> */
13972 invlist_iterinit(cp_list);
13973 while (invlist_iternext(cp_list, &start, &end)) {
13974 UV high;
13975 int i;
13976
13977 if (end == UV_MAX && start <= 256) {
13978 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13979 }
13980
13981 /* Quit if are above what we should change */
13982 if (start > 255) {
13983 break;
13984 }
13985
13986 change_invlist = TRUE;
13987
13988 /* Set all the bits in the range, up to the max that we are doing */
13989 high = (end < 255) ? end : 255;
13990 for (i = start; i <= (int) high; i++) {
13991 if (! ANYOF_BITMAP_TEST(ret, i)) {
13992 ANYOF_BITMAP_SET(ret, i);
13993 }
13994 }
13995 }
13996 invlist_iterfinish(cp_list);
13997
13998 /* Done with loop; remove any code points that are in the bitmap from
13999 * <cp_list> */
14000 if (change_invlist) {
14001 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
14002 }
14003
14004 /* If have completely emptied it, remove it completely */
14005 if (_invlist_len(cp_list) == 0) {
14006 SvREFCNT_dec_NN(cp_list);
14007 cp_list = NULL;
14008 }
14009 }
14010
14011 if (invert) {
14012 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14013 }
14014
14015 /* Here, the bitmap has been populated with all the Latin1 code points that
14016 * always match. Can now add to the overall list those that match only
14017 * when the target string is UTF-8 (<depends_list>). */
14018 if (depends_list) {
14019 if (cp_list) {
14020 _invlist_union(cp_list, depends_list, &cp_list);
14021 SvREFCNT_dec_NN(depends_list);
14022 }
14023 else {
14024 cp_list = depends_list;
14025 }
14026 }
14027
14028 /* If there is a swash and more than one element, we can't use the swash in
14029 * the optimization below. */
14030 if (swash && element_count > 1) {
14031 SvREFCNT_dec_NN(swash);
14032 swash = NULL;
14033 }
14034
14035 if (! cp_list
14036 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14037 {
14038 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
14039 }
14040 else {
14041 /* av[0] stores the character class description in its textual form:
14042 * used later (regexec.c:Perl_regclass_swash()) to initialize the
14043 * appropriate swash, and is also useful for dumping the regnode.
14044 * av[1] if NULL, is a placeholder to later contain the swash computed
14045 * from av[0]. But if no further computation need be done, the
14046 * swash is stored there now.
14047 * av[2] stores the cp_list inversion list for use in addition or
14048 * instead of av[0]; used only if av[1] is NULL
14049 * av[3] is set if any component of the class is from a user-defined
14050 * property; used only if av[1] is NULL */
14051 AV * const av = newAV();
14052 SV *rv;
14053
14054 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14055 ? SvREFCNT_inc(listsv) : &PL_sv_undef);
14056 if (swash) {
14057 av_store(av, 1, swash);
14058 SvREFCNT_dec_NN(cp_list);
14059 }
14060 else {
14061 av_store(av, 1, NULL);
14062 if (cp_list) {
14063 av_store(av, 2, cp_list);
14064 av_store(av, 3, newSVuv(has_user_defined_property));
14065 }
14066 }
14067
14068 rv = newRV_noinc(MUTABLE_SV(av));
14069 n = add_data(pRExC_state, 1, "s");
14070 RExC_rxi->data->data[n] = (void*)rv;
14071 ARG_SET(ret, n);
14072 }
14073
14074 *flagp |= HASWIDTH|SIMPLE;
14075 return ret;
14076}
14077#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14078
14079
14080/* reg_skipcomment()
14081
14082 Absorbs an /x style # comments from the input stream.
14083 Returns true if there is more text remaining in the stream.
14084 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
14085 terminates the pattern without including a newline.
14086
14087 Note its the callers responsibility to ensure that we are
14088 actually in /x mode
14089
14090*/
14091
14092STATIC bool
14093S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14094{
14095 bool ended = 0;
14096
14097 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14098
14099 while (RExC_parse < RExC_end)
14100 if (*RExC_parse++ == '\n') {
14101 ended = 1;
14102 break;
14103 }
14104 if (!ended) {
14105 /* we ran off the end of the pattern without ending
14106 the comment, so we have to add an \n when wrapping */
14107 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
14108 return 0;
14109 } else
14110 return 1;
14111}
14112
14113/* nextchar()
14114
14115 Advances the parse position, and optionally absorbs
14116 "whitespace" from the inputstream.
14117
14118 Without /x "whitespace" means (?#...) style comments only,
14119 with /x this means (?#...) and # comments and whitespace proper.
14120
14121 Returns the RExC_parse point from BEFORE the scan occurs.
14122
14123 This is the /x friendly way of saying RExC_parse++.
14124*/
14125
14126STATIC char*
14127S_nextchar(pTHX_ RExC_state_t *pRExC_state)
14128{
14129 char* const retval = RExC_parse++;
14130
14131 PERL_ARGS_ASSERT_NEXTCHAR;
14132
14133 for (;;) {
14134 if (RExC_end - RExC_parse >= 3
14135 && *RExC_parse == '('
14136 && RExC_parse[1] == '?'
14137 && RExC_parse[2] == '#')
14138 {
14139 while (*RExC_parse != ')') {
14140 if (RExC_parse == RExC_end)
14141 FAIL("Sequence (?#... not terminated");
14142 RExC_parse++;
14143 }
14144 RExC_parse++;
14145 continue;
14146 }
14147 if (RExC_flags & RXf_PMf_EXTENDED) {
14148 if (isSPACE(*RExC_parse)) {
14149 RExC_parse++;
14150 continue;
14151 }
14152 else if (*RExC_parse == '#') {
14153 if ( reg_skipcomment( pRExC_state ) )
14154 continue;
14155 }
14156 }
14157 return retval;
14158 }
14159}
14160
14161/*
14162- reg_node - emit a node
14163*/
14164STATIC regnode * /* Location. */
14165S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
14166{
14167 dVAR;
14168 regnode *ptr;
14169 regnode * const ret = RExC_emit;
14170 GET_RE_DEBUG_FLAGS_DECL;
14171
14172 PERL_ARGS_ASSERT_REG_NODE;
14173
14174 if (SIZE_ONLY) {
14175 SIZE_ALIGN(RExC_size);
14176 RExC_size += 1;
14177 return(ret);
14178 }
14179 if (RExC_emit >= RExC_emit_bound)
14180 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14181 op, RExC_emit, RExC_emit_bound);
14182
14183 NODE_ALIGN_FILL(ret);
14184 ptr = ret;
14185 FILL_ADVANCE_NODE(ptr, op);
14186#ifdef RE_TRACK_PATTERN_OFFSETS
14187 if (RExC_offsets) { /* MJD */
14188 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
14189 "reg_node", __LINE__,
14190 PL_reg_name[op],
14191 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
14192 ? "Overwriting end of array!\n" : "OK",
14193 (UV)(RExC_emit - RExC_emit_start),
14194 (UV)(RExC_parse - RExC_start),
14195 (UV)RExC_offsets[0]));
14196 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
14197 }
14198#endif
14199 RExC_emit = ptr;
14200 return(ret);
14201}
14202
14203/*
14204- reganode - emit a node with an argument
14205*/
14206STATIC regnode * /* Location. */
14207S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
14208{
14209 dVAR;
14210 regnode *ptr;
14211 regnode * const ret = RExC_emit;
14212 GET_RE_DEBUG_FLAGS_DECL;
14213
14214 PERL_ARGS_ASSERT_REGANODE;
14215
14216 if (SIZE_ONLY) {
14217 SIZE_ALIGN(RExC_size);
14218 RExC_size += 2;
14219 /*
14220 We can't do this:
14221
14222 assert(2==regarglen[op]+1);
14223
14224 Anything larger than this has to allocate the extra amount.
14225 If we changed this to be:
14226
14227 RExC_size += (1 + regarglen[op]);
14228
14229 then it wouldn't matter. Its not clear what side effect
14230 might come from that so its not done so far.
14231 -- dmq
14232 */
14233 return(ret);
14234 }
14235 if (RExC_emit >= RExC_emit_bound)
14236 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
14237 op, RExC_emit, RExC_emit_bound);
14238
14239 NODE_ALIGN_FILL(ret);
14240 ptr = ret;
14241 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
14242#ifdef RE_TRACK_PATTERN_OFFSETS
14243 if (RExC_offsets) { /* MJD */
14244 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14245 "reganode",
14246 __LINE__,
14247 PL_reg_name[op],
14248 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
14249 "Overwriting end of array!\n" : "OK",
14250 (UV)(RExC_emit - RExC_emit_start),
14251 (UV)(RExC_parse - RExC_start),
14252 (UV)RExC_offsets[0]));
14253 Set_Cur_Node_Offset;
14254 }
14255#endif
14256 RExC_emit = ptr;
14257 return(ret);
14258}
14259
14260/*
14261- reguni - emit (if appropriate) a Unicode character
14262*/
14263STATIC STRLEN
14264S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
14265{
14266 dVAR;
14267
14268 PERL_ARGS_ASSERT_REGUNI;
14269
14270 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
14271}
14272
14273/*
14274- reginsert - insert an operator in front of already-emitted operand
14275*
14276* Means relocating the operand.
14277*/
14278STATIC void
14279S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
14280{
14281 dVAR;
14282 regnode *src;
14283 regnode *dst;
14284 regnode *place;
14285 const int offset = regarglen[(U8)op];
14286 const int size = NODE_STEP_REGNODE + offset;
14287 GET_RE_DEBUG_FLAGS_DECL;
14288
14289 PERL_ARGS_ASSERT_REGINSERT;
14290 PERL_UNUSED_ARG(depth);
14291/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
14292 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
14293 if (SIZE_ONLY) {
14294 RExC_size += size;
14295 return;
14296 }
14297
14298 src = RExC_emit;
14299 RExC_emit += size;
14300 dst = RExC_emit;
14301 if (RExC_open_parens) {
14302 int paren;
14303 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
14304 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
14305 if ( RExC_open_parens[paren] >= opnd ) {
14306 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
14307 RExC_open_parens[paren] += size;
14308 } else {
14309 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
14310 }
14311 if ( RExC_close_parens[paren] >= opnd ) {
14312 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
14313 RExC_close_parens[paren] += size;
14314 } else {
14315 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
14316 }
14317 }
14318 }
14319
14320 while (src > opnd) {
14321 StructCopy(--src, --dst, regnode);
14322#ifdef RE_TRACK_PATTERN_OFFSETS
14323 if (RExC_offsets) { /* MJD 20010112 */
14324 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
14325 "reg_insert",
14326 __LINE__,
14327 PL_reg_name[op],
14328 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
14329 ? "Overwriting end of array!\n" : "OK",
14330 (UV)(src - RExC_emit_start),
14331 (UV)(dst - RExC_emit_start),
14332 (UV)RExC_offsets[0]));
14333 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
14334 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
14335 }
14336#endif
14337 }
14338
14339
14340 place = opnd; /* Op node, where operand used to be. */
14341#ifdef RE_TRACK_PATTERN_OFFSETS
14342 if (RExC_offsets) { /* MJD */
14343 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
14344 "reginsert",
14345 __LINE__,
14346 PL_reg_name[op],
14347 (UV)(place - RExC_emit_start) > RExC_offsets[0]
14348 ? "Overwriting end of array!\n" : "OK",
14349 (UV)(place - RExC_emit_start),
14350 (UV)(RExC_parse - RExC_start),
14351 (UV)RExC_offsets[0]));
14352 Set_Node_Offset(place, RExC_parse);
14353 Set_Node_Length(place, 1);
14354 }
14355#endif
14356 src = NEXTOPER(place);
14357 FILL_ADVANCE_NODE(place, op);
14358 Zero(src, offset, regnode);
14359}
14360
14361/*
14362- regtail - set the next-pointer at the end of a node chain of p to val.
14363- SEE ALSO: regtail_study
14364*/
14365/* TODO: All three parms should be const */
14366STATIC void
14367S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14368{
14369 dVAR;
14370 regnode *scan;
14371 GET_RE_DEBUG_FLAGS_DECL;
14372
14373 PERL_ARGS_ASSERT_REGTAIL;
14374#ifndef DEBUGGING
14375 PERL_UNUSED_ARG(depth);
14376#endif
14377
14378 if (SIZE_ONLY)
14379 return;
14380
14381 /* Find last node. */
14382 scan = p;
14383 for (;;) {
14384 regnode * const temp = regnext(scan);
14385 DEBUG_PARSE_r({
14386 SV * const mysv=sv_newmortal();
14387 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
14388 regprop(RExC_rx, mysv, scan);
14389 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
14390 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
14391 (temp == NULL ? "->" : ""),
14392 (temp == NULL ? PL_reg_name[OP(val)] : "")
14393 );
14394 });
14395 if (temp == NULL)
14396 break;
14397 scan = temp;
14398 }
14399
14400 if (reg_off_by_arg[OP(scan)]) {
14401 ARG_SET(scan, val - scan);
14402 }
14403 else {
14404 NEXT_OFF(scan) = val - scan;
14405 }
14406}
14407
14408#ifdef DEBUGGING
14409/*
14410- regtail_study - set the next-pointer at the end of a node chain of p to val.
14411- Look for optimizable sequences at the same time.
14412- currently only looks for EXACT chains.
14413
14414This is experimental code. The idea is to use this routine to perform
14415in place optimizations on branches and groups as they are constructed,
14416with the long term intention of removing optimization from study_chunk so
14417that it is purely analytical.
14418
14419Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
14420to control which is which.
14421
14422*/
14423/* TODO: All four parms should be const */
14424
14425STATIC U8
14426S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
14427{
14428 dVAR;
14429 regnode *scan;
14430 U8 exact = PSEUDO;
14431#ifdef EXPERIMENTAL_INPLACESCAN
14432 I32 min = 0;
14433#endif
14434 GET_RE_DEBUG_FLAGS_DECL;
14435
14436 PERL_ARGS_ASSERT_REGTAIL_STUDY;
14437
14438
14439 if (SIZE_ONLY)
14440 return exact;
14441
14442 /* Find last node. */
14443
14444 scan = p;
14445 for (;;) {
14446 regnode * const temp = regnext(scan);
14447#ifdef EXPERIMENTAL_INPLACESCAN
14448 if (PL_regkind[OP(scan)] == EXACT) {
14449 bool has_exactf_sharp_s; /* Unexamined in this routine */
14450 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
14451 return EXACT;
14452 }
14453#endif
14454 if ( exact ) {
14455 switch (OP(scan)) {
14456 case EXACT:
14457 case EXACTF:
14458 case EXACTFA_NO_TRIE:
14459 case EXACTFA:
14460 case EXACTFU:
14461 case EXACTFU_SS:
14462 case EXACTFL:
14463 if( exact == PSEUDO )
14464 exact= OP(scan);
14465 else if ( exact != OP(scan) )
14466 exact= 0;
14467 case NOTHING:
14468 break;
14469 default:
14470 exact= 0;
14471 }
14472 }
14473 DEBUG_PARSE_r({
14474 SV * const mysv=sv_newmortal();
14475 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
14476 regprop(RExC_rx, mysv, scan);
14477 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
14478 SvPV_nolen_const(mysv),
14479 REG_NODE_NUM(scan),
14480 PL_reg_name[exact]);
14481 });
14482 if (temp == NULL)
14483 break;
14484 scan = temp;
14485 }
14486 DEBUG_PARSE_r({
14487 SV * const mysv_val=sv_newmortal();
14488 DEBUG_PARSE_MSG("");
14489 regprop(RExC_rx, mysv_val, val);
14490 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
14491 SvPV_nolen_const(mysv_val),
14492 (IV)REG_NODE_NUM(val),
14493 (IV)(val - scan)
14494 );
14495 });
14496 if (reg_off_by_arg[OP(scan)]) {
14497 ARG_SET(scan, val - scan);
14498 }
14499 else {
14500 NEXT_OFF(scan) = val - scan;
14501 }
14502
14503 return exact;
14504}
14505#endif
14506
14507/*
14508 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
14509 */
14510#ifdef DEBUGGING
14511
14512static void
14513S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
14514{
14515 int bit;
14516 int set=0;
14517
14518 for (bit=0; bit<32; bit++) {
14519 if (flags & (1<<bit)) {
14520 if (!set++ && lead)
14521 PerlIO_printf(Perl_debug_log, "%s",lead);
14522 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
14523 }
14524 }
14525 if (lead) {
14526 if (set)
14527 PerlIO_printf(Perl_debug_log, "\n");
14528 else
14529 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14530 }
14531}
14532
14533static void
14534S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
14535{
14536 int bit;
14537 int set=0;
14538 regex_charset cs;
14539
14540 for (bit=0; bit<32; bit++) {
14541 if (flags & (1<<bit)) {
14542 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
14543 continue;
14544 }
14545 if (!set++ && lead)
14546 PerlIO_printf(Perl_debug_log, "%s",lead);
14547 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
14548 }
14549 }
14550 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
14551 if (!set++ && lead) {
14552 PerlIO_printf(Perl_debug_log, "%s",lead);
14553 }
14554 switch (cs) {
14555 case REGEX_UNICODE_CHARSET:
14556 PerlIO_printf(Perl_debug_log, "UNICODE");
14557 break;
14558 case REGEX_LOCALE_CHARSET:
14559 PerlIO_printf(Perl_debug_log, "LOCALE");
14560 break;
14561 case REGEX_ASCII_RESTRICTED_CHARSET:
14562 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
14563 break;
14564 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
14565 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
14566 break;
14567 default:
14568 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
14569 break;
14570 }
14571 }
14572 if (lead) {
14573 if (set)
14574 PerlIO_printf(Perl_debug_log, "\n");
14575 else
14576 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
14577 }
14578}
14579#endif
14580
14581void
14582Perl_regdump(pTHX_ const regexp *r)
14583{
14584#ifdef DEBUGGING
14585 dVAR;
14586 SV * const sv = sv_newmortal();
14587 SV *dsv= sv_newmortal();
14588 RXi_GET_DECL(r,ri);
14589 GET_RE_DEBUG_FLAGS_DECL;
14590
14591 PERL_ARGS_ASSERT_REGDUMP;
14592
14593 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
14594
14595 /* Header fields of interest. */
14596 if (r->anchored_substr) {
14597 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
14598 RE_SV_DUMPLEN(r->anchored_substr), 30);
14599 PerlIO_printf(Perl_debug_log,
14600 "anchored %s%s at %"IVdf" ",
14601 s, RE_SV_TAIL(r->anchored_substr),
14602 (IV)r->anchored_offset);
14603 } else if (r->anchored_utf8) {
14604 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
14605 RE_SV_DUMPLEN(r->anchored_utf8), 30);
14606 PerlIO_printf(Perl_debug_log,
14607 "anchored utf8 %s%s at %"IVdf" ",
14608 s, RE_SV_TAIL(r->anchored_utf8),
14609 (IV)r->anchored_offset);
14610 }
14611 if (r->float_substr) {
14612 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
14613 RE_SV_DUMPLEN(r->float_substr), 30);
14614 PerlIO_printf(Perl_debug_log,
14615 "floating %s%s at %"IVdf"..%"UVuf" ",
14616 s, RE_SV_TAIL(r->float_substr),
14617 (IV)r->float_min_offset, (UV)r->float_max_offset);
14618 } else if (r->float_utf8) {
14619 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
14620 RE_SV_DUMPLEN(r->float_utf8), 30);
14621 PerlIO_printf(Perl_debug_log,
14622 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
14623 s, RE_SV_TAIL(r->float_utf8),
14624 (IV)r->float_min_offset, (UV)r->float_max_offset);
14625 }
14626 if (r->check_substr || r->check_utf8)
14627 PerlIO_printf(Perl_debug_log,
14628 (const char *)
14629 (r->check_substr == r->float_substr
14630 && r->check_utf8 == r->float_utf8
14631 ? "(checking floating" : "(checking anchored"));
14632 if (r->extflags & RXf_NOSCAN)
14633 PerlIO_printf(Perl_debug_log, " noscan");
14634 if (r->extflags & RXf_CHECK_ALL)
14635 PerlIO_printf(Perl_debug_log, " isall");
14636 if (r->check_substr || r->check_utf8)
14637 PerlIO_printf(Perl_debug_log, ") ");
14638
14639 if (ri->regstclass) {
14640 regprop(r, sv, ri->regstclass);
14641 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
14642 }
14643 if (r->extflags & RXf_ANCH) {
14644 PerlIO_printf(Perl_debug_log, "anchored");
14645 if (r->extflags & RXf_ANCH_BOL)
14646 PerlIO_printf(Perl_debug_log, "(BOL)");
14647 if (r->extflags & RXf_ANCH_MBOL)
14648 PerlIO_printf(Perl_debug_log, "(MBOL)");
14649 if (r->extflags & RXf_ANCH_SBOL)
14650 PerlIO_printf(Perl_debug_log, "(SBOL)");
14651 if (r->extflags & RXf_ANCH_GPOS)
14652 PerlIO_printf(Perl_debug_log, "(GPOS)");
14653 PerlIO_putc(Perl_debug_log, ' ');
14654 }
14655 if (r->extflags & RXf_GPOS_SEEN)
14656 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
14657 if (r->intflags & PREGf_SKIP)
14658 PerlIO_printf(Perl_debug_log, "plus ");
14659 if (r->intflags & PREGf_IMPLICIT)
14660 PerlIO_printf(Perl_debug_log, "implicit ");
14661 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
14662 if (r->extflags & RXf_EVAL_SEEN)
14663 PerlIO_printf(Perl_debug_log, "with eval ");
14664 PerlIO_printf(Perl_debug_log, "\n");
14665 DEBUG_FLAGS_r({
14666 regdump_extflags("r->extflags: ",r->extflags);
14667 regdump_intflags("r->intflags: ",r->intflags);
14668 });
14669#else
14670 PERL_ARGS_ASSERT_REGDUMP;
14671 PERL_UNUSED_CONTEXT;
14672 PERL_UNUSED_ARG(r);
14673#endif /* DEBUGGING */
14674}
14675
14676/*
14677- regprop - printable representation of opcode
14678*/
14679#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
14680STMT_START { \
14681 if (do_sep) { \
14682 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
14683 if (flags & ANYOF_INVERT) \
14684 /*make sure the invert info is in each */ \
14685 sv_catpvs(sv, "^"); \
14686 do_sep = 0; \
14687 } \
14688} STMT_END
14689
14690void
14691Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
14692{
14693#ifdef DEBUGGING
14694 dVAR;
14695 int k;
14696
14697 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
14698 static const char * const anyofs[] = {
14699#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
14700 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
14701 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
14702 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
14703 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
14704 || _CC_VERTSPACE != 16
14705 #error Need to adjust order of anyofs[]
14706#endif
14707 "[\\w]",
14708 "[\\W]",
14709 "[\\d]",
14710 "[\\D]",
14711 "[:alpha:]",
14712 "[:^alpha:]",
14713 "[:lower:]",
14714 "[:^lower:]",
14715 "[:upper:]",
14716 "[:^upper:]",
14717 "[:punct:]",
14718 "[:^punct:]",
14719 "[:print:]",
14720 "[:^print:]",
14721 "[:alnum:]",
14722 "[:^alnum:]",
14723 "[:graph:]",
14724 "[:^graph:]",
14725 "[:cased:]",
14726 "[:^cased:]",
14727 "[\\s]",
14728 "[\\S]",
14729 "[:blank:]",
14730 "[:^blank:]",
14731 "[:xdigit:]",
14732 "[:^xdigit:]",
14733 "[:space:]",
14734 "[:^space:]",
14735 "[:cntrl:]",
14736 "[:^cntrl:]",
14737 "[:ascii:]",
14738 "[:^ascii:]",
14739 "[\\v]",
14740 "[\\V]"
14741 };
14742 RXi_GET_DECL(prog,progi);
14743 GET_RE_DEBUG_FLAGS_DECL;
14744
14745 PERL_ARGS_ASSERT_REGPROP;
14746
14747 sv_setpvs(sv, "");
14748
14749 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
14750 /* It would be nice to FAIL() here, but this may be called from
14751 regexec.c, and it would be hard to supply pRExC_state. */
14752 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
14753 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
14754
14755 k = PL_regkind[OP(o)];
14756
14757 if (k == EXACT) {
14758 sv_catpvs(sv, " ");
14759 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
14760 * is a crude hack but it may be the best for now since
14761 * we have no flag "this EXACTish node was UTF-8"
14762 * --jhi */
14763 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
14764 PERL_PV_ESCAPE_UNI_DETECT |
14765 PERL_PV_ESCAPE_NONASCII |
14766 PERL_PV_PRETTY_ELLIPSES |
14767 PERL_PV_PRETTY_LTGT |
14768 PERL_PV_PRETTY_NOCLEAR
14769 );
14770 } else if (k == TRIE) {
14771 /* print the details of the trie in dumpuntil instead, as
14772 * progi->data isn't available here */
14773 const char op = OP(o);
14774 const U32 n = ARG(o);
14775 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
14776 (reg_ac_data *)progi->data->data[n] :
14777 NULL;
14778 const reg_trie_data * const trie
14779 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
14780
14781 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
14782 DEBUG_TRIE_COMPILE_r(
14783 Perl_sv_catpvf(aTHX_ sv,
14784 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
14785 (UV)trie->startstate,
14786 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
14787 (UV)trie->wordcount,
14788 (UV)trie->minlen,
14789 (UV)trie->maxlen,
14790 (UV)TRIE_CHARCOUNT(trie),
14791 (UV)trie->uniquecharcount
14792 )
14793 );
14794 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
14795 sv_catpvs(sv, "[");
14796 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
14797 ? ANYOF_BITMAP(o)
14798 : TRIE_BITMAP(trie));
14799 sv_catpvs(sv, "]");
14800 }
14801
14802 } else if (k == CURLY) {
14803 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
14804 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
14805 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
14806 }
14807 else if (k == WHILEM && o->flags) /* Ordinal/of */
14808 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
14809 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
14810 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
14811 if ( RXp_PAREN_NAMES(prog) ) {
14812 if ( k != REF || (OP(o) < NREF)) {
14813 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
14814 SV **name= av_fetch(list, ARG(o), 0 );
14815 if (name)
14816 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14817 }
14818 else {
14819 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
14820 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
14821 I32 *nums=(I32*)SvPVX(sv_dat);
14822 SV **name= av_fetch(list, nums[0], 0 );
14823 I32 n;
14824 if (name) {
14825 for ( n=0; n<SvIVX(sv_dat); n++ ) {
14826 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
14827 (n ? "," : ""), (IV)nums[n]);
14828 }
14829 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
14830 }
14831 }
14832 }
14833 } else if (k == GOSUB)
14834 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
14835 else if (k == VERB) {
14836 if (!o->flags)
14837 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
14838 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
14839 } else if (k == LOGICAL)
14840 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
14841 else if (k == ANYOF) {
14842 const U8 flags = ANYOF_FLAGS(o);
14843 int do_sep = 0;
14844
14845
14846 if (flags & ANYOF_LOCALE)
14847 sv_catpvs(sv, "{loc}");
14848 if (flags & ANYOF_LOC_FOLD)
14849 sv_catpvs(sv, "{i}");
14850 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
14851 if (flags & ANYOF_INVERT)
14852 sv_catpvs(sv, "^");
14853
14854 /* output what the standard cp 0-255 bitmap matches */
14855 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
14856
14857 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14858 /* output any special charclass tests (used entirely under use locale) */
14859 if (ANYOF_CLASS_TEST_ANY_SET(o)) {
14860 int i;
14861 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
14862 if (ANYOF_CLASS_TEST(o,i)) {
14863 sv_catpv(sv, anyofs[i]);
14864 do_sep = 1;
14865 }
14866 }
14867 }
14868
14869 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
14870
14871 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
14872 sv_catpvs(sv, "{non-utf8-latin1-all}");
14873 }
14874
14875 /* output information about the unicode matching */
14876 if (flags & ANYOF_UNICODE_ALL)
14877 sv_catpvs(sv, "{unicode_all}");
14878 else if (ANYOF_NONBITMAP(o)) {
14879 SV *lv; /* Set if there is something outside the bit map. */
14880 bool byte_output = FALSE; /* If something in the bitmap has been
14881 output */
14882
14883 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
14884 sv_catpvs(sv, "{outside bitmap}");
14885 }
14886 else {
14887 sv_catpvs(sv, "{utf8}");
14888 }
14889
14890 /* Get the stuff that wasn't in the bitmap */
14891 (void) regclass_swash(prog, o, FALSE, &lv, NULL);
14892 if (lv && lv != &PL_sv_undef) {
14893 char *s = savesvpv(lv);
14894 char * const origs = s;
14895
14896 while (*s && *s != '\n')
14897 s++;
14898
14899 if (*s == '\n') {
14900 const char * const t = ++s;
14901
14902 if (byte_output) {
14903 sv_catpvs(sv, " ");
14904 }
14905
14906 while (*s) {
14907 if (*s == '\n') {
14908
14909 /* Truncate very long output */
14910 if (s - origs > 256) {
14911 Perl_sv_catpvf(aTHX_ sv,
14912 "%.*s...",
14913 (int) (s - origs - 1),
14914 t);
14915 goto out_dump;
14916 }
14917 *s = ' ';
14918 }
14919 else if (*s == '\t') {
14920 *s = '-';
14921 }
14922 s++;
14923 }
14924 if (s[-1] == ' ')
14925 s[-1] = 0;
14926
14927 sv_catpv(sv, t);
14928 }
14929
14930 out_dump:
14931
14932 Safefree(origs);
14933 SvREFCNT_dec_NN(lv);
14934 }
14935 }
14936
14937 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14938 }
14939 else if (k == POSIXD || k == NPOSIXD) {
14940 U8 index = FLAGS(o) * 2;
14941 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14942 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14943 }
14944 else {
14945 sv_catpv(sv, anyofs[index]);
14946 }
14947 }
14948 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14949 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14950#else
14951 PERL_UNUSED_CONTEXT;
14952 PERL_UNUSED_ARG(sv);
14953 PERL_UNUSED_ARG(o);
14954 PERL_UNUSED_ARG(prog);
14955#endif /* DEBUGGING */
14956}
14957
14958SV *
14959Perl_re_intuit_string(pTHX_ REGEXP * const r)
14960{ /* Assume that RE_INTUIT is set */
14961 dVAR;
14962 struct regexp *const prog = ReANY(r);
14963 GET_RE_DEBUG_FLAGS_DECL;
14964
14965 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14966 PERL_UNUSED_CONTEXT;
14967
14968 DEBUG_COMPILE_r(
14969 {
14970 const char * const s = SvPV_nolen_const(prog->check_substr
14971 ? prog->check_substr : prog->check_utf8);
14972
14973 if (!PL_colorset) reginitcolors();
14974 PerlIO_printf(Perl_debug_log,
14975 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14976 PL_colors[4],
14977 prog->check_substr ? "" : "utf8 ",
14978 PL_colors[5],PL_colors[0],
14979 s,
14980 PL_colors[1],
14981 (strlen(s) > 60 ? "..." : ""));
14982 } );
14983
14984 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14985}
14986
14987/*
14988 pregfree()
14989
14990 handles refcounting and freeing the perl core regexp structure. When
14991 it is necessary to actually free the structure the first thing it
14992 does is call the 'free' method of the regexp_engine associated to
14993 the regexp, allowing the handling of the void *pprivate; member
14994 first. (This routine is not overridable by extensions, which is why
14995 the extensions free is called first.)
14996
14997 See regdupe and regdupe_internal if you change anything here.
14998*/
14999#ifndef PERL_IN_XSUB_RE
15000void
15001Perl_pregfree(pTHX_ REGEXP *r)
15002{
15003 SvREFCNT_dec(r);
15004}
15005
15006void
15007Perl_pregfree2(pTHX_ REGEXP *rx)
15008{
15009 dVAR;
15010 struct regexp *const r = ReANY(rx);
15011 GET_RE_DEBUG_FLAGS_DECL;
15012
15013 PERL_ARGS_ASSERT_PREGFREE2;
15014
15015 if (r->mother_re) {
15016 ReREFCNT_dec(r->mother_re);
15017 } else {
15018 CALLREGFREE_PVT(rx); /* free the private data */
15019 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15020 Safefree(r->xpv_len_u.xpvlenu_pv);
15021 }
15022 if (r->substrs) {
15023 SvREFCNT_dec(r->anchored_substr);
15024 SvREFCNT_dec(r->anchored_utf8);
15025 SvREFCNT_dec(r->float_substr);
15026 SvREFCNT_dec(r->float_utf8);
15027 Safefree(r->substrs);
15028 }
15029 RX_MATCH_COPY_FREE(rx);
15030#ifdef PERL_ANY_COW
15031 SvREFCNT_dec(r->saved_copy);
15032#endif
15033 Safefree(r->offs);
15034 SvREFCNT_dec(r->qr_anoncv);
15035 rx->sv_u.svu_rx = 0;
15036}
15037
15038/* reg_temp_copy()
15039
15040 This is a hacky workaround to the structural issue of match results
15041 being stored in the regexp structure which is in turn stored in
15042 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
15043 could be PL_curpm in multiple contexts, and could require multiple
15044 result sets being associated with the pattern simultaneously, such
15045 as when doing a recursive match with (??{$qr})
15046
15047 The solution is to make a lightweight copy of the regexp structure
15048 when a qr// is returned from the code executed by (??{$qr}) this
15049 lightweight copy doesn't actually own any of its data except for
15050 the starp/end and the actual regexp structure itself.
15051
15052*/
15053
15054
15055REGEXP *
15056Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
15057{
15058 struct regexp *ret;
15059 struct regexp *const r = ReANY(rx);
15060 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
15061
15062 PERL_ARGS_ASSERT_REG_TEMP_COPY;
15063
15064 if (!ret_x)
15065 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
15066 else {
15067 SvOK_off((SV *)ret_x);
15068 if (islv) {
15069 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
15070 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
15071 made both spots point to the same regexp body.) */
15072 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
15073 assert(!SvPVX(ret_x));
15074 ret_x->sv_u.svu_rx = temp->sv_any;
15075 temp->sv_any = NULL;
15076 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
15077 SvREFCNT_dec_NN(temp);
15078 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
15079 ing below will not set it. */
15080 SvCUR_set(ret_x, SvCUR(rx));
15081 }
15082 }
15083 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
15084 sv_force_normal(sv) is called. */
15085 SvFAKE_on(ret_x);
15086 ret = ReANY(ret_x);
15087
15088 SvFLAGS(ret_x) |= SvUTF8(rx);
15089 /* We share the same string buffer as the original regexp, on which we
15090 hold a reference count, incremented when mother_re is set below.
15091 The string pointer is copied here, being part of the regexp struct.
15092 */
15093 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
15094 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
15095 if (r->offs) {
15096 const I32 npar = r->nparens+1;
15097 Newx(ret->offs, npar, regexp_paren_pair);
15098 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15099 }
15100 if (r->substrs) {
15101 Newx(ret->substrs, 1, struct reg_substr_data);
15102 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15103
15104 SvREFCNT_inc_void(ret->anchored_substr);
15105 SvREFCNT_inc_void(ret->anchored_utf8);
15106 SvREFCNT_inc_void(ret->float_substr);
15107 SvREFCNT_inc_void(ret->float_utf8);
15108
15109 /* check_substr and check_utf8, if non-NULL, point to either their
15110 anchored or float namesakes, and don't hold a second reference. */
15111 }
15112 RX_MATCH_COPIED_off(ret_x);
15113#ifdef PERL_ANY_COW
15114 ret->saved_copy = NULL;
15115#endif
15116 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
15117 SvREFCNT_inc_void(ret->qr_anoncv);
15118
15119 return ret_x;
15120}
15121#endif
15122
15123/* regfree_internal()
15124
15125 Free the private data in a regexp. This is overloadable by
15126 extensions. Perl takes care of the regexp structure in pregfree(),
15127 this covers the *pprivate pointer which technically perl doesn't
15128 know about, however of course we have to handle the
15129 regexp_internal structure when no extension is in use.
15130
15131 Note this is called before freeing anything in the regexp
15132 structure.
15133 */
15134
15135void
15136Perl_regfree_internal(pTHX_ REGEXP * const rx)
15137{
15138 dVAR;
15139 struct regexp *const r = ReANY(rx);
15140 RXi_GET_DECL(r,ri);
15141 GET_RE_DEBUG_FLAGS_DECL;
15142
15143 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
15144
15145 DEBUG_COMPILE_r({
15146 if (!PL_colorset)
15147 reginitcolors();
15148 {
15149 SV *dsv= sv_newmortal();
15150 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
15151 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
15152 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
15153 PL_colors[4],PL_colors[5],s);
15154 }
15155 });
15156#ifdef RE_TRACK_PATTERN_OFFSETS
15157 if (ri->u.offsets)
15158 Safefree(ri->u.offsets); /* 20010421 MJD */
15159#endif
15160 if (ri->code_blocks) {
15161 int n;
15162 for (n = 0; n < ri->num_code_blocks; n++)
15163 SvREFCNT_dec(ri->code_blocks[n].src_regex);
15164 Safefree(ri->code_blocks);
15165 }
15166
15167 if (ri->data) {
15168 int n = ri->data->count;
15169
15170 while (--n >= 0) {
15171 /* If you add a ->what type here, update the comment in regcomp.h */
15172 switch (ri->data->what[n]) {
15173 case 'a':
15174 case 'r':
15175 case 's':
15176 case 'S':
15177 case 'u':
15178 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
15179 break;
15180 case 'f':
15181 Safefree(ri->data->data[n]);
15182 break;
15183 case 'l':
15184 case 'L':
15185 break;
15186 case 'T':
15187 { /* Aho Corasick add-on structure for a trie node.
15188 Used in stclass optimization only */
15189 U32 refcount;
15190 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
15191 OP_REFCNT_LOCK;
15192 refcount = --aho->refcount;
15193 OP_REFCNT_UNLOCK;
15194 if ( !refcount ) {
15195 PerlMemShared_free(aho->states);
15196 PerlMemShared_free(aho->fail);
15197 /* do this last!!!! */
15198 PerlMemShared_free(ri->data->data[n]);
15199 PerlMemShared_free(ri->regstclass);
15200 }
15201 }
15202 break;
15203 case 't':
15204 {
15205 /* trie structure. */
15206 U32 refcount;
15207 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
15208 OP_REFCNT_LOCK;
15209 refcount = --trie->refcount;
15210 OP_REFCNT_UNLOCK;
15211 if ( !refcount ) {
15212 PerlMemShared_free(trie->charmap);
15213 PerlMemShared_free(trie->states);
15214 PerlMemShared_free(trie->trans);
15215 if (trie->bitmap)
15216 PerlMemShared_free(trie->bitmap);
15217 if (trie->jump)
15218 PerlMemShared_free(trie->jump);
15219 PerlMemShared_free(trie->wordinfo);
15220 /* do this last!!!! */
15221 PerlMemShared_free(ri->data->data[n]);
15222 }
15223 }
15224 break;
15225 default:
15226 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
15227 }
15228 }
15229 Safefree(ri->data->what);
15230 Safefree(ri->data);
15231 }
15232
15233 Safefree(ri);
15234}
15235
15236#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
15237#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
15238#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
15239
15240/*
15241 re_dup - duplicate a regexp.
15242
15243 This routine is expected to clone a given regexp structure. It is only
15244 compiled under USE_ITHREADS.
15245
15246 After all of the core data stored in struct regexp is duplicated
15247 the regexp_engine.dupe method is used to copy any private data
15248 stored in the *pprivate pointer. This allows extensions to handle
15249 any duplication it needs to do.
15250
15251 See pregfree() and regfree_internal() if you change anything here.
15252*/
15253#if defined(USE_ITHREADS)
15254#ifndef PERL_IN_XSUB_RE
15255void
15256Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
15257{
15258 dVAR;
15259 I32 npar;
15260 const struct regexp *r = ReANY(sstr);
15261 struct regexp *ret = ReANY(dstr);
15262
15263 PERL_ARGS_ASSERT_RE_DUP_GUTS;
15264
15265 npar = r->nparens+1;
15266 Newx(ret->offs, npar, regexp_paren_pair);
15267 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
15268
15269 if (ret->substrs) {
15270 /* Do it this way to avoid reading from *r after the StructCopy().
15271 That way, if any of the sv_dup_inc()s dislodge *r from the L1
15272 cache, it doesn't matter. */
15273 const bool anchored = r->check_substr
15274 ? r->check_substr == r->anchored_substr
15275 : r->check_utf8 == r->anchored_utf8;
15276 Newx(ret->substrs, 1, struct reg_substr_data);
15277 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
15278
15279 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
15280 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
15281 ret->float_substr = sv_dup_inc(ret->float_substr, param);
15282 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
15283
15284 /* check_substr and check_utf8, if non-NULL, point to either their
15285 anchored or float namesakes, and don't hold a second reference. */
15286
15287 if (ret->check_substr) {
15288 if (anchored) {
15289 assert(r->check_utf8 == r->anchored_utf8);
15290 ret->check_substr = ret->anchored_substr;
15291 ret->check_utf8 = ret->anchored_utf8;
15292 } else {
15293 assert(r->check_substr == r->float_substr);
15294 assert(r->check_utf8 == r->float_utf8);
15295 ret->check_substr = ret->float_substr;
15296 ret->check_utf8 = ret->float_utf8;
15297 }
15298 } else if (ret->check_utf8) {
15299 if (anchored) {
15300 ret->check_utf8 = ret->anchored_utf8;
15301 } else {
15302 ret->check_utf8 = ret->float_utf8;
15303 }
15304 }
15305 }
15306
15307 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
15308 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
15309
15310 if (ret->pprivate)
15311 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
15312
15313 if (RX_MATCH_COPIED(dstr))
15314 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
15315 else
15316 ret->subbeg = NULL;
15317#ifdef PERL_ANY_COW
15318 ret->saved_copy = NULL;
15319#endif
15320
15321 /* Whether mother_re be set or no, we need to copy the string. We
15322 cannot refrain from copying it when the storage points directly to
15323 our mother regexp, because that's
15324 1: a buffer in a different thread
15325 2: something we no longer hold a reference on
15326 so we need to copy it locally. */
15327 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
15328 ret->mother_re = NULL;
15329}
15330#endif /* PERL_IN_XSUB_RE */
15331
15332/*
15333 regdupe_internal()
15334
15335 This is the internal complement to regdupe() which is used to copy
15336 the structure pointed to by the *pprivate pointer in the regexp.
15337 This is the core version of the extension overridable cloning hook.
15338 The regexp structure being duplicated will be copied by perl prior
15339 to this and will be provided as the regexp *r argument, however
15340 with the /old/ structures pprivate pointer value. Thus this routine
15341 may override any copying normally done by perl.
15342
15343 It returns a pointer to the new regexp_internal structure.
15344*/
15345
15346void *
15347Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
15348{
15349 dVAR;
15350 struct regexp *const r = ReANY(rx);
15351 regexp_internal *reti;
15352 int len;
15353 RXi_GET_DECL(r,ri);
15354
15355 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
15356
15357 len = ProgLen(ri);
15358
15359 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
15360 Copy(ri->program, reti->program, len+1, regnode);
15361
15362 reti->num_code_blocks = ri->num_code_blocks;
15363 if (ri->code_blocks) {
15364 int n;
15365 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
15366 struct reg_code_block);
15367 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
15368 struct reg_code_block);
15369 for (n = 0; n < ri->num_code_blocks; n++)
15370 reti->code_blocks[n].src_regex = (REGEXP*)
15371 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
15372 }
15373 else
15374 reti->code_blocks = NULL;
15375
15376 reti->regstclass = NULL;
15377
15378 if (ri->data) {
15379 struct reg_data *d;
15380 const int count = ri->data->count;
15381 int i;
15382
15383 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
15384 char, struct reg_data);
15385 Newx(d->what, count, U8);
15386
15387 d->count = count;
15388 for (i = 0; i < count; i++) {
15389 d->what[i] = ri->data->what[i];
15390 switch (d->what[i]) {
15391 /* see also regcomp.h and regfree_internal() */
15392 case 'a': /* actually an AV, but the dup function is identical. */
15393 case 'r':
15394 case 's':
15395 case 'S':
15396 case 'u': /* actually an HV, but the dup function is identical. */
15397 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
15398 break;
15399 case 'f':
15400 /* This is cheating. */
15401 Newx(d->data[i], 1, struct regnode_charclass_class);
15402 StructCopy(ri->data->data[i], d->data[i],
15403 struct regnode_charclass_class);
15404 reti->regstclass = (regnode*)d->data[i];
15405 break;
15406 case 'T':
15407 /* Trie stclasses are readonly and can thus be shared
15408 * without duplication. We free the stclass in pregfree
15409 * when the corresponding reg_ac_data struct is freed.
15410 */
15411 reti->regstclass= ri->regstclass;
15412 /* Fall through */
15413 case 't':
15414 OP_REFCNT_LOCK;
15415 ((reg_trie_data*)ri->data->data[i])->refcount++;
15416 OP_REFCNT_UNLOCK;
15417 /* Fall through */
15418 case 'l':
15419 case 'L':
15420 d->data[i] = ri->data->data[i];
15421 break;
15422 default:
15423 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
15424 }
15425 }
15426
15427 reti->data = d;
15428 }
15429 else
15430 reti->data = NULL;
15431
15432 reti->name_list_idx = ri->name_list_idx;
15433
15434#ifdef RE_TRACK_PATTERN_OFFSETS
15435 if (ri->u.offsets) {
15436 Newx(reti->u.offsets, 2*len+1, U32);
15437 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
15438 }
15439#else
15440 SetProgLen(reti,len);
15441#endif
15442
15443 return (void*)reti;
15444}
15445
15446#endif /* USE_ITHREADS */
15447
15448#ifndef PERL_IN_XSUB_RE
15449
15450/*
15451 - regnext - dig the "next" pointer out of a node
15452 */
15453regnode *
15454Perl_regnext(pTHX_ regnode *p)
15455{
15456 dVAR;
15457 I32 offset;
15458
15459 if (!p)
15460 return(NULL);
15461
15462 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
15463 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
15464 }
15465
15466 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
15467 if (offset == 0)
15468 return(NULL);
15469
15470 return(p+offset);
15471}
15472#endif
15473
15474STATIC void
15475S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
15476{
15477 va_list args;
15478 STRLEN l1 = strlen(pat1);
15479 STRLEN l2 = strlen(pat2);
15480 char buf[512];
15481 SV *msv;
15482 const char *message;
15483
15484 PERL_ARGS_ASSERT_RE_CROAK2;
15485
15486 if (l1 > 510)
15487 l1 = 510;
15488 if (l1 + l2 > 510)
15489 l2 = 510 - l1;
15490 Copy(pat1, buf, l1 , char);
15491 Copy(pat2, buf + l1, l2 , char);
15492 buf[l1 + l2] = '\n';
15493 buf[l1 + l2 + 1] = '\0';
15494#ifdef I_STDARG
15495 /* ANSI variant takes additional second argument */
15496 va_start(args, pat2);
15497#else
15498 va_start(args);
15499#endif
15500 msv = vmess(buf, &args);
15501 va_end(args);
15502 message = SvPV_const(msv,l1);
15503 if (l1 > 512)
15504 l1 = 512;
15505 Copy(message, buf, l1 , char);
15506 buf[l1-1] = '\0'; /* Overwrite \n */
15507 Perl_croak(aTHX_ "%s", buf);
15508}
15509
15510/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
15511
15512#ifndef PERL_IN_XSUB_RE
15513void
15514Perl_save_re_context(pTHX)
15515{
15516 dVAR;
15517
15518 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
15519 if (PL_curpm) {
15520 const REGEXP * const rx = PM_GETRE(PL_curpm);
15521 if (rx) {
15522 U32 i;
15523 for (i = 1; i <= RX_NPARENS(rx); i++) {
15524 char digits[TYPE_CHARS(long)];
15525 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
15526 GV *const *const gvp
15527 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
15528
15529 if (gvp) {
15530 GV * const gv = *gvp;
15531 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
15532 save_scalar(gv);
15533 }
15534 }
15535 }
15536 }
15537}
15538#endif
15539
15540#ifdef DEBUGGING
15541
15542STATIC void
15543S_put_byte(pTHX_ SV *sv, int c)
15544{
15545 PERL_ARGS_ASSERT_PUT_BYTE;
15546
15547 /* Our definition of isPRINT() ignores locales, so only bytes that are
15548 not part of UTF-8 are considered printable. I assume that the same
15549 holds for UTF-EBCDIC.
15550 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
15551 which Wikipedia says:
15552
15553 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
15554 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
15555 identical, to the ASCII delete (DEL) or rubout control character. ...
15556 it is typically mapped to hexadecimal code 9F, in order to provide a
15557 unique character mapping in both directions)
15558
15559 So the old condition can be simplified to !isPRINT(c) */
15560 if (!isPRINT(c)) {
15561 switch (c) {
15562 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
15563 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
15564 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
15565 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
15566 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
15567
15568 default:
15569 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
15570 break;
15571 }
15572 }
15573 else {
15574 const char string = c;
15575 if (c == '-' || c == ']' || c == '\\' || c == '^')
15576 sv_catpvs(sv, "\\");
15577 sv_catpvn(sv, &string, 1);
15578 }
15579}
15580
15581STATIC bool
15582S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
15583{
15584 /* Appends to 'sv' a displayable version of the innards of the bracketed
15585 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
15586 * output anything */
15587
15588 int i;
15589 int rangestart = -1;
15590 bool has_output_anything = FALSE;
15591
15592 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
15593
15594 for (i = 0; i <= 256; i++) {
15595 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
15596 if (rangestart == -1)
15597 rangestart = i;
15598 } else if (rangestart != -1) {
15599 int j = i - 1;
15600 if (i <= rangestart + 3) { /* Individual chars in short ranges */
15601 for (; rangestart < i; rangestart++)
15602 put_byte(sv, rangestart);
15603 }
15604 else if ( j > 255
15605 || ! isALPHANUMERIC(rangestart)
15606 || ! isALPHANUMERIC(j)
15607 || isDIGIT(rangestart) != isDIGIT(j)
15608 || isUPPER(rangestart) != isUPPER(j)
15609 || isLOWER(rangestart) != isLOWER(j)
15610
15611 /* This final test should get optimized out except
15612 * on EBCDIC platforms, where it causes ranges that
15613 * cross discontinuities like i/j to be shown as hex
15614 * instead of the misleading, e.g. H-K (since that
15615 * range includes more than H, I, J, K). */
15616 || (j - rangestart)
15617 != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
15618 {
15619 Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
15620 rangestart,
15621 (j < 256) ? j : 255);
15622 }
15623 else { /* Here, the ends of the range are both digits, or both
15624 uppercase, or both lowercase; and there's no
15625 discontinuity in the range (which could happen on EBCDIC
15626 platforms) */
15627 put_byte(sv, rangestart);
15628 sv_catpvs(sv, "-");
15629 put_byte(sv, j);
15630 }
15631 rangestart = -1;
15632 has_output_anything = TRUE;
15633 }
15634 }
15635
15636 return has_output_anything;
15637}
15638
15639#define CLEAR_OPTSTART \
15640 if (optstart) STMT_START { \
15641 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
15642 optstart=NULL; \
15643 } STMT_END
15644
15645#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
15646
15647STATIC const regnode *
15648S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
15649 const regnode *last, const regnode *plast,
15650 SV* sv, I32 indent, U32 depth)
15651{
15652 dVAR;
15653 U8 op = PSEUDO; /* Arbitrary non-END op. */
15654 const regnode *next;
15655 const regnode *optstart= NULL;
15656
15657 RXi_GET_DECL(r,ri);
15658 GET_RE_DEBUG_FLAGS_DECL;
15659
15660 PERL_ARGS_ASSERT_DUMPUNTIL;
15661
15662#ifdef DEBUG_DUMPUNTIL
15663 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
15664 last ? last-start : 0,plast ? plast-start : 0);
15665#endif
15666
15667 if (plast && plast < last)
15668 last= plast;
15669
15670 while (PL_regkind[op] != END && (!last || node < last)) {
15671 /* While that wasn't END last time... */
15672 NODE_ALIGN(node);
15673 op = OP(node);
15674 if (op == CLOSE || op == WHILEM)
15675 indent--;
15676 next = regnext((regnode *)node);
15677
15678 /* Where, what. */
15679 if (OP(node) == OPTIMIZED) {
15680 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
15681 optstart = node;
15682 else
15683 goto after_print;
15684 } else
15685 CLEAR_OPTSTART;
15686
15687 regprop(r, sv, node);
15688 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
15689 (int)(2*indent + 1), "", SvPVX_const(sv));
15690
15691 if (OP(node) != OPTIMIZED) {
15692 if (next == NULL) /* Next ptr. */
15693 PerlIO_printf(Perl_debug_log, " (0)");
15694 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
15695 PerlIO_printf(Perl_debug_log, " (FAIL)");
15696 else
15697 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
15698 (void)PerlIO_putc(Perl_debug_log, '\n');
15699 }
15700
15701 after_print:
15702 if (PL_regkind[(U8)op] == BRANCHJ) {
15703 assert(next);
15704 {
15705 const regnode *nnode = (OP(next) == LONGJMP
15706 ? regnext((regnode *)next)
15707 : next);
15708 if (last && nnode > last)
15709 nnode = last;
15710 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
15711 }
15712 }
15713 else if (PL_regkind[(U8)op] == BRANCH) {
15714 assert(next);
15715 DUMPUNTIL(NEXTOPER(node), next);
15716 }
15717 else if ( PL_regkind[(U8)op] == TRIE ) {
15718 const regnode *this_trie = node;
15719 const char op = OP(node);
15720 const U32 n = ARG(node);
15721 const reg_ac_data * const ac = op>=AHOCORASICK ?
15722 (reg_ac_data *)ri->data->data[n] :
15723 NULL;
15724 const reg_trie_data * const trie =
15725 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
15726#ifdef DEBUGGING
15727 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
15728#endif
15729 const regnode *nextbranch= NULL;
15730 I32 word_idx;
15731 sv_setpvs(sv, "");
15732 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
15733 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
15734
15735 PerlIO_printf(Perl_debug_log, "%*s%s ",
15736 (int)(2*(indent+3)), "",
15737 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
15738 PL_colors[0], PL_colors[1],
15739 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
15740 PERL_PV_PRETTY_ELLIPSES |
15741 PERL_PV_PRETTY_LTGT
15742 )
15743 : "???"
15744 );
15745 if (trie->jump) {
15746 U16 dist= trie->jump[word_idx+1];
15747 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
15748 (UV)((dist ? this_trie + dist : next) - start));
15749 if (dist) {
15750 if (!nextbranch)
15751 nextbranch= this_trie + trie->jump[0];
15752 DUMPUNTIL(this_trie + dist, nextbranch);
15753 }
15754 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
15755 nextbranch= regnext((regnode *)nextbranch);
15756 } else {
15757 PerlIO_printf(Perl_debug_log, "\n");
15758 }
15759 }
15760 if (last && next > last)
15761 node= last;
15762 else
15763 node= next;
15764 }
15765 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
15766 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
15767 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
15768 }
15769 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
15770 assert(next);
15771 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
15772 }
15773 else if ( op == PLUS || op == STAR) {
15774 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
15775 }
15776 else if (PL_regkind[(U8)op] == ANYOF) {
15777 /* arglen 1 + class block */
15778 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
15779 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
15780 node = NEXTOPER(node);
15781 }
15782 else if (PL_regkind[(U8)op] == EXACT) {
15783 /* Literal string, where present. */
15784 node += NODE_SZ_STR(node) - 1;
15785 node = NEXTOPER(node);
15786 }
15787 else {
15788 node = NEXTOPER(node);
15789 node += regarglen[(U8)op];
15790 }
15791 if (op == CURLYX || op == OPEN)
15792 indent++;
15793 }
15794 CLEAR_OPTSTART;
15795#ifdef DEBUG_DUMPUNTIL
15796 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
15797#endif
15798 return node;
15799}
15800
15801#endif /* DEBUGGING */
15802
15803/*
15804 * Local variables:
15805 * c-indentation-style: bsd
15806 * c-basic-offset: 4
15807 * indent-tabs-mode: nil
15808 * End:
15809 *
15810 * ex: set ts=8 sts=4 sw=4 et:
15811 */