This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Move fcn call out of loop
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
34#ifdef PERL_EXT_RE_BUILD
35#include "re_top.h"
36#endif
37
38/*
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
64 ****
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
74#define PERL_IN_REGCOMP_C
75#include "perl.h"
76
77#ifndef PERL_IN_XSUB_RE
78# include "INTERN.h"
79#endif
80
81#define REG_COMP_C
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84EXTERN_C const struct regexp_engine my_reg_engine;
85#else
86# include "regcomp.h"
87#endif
88
89#include "dquote_inline.h"
90#include "invlist_inline.h"
91#include "unicode_constants.h"
92
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99
100#ifndef STATIC
101#define STATIC static
102#endif
103
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
113
114 struct scan_frame *this_prev_frame; /* this previous frame */
115 struct scan_frame *prev_frame; /* previous frame */
116 struct scan_frame *next_frame; /* next frame */
117} scan_frame;
118
119/* Certain characters are output as a sequence with the first being a
120 * backslash. */
121#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
122
123
124struct RExC_state_t {
125 U32 flags; /* RXf_* are we folding, multilining? */
126 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
127 char *precomp; /* uncompiled string. */
128 char *precomp_end; /* pointer to end of uncompiled string. */
129 REGEXP *rx_sv; /* The SV that is the regexp. */
130 regexp *rx; /* perl core regexp structure */
131 regexp_internal *rxi; /* internal data for regexp object
132 pprivate field */
133 char *start; /* Start of input for compile */
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
136 char *copy_start; /* start of copy of input within
137 constructed parse string */
138 char *copy_start_in_input; /* Position in input string
139 corresponding to copy_start */
140 SSize_t whilem_seen; /* number of WHILEM in this expr */
141 regnode *emit_start; /* Start of emitted-code area */
142 regnode *emit_bound; /* First regnode outside of the
143 allocated space */
144 regnode_offset emit; /* Code-emit pointer */
145 I32 naughty; /* How bad is this pattern? */
146 I32 sawback; /* Did we see \1, ...? */
147 U32 seen;
148 SSize_t size; /* Number of regnode equivalents in
149 pattern */
150 I32 npar; /* Capture buffer count, (OPEN) plus
151 one. ("par" 0 is the whole
152 pattern)*/
153 I32 total_par; /* Capture buffer count after parse
154 completed, (OPEN) plus one. ("par" 0
155 is the whole pattern)*/
156 I32 nestroot; /* root parens we are in - used by
157 accept */
158 I32 extralen;
159 I32 seen_zerolen;
160 regnode_offset *open_parens; /* offsets to open parens */
161 regnode_offset *close_parens; /* offsets to close parens */
162 regnode *end_op; /* END node in program */
163 I32 utf8; /* whether the pattern is utf8 or not */
164 I32 orig_utf8; /* whether the pattern was originally in utf8 */
165 /* XXX use this for future optimisation of case
166 * where pattern must be upgraded to utf8. */
167 I32 uni_semantics; /* If a d charset modifier should use unicode
168 rules, even if the pattern is not in
169 utf8 */
170 HV *paren_names; /* Paren names */
171
172 regnode **recurse; /* Recurse regops */
173 I32 recurse_count; /* Number of recurse regops we have generated */
174 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
175 through */
176 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
177 I32 in_lookbehind;
178 I32 contains_locale;
179 I32 override_recoding;
180#ifdef EBCDIC
181 I32 recode_x_to_native;
182#endif
183 I32 in_multi_char_class;
184 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
185 within pattern */
186 int code_index; /* next code_blocks[] slot */
187 SSize_t maxlen; /* mininum possible number of chars in string to match */
188 scan_frame *frame_head;
189 scan_frame *frame_last;
190 U32 frame_count;
191 AV *warn_text;
192#ifdef ADD_TO_REGEXEC
193 char *starttry; /* -Dr: where regtry was called. */
194#define RExC_starttry (pRExC_state->starttry)
195#endif
196 SV *runtime_code_qr; /* qr with the runtime code blocks */
197#ifdef DEBUGGING
198 const char *lastparse;
199 I32 lastnum;
200 AV *paren_name_list; /* idx -> name */
201 U32 study_chunk_recursed_count;
202 SV *mysv1;
203 SV *mysv2;
204
205#define RExC_lastparse (pRExC_state->lastparse)
206#define RExC_lastnum (pRExC_state->lastnum)
207#define RExC_paren_name_list (pRExC_state->paren_name_list)
208#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
209#define RExC_mysv (pRExC_state->mysv1)
210#define RExC_mysv1 (pRExC_state->mysv1)
211#define RExC_mysv2 (pRExC_state->mysv2)
212
213#endif
214 bool seen_unfolded_sharp_s;
215 bool strict;
216 bool study_started;
217 bool in_script_run;
218 bool pass1;
219};
220
221#define RExC_flags (pRExC_state->flags)
222#define RExC_pm_flags (pRExC_state->pm_flags)
223#define RExC_precomp (pRExC_state->precomp)
224#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
225#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
226#define RExC_precomp_end (pRExC_state->precomp_end)
227#define RExC_rx_sv (pRExC_state->rx_sv)
228#define RExC_rx (pRExC_state->rx)
229#define RExC_rxi (pRExC_state->rxi)
230#define RExC_start (pRExC_state->start)
231#define RExC_end (pRExC_state->end)
232#define RExC_parse (pRExC_state->parse)
233#define RExC_whilem_seen (pRExC_state->whilem_seen)
234
235/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
236 * EXACTF node, hence was parsed under /di rules. If later in the parse,
237 * something forces the pattern into using /ui rules, the sharp s should be
238 * folded into the sequence 'ss', which takes up more space than previously
239 * calculated. This means that the sizing pass needs to be restarted. (The
240 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
241 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
242 * so there is no need to resize [perl #125990]. */
243#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
244
245#ifdef RE_TRACK_PATTERN_OFFSETS
246# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
247 others */
248#endif
249#define RExC_emit (pRExC_state->emit)
250#define RExC_pass1 (pRExC_state->pass1)
251#define RExC_emit_start (pRExC_state->emit_start)
252#define RExC_emit_bound (pRExC_state->emit_bound)
253#define RExC_sawback (pRExC_state->sawback)
254#define RExC_seen (pRExC_state->seen)
255#define RExC_size (pRExC_state->size)
256#define RExC_maxlen (pRExC_state->maxlen)
257#define RExC_npar (pRExC_state->npar)
258#define RExC_total_parens (pRExC_state->total_par)
259#define RExC_nestroot (pRExC_state->nestroot)
260#define RExC_extralen (pRExC_state->extralen)
261#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
262#define RExC_utf8 (pRExC_state->utf8)
263#define RExC_uni_semantics (pRExC_state->uni_semantics)
264#define RExC_orig_utf8 (pRExC_state->orig_utf8)
265#define RExC_open_parens (pRExC_state->open_parens)
266#define RExC_close_parens (pRExC_state->close_parens)
267#define RExC_end_op (pRExC_state->end_op)
268#define RExC_paren_names (pRExC_state->paren_names)
269#define RExC_recurse (pRExC_state->recurse)
270#define RExC_recurse_count (pRExC_state->recurse_count)
271#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
272#define RExC_study_chunk_recursed_bytes \
273 (pRExC_state->study_chunk_recursed_bytes)
274#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
275#define RExC_contains_locale (pRExC_state->contains_locale)
276#ifdef EBCDIC
277# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
278#endif
279#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
280#define RExC_frame_head (pRExC_state->frame_head)
281#define RExC_frame_last (pRExC_state->frame_last)
282#define RExC_frame_count (pRExC_state->frame_count)
283#define RExC_strict (pRExC_state->strict)
284#define RExC_study_started (pRExC_state->study_started)
285#define RExC_warn_text (pRExC_state->warn_text)
286#define RExC_in_script_run (pRExC_state->in_script_run)
287#define RExC_use_BRANCHJ (!SIZE_ONLY && RExC_extralen)
288
289/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
290 * a flag to disable back-off on the fixed/floating substrings - if it's
291 * a high complexity pattern we assume the benefit of avoiding a full match
292 * is worth the cost of checking for the substrings even if they rarely help.
293 */
294#define RExC_naughty (pRExC_state->naughty)
295#define TOO_NAUGHTY (10)
296#define MARK_NAUGHTY(add) \
297 if (RExC_naughty < TOO_NAUGHTY) \
298 RExC_naughty += (add)
299#define MARK_NAUGHTY_EXP(exp, add) \
300 if (RExC_naughty < TOO_NAUGHTY) \
301 RExC_naughty += RExC_naughty / (exp) + (add)
302
303#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
304#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
305 ((*s) == '{' && regcurly(s)))
306
307/*
308 * Flags to be passed up and down.
309 */
310#define WORST 0 /* Worst case. */
311#define HASWIDTH 0x01 /* Known to match non-null strings. */
312
313/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
314 * character. (There needs to be a case: in the switch statement in regexec.c
315 * for any node marked SIMPLE.) Note that this is not the same thing as
316 * REGNODE_SIMPLE */
317#define SIMPLE 0x02
318#define SPSTART 0x04 /* Starts with * or + */
319#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
320#define TRYAGAIN 0x10 /* Weeded out a declaration. */
321#define RESTART_PARSE 0x20 /* Need to redo the parse */
322#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
323 calcuate sizes as UTF-8 */
324
325#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
326
327/* whether trie related optimizations are enabled */
328#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
329#define TRIE_STUDY_OPT
330#define FULL_TRIE_STUDY
331#define TRIE_STCLASS
332#endif
333
334
335
336#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
337#define PBITVAL(paren) (1 << ((paren) & 7))
338#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
339#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
340#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
341
342#define REQUIRE_UTF8(flagp) STMT_START { \
343 if (!UTF) { \
344 assert(PASS1); \
345 *flagp = RESTART_PARSE|NEED_UTF8; \
346 return 0; \
347 } \
348 } STMT_END
349
350/* Change from /d into /u rules, and restart the parse if we've already seen
351 * something whose size would increase as a result, by setting *flagp and
352 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
353 * we've changed to /u during the parse. */
354#define REQUIRE_UNI_RULES(flagp, restart_retval) \
355 STMT_START { \
356 if (DEPENDS_SEMANTICS) { \
357 assert(PASS1); \
358 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
359 RExC_uni_semantics = 1; \
360 if (RExC_seen_unfolded_sharp_s) { \
361 *flagp |= RESTART_PARSE; \
362 return restart_retval; \
363 } \
364 } \
365 } STMT_END
366
367/* Executes a return statement with the value 'X', if 'flags' contains any of
368 * 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those
369 * flags */
370#define RETURN_X_ON_RESTART_OR_FLAGS(X, flags, flagp, extra) \
371 STMT_START { \
372 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
373 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
374 return X; \
375 } \
376 } STMT_END
377
378#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
379 RETURN_X_ON_RESTART_OR_FLAGS(0,flags,flagp,extra)
380
381#define RETURN_X_ON_RESTART(X, flags,flagp) \
382 RETURN_X_ON_RESTART_OR_FLAGS( X, flags, flagp, 0)
383
384
385#define RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
386 if (*(flagp) & (RESTART_PARSE|(extra))) return 0
387
388#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
389
390#define RETURN_FAIL_ON_RESTART(flags,flagp) \
391 RETURN_X_ON_RESTART(0, flags,flagp)
392#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
393 RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, 0)
394
395/* This converts the named class defined in regcomp.h to its equivalent class
396 * number defined in handy.h. */
397#define namedclass_to_classnum(class) ((int) ((class) / 2))
398#define classnum_to_namedclass(classnum) ((classnum) * 2)
399
400#define _invlist_union_complement_2nd(a, b, output) \
401 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
402#define _invlist_intersection_complement_2nd(a, b, output) \
403 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
404
405/* About scan_data_t.
406
407 During optimisation we recurse through the regexp program performing
408 various inplace (keyhole style) optimisations. In addition study_chunk
409 and scan_commit populate this data structure with information about
410 what strings MUST appear in the pattern. We look for the longest
411 string that must appear at a fixed location, and we look for the
412 longest string that may appear at a floating location. So for instance
413 in the pattern:
414
415 /FOO[xX]A.*B[xX]BAR/
416
417 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
418 strings (because they follow a .* construct). study_chunk will identify
419 both FOO and BAR as being the longest fixed and floating strings respectively.
420
421 The strings can be composites, for instance
422
423 /(f)(o)(o)/
424
425 will result in a composite fixed substring 'foo'.
426
427 For each string some basic information is maintained:
428
429 - min_offset
430 This is the position the string must appear at, or not before.
431 It also implicitly (when combined with minlenp) tells us how many
432 characters must match before the string we are searching for.
433 Likewise when combined with minlenp and the length of the string it
434 tells us how many characters must appear after the string we have
435 found.
436
437 - max_offset
438 Only used for floating strings. This is the rightmost point that
439 the string can appear at. If set to SSize_t_MAX it indicates that the
440 string can occur infinitely far to the right.
441 For fixed strings, it is equal to min_offset.
442
443 - minlenp
444 A pointer to the minimum number of characters of the pattern that the
445 string was found inside. This is important as in the case of positive
446 lookahead or positive lookbehind we can have multiple patterns
447 involved. Consider
448
449 /(?=FOO).*F/
450
451 The minimum length of the pattern overall is 3, the minimum length
452 of the lookahead part is 3, but the minimum length of the part that
453 will actually match is 1. So 'FOO's minimum length is 3, but the
454 minimum length for the F is 1. This is important as the minimum length
455 is used to determine offsets in front of and behind the string being
456 looked for. Since strings can be composites this is the length of the
457 pattern at the time it was committed with a scan_commit. Note that
458 the length is calculated by study_chunk, so that the minimum lengths
459 are not known until the full pattern has been compiled, thus the
460 pointer to the value.
461
462 - lookbehind
463
464 In the case of lookbehind the string being searched for can be
465 offset past the start point of the final matching string.
466 If this value was just blithely removed from the min_offset it would
467 invalidate some of the calculations for how many chars must match
468 before or after (as they are derived from min_offset and minlen and
469 the length of the string being searched for).
470 When the final pattern is compiled and the data is moved from the
471 scan_data_t structure into the regexp structure the information
472 about lookbehind is factored in, with the information that would
473 have been lost precalculated in the end_shift field for the
474 associated string.
475
476 The fields pos_min and pos_delta are used to store the minimum offset
477 and the delta to the maximum offset at the current point in the pattern.
478
479*/
480
481struct scan_data_substrs {
482 SV *str; /* longest substring found in pattern */
483 SSize_t min_offset; /* earliest point in string it can appear */
484 SSize_t max_offset; /* latest point in string it can appear */
485 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
486 SSize_t lookbehind; /* is the pos of the string modified by LB */
487 I32 flags; /* per substring SF_* and SCF_* flags */
488};
489
490typedef struct scan_data_t {
491 /*I32 len_min; unused */
492 /*I32 len_delta; unused */
493 SSize_t pos_min;
494 SSize_t pos_delta;
495 SV *last_found;
496 SSize_t last_end; /* min value, <0 unless valid. */
497 SSize_t last_start_min;
498 SSize_t last_start_max;
499 U8 cur_is_floating; /* whether the last_* values should be set as
500 * the next fixed (0) or floating (1)
501 * substring */
502
503 /* [0] is longest fixed substring so far, [1] is longest float so far */
504 struct scan_data_substrs substrs[2];
505
506 I32 flags; /* common SF_* and SCF_* flags */
507 I32 whilem_c;
508 SSize_t *last_closep;
509 regnode_ssc *start_class;
510} scan_data_t;
511
512/*
513 * Forward declarations for pregcomp()'s friends.
514 */
515
516static const scan_data_t zero_scan_data = {
517 0, 0, NULL, 0, 0, 0, 0,
518 {
519 { NULL, 0, 0, 0, 0, 0 },
520 { NULL, 0, 0, 0, 0, 0 },
521 },
522 0, 0, NULL, NULL
523};
524
525/* study flags */
526
527#define SF_BEFORE_SEOL 0x0001
528#define SF_BEFORE_MEOL 0x0002
529#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
530
531#define SF_IS_INF 0x0040
532#define SF_HAS_PAR 0x0080
533#define SF_IN_PAR 0x0100
534#define SF_HAS_EVAL 0x0200
535
536
537/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
538 * longest substring in the pattern. When it is not set the optimiser keeps
539 * track of position, but does not keep track of the actual strings seen,
540 *
541 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
542 * /foo/i will not.
543 *
544 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
545 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
546 * turned off because of the alternation (BRANCH). */
547#define SCF_DO_SUBSTR 0x0400
548
549#define SCF_DO_STCLASS_AND 0x0800
550#define SCF_DO_STCLASS_OR 0x1000
551#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
552#define SCF_WHILEM_VISITED_POS 0x2000
553
554#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
555#define SCF_SEEN_ACCEPT 0x8000
556#define SCF_TRIE_DOING_RESTUDY 0x10000
557#define SCF_IN_DEFINE 0x20000
558
559
560
561
562#define UTF cBOOL(RExC_utf8)
563
564/* The enums for all these are ordered so things work out correctly */
565#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
566#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
567 == REGEX_DEPENDS_CHARSET)
568#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
569#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
570 >= REGEX_UNICODE_CHARSET)
571#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
572 == REGEX_ASCII_RESTRICTED_CHARSET)
573#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
574 >= REGEX_ASCII_RESTRICTED_CHARSET)
575#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
576 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
577
578#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
579
580/* For programs that want to be strictly Unicode compatible by dying if any
581 * attempt is made to match a non-Unicode code point against a Unicode
582 * property. */
583#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
584
585#define OOB_NAMEDCLASS -1
586
587/* There is no code point that is out-of-bounds, so this is problematic. But
588 * its only current use is to initialize a variable that is always set before
589 * looked at. */
590#define OOB_UNICODE 0xDEADBEEF
591
592#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
593
594
595/* length of regex to show in messages that don't mark a position within */
596#define RegexLengthToShowInErrorMessages 127
597
598/*
599 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
600 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
601 * op/pragma/warn/regcomp.
602 */
603#define MARKER1 "<-- HERE" /* marker as it appears in the description */
604#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
605
606#define REPORT_LOCATION " in regex; marked by " MARKER1 \
607 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
608
609/* The code in this file in places uses one level of recursion with parsing
610 * rebased to an alternate string constructed by us in memory. This can take
611 * the form of something that is completely different from the input, or
612 * something that uses the input as part of the alternate. In the first case,
613 * there should be no possibility of an error, as we are in complete control of
614 * the alternate string. But in the second case we don't completely control
615 * the input portion, so there may be errors in that. Here's an example:
616 * /[abc\x{DF}def]/ui
617 * is handled specially because \x{df} folds to a sequence of more than one
618 * character: 'ss'. What is done is to create and parse an alternate string,
619 * which looks like this:
620 * /(?:\x{DF}|[abc\x{DF}def])/ui
621 * where it uses the input unchanged in the middle of something it constructs,
622 * which is a branch for the DF outside the character class, and clustering
623 * parens around the whole thing. (It knows enough to skip the DF inside the
624 * class while in this substitute parse.) 'abc' and 'def' may have errors that
625 * need to be reported. The general situation looks like this:
626 *
627 * |<------- identical ------>|
628 * sI tI xI eI
629 * Input: ---------------------------------------------------------------
630 * Constructed: ---------------------------------------------------
631 * sC tC xC eC EC
632 * |<------- identical ------>|
633 *
634 * sI..eI is the portion of the input pattern we are concerned with here.
635 * sC..EC is the constructed substitute parse string.
636 * sC..tC is constructed by us
637 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
638 * In the diagram, these are vertically aligned.
639 * eC..EC is also constructed by us.
640 * xC is the position in the substitute parse string where we found a
641 * problem.
642 * xI is the position in the original pattern corresponding to xC.
643 *
644 * We want to display a message showing the real input string. Thus we need to
645 * translate from xC to xI. We know that xC >= tC, since the portion of the
646 * string sC..tC has been constructed by us, and so shouldn't have errors. We
647 * get:
648 * xI = tI + (xC - tC)
649 *
650 * When the substitute parse is constructed, the code needs to set:
651 * RExC_start (sC)
652 * RExC_end (eC)
653 * RExC_copy_start_in_input (tI)
654 * RExC_copy_start_in_constructed (tC)
655 * and restore them when done.
656 *
657 * During normal processing of the input pattern, both
658 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
659 * sI, so that xC equals xI.
660 */
661
662#define sI RExC_precomp
663#define eI RExC_precomp_end
664#define sC RExC_start
665#define eC RExC_end
666#define tI RExC_copy_start_in_input
667#define tC RExC_copy_start_in_constructed
668#define xI(xC) (tI + (xC - tC))
669#define xI_offset(xC) (xI(xC) - sI)
670
671#define REPORT_LOCATION_ARGS(xC) \
672 UTF8fARG(UTF, \
673 (xI(xC) > eI) /* Don't run off end */ \
674 ? eC - sC /* Length before the <--HERE */ \
675 : ((xI_offset(xC) >= 0) \
676 ? xI_offset(xC) \
677 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
678 IVdf " trying to output message for " \
679 " pattern %.*s", \
680 __FILE__, __LINE__, (IV) xI_offset(xC), \
681 ((int) (eC - sC)), sC), 0)), \
682 sI), /* The input pattern printed up to the <--HERE */ \
683 UTF8fARG(UTF, \
684 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
685 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
686
687/* Used to point after bad bytes for an error message, but avoid skipping
688 * past a nul byte. */
689#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
690
691/* Set up to clean up after our imminent demise */
692#define PREPARE_TO_DIE \
693 STMT_START { \
694 if (RExC_rx_sv) \
695 SAVEFREESV(RExC_rx_sv); \
696 } STMT_END
697
698/*
699 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
700 * arg. Show regex, up to a maximum length. If it's too long, chop and add
701 * "...".
702 */
703#define _FAIL(code) STMT_START { \
704 const char *ellipses = ""; \
705 IV len = RExC_precomp_end - RExC_precomp; \
706 \
707 PREPARE_TO_DIE; \
708 if (len > RegexLengthToShowInErrorMessages) { \
709 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
710 len = RegexLengthToShowInErrorMessages - 10; \
711 ellipses = "..."; \
712 } \
713 code; \
714} STMT_END
715
716#define FAIL(msg) _FAIL( \
717 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
718 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
719
720#define FAIL2(msg,arg) _FAIL( \
721 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
722 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
723
724/*
725 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
726 */
727#define Simple_vFAIL(m) STMT_START { \
728 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
729 m, REPORT_LOCATION_ARGS(RExC_parse)); \
730} STMT_END
731
732/*
733 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
734 */
735#define vFAIL(m) STMT_START { \
736 PREPARE_TO_DIE; \
737 Simple_vFAIL(m); \
738} STMT_END
739
740/*
741 * Like Simple_vFAIL(), but accepts two arguments.
742 */
743#define Simple_vFAIL2(m,a1) STMT_START { \
744 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
745 REPORT_LOCATION_ARGS(RExC_parse)); \
746} STMT_END
747
748/*
749 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
750 */
751#define vFAIL2(m,a1) STMT_START { \
752 PREPARE_TO_DIE; \
753 Simple_vFAIL2(m, a1); \
754} STMT_END
755
756
757/*
758 * Like Simple_vFAIL(), but accepts three arguments.
759 */
760#define Simple_vFAIL3(m, a1, a2) STMT_START { \
761 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
762 REPORT_LOCATION_ARGS(RExC_parse)); \
763} STMT_END
764
765/*
766 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
767 */
768#define vFAIL3(m,a1,a2) STMT_START { \
769 PREPARE_TO_DIE; \
770 Simple_vFAIL3(m, a1, a2); \
771} STMT_END
772
773/*
774 * Like Simple_vFAIL(), but accepts four arguments.
775 */
776#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
777 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
778 REPORT_LOCATION_ARGS(RExC_parse)); \
779} STMT_END
780
781#define vFAIL4(m,a1,a2,a3) STMT_START { \
782 PREPARE_TO_DIE; \
783 Simple_vFAIL4(m, a1, a2, a3); \
784} STMT_END
785
786/* A specialized version of vFAIL2 that works with UTF8f */
787#define vFAIL2utf8f(m, a1) STMT_START { \
788 PREPARE_TO_DIE; \
789 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
790 REPORT_LOCATION_ARGS(RExC_parse)); \
791} STMT_END
792
793#define vFAIL3utf8f(m, a1, a2) STMT_START { \
794 PREPARE_TO_DIE; \
795 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
796 REPORT_LOCATION_ARGS(RExC_parse)); \
797} STMT_END
798
799/* Setting this to NULL is a signal to not output warnings */
800#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
801#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
802
803/* Outputting warnings is generally deferred until the 2nd pass. This is
804 * because the first pass can be restarted, for example if the pattern has to
805 * be converted to UTF-8. If a warning had already been output earlier in the
806 * pass, it would be re-output after the restart. Pass 2 is never restarted,
807 * so the problem simply goes away if we defer the output to that pass. See
808 * [perl #122671]. 'RExC_copy_start_in_constructed' being NULL is a flag to
809 * not generate any warnings */
810#define TO_OUTPUT_WARNINGS(loc) \
811 (PASS2 && RExC_copy_start_in_constructed)
812
813#define UPDATE_WARNINGS_LOC(loc) NOOP
814
815/* 'warns' is the output of the packWARNx macro used in 'code' */
816#define _WARN_HELPER(loc, warns, code) \
817 STMT_START { \
818 if (! RExC_copy_start_in_constructed) { \
819 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
820 " expected at '%s'", \
821 __FILE__, __LINE__, loc); \
822 } \
823 if (TO_OUTPUT_WARNINGS(loc)) { \
824 if (ckDEAD(warns)) \
825 PREPARE_TO_DIE; \
826 code; \
827 UPDATE_WARNINGS_LOC(loc); \
828 } \
829 } STMT_END
830
831/* m is not necessarily a "literal string", in this macro */
832#define reg_warn_non_literal_string(loc, m) \
833 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
834 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
835 "%s" REPORT_LOCATION, \
836 m, REPORT_LOCATION_ARGS(loc)))
837
838#define ckWARNreg(loc,m) \
839 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
840 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
841 m REPORT_LOCATION, \
842 REPORT_LOCATION_ARGS(loc)))
843
844#define vWARN(loc, m) \
845 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
846 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
847 m REPORT_LOCATION, \
848 REPORT_LOCATION_ARGS(loc))) \
849
850#define vWARN_dep(loc, m) \
851 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
852 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
853 m REPORT_LOCATION, \
854 REPORT_LOCATION_ARGS(loc)))
855
856#define ckWARNdep(loc,m) \
857 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
858 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
859 m REPORT_LOCATION, \
860 REPORT_LOCATION_ARGS(loc)))
861
862#define ckWARNregdep(loc,m) \
863 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
864 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
865 WARN_REGEXP), \
866 m REPORT_LOCATION, \
867 REPORT_LOCATION_ARGS(loc)))
868
869#define ckWARN2reg_d(loc,m, a1) \
870 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
871 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
872 m REPORT_LOCATION, \
873 a1, REPORT_LOCATION_ARGS(loc)))
874
875#define ckWARN2reg(loc, m, a1) \
876 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
877 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
878 m REPORT_LOCATION, \
879 a1, REPORT_LOCATION_ARGS(loc)))
880
881#define vWARN3(loc, m, a1, a2) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
884 m REPORT_LOCATION, \
885 a1, a2, REPORT_LOCATION_ARGS(loc)))
886
887#define ckWARN3reg(loc, m, a1, a2) \
888 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
889 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
890 m REPORT_LOCATION, \
891 a1, a2, \
892 REPORT_LOCATION_ARGS(loc)))
893
894#define vWARN4(loc, m, a1, a2, a3) \
895 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
896 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
897 m REPORT_LOCATION, \
898 a1, a2, a3, \
899 REPORT_LOCATION_ARGS(loc)))
900
901#define ckWARN4reg(loc, m, a1, a2, a3) \
902 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
903 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
904 m REPORT_LOCATION, \
905 a1, a2, a3, \
906 REPORT_LOCATION_ARGS(loc)))
907
908#define vWARN5(loc, m, a1, a2, a3, a4) \
909 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
910 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
911 m REPORT_LOCATION, \
912 a1, a2, a3, a4, \
913 REPORT_LOCATION_ARGS(loc)))
914
915#define ckWARNexperimental(loc, class, m) \
916 _WARN_HELPER(loc, packWARN(class), \
917 Perl_ck_warner_d(aTHX_ packWARN(class), \
918 m REPORT_LOCATION, \
919 REPORT_LOCATION_ARGS(loc)))
920
921/* Convert between a pointer to a node and its offset from the beginning of the
922 * program */
923#define REGNODE_p(offset) (RExC_emit_start + (offset))
924#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
925
926/* Macros for recording node offsets. 20001227 mjd@plover.com
927 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
928 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
929 * Element 0 holds the number n.
930 * Position is 1 indexed.
931 */
932#ifndef RE_TRACK_PATTERN_OFFSETS
933#define Set_Node_Offset_To_R(offset,byte)
934#define Set_Node_Offset(node,byte)
935#define Set_Cur_Node_Offset
936#define Set_Node_Length_To_R(node,len)
937#define Set_Node_Length(node,len)
938#define Set_Node_Cur_Length(node,start)
939#define Node_Offset(n)
940#define Node_Length(n)
941#define Set_Node_Offset_Length(node,offset,len)
942#define ProgLen(ri) ri->u.proglen
943#define SetProgLen(ri,x) ri->u.proglen = x
944#else
945#define ProgLen(ri) ri->u.offsets[0]
946#define SetProgLen(ri,x) ri->u.offsets[0] = x
947#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
948 if (! SIZE_ONLY) { \
949 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
950 __LINE__, (int)(offset), (int)(byte))); \
951 if((offset) < 0) { \
952 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
953 (int)(offset)); \
954 } else { \
955 RExC_offsets[2*(offset)-1] = (byte); \
956 } \
957 } \
958} STMT_END
959
960#define Set_Node_Offset(node,byte) \
961 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
962#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
963
964#define Set_Node_Length_To_R(node,len) STMT_START { \
965 if (! SIZE_ONLY) { \
966 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
967 __LINE__, (int)(node), (int)(len))); \
968 if((node) < 0) { \
969 Perl_croak(aTHX_ "value of node is %d in Length macro", \
970 (int)(node)); \
971 } else { \
972 RExC_offsets[2*(node)] = (len); \
973 } \
974 } \
975} STMT_END
976
977#define Set_Node_Length(node,len) \
978 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
979#define Set_Node_Cur_Length(node, start) \
980 Set_Node_Length(node, RExC_parse - start)
981
982/* Get offsets and lengths */
983#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
984#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
985
986#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
987 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
988 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
989} STMT_END
990#endif
991
992#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
993#define EXPERIMENTAL_INPLACESCAN
994#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
995
996#ifdef DEBUGGING
997int
998Perl_re_printf(pTHX_ const char *fmt, ...)
999{
1000 va_list ap;
1001 int result;
1002 PerlIO *f= Perl_debug_log;
1003 PERL_ARGS_ASSERT_RE_PRINTF;
1004 va_start(ap, fmt);
1005 result = PerlIO_vprintf(f, fmt, ap);
1006 va_end(ap);
1007 return result;
1008}
1009
1010int
1011Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1012{
1013 va_list ap;
1014 int result;
1015 PerlIO *f= Perl_debug_log;
1016 PERL_ARGS_ASSERT_RE_INDENTF;
1017 va_start(ap, depth);
1018 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1019 result = PerlIO_vprintf(f, fmt, ap);
1020 va_end(ap);
1021 return result;
1022}
1023#endif /* DEBUGGING */
1024
1025#define DEBUG_RExC_seen() \
1026 DEBUG_OPTIMISE_MORE_r({ \
1027 Perl_re_printf( aTHX_ "RExC_seen: "); \
1028 \
1029 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1030 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1031 \
1032 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1033 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1034 \
1035 if (RExC_seen & REG_GPOS_SEEN) \
1036 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1037 \
1038 if (RExC_seen & REG_RECURSE_SEEN) \
1039 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1040 \
1041 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1042 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1043 \
1044 if (RExC_seen & REG_VERBARG_SEEN) \
1045 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1046 \
1047 if (RExC_seen & REG_CUTGROUP_SEEN) \
1048 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1049 \
1050 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1051 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1052 \
1053 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1054 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1055 \
1056 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1057 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1058 \
1059 Perl_re_printf( aTHX_ "\n"); \
1060 });
1061
1062#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1063 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1064
1065
1066#ifdef DEBUGGING
1067static void
1068S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1069 const char *close_str)
1070{
1071 if (!flags)
1072 return;
1073
1074 Perl_re_printf( aTHX_ "%s", open_str);
1075 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1076 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1077 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1078 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1079 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1080 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1081 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1082 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1083 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1084 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1085 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1086 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1087 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1088 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1089 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1090 Perl_re_printf( aTHX_ "%s", close_str);
1091}
1092
1093
1094static void
1095S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1096 U32 depth, int is_inf)
1097{
1098 GET_RE_DEBUG_FLAGS_DECL;
1099
1100 DEBUG_OPTIMISE_MORE_r({
1101 if (!data)
1102 return;
1103 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1104 depth,
1105 where,
1106 (IV)data->pos_min,
1107 (IV)data->pos_delta,
1108 (UV)data->flags
1109 );
1110
1111 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1112
1113 Perl_re_printf( aTHX_
1114 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1115 (IV)data->whilem_c,
1116 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1117 is_inf ? "INF " : ""
1118 );
1119
1120 if (data->last_found) {
1121 int i;
1122 Perl_re_printf(aTHX_
1123 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1124 SvPVX_const(data->last_found),
1125 (IV)data->last_end,
1126 (IV)data->last_start_min,
1127 (IV)data->last_start_max
1128 );
1129
1130 for (i = 0; i < 2; i++) {
1131 Perl_re_printf(aTHX_
1132 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1133 data->cur_is_floating == i ? "*" : "",
1134 i ? "Float" : "Fixed",
1135 SvPVX_const(data->substrs[i].str),
1136 (IV)data->substrs[i].min_offset,
1137 (IV)data->substrs[i].max_offset
1138 );
1139 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1140 }
1141 }
1142
1143 Perl_re_printf( aTHX_ "\n");
1144 });
1145}
1146
1147
1148static void
1149S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1150 regnode *scan, U32 depth, U32 flags)
1151{
1152 GET_RE_DEBUG_FLAGS_DECL;
1153
1154 DEBUG_OPTIMISE_r({
1155 regnode *Next;
1156
1157 if (!scan)
1158 return;
1159 Next = regnext(scan);
1160 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1161 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1162 depth,
1163 str,
1164 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1165 Next ? (REG_NODE_NUM(Next)) : 0 );
1166 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1167 Perl_re_printf( aTHX_ "\n");
1168 });
1169}
1170
1171
1172# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1173 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1174
1175# define DEBUG_PEEP(str, scan, depth, flags) \
1176 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1177
1178#else
1179# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1180# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1181#endif
1182
1183
1184/* =========================================================
1185 * BEGIN edit_distance stuff.
1186 *
1187 * This calculates how many single character changes of any type are needed to
1188 * transform a string into another one. It is taken from version 3.1 of
1189 *
1190 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1191 */
1192
1193/* Our unsorted dictionary linked list. */
1194/* Note we use UVs, not chars. */
1195
1196struct dictionary{
1197 UV key;
1198 UV value;
1199 struct dictionary* next;
1200};
1201typedef struct dictionary item;
1202
1203
1204PERL_STATIC_INLINE item*
1205push(UV key, item* curr)
1206{
1207 item* head;
1208 Newx(head, 1, item);
1209 head->key = key;
1210 head->value = 0;
1211 head->next = curr;
1212 return head;
1213}
1214
1215
1216PERL_STATIC_INLINE item*
1217find(item* head, UV key)
1218{
1219 item* iterator = head;
1220 while (iterator){
1221 if (iterator->key == key){
1222 return iterator;
1223 }
1224 iterator = iterator->next;
1225 }
1226
1227 return NULL;
1228}
1229
1230PERL_STATIC_INLINE item*
1231uniquePush(item* head, UV key)
1232{
1233 item* iterator = head;
1234
1235 while (iterator){
1236 if (iterator->key == key) {
1237 return head;
1238 }
1239 iterator = iterator->next;
1240 }
1241
1242 return push(key, head);
1243}
1244
1245PERL_STATIC_INLINE void
1246dict_free(item* head)
1247{
1248 item* iterator = head;
1249
1250 while (iterator) {
1251 item* temp = iterator;
1252 iterator = iterator->next;
1253 Safefree(temp);
1254 }
1255
1256 head = NULL;
1257}
1258
1259/* End of Dictionary Stuff */
1260
1261/* All calculations/work are done here */
1262STATIC int
1263S_edit_distance(const UV* src,
1264 const UV* tgt,
1265 const STRLEN x, /* length of src[] */
1266 const STRLEN y, /* length of tgt[] */
1267 const SSize_t maxDistance
1268)
1269{
1270 item *head = NULL;
1271 UV swapCount, swapScore, targetCharCount, i, j;
1272 UV *scores;
1273 UV score_ceil = x + y;
1274
1275 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1276
1277 /* intialize matrix start values */
1278 Newx(scores, ( (x + 2) * (y + 2)), UV);
1279 scores[0] = score_ceil;
1280 scores[1 * (y + 2) + 0] = score_ceil;
1281 scores[0 * (y + 2) + 1] = score_ceil;
1282 scores[1 * (y + 2) + 1] = 0;
1283 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1284
1285 /* work loops */
1286 /* i = src index */
1287 /* j = tgt index */
1288 for (i=1;i<=x;i++) {
1289 if (i < x)
1290 head = uniquePush(head, src[i]);
1291 scores[(i+1) * (y + 2) + 1] = i;
1292 scores[(i+1) * (y + 2) + 0] = score_ceil;
1293 swapCount = 0;
1294
1295 for (j=1;j<=y;j++) {
1296 if (i == 1) {
1297 if(j < y)
1298 head = uniquePush(head, tgt[j]);
1299 scores[1 * (y + 2) + (j + 1)] = j;
1300 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1301 }
1302
1303 targetCharCount = find(head, tgt[j-1])->value;
1304 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1305
1306 if (src[i-1] != tgt[j-1]){
1307 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1308 }
1309 else {
1310 swapCount = j;
1311 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1312 }
1313 }
1314
1315 find(head, src[i-1])->value = i;
1316 }
1317
1318 {
1319 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1320 dict_free(head);
1321 Safefree(scores);
1322 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1323 }
1324}
1325
1326/* END of edit_distance() stuff
1327 * ========================================================= */
1328
1329/* is c a control character for which we have a mnemonic? */
1330#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1331
1332STATIC const char *
1333S_cntrl_to_mnemonic(const U8 c)
1334{
1335 /* Returns the mnemonic string that represents character 'c', if one
1336 * exists; NULL otherwise. The only ones that exist for the purposes of
1337 * this routine are a few control characters */
1338
1339 switch (c) {
1340 case '\a': return "\\a";
1341 case '\b': return "\\b";
1342 case ESC_NATIVE: return "\\e";
1343 case '\f': return "\\f";
1344 case '\n': return "\\n";
1345 case '\r': return "\\r";
1346 case '\t': return "\\t";
1347 }
1348
1349 return NULL;
1350}
1351
1352/* Mark that we cannot extend a found fixed substring at this point.
1353 Update the longest found anchored substring or the longest found
1354 floating substrings if needed. */
1355
1356STATIC void
1357S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1358 SSize_t *minlenp, int is_inf)
1359{
1360 const STRLEN l = CHR_SVLEN(data->last_found);
1361 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1362 const STRLEN old_l = CHR_SVLEN(longest_sv);
1363 GET_RE_DEBUG_FLAGS_DECL;
1364
1365 PERL_ARGS_ASSERT_SCAN_COMMIT;
1366
1367 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1368 const U8 i = data->cur_is_floating;
1369 SvSetMagicSV(longest_sv, data->last_found);
1370 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1371
1372 if (!i) /* fixed */
1373 data->substrs[0].max_offset = data->substrs[0].min_offset;
1374 else { /* float */
1375 data->substrs[1].max_offset = (l
1376 ? data->last_start_max
1377 : (data->pos_delta > SSize_t_MAX - data->pos_min
1378 ? SSize_t_MAX
1379 : data->pos_min + data->pos_delta));
1380 if (is_inf
1381 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1382 data->substrs[1].max_offset = SSize_t_MAX;
1383 }
1384
1385 if (data->flags & SF_BEFORE_EOL)
1386 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1387 else
1388 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1389 data->substrs[i].minlenp = minlenp;
1390 data->substrs[i].lookbehind = 0;
1391 }
1392
1393 SvCUR_set(data->last_found, 0);
1394 {
1395 SV * const sv = data->last_found;
1396 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1397 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1398 if (mg)
1399 mg->mg_len = 0;
1400 }
1401 }
1402 data->last_end = -1;
1403 data->flags &= ~SF_BEFORE_EOL;
1404 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1405}
1406
1407/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1408 * list that describes which code points it matches */
1409
1410STATIC void
1411S_ssc_anything(pTHX_ regnode_ssc *ssc)
1412{
1413 /* Set the SSC 'ssc' to match an empty string or any code point */
1414
1415 PERL_ARGS_ASSERT_SSC_ANYTHING;
1416
1417 assert(is_ANYOF_SYNTHETIC(ssc));
1418
1419 /* mortalize so won't leak */
1420 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1421 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1422}
1423
1424STATIC int
1425S_ssc_is_anything(const regnode_ssc *ssc)
1426{
1427 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1428 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1429 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1430 * in any way, so there's no point in using it */
1431
1432 UV start, end;
1433 bool ret;
1434
1435 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1436
1437 assert(is_ANYOF_SYNTHETIC(ssc));
1438
1439 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1440 return FALSE;
1441 }
1442
1443 /* See if the list consists solely of the range 0 - Infinity */
1444 invlist_iterinit(ssc->invlist);
1445 ret = invlist_iternext(ssc->invlist, &start, &end)
1446 && start == 0
1447 && end == UV_MAX;
1448
1449 invlist_iterfinish(ssc->invlist);
1450
1451 if (ret) {
1452 return TRUE;
1453 }
1454
1455 /* If e.g., both \w and \W are set, matches everything */
1456 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1457 int i;
1458 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1459 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1460 return TRUE;
1461 }
1462 }
1463 }
1464
1465 return FALSE;
1466}
1467
1468STATIC void
1469S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1470{
1471 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1472 * string, any code point, or any posix class under locale */
1473
1474 PERL_ARGS_ASSERT_SSC_INIT;
1475
1476 Zero(ssc, 1, regnode_ssc);
1477 set_ANYOF_SYNTHETIC(ssc);
1478 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1479 ssc_anything(ssc);
1480
1481 /* If any portion of the regex is to operate under locale rules that aren't
1482 * fully known at compile time, initialization includes it. The reason
1483 * this isn't done for all regexes is that the optimizer was written under
1484 * the assumption that locale was all-or-nothing. Given the complexity and
1485 * lack of documentation in the optimizer, and that there are inadequate
1486 * test cases for locale, many parts of it may not work properly, it is
1487 * safest to avoid locale unless necessary. */
1488 if (RExC_contains_locale) {
1489 ANYOF_POSIXL_SETALL(ssc);
1490 }
1491 else {
1492 ANYOF_POSIXL_ZERO(ssc);
1493 }
1494}
1495
1496STATIC int
1497S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1498 const regnode_ssc *ssc)
1499{
1500 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1501 * to the list of code points matched, and locale posix classes; hence does
1502 * not check its flags) */
1503
1504 UV start, end;
1505 bool ret;
1506
1507 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1508
1509 assert(is_ANYOF_SYNTHETIC(ssc));
1510
1511 invlist_iterinit(ssc->invlist);
1512 ret = invlist_iternext(ssc->invlist, &start, &end)
1513 && start == 0
1514 && end == UV_MAX;
1515
1516 invlist_iterfinish(ssc->invlist);
1517
1518 if (! ret) {
1519 return FALSE;
1520 }
1521
1522 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1523 return FALSE;
1524 }
1525
1526 return TRUE;
1527}
1528
1529STATIC SV*
1530S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1531 const regnode_charclass* const node)
1532{
1533 /* Returns a mortal inversion list defining which code points are matched
1534 * by 'node', which is of type ANYOF. Handles complementing the result if
1535 * appropriate. If some code points aren't knowable at this time, the
1536 * returned list must, and will, contain every code point that is a
1537 * possibility. */
1538
1539 SV* invlist = NULL;
1540 SV* only_utf8_locale_invlist = NULL;
1541 unsigned int i;
1542 const U32 n = ARG(node);
1543 bool new_node_has_latin1 = FALSE;
1544
1545 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1546
1547 /* Look at the data structure created by S_set_ANYOF_arg() */
1548 if (n != ANYOF_ONLY_HAS_BITMAP) {
1549 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1550 AV * const av = MUTABLE_AV(SvRV(rv));
1551 SV **const ary = AvARRAY(av);
1552 assert(RExC_rxi->data->what[n] == 's');
1553
1554 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1555 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
1556 }
1557 else if (ary[0] && ary[0] != &PL_sv_undef) {
1558
1559 /* Here, no compile-time swash, and there are things that won't be
1560 * known until runtime -- we have to assume it could be anything */
1561 invlist = sv_2mortal(_new_invlist(1));
1562 return _add_range_to_invlist(invlist, 0, UV_MAX);
1563 }
1564 else if (ary[3] && ary[3] != &PL_sv_undef) {
1565
1566 /* Here no compile-time swash, and no run-time only data. Use the
1567 * node's inversion list */
1568 invlist = sv_2mortal(invlist_clone(ary[3], NULL));
1569 }
1570
1571 /* Get the code points valid only under UTF-8 locales */
1572 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1573 && ary[2] && ary[2] != &PL_sv_undef)
1574 {
1575 only_utf8_locale_invlist = ary[2];
1576 }
1577 }
1578
1579 if (! invlist) {
1580 invlist = sv_2mortal(_new_invlist(0));
1581 }
1582
1583 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1584 * code points, and an inversion list for the others, but if there are code
1585 * points that should match only conditionally on the target string being
1586 * UTF-8, those are placed in the inversion list, and not the bitmap.
1587 * Since there are circumstances under which they could match, they are
1588 * included in the SSC. But if the ANYOF node is to be inverted, we have
1589 * to exclude them here, so that when we invert below, the end result
1590 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1591 * have to do this here before we add the unconditionally matched code
1592 * points */
1593 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1594 _invlist_intersection_complement_2nd(invlist,
1595 PL_UpperLatin1,
1596 &invlist);
1597 }
1598
1599 /* Add in the points from the bit map */
1600 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1601 if (ANYOF_BITMAP_TEST(node, i)) {
1602 unsigned int start = i++;
1603
1604 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1605 /* empty */
1606 }
1607 invlist = _add_range_to_invlist(invlist, start, i-1);
1608 new_node_has_latin1 = TRUE;
1609 }
1610 }
1611
1612 /* If this can match all upper Latin1 code points, have to add them
1613 * as well. But don't add them if inverting, as when that gets done below,
1614 * it would exclude all these characters, including the ones it shouldn't
1615 * that were added just above */
1616 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1617 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1618 {
1619 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1620 }
1621
1622 /* Similarly for these */
1623 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1624 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1625 }
1626
1627 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1628 _invlist_invert(invlist);
1629 }
1630 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1631
1632 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1633 * locale. We can skip this if there are no 0-255 at all. */
1634 _invlist_union(invlist, PL_Latin1, &invlist);
1635 }
1636
1637 /* Similarly add the UTF-8 locale possible matches. These have to be
1638 * deferred until after the non-UTF-8 locale ones are taken care of just
1639 * above, or it leads to wrong results under ANYOF_INVERT */
1640 if (only_utf8_locale_invlist) {
1641 _invlist_union_maybe_complement_2nd(invlist,
1642 only_utf8_locale_invlist,
1643 ANYOF_FLAGS(node) & ANYOF_INVERT,
1644 &invlist);
1645 }
1646
1647 return invlist;
1648}
1649
1650/* These two functions currently do the exact same thing */
1651#define ssc_init_zero ssc_init
1652
1653#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1654#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1655
1656/* 'AND' a given class with another one. Can create false positives. 'ssc'
1657 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1658 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1659
1660STATIC void
1661S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1662 const regnode_charclass *and_with)
1663{
1664 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1665 * another SSC or a regular ANYOF class. Can create false positives. */
1666
1667 SV* anded_cp_list;
1668 U8 anded_flags;
1669
1670 PERL_ARGS_ASSERT_SSC_AND;
1671
1672 assert(is_ANYOF_SYNTHETIC(ssc));
1673
1674 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1675 * the code point inversion list and just the relevant flags */
1676 if (is_ANYOF_SYNTHETIC(and_with)) {
1677 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1678 anded_flags = ANYOF_FLAGS(and_with);
1679
1680 /* XXX This is a kludge around what appears to be deficiencies in the
1681 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1682 * there are paths through the optimizer where it doesn't get weeded
1683 * out when it should. And if we don't make some extra provision for
1684 * it like the code just below, it doesn't get added when it should.
1685 * This solution is to add it only when AND'ing, which is here, and
1686 * only when what is being AND'ed is the pristine, original node
1687 * matching anything. Thus it is like adding it to ssc_anything() but
1688 * only when the result is to be AND'ed. Probably the same solution
1689 * could be adopted for the same problem we have with /l matching,
1690 * which is solved differently in S_ssc_init(), and that would lead to
1691 * fewer false positives than that solution has. But if this solution
1692 * creates bugs, the consequences are only that a warning isn't raised
1693 * that should be; while the consequences for having /l bugs is
1694 * incorrect matches */
1695 if (ssc_is_anything((regnode_ssc *)and_with)) {
1696 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1697 }
1698 }
1699 else {
1700 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1701 if (OP(and_with) == ANYOFD) {
1702 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1703 }
1704 else {
1705 anded_flags = ANYOF_FLAGS(and_with)
1706 &( ANYOF_COMMON_FLAGS
1707 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1708 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1709 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1710 anded_flags &=
1711 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1712 }
1713 }
1714 }
1715
1716 ANYOF_FLAGS(ssc) &= anded_flags;
1717
1718 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1719 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1720 * 'and_with' may be inverted. When not inverted, we have the situation of
1721 * computing:
1722 * (C1 | P1) & (C2 | P2)
1723 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1724 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1725 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1726 * <= ((C1 & C2) | P1 | P2)
1727 * Alternatively, the last few steps could be:
1728 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1729 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1730 * <= (C1 | C2 | (P1 & P2))
1731 * We favor the second approach if either P1 or P2 is non-empty. This is
1732 * because these components are a barrier to doing optimizations, as what
1733 * they match cannot be known until the moment of matching as they are
1734 * dependent on the current locale, 'AND"ing them likely will reduce or
1735 * eliminate them.
1736 * But we can do better if we know that C1,P1 are in their initial state (a
1737 * frequent occurrence), each matching everything:
1738 * (<everything>) & (C2 | P2) = C2 | P2
1739 * Similarly, if C2,P2 are in their initial state (again a frequent
1740 * occurrence), the result is a no-op
1741 * (C1 | P1) & (<everything>) = C1 | P1
1742 *
1743 * Inverted, we have
1744 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1745 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1746 * <= (C1 & ~C2) | (P1 & ~P2)
1747 * */
1748
1749 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1750 && ! is_ANYOF_SYNTHETIC(and_with))
1751 {
1752 unsigned int i;
1753
1754 ssc_intersection(ssc,
1755 anded_cp_list,
1756 FALSE /* Has already been inverted */
1757 );
1758
1759 /* If either P1 or P2 is empty, the intersection will be also; can skip
1760 * the loop */
1761 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1762 ANYOF_POSIXL_ZERO(ssc);
1763 }
1764 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1765
1766 /* Note that the Posix class component P from 'and_with' actually
1767 * looks like:
1768 * P = Pa | Pb | ... | Pn
1769 * where each component is one posix class, such as in [\w\s].
1770 * Thus
1771 * ~P = ~(Pa | Pb | ... | Pn)
1772 * = ~Pa & ~Pb & ... & ~Pn
1773 * <= ~Pa | ~Pb | ... | ~Pn
1774 * The last is something we can easily calculate, but unfortunately
1775 * is likely to have many false positives. We could do better
1776 * in some (but certainly not all) instances if two classes in
1777 * P have known relationships. For example
1778 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1779 * So
1780 * :lower: & :print: = :lower:
1781 * And similarly for classes that must be disjoint. For example,
1782 * since \s and \w can have no elements in common based on rules in
1783 * the POSIX standard,
1784 * \w & ^\S = nothing
1785 * Unfortunately, some vendor locales do not meet the Posix
1786 * standard, in particular almost everything by Microsoft.
1787 * The loop below just changes e.g., \w into \W and vice versa */
1788
1789 regnode_charclass_posixl temp;
1790 int add = 1; /* To calculate the index of the complement */
1791
1792 Zero(&temp, 1, regnode_charclass_posixl);
1793 ANYOF_POSIXL_ZERO(&temp);
1794 for (i = 0; i < ANYOF_MAX; i++) {
1795 assert(i % 2 != 0
1796 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1797 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1798
1799 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1800 ANYOF_POSIXL_SET(&temp, i + add);
1801 }
1802 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1803 }
1804 ANYOF_POSIXL_AND(&temp, ssc);
1805
1806 } /* else ssc already has no posixes */
1807 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1808 in its initial state */
1809 else if (! is_ANYOF_SYNTHETIC(and_with)
1810 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1811 {
1812 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1813 * copy it over 'ssc' */
1814 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1815 if (is_ANYOF_SYNTHETIC(and_with)) {
1816 StructCopy(and_with, ssc, regnode_ssc);
1817 }
1818 else {
1819 ssc->invlist = anded_cp_list;
1820 ANYOF_POSIXL_ZERO(ssc);
1821 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1822 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1823 }
1824 }
1825 }
1826 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1827 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1828 {
1829 /* One or the other of P1, P2 is non-empty. */
1830 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1831 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1832 }
1833 ssc_union(ssc, anded_cp_list, FALSE);
1834 }
1835 else { /* P1 = P2 = empty */
1836 ssc_intersection(ssc, anded_cp_list, FALSE);
1837 }
1838 }
1839}
1840
1841STATIC void
1842S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1843 const regnode_charclass *or_with)
1844{
1845 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1846 * another SSC or a regular ANYOF class. Can create false positives if
1847 * 'or_with' is to be inverted. */
1848
1849 SV* ored_cp_list;
1850 U8 ored_flags;
1851
1852 PERL_ARGS_ASSERT_SSC_OR;
1853
1854 assert(is_ANYOF_SYNTHETIC(ssc));
1855
1856 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1857 * the code point inversion list and just the relevant flags */
1858 if (is_ANYOF_SYNTHETIC(or_with)) {
1859 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1860 ored_flags = ANYOF_FLAGS(or_with);
1861 }
1862 else {
1863 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1864 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1865 if (OP(or_with) != ANYOFD) {
1866 ored_flags
1867 |= ANYOF_FLAGS(or_with)
1868 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1869 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1870 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1871 ored_flags |=
1872 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1873 }
1874 }
1875 }
1876
1877 ANYOF_FLAGS(ssc) |= ored_flags;
1878
1879 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1880 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1881 * 'or_with' may be inverted. When not inverted, we have the simple
1882 * situation of computing:
1883 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1884 * If P1|P2 yields a situation with both a class and its complement are
1885 * set, like having both \w and \W, this matches all code points, and we
1886 * can delete these from the P component of the ssc going forward. XXX We
1887 * might be able to delete all the P components, but I (khw) am not certain
1888 * about this, and it is better to be safe.
1889 *
1890 * Inverted, we have
1891 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1892 * <= (C1 | P1) | ~C2
1893 * <= (C1 | ~C2) | P1
1894 * (which results in actually simpler code than the non-inverted case)
1895 * */
1896
1897 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1898 && ! is_ANYOF_SYNTHETIC(or_with))
1899 {
1900 /* We ignore P2, leaving P1 going forward */
1901 } /* else Not inverted */
1902 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1903 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1904 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1905 unsigned int i;
1906 for (i = 0; i < ANYOF_MAX; i += 2) {
1907 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1908 {
1909 ssc_match_all_cp(ssc);
1910 ANYOF_POSIXL_CLEAR(ssc, i);
1911 ANYOF_POSIXL_CLEAR(ssc, i+1);
1912 }
1913 }
1914 }
1915 }
1916
1917 ssc_union(ssc,
1918 ored_cp_list,
1919 FALSE /* Already has been inverted */
1920 );
1921}
1922
1923PERL_STATIC_INLINE void
1924S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1925{
1926 PERL_ARGS_ASSERT_SSC_UNION;
1927
1928 assert(is_ANYOF_SYNTHETIC(ssc));
1929
1930 _invlist_union_maybe_complement_2nd(ssc->invlist,
1931 invlist,
1932 invert2nd,
1933 &ssc->invlist);
1934}
1935
1936PERL_STATIC_INLINE void
1937S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1938 SV* const invlist,
1939 const bool invert2nd)
1940{
1941 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1942
1943 assert(is_ANYOF_SYNTHETIC(ssc));
1944
1945 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1946 invlist,
1947 invert2nd,
1948 &ssc->invlist);
1949}
1950
1951PERL_STATIC_INLINE void
1952S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1953{
1954 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1955
1956 assert(is_ANYOF_SYNTHETIC(ssc));
1957
1958 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1959}
1960
1961PERL_STATIC_INLINE void
1962S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1963{
1964 /* AND just the single code point 'cp' into the SSC 'ssc' */
1965
1966 SV* cp_list = _new_invlist(2);
1967
1968 PERL_ARGS_ASSERT_SSC_CP_AND;
1969
1970 assert(is_ANYOF_SYNTHETIC(ssc));
1971
1972 cp_list = add_cp_to_invlist(cp_list, cp);
1973 ssc_intersection(ssc, cp_list,
1974 FALSE /* Not inverted */
1975 );
1976 SvREFCNT_dec_NN(cp_list);
1977}
1978
1979PERL_STATIC_INLINE void
1980S_ssc_clear_locale(regnode_ssc *ssc)
1981{
1982 /* Set the SSC 'ssc' to not match any locale things */
1983 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1984
1985 assert(is_ANYOF_SYNTHETIC(ssc));
1986
1987 ANYOF_POSIXL_ZERO(ssc);
1988 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1989}
1990
1991#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1992
1993STATIC bool
1994S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1995{
1996 /* The synthetic start class is used to hopefully quickly winnow down
1997 * places where a pattern could start a match in the target string. If it
1998 * doesn't really narrow things down that much, there isn't much point to
1999 * having the overhead of using it. This function uses some very crude
2000 * heuristics to decide if to use the ssc or not.
2001 *
2002 * It returns TRUE if 'ssc' rules out more than half what it considers to
2003 * be the "likely" possible matches, but of course it doesn't know what the
2004 * actual things being matched are going to be; these are only guesses
2005 *
2006 * For /l matches, it assumes that the only likely matches are going to be
2007 * in the 0-255 range, uniformly distributed, so half of that is 127
2008 * For /a and /d matches, it assumes that the likely matches will be just
2009 * the ASCII range, so half of that is 63
2010 * For /u and there isn't anything matching above the Latin1 range, it
2011 * assumes that that is the only range likely to be matched, and uses
2012 * half that as the cut-off: 127. If anything matches above Latin1,
2013 * it assumes that all of Unicode could match (uniformly), except for
2014 * non-Unicode code points and things in the General Category "Other"
2015 * (unassigned, private use, surrogates, controls and formats). This
2016 * is a much large number. */
2017
2018 U32 count = 0; /* Running total of number of code points matched by
2019 'ssc' */
2020 UV start, end; /* Start and end points of current range in inversion
2021 list */
2022 const U32 max_code_points = (LOC)
2023 ? 256
2024 : (( ! UNI_SEMANTICS
2025 || invlist_highest(ssc->invlist) < 256)
2026 ? 128
2027 : NON_OTHER_COUNT);
2028 const U32 max_match = max_code_points / 2;
2029
2030 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2031
2032 invlist_iterinit(ssc->invlist);
2033 while (invlist_iternext(ssc->invlist, &start, &end)) {
2034 if (start >= max_code_points) {
2035 break;
2036 }
2037 end = MIN(end, max_code_points - 1);
2038 count += end - start + 1;
2039 if (count >= max_match) {
2040 invlist_iterfinish(ssc->invlist);
2041 return FALSE;
2042 }
2043 }
2044
2045 return TRUE;
2046}
2047
2048
2049STATIC void
2050S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2051{
2052 /* The inversion list in the SSC is marked mortal; now we need a more
2053 * permanent copy, which is stored the same way that is done in a regular
2054 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2055 * map */
2056
2057 SV* invlist = invlist_clone(ssc->invlist, NULL);
2058
2059 PERL_ARGS_ASSERT_SSC_FINALIZE;
2060
2061 assert(is_ANYOF_SYNTHETIC(ssc));
2062
2063 /* The code in this file assumes that all but these flags aren't relevant
2064 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2065 * by the time we reach here */
2066 assert(! (ANYOF_FLAGS(ssc)
2067 & ~( ANYOF_COMMON_FLAGS
2068 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2069 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2070
2071 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2072
2073 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
2074 NULL, NULL, NULL, FALSE);
2075
2076 /* Make sure is clone-safe */
2077 ssc->invlist = NULL;
2078
2079 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2080 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2081 OP(ssc) = ANYOFPOSIXL;
2082 }
2083 else if (RExC_contains_locale) {
2084 OP(ssc) = ANYOFL;
2085 }
2086
2087 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2088}
2089
2090#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2091#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2092#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2093#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2094 ? (TRIE_LIST_CUR( idx ) - 1) \
2095 : 0 )
2096
2097
2098#ifdef DEBUGGING
2099/*
2100 dump_trie(trie,widecharmap,revcharmap)
2101 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2102 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2103
2104 These routines dump out a trie in a somewhat readable format.
2105 The _interim_ variants are used for debugging the interim
2106 tables that are used to generate the final compressed
2107 representation which is what dump_trie expects.
2108
2109 Part of the reason for their existence is to provide a form
2110 of documentation as to how the different representations function.
2111
2112*/
2113
2114/*
2115 Dumps the final compressed table form of the trie to Perl_debug_log.
2116 Used for debugging make_trie().
2117*/
2118
2119STATIC void
2120S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2121 AV *revcharmap, U32 depth)
2122{
2123 U32 state;
2124 SV *sv=sv_newmortal();
2125 int colwidth= widecharmap ? 6 : 4;
2126 U16 word;
2127 GET_RE_DEBUG_FLAGS_DECL;
2128
2129 PERL_ARGS_ASSERT_DUMP_TRIE;
2130
2131 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2132 depth+1, "Match","Base","Ofs" );
2133
2134 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2135 SV ** const tmp = av_fetch( revcharmap, state, 0);
2136 if ( tmp ) {
2137 Perl_re_printf( aTHX_ "%*s",
2138 colwidth,
2139 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2140 PL_colors[0], PL_colors[1],
2141 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2142 PERL_PV_ESCAPE_FIRSTCHAR
2143 )
2144 );
2145 }
2146 }
2147 Perl_re_printf( aTHX_ "\n");
2148 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2149
2150 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2151 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2152 Perl_re_printf( aTHX_ "\n");
2153
2154 for( state = 1 ; state < trie->statecount ; state++ ) {
2155 const U32 base = trie->states[ state ].trans.base;
2156
2157 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2158
2159 if ( trie->states[ state ].wordnum ) {
2160 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2161 } else {
2162 Perl_re_printf( aTHX_ "%6s", "" );
2163 }
2164
2165 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2166
2167 if ( base ) {
2168 U32 ofs = 0;
2169
2170 while( ( base + ofs < trie->uniquecharcount ) ||
2171 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2172 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2173 != state))
2174 ofs++;
2175
2176 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2177
2178 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2179 if ( ( base + ofs >= trie->uniquecharcount )
2180 && ( base + ofs - trie->uniquecharcount
2181 < trie->lasttrans )
2182 && trie->trans[ base + ofs
2183 - trie->uniquecharcount ].check == state )
2184 {
2185 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2186 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2187 );
2188 } else {
2189 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2190 }
2191 }
2192
2193 Perl_re_printf( aTHX_ "]");
2194
2195 }
2196 Perl_re_printf( aTHX_ "\n" );
2197 }
2198 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2199 depth);
2200 for (word=1; word <= trie->wordcount; word++) {
2201 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2202 (int)word, (int)(trie->wordinfo[word].prev),
2203 (int)(trie->wordinfo[word].len));
2204 }
2205 Perl_re_printf( aTHX_ "\n" );
2206}
2207/*
2208 Dumps a fully constructed but uncompressed trie in list form.
2209 List tries normally only are used for construction when the number of
2210 possible chars (trie->uniquecharcount) is very high.
2211 Used for debugging make_trie().
2212*/
2213STATIC void
2214S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2215 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2216 U32 depth)
2217{
2218 U32 state;
2219 SV *sv=sv_newmortal();
2220 int colwidth= widecharmap ? 6 : 4;
2221 GET_RE_DEBUG_FLAGS_DECL;
2222
2223 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2224
2225 /* print out the table precompression. */
2226 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2227 depth+1 );
2228 Perl_re_indentf( aTHX_ "%s",
2229 depth+1, "------:-----+-----------------\n" );
2230
2231 for( state=1 ; state < next_alloc ; state ++ ) {
2232 U16 charid;
2233
2234 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2235 depth+1, (UV)state );
2236 if ( ! trie->states[ state ].wordnum ) {
2237 Perl_re_printf( aTHX_ "%5s| ","");
2238 } else {
2239 Perl_re_printf( aTHX_ "W%4x| ",
2240 trie->states[ state ].wordnum
2241 );
2242 }
2243 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2244 SV ** const tmp = av_fetch( revcharmap,
2245 TRIE_LIST_ITEM(state, charid).forid, 0);
2246 if ( tmp ) {
2247 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2248 colwidth,
2249 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2250 colwidth,
2251 PL_colors[0], PL_colors[1],
2252 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2253 | PERL_PV_ESCAPE_FIRSTCHAR
2254 ) ,
2255 TRIE_LIST_ITEM(state, charid).forid,
2256 (UV)TRIE_LIST_ITEM(state, charid).newstate
2257 );
2258 if (!(charid % 10))
2259 Perl_re_printf( aTHX_ "\n%*s| ",
2260 (int)((depth * 2) + 14), "");
2261 }
2262 }
2263 Perl_re_printf( aTHX_ "\n");
2264 }
2265}
2266
2267/*
2268 Dumps a fully constructed but uncompressed trie in table form.
2269 This is the normal DFA style state transition table, with a few
2270 twists to facilitate compression later.
2271 Used for debugging make_trie().
2272*/
2273STATIC void
2274S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2275 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2276 U32 depth)
2277{
2278 U32 state;
2279 U16 charid;
2280 SV *sv=sv_newmortal();
2281 int colwidth= widecharmap ? 6 : 4;
2282 GET_RE_DEBUG_FLAGS_DECL;
2283
2284 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2285
2286 /*
2287 print out the table precompression so that we can do a visual check
2288 that they are identical.
2289 */
2290
2291 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2292
2293 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2294 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2295 if ( tmp ) {
2296 Perl_re_printf( aTHX_ "%*s",
2297 colwidth,
2298 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2299 PL_colors[0], PL_colors[1],
2300 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2301 PERL_PV_ESCAPE_FIRSTCHAR
2302 )
2303 );
2304 }
2305 }
2306
2307 Perl_re_printf( aTHX_ "\n");
2308 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2309
2310 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2311 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2312 }
2313
2314 Perl_re_printf( aTHX_ "\n" );
2315
2316 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2317
2318 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2319 depth+1,
2320 (UV)TRIE_NODENUM( state ) );
2321
2322 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2323 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2324 if (v)
2325 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2326 else
2327 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2328 }
2329 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2330 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2331 (UV)trie->trans[ state ].check );
2332 } else {
2333 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2334 (UV)trie->trans[ state ].check,
2335 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2336 }
2337 }
2338}
2339
2340#endif
2341
2342
2343/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2344 startbranch: the first branch in the whole branch sequence
2345 first : start branch of sequence of branch-exact nodes.
2346 May be the same as startbranch
2347 last : Thing following the last branch.
2348 May be the same as tail.
2349 tail : item following the branch sequence
2350 count : words in the sequence
2351 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2352 depth : indent depth
2353
2354Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2355
2356A trie is an N'ary tree where the branches are determined by digital
2357decomposition of the key. IE, at the root node you look up the 1st character and
2358follow that branch repeat until you find the end of the branches. Nodes can be
2359marked as "accepting" meaning they represent a complete word. Eg:
2360
2361 /he|she|his|hers/
2362
2363would convert into the following structure. Numbers represent states, letters
2364following numbers represent valid transitions on the letter from that state, if
2365the number is in square brackets it represents an accepting state, otherwise it
2366will be in parenthesis.
2367
2368 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2369 | |
2370 | (2)
2371 | |
2372 (1) +-i->(6)-+-s->[7]
2373 |
2374 +-s->(3)-+-h->(4)-+-e->[5]
2375
2376 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2377
2378This shows that when matching against the string 'hers' we will begin at state 1
2379read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2380then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2381is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2382single traverse. We store a mapping from accepting to state to which word was
2383matched, and then when we have multiple possibilities we try to complete the
2384rest of the regex in the order in which they occurred in the alternation.
2385
2386The only prior NFA like behaviour that would be changed by the TRIE support is
2387the silent ignoring of duplicate alternations which are of the form:
2388
2389 / (DUPE|DUPE) X? (?{ ... }) Y /x
2390
2391Thus EVAL blocks following a trie may be called a different number of times with
2392and without the optimisation. With the optimisations dupes will be silently
2393ignored. This inconsistent behaviour of EVAL type nodes is well established as
2394the following demonstrates:
2395
2396 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2397
2398which prints out 'word' three times, but
2399
2400 'words'=~/(word|word|word)(?{ print $1 })S/
2401
2402which doesnt print it out at all. This is due to other optimisations kicking in.
2403
2404Example of what happens on a structural level:
2405
2406The regexp /(ac|ad|ab)+/ will produce the following debug output:
2407
2408 1: CURLYM[1] {1,32767}(18)
2409 5: BRANCH(8)
2410 6: EXACT <ac>(16)
2411 8: BRANCH(11)
2412 9: EXACT <ad>(16)
2413 11: BRANCH(14)
2414 12: EXACT <ab>(16)
2415 16: SUCCEED(0)
2416 17: NOTHING(18)
2417 18: END(0)
2418
2419This would be optimizable with startbranch=5, first=5, last=16, tail=16
2420and should turn into:
2421
2422 1: CURLYM[1] {1,32767}(18)
2423 5: TRIE(16)
2424 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2425 <ac>
2426 <ad>
2427 <ab>
2428 16: SUCCEED(0)
2429 17: NOTHING(18)
2430 18: END(0)
2431
2432Cases where tail != last would be like /(?foo|bar)baz/:
2433
2434 1: BRANCH(4)
2435 2: EXACT <foo>(8)
2436 4: BRANCH(7)
2437 5: EXACT <bar>(8)
2438 7: TAIL(8)
2439 8: EXACT <baz>(10)
2440 10: END(0)
2441
2442which would be optimizable with startbranch=1, first=1, last=7, tail=8
2443and would end up looking like:
2444
2445 1: TRIE(8)
2446 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2447 <foo>
2448 <bar>
2449 7: TAIL(8)
2450 8: EXACT <baz>(10)
2451 10: END(0)
2452
2453 d = uvchr_to_utf8_flags(d, uv, 0);
2454
2455is the recommended Unicode-aware way of saying
2456
2457 *(d++) = uv;
2458*/
2459
2460#define TRIE_STORE_REVCHAR(val) \
2461 STMT_START { \
2462 if (UTF) { \
2463 SV *zlopp = newSV(UTF8_MAXBYTES); \
2464 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2465 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2466 SvCUR_set(zlopp, kapow - flrbbbbb); \
2467 SvPOK_on(zlopp); \
2468 SvUTF8_on(zlopp); \
2469 av_push(revcharmap, zlopp); \
2470 } else { \
2471 char ooooff = (char)val; \
2472 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2473 } \
2474 } STMT_END
2475
2476/* This gets the next character from the input, folding it if not already
2477 * folded. */
2478#define TRIE_READ_CHAR STMT_START { \
2479 wordlen++; \
2480 if ( UTF ) { \
2481 /* if it is UTF then it is either already folded, or does not need \
2482 * folding */ \
2483 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2484 } \
2485 else if (folder == PL_fold_latin1) { \
2486 /* This folder implies Unicode rules, which in the range expressible \
2487 * by not UTF is the lower case, with the two exceptions, one of \
2488 * which should have been taken care of before calling this */ \
2489 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2490 uvc = toLOWER_L1(*uc); \
2491 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2492 len = 1; \
2493 } else { \
2494 /* raw data, will be folded later if needed */ \
2495 uvc = (U32)*uc; \
2496 len = 1; \
2497 } \
2498} STMT_END
2499
2500
2501
2502#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2503 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2504 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2505 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2506 TRIE_LIST_LEN( state ) = ging; \
2507 } \
2508 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2509 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2510 TRIE_LIST_CUR( state )++; \
2511} STMT_END
2512
2513#define TRIE_LIST_NEW(state) STMT_START { \
2514 Newx( trie->states[ state ].trans.list, \
2515 4, reg_trie_trans_le ); \
2516 TRIE_LIST_CUR( state ) = 1; \
2517 TRIE_LIST_LEN( state ) = 4; \
2518} STMT_END
2519
2520#define TRIE_HANDLE_WORD(state) STMT_START { \
2521 U16 dupe= trie->states[ state ].wordnum; \
2522 regnode * const noper_next = regnext( noper ); \
2523 \
2524 DEBUG_r({ \
2525 /* store the word for dumping */ \
2526 SV* tmp; \
2527 if (OP(noper) != NOTHING) \
2528 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2529 else \
2530 tmp = newSVpvn_utf8( "", 0, UTF ); \
2531 av_push( trie_words, tmp ); \
2532 }); \
2533 \
2534 curword++; \
2535 trie->wordinfo[curword].prev = 0; \
2536 trie->wordinfo[curword].len = wordlen; \
2537 trie->wordinfo[curword].accept = state; \
2538 \
2539 if ( noper_next < tail ) { \
2540 if (!trie->jump) \
2541 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2542 sizeof(U16) ); \
2543 trie->jump[curword] = (U16)(noper_next - convert); \
2544 if (!jumper) \
2545 jumper = noper_next; \
2546 if (!nextbranch) \
2547 nextbranch= regnext(cur); \
2548 } \
2549 \
2550 if ( dupe ) { \
2551 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2552 /* chain, so that when the bits of chain are later */\
2553 /* linked together, the dups appear in the chain */\
2554 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2555 trie->wordinfo[dupe].prev = curword; \
2556 } else { \
2557 /* we haven't inserted this word yet. */ \
2558 trie->states[ state ].wordnum = curword; \
2559 } \
2560} STMT_END
2561
2562
2563#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2564 ( ( base + charid >= ucharcount \
2565 && base + charid < ubound \
2566 && state == trie->trans[ base - ucharcount + charid ].check \
2567 && trie->trans[ base - ucharcount + charid ].next ) \
2568 ? trie->trans[ base - ucharcount + charid ].next \
2569 : ( state==1 ? special : 0 ) \
2570 )
2571
2572#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2573STMT_START { \
2574 TRIE_BITMAP_SET(trie, uvc); \
2575 /* store the folded codepoint */ \
2576 if ( folder ) \
2577 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2578 \
2579 if ( !UTF ) { \
2580 /* store first byte of utf8 representation of */ \
2581 /* variant codepoints */ \
2582 if (! UVCHR_IS_INVARIANT(uvc)) { \
2583 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2584 } \
2585 } \
2586} STMT_END
2587#define MADE_TRIE 1
2588#define MADE_JUMP_TRIE 2
2589#define MADE_EXACT_TRIE 4
2590
2591STATIC I32
2592S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2593 regnode *first, regnode *last, regnode *tail,
2594 U32 word_count, U32 flags, U32 depth)
2595{
2596 /* first pass, loop through and scan words */
2597 reg_trie_data *trie;
2598 HV *widecharmap = NULL;
2599 AV *revcharmap = newAV();
2600 regnode *cur;
2601 STRLEN len = 0;
2602 UV uvc = 0;
2603 U16 curword = 0;
2604 U32 next_alloc = 0;
2605 regnode *jumper = NULL;
2606 regnode *nextbranch = NULL;
2607 regnode *convert = NULL;
2608 U32 *prev_states; /* temp array mapping each state to previous one */
2609 /* we just use folder as a flag in utf8 */
2610 const U8 * folder = NULL;
2611
2612 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2613 * which stands for one trie structure, one hash, optionally followed
2614 * by two arrays */
2615#ifdef DEBUGGING
2616 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2617 AV *trie_words = NULL;
2618 /* along with revcharmap, this only used during construction but both are
2619 * useful during debugging so we store them in the struct when debugging.
2620 */
2621#else
2622 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2623 STRLEN trie_charcount=0;
2624#endif
2625 SV *re_trie_maxbuff;
2626 GET_RE_DEBUG_FLAGS_DECL;
2627
2628 PERL_ARGS_ASSERT_MAKE_TRIE;
2629#ifndef DEBUGGING
2630 PERL_UNUSED_ARG(depth);
2631#endif
2632
2633 switch (flags) {
2634 case EXACT: case EXACTL: break;
2635 case EXACTFAA:
2636 case EXACTFU_SS:
2637 case EXACTFU:
2638 case EXACTFLU8: folder = PL_fold_latin1; break;
2639 case EXACTF: folder = PL_fold; break;
2640 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2641 }
2642
2643 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2644 trie->refcount = 1;
2645 trie->startstate = 1;
2646 trie->wordcount = word_count;
2647 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2648 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2649 if (flags == EXACT || flags == EXACTL)
2650 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2651 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2652 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2653
2654 DEBUG_r({
2655 trie_words = newAV();
2656 });
2657
2658 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2659 assert(re_trie_maxbuff);
2660 if (!SvIOK(re_trie_maxbuff)) {
2661 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2662 }
2663 DEBUG_TRIE_COMPILE_r({
2664 Perl_re_indentf( aTHX_
2665 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2666 depth+1,
2667 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2668 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2669 });
2670
2671 /* Find the node we are going to overwrite */
2672 if ( first == startbranch && OP( last ) != BRANCH ) {
2673 /* whole branch chain */
2674 convert = first;
2675 } else {
2676 /* branch sub-chain */
2677 convert = NEXTOPER( first );
2678 }
2679
2680 /* -- First loop and Setup --
2681
2682 We first traverse the branches and scan each word to determine if it
2683 contains widechars, and how many unique chars there are, this is
2684 important as we have to build a table with at least as many columns as we
2685 have unique chars.
2686
2687 We use an array of integers to represent the character codes 0..255
2688 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2689 the native representation of the character value as the key and IV's for
2690 the coded index.
2691
2692 *TODO* If we keep track of how many times each character is used we can
2693 remap the columns so that the table compression later on is more
2694 efficient in terms of memory by ensuring the most common value is in the
2695 middle and the least common are on the outside. IMO this would be better
2696 than a most to least common mapping as theres a decent chance the most
2697 common letter will share a node with the least common, meaning the node
2698 will not be compressible. With a middle is most common approach the worst
2699 case is when we have the least common nodes twice.
2700
2701 */
2702
2703 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2704 regnode *noper = NEXTOPER( cur );
2705 const U8 *uc;
2706 const U8 *e;
2707 int foldlen = 0;
2708 U32 wordlen = 0; /* required init */
2709 STRLEN minchars = 0;
2710 STRLEN maxchars = 0;
2711 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2712 bitmap?*/
2713
2714 if (OP(noper) == NOTHING) {
2715 /* skip past a NOTHING at the start of an alternation
2716 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2717 */
2718 regnode *noper_next= regnext(noper);
2719 if (noper_next < tail)
2720 noper= noper_next;
2721 }
2722
2723 if ( noper < tail &&
2724 (
2725 OP(noper) == flags ||
2726 (
2727 flags == EXACTFU &&
2728 OP(noper) == EXACTFU_SS
2729 )
2730 )
2731 ) {
2732 uc= (U8*)STRING(noper);
2733 e= uc + STR_LEN(noper);
2734 } else {
2735 trie->minlen= 0;
2736 continue;
2737 }
2738
2739
2740 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2741 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2742 regardless of encoding */
2743 if (OP( noper ) == EXACTFU_SS) {
2744 /* false positives are ok, so just set this */
2745 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2746 }
2747 }
2748
2749 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2750 branch */
2751 TRIE_CHARCOUNT(trie)++;
2752 TRIE_READ_CHAR;
2753
2754 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2755 * is in effect. Under /i, this character can match itself, or
2756 * anything that folds to it. If not under /i, it can match just
2757 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2758 * all fold to k, and all are single characters. But some folds
2759 * expand to more than one character, so for example LATIN SMALL
2760 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2761 * the string beginning at 'uc' is 'ffi', it could be matched by
2762 * three characters, or just by the one ligature character. (It
2763 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2764 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2765 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2766 * match.) The trie needs to know the minimum and maximum number
2767 * of characters that could match so that it can use size alone to
2768 * quickly reject many match attempts. The max is simple: it is
2769 * the number of folded characters in this branch (since a fold is
2770 * never shorter than what folds to it. */
2771
2772 maxchars++;
2773
2774 /* And the min is equal to the max if not under /i (indicated by
2775 * 'folder' being NULL), or there are no multi-character folds. If
2776 * there is a multi-character fold, the min is incremented just
2777 * once, for the character that folds to the sequence. Each
2778 * character in the sequence needs to be added to the list below of
2779 * characters in the trie, but we count only the first towards the
2780 * min number of characters needed. This is done through the
2781 * variable 'foldlen', which is returned by the macros that look
2782 * for these sequences as the number of bytes the sequence
2783 * occupies. Each time through the loop, we decrement 'foldlen' by
2784 * how many bytes the current char occupies. Only when it reaches
2785 * 0 do we increment 'minchars' or look for another multi-character
2786 * sequence. */
2787 if (folder == NULL) {
2788 minchars++;
2789 }
2790 else if (foldlen > 0) {
2791 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2792 }
2793 else {
2794 minchars++;
2795
2796 /* See if *uc is the beginning of a multi-character fold. If
2797 * so, we decrement the length remaining to look at, to account
2798 * for the current character this iteration. (We can use 'uc'
2799 * instead of the fold returned by TRIE_READ_CHAR because for
2800 * non-UTF, the latin1_safe macro is smart enough to account
2801 * for all the unfolded characters, and because for UTF, the
2802 * string will already have been folded earlier in the
2803 * compilation process */
2804 if (UTF) {
2805 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2806 foldlen -= UTF8SKIP(uc);
2807 }
2808 }
2809 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2810 foldlen--;
2811 }
2812 }
2813
2814 /* The current character (and any potential folds) should be added
2815 * to the possible matching characters for this position in this
2816 * branch */
2817 if ( uvc < 256 ) {
2818 if ( folder ) {
2819 U8 folded= folder[ (U8) uvc ];
2820 if ( !trie->charmap[ folded ] ) {
2821 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2822 TRIE_STORE_REVCHAR( folded );
2823 }
2824 }
2825 if ( !trie->charmap[ uvc ] ) {
2826 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2827 TRIE_STORE_REVCHAR( uvc );
2828 }
2829 if ( set_bit ) {
2830 /* store the codepoint in the bitmap, and its folded
2831 * equivalent. */
2832 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2833 set_bit = 0; /* We've done our bit :-) */
2834 }
2835 } else {
2836
2837 /* XXX We could come up with the list of code points that fold
2838 * to this using PL_utf8_foldclosures, except not for
2839 * multi-char folds, as there may be multiple combinations
2840 * there that could work, which needs to wait until runtime to
2841 * resolve (The comment about LIGATURE FFI above is such an
2842 * example */
2843
2844 SV** svpp;
2845 if ( !widecharmap )
2846 widecharmap = newHV();
2847
2848 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2849
2850 if ( !svpp )
2851 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2852
2853 if ( !SvTRUE( *svpp ) ) {
2854 sv_setiv( *svpp, ++trie->uniquecharcount );
2855 TRIE_STORE_REVCHAR(uvc);
2856 }
2857 }
2858 } /* end loop through characters in this branch of the trie */
2859
2860 /* We take the min and max for this branch and combine to find the min
2861 * and max for all branches processed so far */
2862 if( cur == first ) {
2863 trie->minlen = minchars;
2864 trie->maxlen = maxchars;
2865 } else if (minchars < trie->minlen) {
2866 trie->minlen = minchars;
2867 } else if (maxchars > trie->maxlen) {
2868 trie->maxlen = maxchars;
2869 }
2870 } /* end first pass */
2871 DEBUG_TRIE_COMPILE_r(
2872 Perl_re_indentf( aTHX_
2873 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2874 depth+1,
2875 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2876 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2877 (int)trie->minlen, (int)trie->maxlen )
2878 );
2879
2880 /*
2881 We now know what we are dealing with in terms of unique chars and
2882 string sizes so we can calculate how much memory a naive
2883 representation using a flat table will take. If it's over a reasonable
2884 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2885 conservative but potentially much slower representation using an array
2886 of lists.
2887
2888 At the end we convert both representations into the same compressed
2889 form that will be used in regexec.c for matching with. The latter
2890 is a form that cannot be used to construct with but has memory
2891 properties similar to the list form and access properties similar
2892 to the table form making it both suitable for fast searches and
2893 small enough that its feasable to store for the duration of a program.
2894
2895 See the comment in the code where the compressed table is produced
2896 inplace from the flat tabe representation for an explanation of how
2897 the compression works.
2898
2899 */
2900
2901
2902 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2903 prev_states[1] = 0;
2904
2905 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2906 > SvIV(re_trie_maxbuff) )
2907 {
2908 /*
2909 Second Pass -- Array Of Lists Representation
2910
2911 Each state will be represented by a list of charid:state records
2912 (reg_trie_trans_le) the first such element holds the CUR and LEN
2913 points of the allocated array. (See defines above).
2914
2915 We build the initial structure using the lists, and then convert
2916 it into the compressed table form which allows faster lookups
2917 (but cant be modified once converted).
2918 */
2919
2920 STRLEN transcount = 1;
2921
2922 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
2923 depth+1));
2924
2925 trie->states = (reg_trie_state *)
2926 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2927 sizeof(reg_trie_state) );
2928 TRIE_LIST_NEW(1);
2929 next_alloc = 2;
2930
2931 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2932
2933 regnode *noper = NEXTOPER( cur );
2934 U32 state = 1; /* required init */
2935 U16 charid = 0; /* sanity init */
2936 U32 wordlen = 0; /* required init */
2937
2938 if (OP(noper) == NOTHING) {
2939 regnode *noper_next= regnext(noper);
2940 if (noper_next < tail)
2941 noper= noper_next;
2942 }
2943
2944 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2945 const U8 *uc= (U8*)STRING(noper);
2946 const U8 *e= uc + STR_LEN(noper);
2947
2948 for ( ; uc < e ; uc += len ) {
2949
2950 TRIE_READ_CHAR;
2951
2952 if ( uvc < 256 ) {
2953 charid = trie->charmap[ uvc ];
2954 } else {
2955 SV** const svpp = hv_fetch( widecharmap,
2956 (char*)&uvc,
2957 sizeof( UV ),
2958 0);
2959 if ( !svpp ) {
2960 charid = 0;
2961 } else {
2962 charid=(U16)SvIV( *svpp );
2963 }
2964 }
2965 /* charid is now 0 if we dont know the char read, or
2966 * nonzero if we do */
2967 if ( charid ) {
2968
2969 U16 check;
2970 U32 newstate = 0;
2971
2972 charid--;
2973 if ( !trie->states[ state ].trans.list ) {
2974 TRIE_LIST_NEW( state );
2975 }
2976 for ( check = 1;
2977 check <= TRIE_LIST_USED( state );
2978 check++ )
2979 {
2980 if ( TRIE_LIST_ITEM( state, check ).forid
2981 == charid )
2982 {
2983 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2984 break;
2985 }
2986 }
2987 if ( ! newstate ) {
2988 newstate = next_alloc++;
2989 prev_states[newstate] = state;
2990 TRIE_LIST_PUSH( state, charid, newstate );
2991 transcount++;
2992 }
2993 state = newstate;
2994 } else {
2995 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
2996 }
2997 }
2998 }
2999 TRIE_HANDLE_WORD(state);
3000
3001 } /* end second pass */
3002
3003 /* next alloc is the NEXT state to be allocated */
3004 trie->statecount = next_alloc;
3005 trie->states = (reg_trie_state *)
3006 PerlMemShared_realloc( trie->states,
3007 next_alloc
3008 * sizeof(reg_trie_state) );
3009
3010 /* and now dump it out before we compress it */
3011 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3012 revcharmap, next_alloc,
3013 depth+1)
3014 );
3015
3016 trie->trans = (reg_trie_trans *)
3017 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3018 {
3019 U32 state;
3020 U32 tp = 0;
3021 U32 zp = 0;
3022
3023
3024 for( state=1 ; state < next_alloc ; state ++ ) {
3025 U32 base=0;
3026
3027 /*
3028 DEBUG_TRIE_COMPILE_MORE_r(
3029 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3030 );
3031 */
3032
3033 if (trie->states[state].trans.list) {
3034 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3035 U16 maxid=minid;
3036 U16 idx;
3037
3038 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3039 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3040 if ( forid < minid ) {
3041 minid=forid;
3042 } else if ( forid > maxid ) {
3043 maxid=forid;
3044 }
3045 }
3046 if ( transcount < tp + maxid - minid + 1) {
3047 transcount *= 2;
3048 trie->trans = (reg_trie_trans *)
3049 PerlMemShared_realloc( trie->trans,
3050 transcount
3051 * sizeof(reg_trie_trans) );
3052 Zero( trie->trans + (transcount / 2),
3053 transcount / 2,
3054 reg_trie_trans );
3055 }
3056 base = trie->uniquecharcount + tp - minid;
3057 if ( maxid == minid ) {
3058 U32 set = 0;
3059 for ( ; zp < tp ; zp++ ) {
3060 if ( ! trie->trans[ zp ].next ) {
3061 base = trie->uniquecharcount + zp - minid;
3062 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3063 1).newstate;
3064 trie->trans[ zp ].check = state;
3065 set = 1;
3066 break;
3067 }
3068 }
3069 if ( !set ) {
3070 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3071 1).newstate;
3072 trie->trans[ tp ].check = state;
3073 tp++;
3074 zp = tp;
3075 }
3076 } else {
3077 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3078 const U32 tid = base
3079 - trie->uniquecharcount
3080 + TRIE_LIST_ITEM( state, idx ).forid;
3081 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3082 idx ).newstate;
3083 trie->trans[ tid ].check = state;
3084 }
3085 tp += ( maxid - minid + 1 );
3086 }
3087 Safefree(trie->states[ state ].trans.list);
3088 }
3089 /*
3090 DEBUG_TRIE_COMPILE_MORE_r(
3091 Perl_re_printf( aTHX_ " base: %d\n",base);
3092 );
3093 */
3094 trie->states[ state ].trans.base=base;
3095 }
3096 trie->lasttrans = tp + 1;
3097 }
3098 } else {
3099 /*
3100 Second Pass -- Flat Table Representation.
3101
3102 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3103 each. We know that we will need Charcount+1 trans at most to store
3104 the data (one row per char at worst case) So we preallocate both
3105 structures assuming worst case.
3106
3107 We then construct the trie using only the .next slots of the entry
3108 structs.
3109
3110 We use the .check field of the first entry of the node temporarily
3111 to make compression both faster and easier by keeping track of how
3112 many non zero fields are in the node.
3113
3114 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3115 transition.
3116
3117 There are two terms at use here: state as a TRIE_NODEIDX() which is
3118 a number representing the first entry of the node, and state as a
3119 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3120 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3121 if there are 2 entrys per node. eg:
3122
3123 A B A B
3124 1. 2 4 1. 3 7
3125 2. 0 3 3. 0 5
3126 3. 0 0 5. 0 0
3127 4. 0 0 7. 0 0
3128
3129 The table is internally in the right hand, idx form. However as we
3130 also have to deal with the states array which is indexed by nodenum
3131 we have to use TRIE_NODENUM() to convert.
3132
3133 */
3134 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3135 depth+1));
3136
3137 trie->trans = (reg_trie_trans *)
3138 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3139 * trie->uniquecharcount + 1,
3140 sizeof(reg_trie_trans) );
3141 trie->states = (reg_trie_state *)
3142 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3143 sizeof(reg_trie_state) );
3144 next_alloc = trie->uniquecharcount + 1;
3145
3146
3147 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3148
3149 regnode *noper = NEXTOPER( cur );
3150
3151 U32 state = 1; /* required init */
3152
3153 U16 charid = 0; /* sanity init */
3154 U32 accept_state = 0; /* sanity init */
3155
3156 U32 wordlen = 0; /* required init */
3157
3158 if (OP(noper) == NOTHING) {
3159 regnode *noper_next= regnext(noper);
3160 if (noper_next < tail)
3161 noper= noper_next;
3162 }
3163
3164 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3165 const U8 *uc= (U8*)STRING(noper);
3166 const U8 *e= uc + STR_LEN(noper);
3167
3168 for ( ; uc < e ; uc += len ) {
3169
3170 TRIE_READ_CHAR;
3171
3172 if ( uvc < 256 ) {
3173 charid = trie->charmap[ uvc ];
3174 } else {
3175 SV* const * const svpp = hv_fetch( widecharmap,
3176 (char*)&uvc,
3177 sizeof( UV ),
3178 0);
3179 charid = svpp ? (U16)SvIV(*svpp) : 0;
3180 }
3181 if ( charid ) {
3182 charid--;
3183 if ( !trie->trans[ state + charid ].next ) {
3184 trie->trans[ state + charid ].next = next_alloc;
3185 trie->trans[ state ].check++;
3186 prev_states[TRIE_NODENUM(next_alloc)]
3187 = TRIE_NODENUM(state);
3188 next_alloc += trie->uniquecharcount;
3189 }
3190 state = trie->trans[ state + charid ].next;
3191 } else {
3192 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3193 }
3194 /* charid is now 0 if we dont know the char read, or
3195 * nonzero if we do */
3196 }
3197 }
3198 accept_state = TRIE_NODENUM( state );
3199 TRIE_HANDLE_WORD(accept_state);
3200
3201 } /* end second pass */
3202
3203 /* and now dump it out before we compress it */
3204 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3205 revcharmap,
3206 next_alloc, depth+1));
3207
3208 {
3209 /*
3210 * Inplace compress the table.*
3211
3212 For sparse data sets the table constructed by the trie algorithm will
3213 be mostly 0/FAIL transitions or to put it another way mostly empty.
3214 (Note that leaf nodes will not contain any transitions.)
3215
3216 This algorithm compresses the tables by eliminating most such
3217 transitions, at the cost of a modest bit of extra work during lookup:
3218
3219 - Each states[] entry contains a .base field which indicates the
3220 index in the state[] array wheres its transition data is stored.
3221
3222 - If .base is 0 there are no valid transitions from that node.
3223
3224 - If .base is nonzero then charid is added to it to find an entry in
3225 the trans array.
3226
3227 -If trans[states[state].base+charid].check!=state then the
3228 transition is taken to be a 0/Fail transition. Thus if there are fail
3229 transitions at the front of the node then the .base offset will point
3230 somewhere inside the previous nodes data (or maybe even into a node
3231 even earlier), but the .check field determines if the transition is
3232 valid.
3233
3234 XXX - wrong maybe?
3235 The following process inplace converts the table to the compressed
3236 table: We first do not compress the root node 1,and mark all its
3237 .check pointers as 1 and set its .base pointer as 1 as well. This
3238 allows us to do a DFA construction from the compressed table later,
3239 and ensures that any .base pointers we calculate later are greater
3240 than 0.
3241
3242 - We set 'pos' to indicate the first entry of the second node.
3243
3244 - We then iterate over the columns of the node, finding the first and
3245 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3246 and set the .check pointers accordingly, and advance pos
3247 appropriately and repreat for the next node. Note that when we copy
3248 the next pointers we have to convert them from the original
3249 NODEIDX form to NODENUM form as the former is not valid post
3250 compression.
3251
3252 - If a node has no transitions used we mark its base as 0 and do not
3253 advance the pos pointer.
3254
3255 - If a node only has one transition we use a second pointer into the
3256 structure to fill in allocated fail transitions from other states.
3257 This pointer is independent of the main pointer and scans forward
3258 looking for null transitions that are allocated to a state. When it
3259 finds one it writes the single transition into the "hole". If the
3260 pointer doesnt find one the single transition is appended as normal.
3261
3262 - Once compressed we can Renew/realloc the structures to release the
3263 excess space.
3264
3265 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3266 specifically Fig 3.47 and the associated pseudocode.
3267
3268 demq
3269 */
3270 const U32 laststate = TRIE_NODENUM( next_alloc );
3271 U32 state, charid;
3272 U32 pos = 0, zp=0;
3273 trie->statecount = laststate;
3274
3275 for ( state = 1 ; state < laststate ; state++ ) {
3276 U8 flag = 0;
3277 const U32 stateidx = TRIE_NODEIDX( state );
3278 const U32 o_used = trie->trans[ stateidx ].check;
3279 U32 used = trie->trans[ stateidx ].check;
3280 trie->trans[ stateidx ].check = 0;
3281
3282 for ( charid = 0;
3283 used && charid < trie->uniquecharcount;
3284 charid++ )
3285 {
3286 if ( flag || trie->trans[ stateidx + charid ].next ) {
3287 if ( trie->trans[ stateidx + charid ].next ) {
3288 if (o_used == 1) {
3289 for ( ; zp < pos ; zp++ ) {
3290 if ( ! trie->trans[ zp ].next ) {
3291 break;
3292 }
3293 }
3294 trie->states[ state ].trans.base
3295 = zp
3296 + trie->uniquecharcount
3297 - charid ;
3298 trie->trans[ zp ].next
3299 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3300 + charid ].next );
3301 trie->trans[ zp ].check = state;
3302 if ( ++zp > pos ) pos = zp;
3303 break;
3304 }
3305 used--;
3306 }
3307 if ( !flag ) {
3308 flag = 1;
3309 trie->states[ state ].trans.base
3310 = pos + trie->uniquecharcount - charid ;
3311 }
3312 trie->trans[ pos ].next
3313 = SAFE_TRIE_NODENUM(
3314 trie->trans[ stateidx + charid ].next );
3315 trie->trans[ pos ].check = state;
3316 pos++;
3317 }
3318 }
3319 }
3320 trie->lasttrans = pos + 1;
3321 trie->states = (reg_trie_state *)
3322 PerlMemShared_realloc( trie->states, laststate
3323 * sizeof(reg_trie_state) );
3324 DEBUG_TRIE_COMPILE_MORE_r(
3325 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3326 depth+1,
3327 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3328 + 1 ),
3329 (IV)next_alloc,
3330 (IV)pos,
3331 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3332 );
3333
3334 } /* end table compress */
3335 }
3336 DEBUG_TRIE_COMPILE_MORE_r(
3337 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3338 depth+1,
3339 (UV)trie->statecount,
3340 (UV)trie->lasttrans)
3341 );
3342 /* resize the trans array to remove unused space */
3343 trie->trans = (reg_trie_trans *)
3344 PerlMemShared_realloc( trie->trans, trie->lasttrans
3345 * sizeof(reg_trie_trans) );
3346
3347 { /* Modify the program and insert the new TRIE node */
3348 U8 nodetype =(U8)(flags & 0xFF);
3349 char *str=NULL;
3350
3351#ifdef DEBUGGING
3352 regnode *optimize = NULL;
3353#ifdef RE_TRACK_PATTERN_OFFSETS
3354
3355 U32 mjd_offset = 0;
3356 U32 mjd_nodelen = 0;
3357#endif /* RE_TRACK_PATTERN_OFFSETS */
3358#endif /* DEBUGGING */
3359 /*
3360 This means we convert either the first branch or the first Exact,
3361 depending on whether the thing following (in 'last') is a branch
3362 or not and whther first is the startbranch (ie is it a sub part of
3363 the alternation or is it the whole thing.)
3364 Assuming its a sub part we convert the EXACT otherwise we convert
3365 the whole branch sequence, including the first.
3366 */
3367 /* Find the node we are going to overwrite */
3368 if ( first != startbranch || OP( last ) == BRANCH ) {
3369 /* branch sub-chain */
3370 NEXT_OFF( first ) = (U16)(last - first);
3371#ifdef RE_TRACK_PATTERN_OFFSETS
3372 DEBUG_r({
3373 mjd_offset= Node_Offset((convert));
3374 mjd_nodelen= Node_Length((convert));
3375 });
3376#endif
3377 /* whole branch chain */
3378 }
3379#ifdef RE_TRACK_PATTERN_OFFSETS
3380 else {
3381 DEBUG_r({
3382 const regnode *nop = NEXTOPER( convert );
3383 mjd_offset= Node_Offset((nop));
3384 mjd_nodelen= Node_Length((nop));
3385 });
3386 }
3387 DEBUG_OPTIMISE_r(
3388 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3389 depth+1,
3390 (UV)mjd_offset, (UV)mjd_nodelen)
3391 );
3392#endif
3393 /* But first we check to see if there is a common prefix we can
3394 split out as an EXACT and put in front of the TRIE node. */
3395 trie->startstate= 1;
3396 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3397 /* we want to find the first state that has more than
3398 * one transition, if that state is not the first state
3399 * then we have a common prefix which we can remove.
3400 */
3401 U32 state;
3402 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3403 U32 ofs = 0;
3404 I32 first_ofs = -1; /* keeps track of the ofs of the first
3405 transition, -1 means none */
3406 U32 count = 0;
3407 const U32 base = trie->states[ state ].trans.base;
3408
3409 /* does this state terminate an alternation? */
3410 if ( trie->states[state].wordnum )
3411 count = 1;
3412
3413 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3414 if ( ( base + ofs >= trie->uniquecharcount ) &&
3415 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3416 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3417 {
3418 if ( ++count > 1 ) {
3419 /* we have more than one transition */
3420 SV **tmp;
3421 U8 *ch;
3422 /* if this is the first state there is no common prefix
3423 * to extract, so we can exit */
3424 if ( state == 1 ) break;
3425 tmp = av_fetch( revcharmap, ofs, 0);
3426 ch = (U8*)SvPV_nolen_const( *tmp );
3427
3428 /* if we are on count 2 then we need to initialize the
3429 * bitmap, and store the previous char if there was one
3430 * in it*/
3431 if ( count == 2 ) {
3432 /* clear the bitmap */
3433 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3434 DEBUG_OPTIMISE_r(
3435 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3436 depth+1,
3437 (UV)state));
3438 if (first_ofs >= 0) {
3439 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3440 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3441
3442 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3443 DEBUG_OPTIMISE_r(
3444 Perl_re_printf( aTHX_ "%s", (char*)ch)
3445 );
3446 }
3447 }
3448 /* store the current firstchar in the bitmap */
3449 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3450 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3451 }
3452 first_ofs = ofs;
3453 }
3454 }
3455 if ( count == 1 ) {
3456 /* This state has only one transition, its transition is part
3457 * of a common prefix - we need to concatenate the char it
3458 * represents to what we have so far. */
3459 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3460 STRLEN len;
3461 char *ch = SvPV( *tmp, len );
3462 DEBUG_OPTIMISE_r({
3463 SV *sv=sv_newmortal();
3464 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3465 depth+1,
3466 (UV)state, (UV)first_ofs,
3467 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3468 PL_colors[0], PL_colors[1],
3469 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3470 PERL_PV_ESCAPE_FIRSTCHAR
3471 )
3472 );
3473 });
3474 if ( state==1 ) {
3475 OP( convert ) = nodetype;
3476 str=STRING(convert);
3477 STR_LEN(convert)=0;
3478 }
3479 STR_LEN(convert) += len;
3480 while (len--)
3481 *str++ = *ch++;
3482 } else {
3483#ifdef DEBUGGING
3484 if (state>1)
3485 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3486#endif
3487 break;
3488 }
3489 }
3490 trie->prefixlen = (state-1);
3491 if (str) {
3492 regnode *n = convert+NODE_SZ_STR(convert);
3493 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3494 trie->startstate = state;
3495 trie->minlen -= (state - 1);
3496 trie->maxlen -= (state - 1);
3497#ifdef DEBUGGING
3498 /* At least the UNICOS C compiler choked on this
3499 * being argument to DEBUG_r(), so let's just have
3500 * it right here. */
3501 if (
3502#ifdef PERL_EXT_RE_BUILD
3503 1
3504#else
3505 DEBUG_r_TEST
3506#endif
3507 ) {
3508 regnode *fix = convert;
3509 U32 word = trie->wordcount;
3510#ifdef RE_TRACK_PATTERN_OFFSETS
3511 mjd_nodelen++;
3512#endif
3513 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3514 while( ++fix < n ) {
3515 Set_Node_Offset_Length(fix, 0, 0);
3516 }
3517 while (word--) {
3518 SV ** const tmp = av_fetch( trie_words, word, 0 );
3519 if (tmp) {
3520 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3521 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3522 else
3523 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3524 }
3525 }
3526 }
3527#endif
3528 if (trie->maxlen) {
3529 convert = n;
3530 } else {
3531 NEXT_OFF(convert) = (U16)(tail - convert);
3532 DEBUG_r(optimize= n);
3533 }
3534 }
3535 }
3536 if (!jumper)
3537 jumper = last;
3538 if ( trie->maxlen ) {
3539 NEXT_OFF( convert ) = (U16)(tail - convert);
3540 ARG_SET( convert, data_slot );
3541 /* Store the offset to the first unabsorbed branch in
3542 jump[0], which is otherwise unused by the jump logic.
3543 We use this when dumping a trie and during optimisation. */
3544 if (trie->jump)
3545 trie->jump[0] = (U16)(nextbranch - convert);
3546
3547 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3548 * and there is a bitmap
3549 * and the first "jump target" node we found leaves enough room
3550 * then convert the TRIE node into a TRIEC node, with the bitmap
3551 * embedded inline in the opcode - this is hypothetically faster.
3552 */
3553 if ( !trie->states[trie->startstate].wordnum
3554 && trie->bitmap
3555 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3556 {
3557 OP( convert ) = TRIEC;
3558 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3559 PerlMemShared_free(trie->bitmap);
3560 trie->bitmap= NULL;
3561 } else
3562 OP( convert ) = TRIE;
3563
3564 /* store the type in the flags */
3565 convert->flags = nodetype;
3566 DEBUG_r({
3567 optimize = convert
3568 + NODE_STEP_REGNODE
3569 + regarglen[ OP( convert ) ];
3570 });
3571 /* XXX We really should free up the resource in trie now,
3572 as we won't use them - (which resources?) dmq */
3573 }
3574 /* needed for dumping*/
3575 DEBUG_r(if (optimize) {
3576 regnode *opt = convert;
3577
3578 while ( ++opt < optimize) {
3579 Set_Node_Offset_Length(opt, 0, 0);
3580 }
3581 /*
3582 Try to clean up some of the debris left after the
3583 optimisation.
3584 */
3585 while( optimize < jumper ) {
3586#ifdef RE_TRACK_PATTERN_OFFSETS
3587 mjd_nodelen += Node_Length((optimize));
3588#endif
3589 OP( optimize ) = OPTIMIZED;
3590 Set_Node_Offset_Length(optimize, 0, 0);
3591 optimize++;
3592 }
3593 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3594 });
3595 } /* end node insert */
3596
3597 /* Finish populating the prev field of the wordinfo array. Walk back
3598 * from each accept state until we find another accept state, and if
3599 * so, point the first word's .prev field at the second word. If the
3600 * second already has a .prev field set, stop now. This will be the
3601 * case either if we've already processed that word's accept state,
3602 * or that state had multiple words, and the overspill words were
3603 * already linked up earlier.
3604 */
3605 {
3606 U16 word;
3607 U32 state;
3608 U16 prev;
3609
3610 for (word=1; word <= trie->wordcount; word++) {
3611 prev = 0;
3612 if (trie->wordinfo[word].prev)
3613 continue;
3614 state = trie->wordinfo[word].accept;
3615 while (state) {
3616 state = prev_states[state];
3617 if (!state)
3618 break;
3619 prev = trie->states[state].wordnum;
3620 if (prev)
3621 break;
3622 }
3623 trie->wordinfo[word].prev = prev;
3624 }
3625 Safefree(prev_states);
3626 }
3627
3628
3629 /* and now dump out the compressed format */
3630 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3631
3632 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3633#ifdef DEBUGGING
3634 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3635 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3636#else
3637 SvREFCNT_dec_NN(revcharmap);
3638#endif
3639 return trie->jump
3640 ? MADE_JUMP_TRIE
3641 : trie->startstate>1
3642 ? MADE_EXACT_TRIE
3643 : MADE_TRIE;
3644}
3645
3646STATIC regnode *
3647S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3648{
3649/* The Trie is constructed and compressed now so we can build a fail array if
3650 * it's needed
3651
3652 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3653 3.32 in the
3654 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3655 Ullman 1985/88
3656 ISBN 0-201-10088-6
3657
3658 We find the fail state for each state in the trie, this state is the longest
3659 proper suffix of the current state's 'word' that is also a proper prefix of
3660 another word in our trie. State 1 represents the word '' and is thus the
3661 default fail state. This allows the DFA not to have to restart after its
3662 tried and failed a word at a given point, it simply continues as though it
3663 had been matching the other word in the first place.
3664 Consider
3665 'abcdgu'=~/abcdefg|cdgu/
3666 When we get to 'd' we are still matching the first word, we would encounter
3667 'g' which would fail, which would bring us to the state representing 'd' in
3668 the second word where we would try 'g' and succeed, proceeding to match
3669 'cdgu'.
3670 */
3671 /* add a fail transition */
3672 const U32 trie_offset = ARG(source);
3673 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3674 U32 *q;
3675 const U32 ucharcount = trie->uniquecharcount;
3676 const U32 numstates = trie->statecount;
3677 const U32 ubound = trie->lasttrans + ucharcount;
3678 U32 q_read = 0;
3679 U32 q_write = 0;
3680 U32 charid;
3681 U32 base = trie->states[ 1 ].trans.base;
3682 U32 *fail;
3683 reg_ac_data *aho;
3684 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3685 regnode *stclass;
3686 GET_RE_DEBUG_FLAGS_DECL;
3687
3688 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3689 PERL_UNUSED_CONTEXT;
3690#ifndef DEBUGGING
3691 PERL_UNUSED_ARG(depth);
3692#endif
3693
3694 if ( OP(source) == TRIE ) {
3695 struct regnode_1 *op = (struct regnode_1 *)
3696 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3697 StructCopy(source, op, struct regnode_1);
3698 stclass = (regnode *)op;
3699 } else {
3700 struct regnode_charclass *op = (struct regnode_charclass *)
3701 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3702 StructCopy(source, op, struct regnode_charclass);
3703 stclass = (regnode *)op;
3704 }
3705 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3706
3707 ARG_SET( stclass, data_slot );
3708 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3709 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3710 aho->trie=trie_offset;
3711 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3712 Copy( trie->states, aho->states, numstates, reg_trie_state );
3713 Newx( q, numstates, U32);
3714 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3715 aho->refcount = 1;
3716 fail = aho->fail;
3717 /* initialize fail[0..1] to be 1 so that we always have
3718 a valid final fail state */
3719 fail[ 0 ] = fail[ 1 ] = 1;
3720
3721 for ( charid = 0; charid < ucharcount ; charid++ ) {
3722 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3723 if ( newstate ) {
3724 q[ q_write ] = newstate;
3725 /* set to point at the root */
3726 fail[ q[ q_write++ ] ]=1;
3727 }
3728 }
3729 while ( q_read < q_write) {
3730 const U32 cur = q[ q_read++ % numstates ];
3731 base = trie->states[ cur ].trans.base;
3732
3733 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3734 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3735 if (ch_state) {
3736 U32 fail_state = cur;
3737 U32 fail_base;
3738 do {
3739 fail_state = fail[ fail_state ];
3740 fail_base = aho->states[ fail_state ].trans.base;
3741 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3742
3743 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3744 fail[ ch_state ] = fail_state;
3745 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3746 {
3747 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3748 }
3749 q[ q_write++ % numstates] = ch_state;
3750 }
3751 }
3752 }
3753 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3754 when we fail in state 1, this allows us to use the
3755 charclass scan to find a valid start char. This is based on the principle
3756 that theres a good chance the string being searched contains lots of stuff
3757 that cant be a start char.
3758 */
3759 fail[ 0 ] = fail[ 1 ] = 0;
3760 DEBUG_TRIE_COMPILE_r({
3761 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3762 depth, (UV)numstates
3763 );
3764 for( q_read=1; q_read<numstates; q_read++ ) {
3765 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3766 }
3767 Perl_re_printf( aTHX_ "\n");
3768 });
3769 Safefree(q);
3770 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3771 return stclass;
3772}
3773
3774
3775/* The below joins as many adjacent EXACTish nodes as possible into a single
3776 * one. The regop may be changed if the node(s) contain certain sequences that
3777 * require special handling. The joining is only done if:
3778 * 1) there is room in the current conglomerated node to entirely contain the
3779 * next one.
3780 * 2) they are the exact same node type
3781 *
3782 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3783 * these get optimized out
3784 *
3785 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3786 * as possible, even if that means splitting an existing node so that its first
3787 * part is moved to the preceeding node. This would maximise the efficiency of
3788 * memEQ during matching.
3789 *
3790 * If a node is to match under /i (folded), the number of characters it matches
3791 * can be different than its character length if it contains a multi-character
3792 * fold. *min_subtract is set to the total delta number of characters of the
3793 * input nodes.
3794 *
3795 * And *unfolded_multi_char is set to indicate whether or not the node contains
3796 * an unfolded multi-char fold. This happens when it won't be known until
3797 * runtime whether the fold is valid or not; namely
3798 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3799 * target string being matched against turns out to be UTF-8 is that fold
3800 * valid; or
3801 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3802 * runtime.
3803 * (Multi-char folds whose components are all above the Latin1 range are not
3804 * run-time locale dependent, and have already been folded by the time this
3805 * function is called.)
3806 *
3807 * This is as good a place as any to discuss the design of handling these
3808 * multi-character fold sequences. It's been wrong in Perl for a very long
3809 * time. There are three code points in Unicode whose multi-character folds
3810 * were long ago discovered to mess things up. The previous designs for
3811 * dealing with these involved assigning a special node for them. This
3812 * approach doesn't always work, as evidenced by this example:
3813 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3814 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3815 * would match just the \xDF, it won't be able to handle the case where a
3816 * successful match would have to cross the node's boundary. The new approach
3817 * that hopefully generally solves the problem generates an EXACTFU_SS node
3818 * that is "sss" in this case.
3819 *
3820 * It turns out that there are problems with all multi-character folds, and not
3821 * just these three. Now the code is general, for all such cases. The
3822 * approach taken is:
3823 * 1) This routine examines each EXACTFish node that could contain multi-
3824 * character folded sequences. Since a single character can fold into
3825 * such a sequence, the minimum match length for this node is less than
3826 * the number of characters in the node. This routine returns in
3827 * *min_subtract how many characters to subtract from the the actual
3828 * length of the string to get a real minimum match length; it is 0 if
3829 * there are no multi-char foldeds. This delta is used by the caller to
3830 * adjust the min length of the match, and the delta between min and max,
3831 * so that the optimizer doesn't reject these possibilities based on size
3832 * constraints.
3833 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3834 * is used for an EXACTFU node that contains at least one "ss" sequence in
3835 * it. For non-UTF-8 patterns and strings, this is the only case where
3836 * there is a possible fold length change. That means that a regular
3837 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3838 * with length changes, and so can be processed faster. regexec.c takes
3839 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3840 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3841 * known until runtime). This saves effort in regex matching. However,
3842 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3843 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3844 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3845 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3846 * possibilities for the non-UTF8 patterns are quite simple, except for
3847 * the sharp s. All the ones that don't involve a UTF-8 target string are
3848 * members of a fold-pair, and arrays are set up for all of them so that
3849 * the other member of the pair can be found quickly. Code elsewhere in
3850 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3851 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3852 * described in the next item.
3853 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3854 * validity of the fold won't be known until runtime, and so must remain
3855 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
3856 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3857 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3858 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3859 * The reason this is a problem is that the optimizer part of regexec.c
3860 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3861 * that a character in the pattern corresponds to at most a single
3862 * character in the target string. (And I do mean character, and not byte
3863 * here, unlike other parts of the documentation that have never been
3864 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3865 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3866 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
3867 * EXACTFL nodes, violate the assumption, and they are the only instances
3868 * where it is violated. I'm reluctant to try to change the assumption,
3869 * as the code involved is impenetrable to me (khw), so instead the code
3870 * here punts. This routine examines EXACTFL nodes, and (when the pattern
3871 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3872 * boolean indicating whether or not the node contains such a fold. When
3873 * it is true, the caller sets a flag that later causes the optimizer in
3874 * this file to not set values for the floating and fixed string lengths,
3875 * and thus avoids the optimizer code in regexec.c that makes the invalid
3876 * assumption. Thus, there is no optimization based on string lengths for
3877 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3878 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
3879 * assumption is wrong only in these cases is that all other non-UTF-8
3880 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3881 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3882 * EXACTF nodes because we don't know at compile time if it actually
3883 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3884 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3885 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
3886 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3887 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3888 * string would require the pattern to be forced into UTF-8, the overhead
3889 * of which we want to avoid. Similarly the unfolded multi-char folds in
3890 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3891 * locale.)
3892 *
3893 * Similarly, the code that generates tries doesn't currently handle
3894 * not-already-folded multi-char folds, and it looks like a pain to change
3895 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
3896 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
3897 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
3898 * using /iaa matching will be doing so almost entirely with ASCII
3899 * strings, so this should rarely be encountered in practice */
3900
3901#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3902 if (PL_regkind[OP(scan)] == EXACT) \
3903 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
3904
3905STATIC U32
3906S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3907 UV *min_subtract, bool *unfolded_multi_char,
3908 U32 flags, regnode *val, U32 depth)
3909{
3910 /* Merge several consecutive EXACTish nodes into one. */
3911 regnode *n = regnext(scan);
3912 U32 stringok = 1;
3913 regnode *next = scan + NODE_SZ_STR(scan);
3914 U32 merged = 0;
3915 U32 stopnow = 0;
3916#ifdef DEBUGGING
3917 regnode *stop = scan;
3918 GET_RE_DEBUG_FLAGS_DECL;
3919#else
3920 PERL_UNUSED_ARG(depth);
3921#endif
3922
3923 PERL_ARGS_ASSERT_JOIN_EXACT;
3924#ifndef EXPERIMENTAL_INPLACESCAN
3925 PERL_UNUSED_ARG(flags);
3926 PERL_UNUSED_ARG(val);
3927#endif
3928 DEBUG_PEEP("join", scan, depth, 0);
3929
3930 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3931 * EXACT ones that are mergeable to the current one. */
3932 while (n
3933 && (PL_regkind[OP(n)] == NOTHING
3934 || (stringok && OP(n) == OP(scan)))
3935 && NEXT_OFF(n)
3936 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3937 {
3938
3939 if (OP(n) == TAIL || n > next)
3940 stringok = 0;
3941 if (PL_regkind[OP(n)] == NOTHING) {
3942 DEBUG_PEEP("skip:", n, depth, 0);
3943 NEXT_OFF(scan) += NEXT_OFF(n);
3944 next = n + NODE_STEP_REGNODE;
3945#ifdef DEBUGGING
3946 if (stringok)
3947 stop = n;
3948#endif
3949 n = regnext(n);
3950 }
3951 else if (stringok) {
3952 const unsigned int oldl = STR_LEN(scan);
3953 regnode * const nnext = regnext(n);
3954
3955 /* XXX I (khw) kind of doubt that this works on platforms (should
3956 * Perl ever run on one) where U8_MAX is above 255 because of lots
3957 * of other assumptions */
3958 /* Don't join if the sum can't fit into a single node */
3959 if (oldl + STR_LEN(n) > U8_MAX)
3960 break;
3961
3962 DEBUG_PEEP("merg", n, depth, 0);
3963 merged++;
3964
3965 NEXT_OFF(scan) += NEXT_OFF(n);
3966 STR_LEN(scan) += STR_LEN(n);
3967 next = n + NODE_SZ_STR(n);
3968 /* Now we can overwrite *n : */
3969 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3970#ifdef DEBUGGING
3971 stop = next - 1;
3972#endif
3973 n = nnext;
3974 if (stopnow) break;
3975 }
3976
3977#ifdef EXPERIMENTAL_INPLACESCAN
3978 if (flags && !NEXT_OFF(n)) {
3979 DEBUG_PEEP("atch", val, depth, 0);
3980 if (reg_off_by_arg[OP(n)]) {
3981 ARG_SET(n, val - n);
3982 }
3983 else {
3984 NEXT_OFF(n) = val - n;
3985 }
3986 stopnow = 1;
3987 }
3988#endif
3989 }
3990
3991 *min_subtract = 0;
3992 *unfolded_multi_char = FALSE;
3993
3994 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3995 * can now analyze for sequences of problematic code points. (Prior to
3996 * this final joining, sequences could have been split over boundaries, and
3997 * hence missed). The sequences only happen in folding, hence for any
3998 * non-EXACT EXACTish node */
3999 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
4000 U8* s0 = (U8*) STRING(scan);
4001 U8* s = s0;
4002 U8* s_end = s0 + STR_LEN(scan);
4003
4004 int total_count_delta = 0; /* Total delta number of characters that
4005 multi-char folds expand to */
4006
4007 /* One pass is made over the node's string looking for all the
4008 * possibilities. To avoid some tests in the loop, there are two main
4009 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4010 * non-UTF-8 */
4011 if (UTF) {
4012 U8* folded = NULL;
4013
4014 if (OP(scan) == EXACTFL) {
4015 U8 *d;
4016
4017 /* An EXACTFL node would already have been changed to another
4018 * node type unless there is at least one character in it that
4019 * is problematic; likely a character whose fold definition
4020 * won't be known until runtime, and so has yet to be folded.
4021 * For all but the UTF-8 locale, folds are 1-1 in length, but
4022 * to handle the UTF-8 case, we need to create a temporary
4023 * folded copy using UTF-8 locale rules in order to analyze it.
4024 * This is because our macros that look to see if a sequence is
4025 * a multi-char fold assume everything is folded (otherwise the
4026 * tests in those macros would be too complicated and slow).
4027 * Note that here, the non-problematic folds will have already
4028 * been done, so we can just copy such characters. We actually
4029 * don't completely fold the EXACTFL string. We skip the
4030 * unfolded multi-char folds, as that would just create work
4031 * below to figure out the size they already are */
4032
4033 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4034 d = folded;
4035 while (s < s_end) {
4036 STRLEN s_len = UTF8SKIP(s);
4037 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4038 Copy(s, d, s_len, U8);
4039 d += s_len;
4040 }
4041 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4042 *unfolded_multi_char = TRUE;
4043 Copy(s, d, s_len, U8);
4044 d += s_len;
4045 }
4046 else if (isASCII(*s)) {
4047 *(d++) = toFOLD(*s);
4048 }
4049 else {
4050 STRLEN len;
4051 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4052 d += len;
4053 }
4054 s += s_len;
4055 }
4056
4057 /* Point the remainder of the routine to look at our temporary
4058 * folded copy */
4059 s = folded;
4060 s_end = d;
4061 } /* End of creating folded copy of EXACTFL string */
4062
4063 /* Examine the string for a multi-character fold sequence. UTF-8
4064 * patterns have all characters pre-folded by the time this code is
4065 * executed */
4066 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4067 length sequence we are looking for is 2 */
4068 {
4069 int count = 0; /* How many characters in a multi-char fold */
4070 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4071 if (! len) { /* Not a multi-char fold: get next char */
4072 s += UTF8SKIP(s);
4073 continue;
4074 }
4075
4076 /* Nodes with 'ss' require special handling, except for
4077 * EXACTFAA-ish for which there is no multi-char fold to this */
4078 if (len == 2 && *s == 's' && *(s+1) == 's'
4079 && OP(scan) != EXACTFAA
4080 && OP(scan) != EXACTFAA_NO_TRIE)
4081 {
4082 count = 2;
4083 if (OP(scan) != EXACTFL) {
4084 OP(scan) = EXACTFU_SS;
4085 }
4086 s += 2;
4087 }
4088 else { /* Here is a generic multi-char fold. */
4089 U8* multi_end = s + len;
4090
4091 /* Count how many characters are in it. In the case of
4092 * /aa, no folds which contain ASCII code points are
4093 * allowed, so check for those, and skip if found. */
4094 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4095 count = utf8_length(s, multi_end);
4096 s = multi_end;
4097 }
4098 else {
4099 while (s < multi_end) {
4100 if (isASCII(*s)) {
4101 s++;
4102 goto next_iteration;
4103 }
4104 else {
4105 s += UTF8SKIP(s);
4106 }
4107 count++;
4108 }
4109 }
4110 }
4111
4112 /* The delta is how long the sequence is minus 1 (1 is how long
4113 * the character that folds to the sequence is) */
4114 total_count_delta += count - 1;
4115 next_iteration: ;
4116 }
4117
4118 /* We created a temporary folded copy of the string in EXACTFL
4119 * nodes. Therefore we need to be sure it doesn't go below zero,
4120 * as the real string could be shorter */
4121 if (OP(scan) == EXACTFL) {
4122 int total_chars = utf8_length((U8*) STRING(scan),
4123 (U8*) STRING(scan) + STR_LEN(scan));
4124 if (total_count_delta > total_chars) {
4125 total_count_delta = total_chars;
4126 }
4127 }
4128
4129 *min_subtract += total_count_delta;
4130 Safefree(folded);
4131 }
4132 else if (OP(scan) == EXACTFAA) {
4133
4134 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4135 * fold to the ASCII range (and there are no existing ones in the
4136 * upper latin1 range). But, as outlined in the comments preceding
4137 * this function, we need to flag any occurrences of the sharp s.
4138 * This character forbids trie formation (because of added
4139 * complexity) */
4140#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4141 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4142 || UNICODE_DOT_DOT_VERSION > 0)
4143 while (s < s_end) {
4144 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4145 OP(scan) = EXACTFAA_NO_TRIE;
4146 *unfolded_multi_char = TRUE;
4147 break;
4148 }
4149 s++;
4150 }
4151 }
4152 else {
4153
4154 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4155 * folds that are all Latin1. As explained in the comments
4156 * preceding this function, we look also for the sharp s in EXACTF
4157 * and EXACTFL nodes; it can be in the final position. Otherwise
4158 * we can stop looking 1 byte earlier because have to find at least
4159 * two characters for a multi-fold */
4160 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4161 ? s_end
4162 : s_end -1;
4163
4164 while (s < upper) {
4165 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4166 if (! len) { /* Not a multi-char fold. */
4167 if (*s == LATIN_SMALL_LETTER_SHARP_S
4168 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4169 {
4170 *unfolded_multi_char = TRUE;
4171 }
4172 s++;
4173 continue;
4174 }
4175
4176 if (len == 2
4177 && isALPHA_FOLD_EQ(*s, 's')
4178 && isALPHA_FOLD_EQ(*(s+1), 's'))
4179 {
4180
4181 /* EXACTF nodes need to know that the minimum length
4182 * changed so that a sharp s in the string can match this
4183 * ss in the pattern, but they remain EXACTF nodes, as they
4184 * won't match this unless the target string is is UTF-8,
4185 * which we don't know until runtime. EXACTFL nodes can't
4186 * transform into EXACTFU nodes */
4187 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4188 OP(scan) = EXACTFU_SS;
4189 }
4190 }
4191
4192 *min_subtract += len - 1;
4193 s += len;
4194 }
4195#endif
4196 }
4197 }
4198
4199#ifdef DEBUGGING
4200 /* Allow dumping but overwriting the collection of skipped
4201 * ops and/or strings with fake optimized ops */
4202 n = scan + NODE_SZ_STR(scan);
4203 while (n <= stop) {
4204 OP(n) = OPTIMIZED;
4205 FLAGS(n) = 0;
4206 NEXT_OFF(n) = 0;
4207 n++;
4208 }
4209#endif
4210 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4211 return stopnow;
4212}
4213
4214/* REx optimizer. Converts nodes into quicker variants "in place".
4215 Finds fixed substrings. */
4216
4217/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4218 to the position after last scanned or to NULL. */
4219
4220#define INIT_AND_WITHP \
4221 assert(!and_withp); \
4222 Newx(and_withp, 1, regnode_ssc); \
4223 SAVEFREEPV(and_withp)
4224
4225
4226static void
4227S_unwind_scan_frames(pTHX_ const void *p)
4228{
4229 scan_frame *f= (scan_frame *)p;
4230 do {
4231 scan_frame *n= f->next_frame;
4232 Safefree(f);
4233 f= n;
4234 } while (f);
4235}
4236
4237/* the return from this sub is the minimum length that could possibly match */
4238STATIC SSize_t
4239S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4240 SSize_t *minlenp, SSize_t *deltap,
4241 regnode *last,
4242 scan_data_t *data,
4243 I32 stopparen,
4244 U32 recursed_depth,
4245 regnode_ssc *and_withp,
4246 U32 flags, U32 depth)
4247 /* scanp: Start here (read-write). */
4248 /* deltap: Write maxlen-minlen here. */
4249 /* last: Stop before this one. */
4250 /* data: string data about the pattern */
4251 /* stopparen: treat close N as END */
4252 /* recursed: which subroutines have we recursed into */
4253 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4254{
4255 /* There must be at least this number of characters to match */
4256 SSize_t min = 0;
4257 I32 pars = 0, code;
4258 regnode *scan = *scanp, *next;
4259 SSize_t delta = 0;
4260 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4261 int is_inf_internal = 0; /* The studied chunk is infinite */
4262 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4263 scan_data_t data_fake;
4264 SV *re_trie_maxbuff = NULL;
4265 regnode *first_non_open = scan;
4266 SSize_t stopmin = SSize_t_MAX;
4267 scan_frame *frame = NULL;
4268 GET_RE_DEBUG_FLAGS_DECL;
4269
4270 PERL_ARGS_ASSERT_STUDY_CHUNK;
4271 RExC_study_started= 1;
4272
4273 Zero(&data_fake, 1, scan_data_t);
4274
4275 if ( depth == 0 ) {
4276 while (first_non_open && OP(first_non_open) == OPEN)
4277 first_non_open=regnext(first_non_open);
4278 }
4279
4280
4281 fake_study_recurse:
4282 DEBUG_r(
4283 RExC_study_chunk_recursed_count++;
4284 );
4285 DEBUG_OPTIMISE_MORE_r(
4286 {
4287 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4288 depth, (long)stopparen,
4289 (unsigned long)RExC_study_chunk_recursed_count,
4290 (unsigned long)depth, (unsigned long)recursed_depth,
4291 scan,
4292 last);
4293 if (recursed_depth) {
4294 U32 i;
4295 U32 j;
4296 for ( j = 0 ; j < recursed_depth ; j++ ) {
4297 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4298 if (
4299 PAREN_TEST(RExC_study_chunk_recursed +
4300 ( j * RExC_study_chunk_recursed_bytes), i )
4301 && (
4302 !j ||
4303 !PAREN_TEST(RExC_study_chunk_recursed +
4304 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4305 )
4306 ) {
4307 Perl_re_printf( aTHX_ " %d",(int)i);
4308 break;
4309 }
4310 }
4311 if ( j + 1 < recursed_depth ) {
4312 Perl_re_printf( aTHX_ ",");
4313 }
4314 }
4315 }
4316 Perl_re_printf( aTHX_ "\n");
4317 }
4318 );
4319 while ( scan && OP(scan) != END && scan < last ){
4320 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4321 node length to get a real minimum (because
4322 the folded version may be shorter) */
4323 bool unfolded_multi_char = FALSE;
4324 /* Peephole optimizer: */
4325 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4326 DEBUG_PEEP("Peep", scan, depth, flags);
4327
4328
4329 /* The reason we do this here is that we need to deal with things like
4330 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4331 * parsing code, as each (?:..) is handled by a different invocation of
4332 * reg() -- Yves
4333 */
4334 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4335
4336 /* Follow the next-chain of the current node and optimize
4337 away all the NOTHINGs from it. */
4338 if (OP(scan) != CURLYX) {
4339 const int max = (reg_off_by_arg[OP(scan)]
4340 ? I32_MAX
4341 /* I32 may be smaller than U16 on CRAYs! */
4342 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4343 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4344 int noff;
4345 regnode *n = scan;
4346
4347 /* Skip NOTHING and LONGJMP. */
4348 while ((n = regnext(n))
4349 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4350 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4351 && off + noff < max)
4352 off += noff;
4353 if (reg_off_by_arg[OP(scan)])
4354 ARG(scan) = off;
4355 else
4356 NEXT_OFF(scan) = off;
4357 }
4358
4359 /* The principal pseudo-switch. Cannot be a switch, since we
4360 look into several different things. */
4361 if ( OP(scan) == DEFINEP ) {
4362 SSize_t minlen = 0;
4363 SSize_t deltanext = 0;
4364 SSize_t fake_last_close = 0;
4365 I32 f = SCF_IN_DEFINE;
4366
4367 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4368 scan = regnext(scan);
4369 assert( OP(scan) == IFTHEN );
4370 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4371
4372 data_fake.last_closep= &fake_last_close;
4373 minlen = *minlenp;
4374 next = regnext(scan);
4375 scan = NEXTOPER(NEXTOPER(scan));
4376 DEBUG_PEEP("scan", scan, depth, flags);
4377 DEBUG_PEEP("next", next, depth, flags);
4378
4379 /* we suppose the run is continuous, last=next...
4380 * NOTE we dont use the return here! */
4381 /* DEFINEP study_chunk() recursion */
4382 (void)study_chunk(pRExC_state, &scan, &minlen,
4383 &deltanext, next, &data_fake, stopparen,
4384 recursed_depth, NULL, f, depth+1);
4385
4386 scan = next;
4387 } else
4388 if (
4389 OP(scan) == BRANCH ||
4390 OP(scan) == BRANCHJ ||
4391 OP(scan) == IFTHEN
4392 ) {
4393 next = regnext(scan);
4394 code = OP(scan);
4395
4396 /* The op(next)==code check below is to see if we
4397 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4398 * IFTHEN is special as it might not appear in pairs.
4399 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4400 * we dont handle it cleanly. */
4401 if (OP(next) == code || code == IFTHEN) {
4402 /* NOTE - There is similar code to this block below for
4403 * handling TRIE nodes on a re-study. If you change stuff here
4404 * check there too. */
4405 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4406 regnode_ssc accum;
4407 regnode * const startbranch=scan;
4408
4409 if (flags & SCF_DO_SUBSTR) {
4410 /* Cannot merge strings after this. */
4411 scan_commit(pRExC_state, data, minlenp, is_inf);
4412 }
4413
4414 if (flags & SCF_DO_STCLASS)
4415 ssc_init_zero(pRExC_state, &accum);
4416
4417 while (OP(scan) == code) {
4418 SSize_t deltanext, minnext, fake;
4419 I32 f = 0;
4420 regnode_ssc this_class;
4421
4422 DEBUG_PEEP("Branch", scan, depth, flags);
4423
4424 num++;
4425 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4426 if (data) {
4427 data_fake.whilem_c = data->whilem_c;
4428 data_fake.last_closep = data->last_closep;
4429 }
4430 else
4431 data_fake.last_closep = &fake;
4432
4433 data_fake.pos_delta = delta;
4434 next = regnext(scan);
4435
4436 scan = NEXTOPER(scan); /* everything */
4437 if (code != BRANCH) /* everything but BRANCH */
4438 scan = NEXTOPER(scan);
4439
4440 if (flags & SCF_DO_STCLASS) {
4441 ssc_init(pRExC_state, &this_class);
4442 data_fake.start_class = &this_class;
4443 f = SCF_DO_STCLASS_AND;
4444 }
4445 if (flags & SCF_WHILEM_VISITED_POS)
4446 f |= SCF_WHILEM_VISITED_POS;
4447
4448 /* we suppose the run is continuous, last=next...*/
4449 /* recurse study_chunk() for each BRANCH in an alternation */
4450 minnext = study_chunk(pRExC_state, &scan, minlenp,
4451 &deltanext, next, &data_fake, stopparen,
4452 recursed_depth, NULL, f, depth+1);
4453
4454 if (min1 > minnext)
4455 min1 = minnext;
4456 if (deltanext == SSize_t_MAX) {
4457 is_inf = is_inf_internal = 1;
4458 max1 = SSize_t_MAX;
4459 } else if (max1 < minnext + deltanext)
4460 max1 = minnext + deltanext;
4461 scan = next;
4462 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4463 pars++;
4464 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4465 if ( stopmin > minnext)
4466 stopmin = min + min1;
4467 flags &= ~SCF_DO_SUBSTR;
4468 if (data)
4469 data->flags |= SCF_SEEN_ACCEPT;
4470 }
4471 if (data) {
4472 if (data_fake.flags & SF_HAS_EVAL)
4473 data->flags |= SF_HAS_EVAL;
4474 data->whilem_c = data_fake.whilem_c;
4475 }
4476 if (flags & SCF_DO_STCLASS)
4477 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4478 }
4479 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4480 min1 = 0;
4481 if (flags & SCF_DO_SUBSTR) {
4482 data->pos_min += min1;
4483 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4484 data->pos_delta = SSize_t_MAX;
4485 else
4486 data->pos_delta += max1 - min1;
4487 if (max1 != min1 || is_inf)
4488 data->cur_is_floating = 1;
4489 }
4490 min += min1;
4491 if (delta == SSize_t_MAX
4492 || SSize_t_MAX - delta - (max1 - min1) < 0)
4493 delta = SSize_t_MAX;
4494 else
4495 delta += max1 - min1;
4496 if (flags & SCF_DO_STCLASS_OR) {
4497 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4498 if (min1) {
4499 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4500 flags &= ~SCF_DO_STCLASS;
4501 }
4502 }
4503 else if (flags & SCF_DO_STCLASS_AND) {
4504 if (min1) {
4505 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4506 flags &= ~SCF_DO_STCLASS;
4507 }
4508 else {
4509 /* Switch to OR mode: cache the old value of
4510 * data->start_class */
4511 INIT_AND_WITHP;
4512 StructCopy(data->start_class, and_withp, regnode_ssc);
4513 flags &= ~SCF_DO_STCLASS_AND;
4514 StructCopy(&accum, data->start_class, regnode_ssc);
4515 flags |= SCF_DO_STCLASS_OR;
4516 }
4517 }
4518
4519 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4520 OP( startbranch ) == BRANCH )
4521 {
4522 /* demq.
4523
4524 Assuming this was/is a branch we are dealing with: 'scan'
4525 now points at the item that follows the branch sequence,
4526 whatever it is. We now start at the beginning of the
4527 sequence and look for subsequences of
4528
4529 BRANCH->EXACT=>x1
4530 BRANCH->EXACT=>x2
4531 tail
4532
4533 which would be constructed from a pattern like
4534 /A|LIST|OF|WORDS/
4535
4536 If we can find such a subsequence we need to turn the first
4537 element into a trie and then add the subsequent branch exact
4538 strings to the trie.
4539
4540 We have two cases
4541
4542 1. patterns where the whole set of branches can be
4543 converted.
4544
4545 2. patterns where only a subset can be converted.
4546
4547 In case 1 we can replace the whole set with a single regop
4548 for the trie. In case 2 we need to keep the start and end
4549 branches so
4550
4551 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4552 becomes BRANCH TRIE; BRANCH X;
4553
4554 There is an additional case, that being where there is a
4555 common prefix, which gets split out into an EXACT like node
4556 preceding the TRIE node.
4557
4558 If x(1..n)==tail then we can do a simple trie, if not we make
4559 a "jump" trie, such that when we match the appropriate word
4560 we "jump" to the appropriate tail node. Essentially we turn
4561 a nested if into a case structure of sorts.
4562
4563 */
4564
4565 int made=0;
4566 if (!re_trie_maxbuff) {
4567 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4568 if (!SvIOK(re_trie_maxbuff))
4569 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4570 }
4571 if ( SvIV(re_trie_maxbuff)>=0 ) {
4572 regnode *cur;
4573 regnode *first = (regnode *)NULL;
4574 regnode *last = (regnode *)NULL;
4575 regnode *tail = scan;
4576 U8 trietype = 0;
4577 U32 count=0;
4578
4579 /* var tail is used because there may be a TAIL
4580 regop in the way. Ie, the exacts will point to the
4581 thing following the TAIL, but the last branch will
4582 point at the TAIL. So we advance tail. If we
4583 have nested (?:) we may have to move through several
4584 tails.
4585 */
4586
4587 while ( OP( tail ) == TAIL ) {
4588 /* this is the TAIL generated by (?:) */
4589 tail = regnext( tail );
4590 }
4591
4592
4593 DEBUG_TRIE_COMPILE_r({
4594 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4595 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4596 depth+1,
4597 "Looking for TRIE'able sequences. Tail node is ",
4598 (UV) REGNODE_OFFSET(tail),
4599 SvPV_nolen_const( RExC_mysv )
4600 );
4601 });
4602
4603 /*
4604
4605 Step through the branches
4606 cur represents each branch,
4607 noper is the first thing to be matched as part
4608 of that branch
4609 noper_next is the regnext() of that node.
4610
4611 We normally handle a case like this
4612 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4613 support building with NOJUMPTRIE, which restricts
4614 the trie logic to structures like /FOO|BAR/.
4615
4616 If noper is a trieable nodetype then the branch is
4617 a possible optimization target. If we are building
4618 under NOJUMPTRIE then we require that noper_next is
4619 the same as scan (our current position in the regex
4620 program).
4621
4622 Once we have two or more consecutive such branches
4623 we can create a trie of the EXACT's contents and
4624 stitch it in place into the program.
4625
4626 If the sequence represents all of the branches in
4627 the alternation we replace the entire thing with a
4628 single TRIE node.
4629
4630 Otherwise when it is a subsequence we need to
4631 stitch it in place and replace only the relevant
4632 branches. This means the first branch has to remain
4633 as it is used by the alternation logic, and its
4634 next pointer, and needs to be repointed at the item
4635 on the branch chain following the last branch we
4636 have optimized away.
4637
4638 This could be either a BRANCH, in which case the
4639 subsequence is internal, or it could be the item
4640 following the branch sequence in which case the
4641 subsequence is at the end (which does not
4642 necessarily mean the first node is the start of the
4643 alternation).
4644
4645 TRIE_TYPE(X) is a define which maps the optype to a
4646 trietype.
4647
4648 optype | trietype
4649 ----------------+-----------
4650 NOTHING | NOTHING
4651 EXACT | EXACT
4652 EXACTFU | EXACTFU
4653 EXACTFU_SS | EXACTFU
4654 EXACTFAA | EXACTFAA
4655 EXACTL | EXACTL
4656 EXACTFLU8 | EXACTFLU8
4657
4658
4659 */
4660#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4661 ? NOTHING \
4662 : ( EXACT == (X) ) \
4663 ? EXACT \
4664 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4665 ? EXACTFU \
4666 : ( EXACTFAA == (X) ) \
4667 ? EXACTFAA \
4668 : ( EXACTL == (X) ) \
4669 ? EXACTL \
4670 : ( EXACTFLU8 == (X) ) \
4671 ? EXACTFLU8 \
4672 : 0 )
4673
4674 /* dont use tail as the end marker for this traverse */
4675 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4676 regnode * const noper = NEXTOPER( cur );
4677 U8 noper_type = OP( noper );
4678 U8 noper_trietype = TRIE_TYPE( noper_type );
4679#if defined(DEBUGGING) || defined(NOJUMPTRIE)
4680 regnode * const noper_next = regnext( noper );
4681 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4682 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4683#endif
4684
4685 DEBUG_TRIE_COMPILE_r({
4686 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4687 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
4688 depth+1,
4689 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4690
4691 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4692 Perl_re_printf( aTHX_ " -> %d:%s",
4693 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4694
4695 if ( noper_next ) {
4696 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4697 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4698 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4699 }
4700 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4701 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4702 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4703 );
4704 });
4705
4706 /* Is noper a trieable nodetype that can be merged
4707 * with the current trie (if there is one)? */
4708 if ( noper_trietype
4709 &&
4710 (
4711 ( noper_trietype == NOTHING )
4712 || ( trietype == NOTHING )
4713 || ( trietype == noper_trietype )
4714 )
4715#ifdef NOJUMPTRIE
4716 && noper_next >= tail
4717#endif
4718 && count < U16_MAX)
4719 {
4720 /* Handle mergable triable node Either we are
4721 * the first node in a new trieable sequence,
4722 * in which case we do some bookkeeping,
4723 * otherwise we update the end pointer. */
4724 if ( !first ) {
4725 first = cur;
4726 if ( noper_trietype == NOTHING ) {
4727#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4728 regnode * const noper_next = regnext( noper );
4729 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4730 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4731#endif
4732
4733 if ( noper_next_trietype ) {
4734 trietype = noper_next_trietype;
4735 } else if (noper_next_type) {
4736 /* a NOTHING regop is 1 regop wide.
4737 * We need at least two for a trie
4738 * so we can't merge this in */
4739 first = NULL;
4740 }
4741 } else {
4742 trietype = noper_trietype;
4743 }
4744 } else {
4745 if ( trietype == NOTHING )
4746 trietype = noper_trietype;
4747 last = cur;
4748 }
4749 if (first)
4750 count++;
4751 } /* end handle mergable triable node */
4752 else {
4753 /* handle unmergable node -
4754 * noper may either be a triable node which can
4755 * not be tried together with the current trie,
4756 * or a non triable node */
4757 if ( last ) {
4758 /* If last is set and trietype is not
4759 * NOTHING then we have found at least two
4760 * triable branch sequences in a row of a
4761 * similar trietype so we can turn them
4762 * into a trie. If/when we allow NOTHING to
4763 * start a trie sequence this condition
4764 * will be required, and it isn't expensive
4765 * so we leave it in for now. */
4766 if ( trietype && trietype != NOTHING )
4767 make_trie( pRExC_state,
4768 startbranch, first, cur, tail,
4769 count, trietype, depth+1 );
4770 last = NULL; /* note: we clear/update
4771 first, trietype etc below,
4772 so we dont do it here */
4773 }
4774 if ( noper_trietype
4775#ifdef NOJUMPTRIE
4776 && noper_next >= tail
4777#endif
4778 ){
4779 /* noper is triable, so we can start a new
4780 * trie sequence */
4781 count = 1;
4782 first = cur;
4783 trietype = noper_trietype;
4784 } else if (first) {
4785 /* if we already saw a first but the
4786 * current node is not triable then we have
4787 * to reset the first information. */
4788 count = 0;
4789 first = NULL;
4790 trietype = 0;
4791 }
4792 } /* end handle unmergable node */
4793 } /* loop over branches */
4794 DEBUG_TRIE_COMPILE_r({
4795 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4796 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
4797 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
4798 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4799 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4800 PL_reg_name[trietype]
4801 );
4802
4803 });
4804 if ( last && trietype ) {
4805 if ( trietype != NOTHING ) {
4806 /* the last branch of the sequence was part of
4807 * a trie, so we have to construct it here
4808 * outside of the loop */
4809 made= make_trie( pRExC_state, startbranch,
4810 first, scan, tail, count,
4811 trietype, depth+1 );
4812#ifdef TRIE_STUDY_OPT
4813 if ( ((made == MADE_EXACT_TRIE &&
4814 startbranch == first)
4815 || ( first_non_open == first )) &&
4816 depth==0 ) {
4817 flags |= SCF_TRIE_RESTUDY;
4818 if ( startbranch == first
4819 && scan >= tail )
4820 {
4821 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4822 }
4823 }
4824#endif
4825 } else {
4826 /* at this point we know whatever we have is a
4827 * NOTHING sequence/branch AND if 'startbranch'
4828 * is 'first' then we can turn the whole thing
4829 * into a NOTHING
4830 */
4831 if ( startbranch == first ) {
4832 regnode *opt;
4833 /* the entire thing is a NOTHING sequence,
4834 * something like this: (?:|) So we can
4835 * turn it into a plain NOTHING op. */
4836 DEBUG_TRIE_COMPILE_r({
4837 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4838 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
4839 depth+1,
4840 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
4841
4842 });
4843 OP(startbranch)= NOTHING;
4844 NEXT_OFF(startbranch)= tail - startbranch;
4845 for ( opt= startbranch + 1; opt < tail ; opt++ )
4846 OP(opt)= OPTIMIZED;
4847 }
4848 }
4849 } /* end if ( last) */
4850 } /* TRIE_MAXBUF is non zero */
4851
4852 } /* do trie */
4853
4854 }
4855 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4856 scan = NEXTOPER(NEXTOPER(scan));
4857 } else /* single branch is optimized. */
4858 scan = NEXTOPER(scan);
4859 continue;
4860 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
4861 I32 paren = 0;
4862 regnode *start = NULL;
4863 regnode *end = NULL;
4864 U32 my_recursed_depth= recursed_depth;
4865
4866 if (OP(scan) != SUSPEND) { /* GOSUB */
4867 /* Do setup, note this code has side effects beyond
4868 * the rest of this block. Specifically setting
4869 * RExC_recurse[] must happen at least once during
4870 * study_chunk(). */
4871 paren = ARG(scan);
4872 RExC_recurse[ARG2L(scan)] = scan;
4873 start = REGNODE_p(RExC_open_parens[paren]);
4874 end = REGNODE_p(RExC_close_parens[paren]);
4875
4876 /* NOTE we MUST always execute the above code, even
4877 * if we do nothing with a GOSUB */
4878 if (
4879 ( flags & SCF_IN_DEFINE )
4880 ||
4881 (
4882 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4883 &&
4884 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4885 )
4886 ) {
4887 /* no need to do anything here if we are in a define. */
4888 /* or we are after some kind of infinite construct
4889 * so we can skip recursing into this item.
4890 * Since it is infinite we will not change the maxlen
4891 * or delta, and if we miss something that might raise
4892 * the minlen it will merely pessimise a little.
4893 *
4894 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4895 * might result in a minlen of 1 and not of 4,
4896 * but this doesn't make us mismatch, just try a bit
4897 * harder than we should.
4898 * */
4899 scan= regnext(scan);
4900 continue;
4901 }
4902
4903 if (
4904 !recursed_depth
4905 ||
4906 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4907 ) {
4908 /* it is quite possible that there are more efficient ways
4909 * to do this. We maintain a bitmap per level of recursion
4910 * of which patterns we have entered so we can detect if a
4911 * pattern creates a possible infinite loop. When we
4912 * recurse down a level we copy the previous levels bitmap
4913 * down. When we are at recursion level 0 we zero the top
4914 * level bitmap. It would be nice to implement a different
4915 * more efficient way of doing this. In particular the top
4916 * level bitmap may be unnecessary.
4917 */
4918 if (!recursed_depth) {
4919 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4920 } else {
4921 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4922 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4923 RExC_study_chunk_recursed_bytes, U8);
4924 }
4925 /* we havent recursed into this paren yet, so recurse into it */
4926 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
4927 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4928 my_recursed_depth= recursed_depth + 1;
4929 } else {
4930 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
4931 /* some form of infinite recursion, assume infinite length
4932 * */
4933 if (flags & SCF_DO_SUBSTR) {
4934 scan_commit(pRExC_state, data, minlenp, is_inf);
4935 data->cur_is_floating = 1;
4936 }
4937 is_inf = is_inf_internal = 1;
4938 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4939 ssc_anything(data->start_class);
4940 flags &= ~SCF_DO_STCLASS;
4941
4942 start= NULL; /* reset start so we dont recurse later on. */
4943 }
4944 } else {
4945 paren = stopparen;
4946 start = scan + 2;
4947 end = regnext(scan);
4948 }
4949 if (start) {
4950 scan_frame *newframe;
4951 assert(end);
4952 if (!RExC_frame_last) {
4953 Newxz(newframe, 1, scan_frame);
4954 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4955 RExC_frame_head= newframe;
4956 RExC_frame_count++;
4957 } else if (!RExC_frame_last->next_frame) {
4958 Newxz(newframe, 1, scan_frame);
4959 RExC_frame_last->next_frame= newframe;
4960 newframe->prev_frame= RExC_frame_last;
4961 RExC_frame_count++;
4962 } else {
4963 newframe= RExC_frame_last->next_frame;
4964 }
4965 RExC_frame_last= newframe;
4966
4967 newframe->next_regnode = regnext(scan);
4968 newframe->last_regnode = last;
4969 newframe->stopparen = stopparen;
4970 newframe->prev_recursed_depth = recursed_depth;
4971 newframe->this_prev_frame= frame;
4972
4973 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
4974 DEBUG_PEEP("fnew", scan, depth, flags);
4975
4976 frame = newframe;
4977 scan = start;
4978 stopparen = paren;
4979 last = end;
4980 depth = depth + 1;
4981 recursed_depth= my_recursed_depth;
4982
4983 continue;
4984 }
4985 }
4986 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4987 SSize_t l = STR_LEN(scan);
4988 UV uc;
4989 assert(l);
4990 if (UTF) {
4991 const U8 * const s = (U8*)STRING(scan);
4992 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4993 l = utf8_length(s, s + l);
4994 } else {
4995 uc = *((U8*)STRING(scan));
4996 }
4997 min += l;
4998 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4999 /* The code below prefers earlier match for fixed
5000 offset, later match for variable offset. */
5001 if (data->last_end == -1) { /* Update the start info. */
5002 data->last_start_min = data->pos_min;
5003 data->last_start_max = is_inf
5004 ? SSize_t_MAX : data->pos_min + data->pos_delta;
5005 }
5006 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5007 if (UTF)
5008 SvUTF8_on(data->last_found);
5009 {
5010 SV * const sv = data->last_found;
5011 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5012 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5013 if (mg && mg->mg_len >= 0)
5014 mg->mg_len += utf8_length((U8*)STRING(scan),
5015 (U8*)STRING(scan)+STR_LEN(scan));
5016 }
5017 data->last_end = data->pos_min + l;
5018 data->pos_min += l; /* As in the first entry. */
5019 data->flags &= ~SF_BEFORE_EOL;
5020 }
5021
5022 /* ANDing the code point leaves at most it, and not in locale, and
5023 * can't match null string */
5024 if (flags & SCF_DO_STCLASS_AND) {
5025 ssc_cp_and(data->start_class, uc);
5026 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5027 ssc_clear_locale(data->start_class);
5028 }
5029 else if (flags & SCF_DO_STCLASS_OR) {
5030 ssc_add_cp(data->start_class, uc);
5031 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5032
5033 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5034 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5035 }
5036 flags &= ~SCF_DO_STCLASS;
5037 }
5038 else if (PL_regkind[OP(scan)] == EXACT) {
5039 /* But OP != EXACT!, so is EXACTFish */
5040 SSize_t l = STR_LEN(scan);
5041 const U8 * s = (U8*)STRING(scan);
5042
5043 /* Search for fixed substrings supports EXACT only. */
5044 if (flags & SCF_DO_SUBSTR) {
5045 assert(data);
5046 scan_commit(pRExC_state, data, minlenp, is_inf);
5047 }
5048 if (UTF) {
5049 l = utf8_length(s, s + l);
5050 }
5051 if (unfolded_multi_char) {
5052 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5053 }
5054 min += l - min_subtract;
5055 assert (min >= 0);
5056 delta += min_subtract;
5057 if (flags & SCF_DO_SUBSTR) {
5058 data->pos_min += l - min_subtract;
5059 if (data->pos_min < 0) {
5060 data->pos_min = 0;
5061 }
5062 data->pos_delta += min_subtract;
5063 if (min_subtract) {
5064 data->cur_is_floating = 1; /* float */
5065 }
5066 }
5067
5068 if (flags & SCF_DO_STCLASS) {
5069 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
5070
5071 assert(EXACTF_invlist);
5072 if (flags & SCF_DO_STCLASS_AND) {
5073 if (OP(scan) != EXACTFL)
5074 ssc_clear_locale(data->start_class);
5075 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5076 ANYOF_POSIXL_ZERO(data->start_class);
5077 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5078 }
5079 else { /* SCF_DO_STCLASS_OR */
5080 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5081 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5082
5083 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5084 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5085 }
5086 flags &= ~SCF_DO_STCLASS;
5087 SvREFCNT_dec(EXACTF_invlist);
5088 }
5089 }
5090 else if (REGNODE_VARIES(OP(scan))) {
5091 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5092 I32 fl = 0, f = flags;
5093 regnode * const oscan = scan;
5094 regnode_ssc this_class;
5095 regnode_ssc *oclass = NULL;
5096 I32 next_is_eval = 0;
5097
5098 switch (PL_regkind[OP(scan)]) {
5099 case WHILEM: /* End of (?:...)* . */
5100 scan = NEXTOPER(scan);
5101 goto finish;
5102 case PLUS:
5103 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5104 next = NEXTOPER(scan);
5105 if (OP(next) == EXACT
5106 || OP(next) == EXACTL
5107 || (flags & SCF_DO_STCLASS))
5108 {
5109 mincount = 1;
5110 maxcount = REG_INFTY;
5111 next = regnext(scan);
5112 scan = NEXTOPER(scan);
5113 goto do_curly;
5114 }
5115 }
5116 if (flags & SCF_DO_SUBSTR)
5117 data->pos_min++;
5118 min++;
5119 /* FALLTHROUGH */
5120 case STAR:
5121 if (flags & SCF_DO_STCLASS) {
5122 mincount = 0;
5123 maxcount = REG_INFTY;
5124 next = regnext(scan);
5125 scan = NEXTOPER(scan);
5126 goto do_curly;
5127 }
5128 if (flags & SCF_DO_SUBSTR) {
5129 scan_commit(pRExC_state, data, minlenp, is_inf);
5130 /* Cannot extend fixed substrings */
5131 data->cur_is_floating = 1; /* float */
5132 }
5133 is_inf = is_inf_internal = 1;
5134 scan = regnext(scan);
5135 goto optimize_curly_tail;
5136 case CURLY:
5137 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5138 && (scan->flags == stopparen))
5139 {
5140 mincount = 1;
5141 maxcount = 1;
5142 } else {
5143 mincount = ARG1(scan);
5144 maxcount = ARG2(scan);
5145 }
5146 next = regnext(scan);
5147 if (OP(scan) == CURLYX) {
5148 I32 lp = (data ? *(data->last_closep) : 0);
5149 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5150 }
5151 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5152 next_is_eval = (OP(scan) == EVAL);
5153 do_curly:
5154 if (flags & SCF_DO_SUBSTR) {
5155 if (mincount == 0)
5156 scan_commit(pRExC_state, data, minlenp, is_inf);
5157 /* Cannot extend fixed substrings */
5158 pos_before = data->pos_min;
5159 }
5160 if (data) {
5161 fl = data->flags;
5162 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5163 if (is_inf)
5164 data->flags |= SF_IS_INF;
5165 }
5166 if (flags & SCF_DO_STCLASS) {
5167 ssc_init(pRExC_state, &this_class);
5168 oclass = data->start_class;
5169 data->start_class = &this_class;
5170 f |= SCF_DO_STCLASS_AND;
5171 f &= ~SCF_DO_STCLASS_OR;
5172 }
5173 /* Exclude from super-linear cache processing any {n,m}
5174 regops for which the combination of input pos and regex
5175 pos is not enough information to determine if a match
5176 will be possible.
5177
5178 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5179 regex pos at the \s*, the prospects for a match depend not
5180 only on the input position but also on how many (bar\s*)
5181 repeats into the {4,8} we are. */
5182 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5183 f &= ~SCF_WHILEM_VISITED_POS;
5184
5185 /* This will finish on WHILEM, setting scan, or on NULL: */
5186 /* recurse study_chunk() on loop bodies */
5187 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5188 last, data, stopparen, recursed_depth, NULL,
5189 (mincount == 0
5190 ? (f & ~SCF_DO_SUBSTR)
5191 : f)
5192 ,depth+1);
5193
5194 if (flags & SCF_DO_STCLASS)
5195 data->start_class = oclass;
5196 if (mincount == 0 || minnext == 0) {
5197 if (flags & SCF_DO_STCLASS_OR) {
5198 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5199 }
5200 else if (flags & SCF_DO_STCLASS_AND) {
5201 /* Switch to OR mode: cache the old value of
5202 * data->start_class */
5203 INIT_AND_WITHP;
5204 StructCopy(data->start_class, and_withp, regnode_ssc);
5205 flags &= ~SCF_DO_STCLASS_AND;
5206 StructCopy(&this_class, data->start_class, regnode_ssc);
5207 flags |= SCF_DO_STCLASS_OR;
5208 ANYOF_FLAGS(data->start_class)
5209 |= SSC_MATCHES_EMPTY_STRING;
5210 }
5211 } else { /* Non-zero len */
5212 if (flags & SCF_DO_STCLASS_OR) {
5213 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5214 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5215 }
5216 else if (flags & SCF_DO_STCLASS_AND)
5217 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5218 flags &= ~SCF_DO_STCLASS;
5219 }
5220 if (!scan) /* It was not CURLYX, but CURLY. */
5221 scan = next;
5222 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5223 /* ? quantifier ok, except for (?{ ... }) */
5224 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5225 && (minnext == 0) && (deltanext == 0)
5226 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5227 && maxcount <= REG_INFTY/3) /* Complement check for big
5228 count */
5229 {
5230 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5231 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5232 "Quantifier unexpected on zero-length expression "
5233 "in regex m/%" UTF8f "/",
5234 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5235 RExC_precomp)));
5236 }
5237
5238 min += minnext * mincount;
5239 is_inf_internal |= deltanext == SSize_t_MAX
5240 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5241 is_inf |= is_inf_internal;
5242 if (is_inf) {
5243 delta = SSize_t_MAX;
5244 } else {
5245 delta += (minnext + deltanext) * maxcount
5246 - minnext * mincount;
5247 }
5248 /* Try powerful optimization CURLYX => CURLYN. */
5249 if ( OP(oscan) == CURLYX && data
5250 && data->flags & SF_IN_PAR
5251 && !(data->flags & SF_HAS_EVAL)
5252 && !deltanext && minnext == 1 ) {
5253 /* Try to optimize to CURLYN. */
5254 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5255 regnode * const nxt1 = nxt;
5256#ifdef DEBUGGING
5257 regnode *nxt2;
5258#endif
5259
5260 /* Skip open. */
5261 nxt = regnext(nxt);
5262 if (!REGNODE_SIMPLE(OP(nxt))
5263 && !(PL_regkind[OP(nxt)] == EXACT
5264 && STR_LEN(nxt) == 1))
5265 goto nogo;
5266#ifdef DEBUGGING
5267 nxt2 = nxt;
5268#endif
5269 nxt = regnext(nxt);
5270 if (OP(nxt) != CLOSE)
5271 goto nogo;
5272 if (RExC_open_parens) {
5273
5274 /*open->CURLYM*/
5275 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5276
5277 /*close->while*/
5278 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5279 }
5280 /* Now we know that nxt2 is the only contents: */
5281 oscan->flags = (U8)ARG(nxt);
5282 OP(oscan) = CURLYN;
5283 OP(nxt1) = NOTHING; /* was OPEN. */
5284
5285#ifdef DEBUGGING
5286 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5287 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5288 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5289 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5290 OP(nxt + 1) = OPTIMIZED; /* was count. */
5291 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5292#endif
5293 }
5294 nogo:
5295
5296 /* Try optimization CURLYX => CURLYM. */
5297 if ( OP(oscan) == CURLYX && data
5298 && !(data->flags & SF_HAS_PAR)
5299 && !(data->flags & SF_HAS_EVAL)
5300 && !deltanext /* atom is fixed width */
5301 && minnext != 0 /* CURLYM can't handle zero width */
5302
5303 /* Nor characters whose fold at run-time may be
5304 * multi-character */
5305 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5306 ) {
5307 /* XXXX How to optimize if data == 0? */
5308 /* Optimize to a simpler form. */
5309 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5310 regnode *nxt2;
5311
5312 OP(oscan) = CURLYM;
5313 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5314 && (OP(nxt2) != WHILEM))
5315 nxt = nxt2;
5316 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5317 /* Need to optimize away parenths. */
5318 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5319 /* Set the parenth number. */
5320 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5321
5322 oscan->flags = (U8)ARG(nxt);
5323 if (RExC_open_parens) {
5324 /*open->CURLYM*/
5325 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5326
5327 /*close->NOTHING*/
5328 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5329 + 1;
5330 }
5331 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5332 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5333
5334#ifdef DEBUGGING
5335 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5336 OP(nxt + 1) = OPTIMIZED; /* was count. */
5337 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5338 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5339#endif
5340#if 0
5341 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5342 regnode *nnxt = regnext(nxt1);
5343 if (nnxt == nxt) {
5344 if (reg_off_by_arg[OP(nxt1)])
5345 ARG_SET(nxt1, nxt2 - nxt1);
5346 else if (nxt2 - nxt1 < U16_MAX)
5347 NEXT_OFF(nxt1) = nxt2 - nxt1;
5348 else
5349 OP(nxt) = NOTHING; /* Cannot beautify */
5350 }
5351 nxt1 = nnxt;
5352 }
5353#endif
5354 /* Optimize again: */
5355 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5356 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5357 NULL, stopparen, recursed_depth, NULL, 0,
5358 depth+1);
5359 }
5360 else
5361 oscan->flags = 0;
5362 }
5363 else if ((OP(oscan) == CURLYX)
5364 && (flags & SCF_WHILEM_VISITED_POS)
5365 /* See the comment on a similar expression above.
5366 However, this time it's not a subexpression
5367 we care about, but the expression itself. */
5368 && (maxcount == REG_INFTY)
5369 && data) {
5370 /* This stays as CURLYX, we can put the count/of pair. */
5371 /* Find WHILEM (as in regexec.c) */
5372 regnode *nxt = oscan + NEXT_OFF(oscan);
5373
5374 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5375 nxt += ARG(nxt);
5376 nxt = PREVOPER(nxt);
5377 if (nxt->flags & 0xf) {
5378 /* we've already set whilem count on this node */
5379 } else if (++data->whilem_c < 16) {
5380 assert(data->whilem_c <= RExC_whilem_seen);
5381 nxt->flags = (U8)(data->whilem_c
5382 | (RExC_whilem_seen << 4)); /* On WHILEM */
5383 }
5384 }
5385 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5386 pars++;
5387 if (flags & SCF_DO_SUBSTR) {
5388 SV *last_str = NULL;
5389 STRLEN last_chrs = 0;
5390 int counted = mincount != 0;
5391
5392 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5393 string. */
5394 SSize_t b = pos_before >= data->last_start_min
5395 ? pos_before : data->last_start_min;
5396 STRLEN l;
5397 const char * const s = SvPV_const(data->last_found, l);
5398 SSize_t old = b - data->last_start_min;
5399
5400 if (UTF)
5401 old = utf8_hop((U8*)s, old) - (U8*)s;
5402 l -= old;
5403 /* Get the added string: */
5404 last_str = newSVpvn_utf8(s + old, l, UTF);
5405 last_chrs = UTF ? utf8_length((U8*)(s + old),
5406 (U8*)(s + old + l)) : l;
5407 if (deltanext == 0 && pos_before == b) {
5408 /* What was added is a constant string */
5409 if (mincount > 1) {
5410
5411 SvGROW(last_str, (mincount * l) + 1);
5412 repeatcpy(SvPVX(last_str) + l,
5413 SvPVX_const(last_str), l,
5414 mincount - 1);
5415 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5416 /* Add additional parts. */
5417 SvCUR_set(data->last_found,
5418 SvCUR(data->last_found) - l);
5419 sv_catsv(data->last_found, last_str);
5420 {
5421 SV * sv = data->last_found;
5422 MAGIC *mg =
5423 SvUTF8(sv) && SvMAGICAL(sv) ?
5424 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5425 if (mg && mg->mg_len >= 0)
5426 mg->mg_len += last_chrs * (mincount-1);
5427 }
5428 last_chrs *= mincount;
5429 data->last_end += l * (mincount - 1);
5430 }
5431 } else {
5432 /* start offset must point into the last copy */
5433 data->last_start_min += minnext * (mincount - 1);
5434 data->last_start_max =
5435 is_inf
5436 ? SSize_t_MAX
5437 : data->last_start_max +
5438 (maxcount - 1) * (minnext + data->pos_delta);
5439 }
5440 }
5441 /* It is counted once already... */
5442 data->pos_min += minnext * (mincount - counted);
5443#if 0
5444Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5445 " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5446 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5447 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5448 (UV)mincount);
5449if (deltanext != SSize_t_MAX)
5450Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5451 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5452 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5453#endif
5454 if (deltanext == SSize_t_MAX
5455 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5456 data->pos_delta = SSize_t_MAX;
5457 else
5458 data->pos_delta += - counted * deltanext +
5459 (minnext + deltanext) * maxcount - minnext * mincount;
5460 if (mincount != maxcount) {
5461 /* Cannot extend fixed substrings found inside
5462 the group. */
5463 scan_commit(pRExC_state, data, minlenp, is_inf);
5464 if (mincount && last_str) {
5465 SV * const sv = data->last_found;
5466 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5467 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5468
5469 if (mg)
5470 mg->mg_len = -1;
5471 sv_setsv(sv, last_str);
5472 data->last_end = data->pos_min;
5473 data->last_start_min = data->pos_min - last_chrs;
5474 data->last_start_max = is_inf
5475 ? SSize_t_MAX
5476 : data->pos_min + data->pos_delta - last_chrs;
5477 }
5478 data->cur_is_floating = 1; /* float */
5479 }
5480 SvREFCNT_dec(last_str);
5481 }
5482 if (data && (fl & SF_HAS_EVAL))
5483 data->flags |= SF_HAS_EVAL;
5484 optimize_curly_tail:
5485 if (OP(oscan) != CURLYX) {
5486 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5487 && NEXT_OFF(next))
5488 NEXT_OFF(oscan) += NEXT_OFF(next);
5489 }
5490 continue;
5491
5492 default:
5493#ifdef DEBUGGING
5494 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5495 OP(scan));
5496#endif
5497 case REF:
5498 case CLUMP:
5499 if (flags & SCF_DO_SUBSTR) {
5500 /* Cannot expect anything... */
5501 scan_commit(pRExC_state, data, minlenp, is_inf);
5502 data->cur_is_floating = 1; /* float */
5503 }
5504 is_inf = is_inf_internal = 1;
5505 if (flags & SCF_DO_STCLASS_OR) {
5506 if (OP(scan) == CLUMP) {
5507 /* Actually is any start char, but very few code points
5508 * aren't start characters */
5509 ssc_match_all_cp(data->start_class);
5510 }
5511 else {
5512 ssc_anything(data->start_class);
5513 }
5514 }
5515 flags &= ~SCF_DO_STCLASS;
5516 break;
5517 }
5518 }
5519 else if (OP(scan) == LNBREAK) {
5520 if (flags & SCF_DO_STCLASS) {
5521 if (flags & SCF_DO_STCLASS_AND) {
5522 ssc_intersection(data->start_class,
5523 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5524 ssc_clear_locale(data->start_class);
5525 ANYOF_FLAGS(data->start_class)
5526 &= ~SSC_MATCHES_EMPTY_STRING;
5527 }
5528 else if (flags & SCF_DO_STCLASS_OR) {
5529 ssc_union(data->start_class,
5530 PL_XPosix_ptrs[_CC_VERTSPACE],
5531 FALSE);
5532 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5533
5534 /* See commit msg for
5535 * 749e076fceedeb708a624933726e7989f2302f6a */
5536 ANYOF_FLAGS(data->start_class)
5537 &= ~SSC_MATCHES_EMPTY_STRING;
5538 }
5539 flags &= ~SCF_DO_STCLASS;
5540 }
5541 min++;
5542 if (delta != SSize_t_MAX)
5543 delta++; /* Because of the 2 char string cr-lf */
5544 if (flags & SCF_DO_SUBSTR) {
5545 /* Cannot expect anything... */
5546 scan_commit(pRExC_state, data, minlenp, is_inf);
5547 data->pos_min += 1;
5548 if (data->pos_delta != SSize_t_MAX) {
5549 data->pos_delta += 1;
5550 }
5551 data->cur_is_floating = 1; /* float */
5552 }
5553 }
5554 else if (REGNODE_SIMPLE(OP(scan))) {
5555
5556 if (flags & SCF_DO_SUBSTR) {
5557 scan_commit(pRExC_state, data, minlenp, is_inf);
5558 data->pos_min++;
5559 }
5560 min++;
5561 if (flags & SCF_DO_STCLASS) {
5562 bool invert = 0;
5563 SV* my_invlist = NULL;
5564 U8 namedclass;
5565
5566 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5567 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5568
5569 /* Some of the logic below assumes that switching
5570 locale on will only add false positives. */
5571 switch (OP(scan)) {
5572
5573 default:
5574#ifdef DEBUGGING
5575 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5576 OP(scan));
5577#endif
5578 case SANY:
5579 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5580 ssc_match_all_cp(data->start_class);
5581 break;
5582
5583 case REG_ANY:
5584 {
5585 SV* REG_ANY_invlist = _new_invlist(2);
5586 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5587 '\n');
5588 if (flags & SCF_DO_STCLASS_OR) {
5589 ssc_union(data->start_class,
5590 REG_ANY_invlist,
5591 TRUE /* TRUE => invert, hence all but \n
5592 */
5593 );
5594 }
5595 else if (flags & SCF_DO_STCLASS_AND) {
5596 ssc_intersection(data->start_class,
5597 REG_ANY_invlist,
5598 TRUE /* TRUE => invert */
5599 );
5600 ssc_clear_locale(data->start_class);
5601 }
5602 SvREFCNT_dec_NN(REG_ANY_invlist);
5603 }
5604 break;
5605
5606 case ANYOFD:
5607 case ANYOFL:
5608 case ANYOFPOSIXL:
5609 case ANYOF:
5610 if (flags & SCF_DO_STCLASS_AND)
5611 ssc_and(pRExC_state, data->start_class,
5612 (regnode_charclass *) scan);
5613 else
5614 ssc_or(pRExC_state, data->start_class,
5615 (regnode_charclass *) scan);
5616 break;
5617
5618 case ANYOFM:
5619 {
5620 SV* cp_list = get_ANYOFM_contents(scan);
5621
5622 if (flags & SCF_DO_STCLASS_OR) {
5623 ssc_union(data->start_class,
5624 cp_list,
5625 FALSE /* don't invert */
5626 );
5627 }
5628 else if (flags & SCF_DO_STCLASS_AND) {
5629 ssc_intersection(data->start_class,
5630 cp_list,
5631 FALSE /* don't invert */
5632 );
5633 }
5634
5635 SvREFCNT_dec_NN(cp_list);
5636 break;
5637 }
5638
5639 case NPOSIXL:
5640 invert = 1;
5641 /* FALLTHROUGH */
5642
5643 case POSIXL:
5644 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5645 if (flags & SCF_DO_STCLASS_AND) {
5646 bool was_there = cBOOL(
5647 ANYOF_POSIXL_TEST(data->start_class,
5648 namedclass));
5649 ANYOF_POSIXL_ZERO(data->start_class);
5650 if (was_there) { /* Do an AND */
5651 ANYOF_POSIXL_SET(data->start_class, namedclass);
5652 }
5653 /* No individual code points can now match */
5654 data->start_class->invlist
5655 = sv_2mortal(_new_invlist(0));
5656 }
5657 else {
5658 int complement = namedclass + ((invert) ? -1 : 1);
5659
5660 assert(flags & SCF_DO_STCLASS_OR);
5661
5662 /* If the complement of this class was already there,
5663 * the result is that they match all code points,
5664 * (\d + \D == everything). Remove the classes from
5665 * future consideration. Locale is not relevant in
5666 * this case */
5667 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5668 ssc_match_all_cp(data->start_class);
5669 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5670 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5671 }
5672 else { /* The usual case; just add this class to the
5673 existing set */
5674 ANYOF_POSIXL_SET(data->start_class, namedclass);
5675 }
5676 }
5677 break;
5678
5679 case NASCII:
5680 invert = 1;
5681 /* FALLTHROUGH */
5682 case ASCII:
5683 my_invlist = invlist_clone(PL_Posix_ptrs[_CC_ASCII], NULL);
5684
5685 /* This can be handled as a Posix class */
5686 goto join_posix_and_ascii;
5687
5688 case NPOSIXA: /* For these, we always know the exact set of
5689 what's matched */
5690 invert = 1;
5691 /* FALLTHROUGH */
5692 case POSIXA:
5693 assert(FLAGS(scan) != _CC_ASCII);
5694 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5695 goto join_posix_and_ascii;
5696
5697 case NPOSIXD:
5698 case NPOSIXU:
5699 invert = 1;
5700 /* FALLTHROUGH */
5701 case POSIXD:
5702 case POSIXU:
5703 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5704
5705 /* NPOSIXD matches all upper Latin1 code points unless the
5706 * target string being matched is UTF-8, which is
5707 * unknowable until match time. Since we are going to
5708 * invert, we want to get rid of all of them so that the
5709 * inversion will match all */
5710 if (OP(scan) == NPOSIXD) {
5711 _invlist_subtract(my_invlist, PL_UpperLatin1,
5712 &my_invlist);
5713 }
5714
5715 join_posix_and_ascii:
5716
5717 if (flags & SCF_DO_STCLASS_AND) {
5718 ssc_intersection(data->start_class, my_invlist, invert);
5719 ssc_clear_locale(data->start_class);
5720 }
5721 else {
5722 assert(flags & SCF_DO_STCLASS_OR);
5723 ssc_union(data->start_class, my_invlist, invert);
5724 }
5725 SvREFCNT_dec(my_invlist);
5726 }
5727 if (flags & SCF_DO_STCLASS_OR)
5728 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5729 flags &= ~SCF_DO_STCLASS;
5730 }
5731 }
5732 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5733 data->flags |= (OP(scan) == MEOL
5734 ? SF_BEFORE_MEOL
5735 : SF_BEFORE_SEOL);
5736 scan_commit(pRExC_state, data, minlenp, is_inf);
5737
5738 }
5739 else if ( PL_regkind[OP(scan)] == BRANCHJ
5740 /* Lookbehind, or need to calculate parens/evals/stclass: */
5741 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5742 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5743 {
5744 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5745 || OP(scan) == UNLESSM )
5746 {
5747 /* Negative Lookahead/lookbehind
5748 In this case we can't do fixed string optimisation.
5749 */
5750
5751 SSize_t deltanext, minnext, fake = 0;
5752 regnode *nscan;
5753 regnode_ssc intrnl;
5754 int f = 0;
5755
5756 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5757 if (data) {
5758 data_fake.whilem_c = data->whilem_c;
5759 data_fake.last_closep = data->last_closep;
5760 }
5761 else
5762 data_fake.last_closep = &fake;
5763 data_fake.pos_delta = delta;
5764 if ( flags & SCF_DO_STCLASS && !scan->flags
5765 && OP(scan) == IFMATCH ) { /* Lookahead */
5766 ssc_init(pRExC_state, &intrnl);
5767 data_fake.start_class = &intrnl;
5768 f |= SCF_DO_STCLASS_AND;
5769 }
5770 if (flags & SCF_WHILEM_VISITED_POS)
5771 f |= SCF_WHILEM_VISITED_POS;
5772 next = regnext(scan);
5773 nscan = NEXTOPER(NEXTOPER(scan));
5774
5775 /* recurse study_chunk() for lookahead body */
5776 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5777 last, &data_fake, stopparen,
5778 recursed_depth, NULL, f, depth+1);
5779 if (scan->flags) {
5780 if (deltanext) {
5781 FAIL("Variable length lookbehind not implemented");
5782 }
5783 else if (minnext > (I32)U8_MAX) {
5784 FAIL2("Lookbehind longer than %" UVuf " not implemented",
5785 (UV)U8_MAX);
5786 }
5787 scan->flags = (U8)minnext;
5788 }
5789 if (data) {
5790 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5791 pars++;
5792 if (data_fake.flags & SF_HAS_EVAL)
5793 data->flags |= SF_HAS_EVAL;
5794 data->whilem_c = data_fake.whilem_c;
5795 }
5796 if (f & SCF_DO_STCLASS_AND) {
5797 if (flags & SCF_DO_STCLASS_OR) {
5798 /* OR before, AND after: ideally we would recurse with
5799 * data_fake to get the AND applied by study of the
5800 * remainder of the pattern, and then derecurse;
5801 * *** HACK *** for now just treat as "no information".
5802 * See [perl #56690].
5803 */
5804 ssc_init(pRExC_state, data->start_class);
5805 } else {
5806 /* AND before and after: combine and continue. These
5807 * assertions are zero-length, so can match an EMPTY
5808 * string */
5809 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5810 ANYOF_FLAGS(data->start_class)
5811 |= SSC_MATCHES_EMPTY_STRING;
5812 }
5813 }
5814 }
5815#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5816 else {
5817 /* Positive Lookahead/lookbehind
5818 In this case we can do fixed string optimisation,
5819 but we must be careful about it. Note in the case of
5820 lookbehind the positions will be offset by the minimum
5821 length of the pattern, something we won't know about
5822 until after the recurse.
5823 */
5824 SSize_t deltanext, fake = 0;
5825 regnode *nscan;
5826 regnode_ssc intrnl;
5827 int f = 0;
5828 /* We use SAVEFREEPV so that when the full compile
5829 is finished perl will clean up the allocated
5830 minlens when it's all done. This way we don't
5831 have to worry about freeing them when we know
5832 they wont be used, which would be a pain.
5833 */
5834 SSize_t *minnextp;
5835 Newx( minnextp, 1, SSize_t );
5836 SAVEFREEPV(minnextp);
5837
5838 if (data) {
5839 StructCopy(data, &data_fake, scan_data_t);
5840 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5841 f |= SCF_DO_SUBSTR;
5842 if (scan->flags)
5843 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5844 data_fake.last_found=newSVsv(data->last_found);
5845 }
5846 }
5847 else
5848 data_fake.last_closep = &fake;
5849 data_fake.flags = 0;
5850 data_fake.substrs[0].flags = 0;
5851 data_fake.substrs[1].flags = 0;
5852 data_fake.pos_delta = delta;
5853 if (is_inf)
5854 data_fake.flags |= SF_IS_INF;
5855 if ( flags & SCF_DO_STCLASS && !scan->flags
5856 && OP(scan) == IFMATCH ) { /* Lookahead */
5857 ssc_init(pRExC_state, &intrnl);
5858 data_fake.start_class = &intrnl;
5859 f |= SCF_DO_STCLASS_AND;
5860 }
5861 if (flags & SCF_WHILEM_VISITED_POS)
5862 f |= SCF_WHILEM_VISITED_POS;
5863 next = regnext(scan);
5864 nscan = NEXTOPER(NEXTOPER(scan));
5865
5866 /* positive lookahead study_chunk() recursion */
5867 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5868 &deltanext, last, &data_fake,
5869 stopparen, recursed_depth, NULL,
5870 f, depth+1);
5871 if (scan->flags) {
5872 if (deltanext) {
5873 FAIL("Variable length lookbehind not implemented");
5874 }
5875 else if (*minnextp > (I32)U8_MAX) {
5876 FAIL2("Lookbehind longer than %" UVuf " not implemented",
5877 (UV)U8_MAX);
5878 }
5879 scan->flags = (U8)*minnextp;
5880 }
5881
5882 *minnextp += min;
5883
5884 if (f & SCF_DO_STCLASS_AND) {
5885 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5886 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5887 }
5888 if (data) {
5889 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5890 pars++;
5891 if (data_fake.flags & SF_HAS_EVAL)
5892 data->flags |= SF_HAS_EVAL;
5893 data->whilem_c = data_fake.whilem_c;
5894 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5895 int i;
5896 if (RExC_rx->minlen<*minnextp)
5897 RExC_rx->minlen=*minnextp;
5898 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5899 SvREFCNT_dec_NN(data_fake.last_found);
5900
5901 for (i = 0; i < 2; i++) {
5902 if (data_fake.substrs[i].minlenp != minlenp) {
5903 data->substrs[i].min_offset =
5904 data_fake.substrs[i].min_offset;
5905 data->substrs[i].max_offset =
5906 data_fake.substrs[i].max_offset;
5907 data->substrs[i].minlenp =
5908 data_fake.substrs[i].minlenp;
5909 data->substrs[i].lookbehind += scan->flags;
5910 }
5911 }
5912 }
5913 }
5914 }
5915#endif
5916 }
5917
5918 else if (OP(scan) == OPEN) {
5919 if (stopparen != (I32)ARG(scan))
5920 pars++;
5921 }
5922 else if (OP(scan) == CLOSE) {
5923 if (stopparen == (I32)ARG(scan)) {
5924 break;
5925 }
5926 if ((I32)ARG(scan) == is_par) {
5927 next = regnext(scan);
5928
5929 if ( next && (OP(next) != WHILEM) && next < last)
5930 is_par = 0; /* Disable optimization */
5931 }
5932 if (data)
5933 *(data->last_closep) = ARG(scan);
5934 }
5935 else if (OP(scan) == EVAL) {
5936 if (data)
5937 data->flags |= SF_HAS_EVAL;
5938 }
5939 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5940 if (flags & SCF_DO_SUBSTR) {
5941 scan_commit(pRExC_state, data, minlenp, is_inf);
5942 flags &= ~SCF_DO_SUBSTR;
5943 }
5944 if (data && OP(scan)==ACCEPT) {
5945 data->flags |= SCF_SEEN_ACCEPT;
5946 if (stopmin > min)
5947 stopmin = min;
5948 }
5949 }
5950 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5951 {
5952 if (flags & SCF_DO_SUBSTR) {
5953 scan_commit(pRExC_state, data, minlenp, is_inf);
5954 data->cur_is_floating = 1; /* float */
5955 }
5956 is_inf = is_inf_internal = 1;
5957 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5958 ssc_anything(data->start_class);
5959 flags &= ~SCF_DO_STCLASS;
5960 }
5961 else if (OP(scan) == GPOS) {
5962 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5963 !(delta || is_inf || (data && data->pos_delta)))
5964 {
5965 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5966 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5967 if (RExC_rx->gofs < (STRLEN)min)
5968 RExC_rx->gofs = min;
5969 } else {
5970 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5971 RExC_rx->gofs = 0;
5972 }
5973 }
5974#ifdef TRIE_STUDY_OPT
5975#ifdef FULL_TRIE_STUDY
5976 else if (PL_regkind[OP(scan)] == TRIE) {
5977 /* NOTE - There is similar code to this block above for handling
5978 BRANCH nodes on the initial study. If you change stuff here
5979 check there too. */
5980 regnode *trie_node= scan;
5981 regnode *tail= regnext(scan);
5982 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5983 SSize_t max1 = 0, min1 = SSize_t_MAX;
5984 regnode_ssc accum;
5985
5986 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5987 /* Cannot merge strings after this. */
5988 scan_commit(pRExC_state, data, minlenp, is_inf);
5989 }
5990 if (flags & SCF_DO_STCLASS)
5991 ssc_init_zero(pRExC_state, &accum);
5992
5993 if (!trie->jump) {
5994 min1= trie->minlen;
5995 max1= trie->maxlen;
5996 } else {
5997 const regnode *nextbranch= NULL;
5998 U32 word;
5999
6000 for ( word=1 ; word <= trie->wordcount ; word++)
6001 {
6002 SSize_t deltanext=0, minnext=0, f = 0, fake;
6003 regnode_ssc this_class;
6004
6005 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6006 if (data) {
6007 data_fake.whilem_c = data->whilem_c;
6008 data_fake.last_closep = data->last_closep;
6009 }
6010 else
6011 data_fake.last_closep = &fake;
6012 data_fake.pos_delta = delta;
6013 if (flags & SCF_DO_STCLASS) {
6014 ssc_init(pRExC_state, &this_class);
6015 data_fake.start_class = &this_class;
6016 f = SCF_DO_STCLASS_AND;
6017 }
6018 if (flags & SCF_WHILEM_VISITED_POS)
6019 f |= SCF_WHILEM_VISITED_POS;
6020
6021 if (trie->jump[word]) {
6022 if (!nextbranch)
6023 nextbranch = trie_node + trie->jump[0];
6024 scan= trie_node + trie->jump[word];
6025 /* We go from the jump point to the branch that follows
6026 it. Note this means we need the vestigal unused
6027 branches even though they arent otherwise used. */
6028 /* optimise study_chunk() for TRIE */
6029 minnext = study_chunk(pRExC_state, &scan, minlenp,
6030 &deltanext, (regnode *)nextbranch, &data_fake,
6031 stopparen, recursed_depth, NULL, f, depth+1);
6032 }
6033 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6034 nextbranch= regnext((regnode*)nextbranch);
6035
6036 if (min1 > (SSize_t)(minnext + trie->minlen))
6037 min1 = minnext + trie->minlen;
6038 if (deltanext == SSize_t_MAX) {
6039 is_inf = is_inf_internal = 1;
6040 max1 = SSize_t_MAX;
6041 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6042 max1 = minnext + deltanext + trie->maxlen;
6043
6044 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6045 pars++;
6046 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6047 if ( stopmin > min + min1)
6048 stopmin = min + min1;
6049 flags &= ~SCF_DO_SUBSTR;
6050 if (data)
6051 data->flags |= SCF_SEEN_ACCEPT;
6052 }
6053 if (data) {
6054 if (data_fake.flags & SF_HAS_EVAL)
6055 data->flags |= SF_HAS_EVAL;
6056 data->whilem_c = data_fake.whilem_c;
6057 }
6058 if (flags & SCF_DO_STCLASS)
6059 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6060 }
6061 }
6062 if (flags & SCF_DO_SUBSTR) {
6063 data->pos_min += min1;
6064 data->pos_delta += max1 - min1;
6065 if (max1 != min1 || is_inf)
6066 data->cur_is_floating = 1; /* float */
6067 }
6068 min += min1;
6069 if (delta != SSize_t_MAX) {
6070 if (SSize_t_MAX - (max1 - min1) >= delta)
6071 delta += max1 - min1;
6072 else
6073 delta = SSize_t_MAX;
6074 }
6075 if (flags & SCF_DO_STCLASS_OR) {
6076 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6077 if (min1) {
6078 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6079 flags &= ~SCF_DO_STCLASS;
6080 }
6081 }
6082 else if (flags & SCF_DO_STCLASS_AND) {
6083 if (min1) {
6084 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6085 flags &= ~SCF_DO_STCLASS;
6086 }
6087 else {
6088 /* Switch to OR mode: cache the old value of
6089 * data->start_class */
6090 INIT_AND_WITHP;
6091 StructCopy(data->start_class, and_withp, regnode_ssc);
6092 flags &= ~SCF_DO_STCLASS_AND;
6093 StructCopy(&accum, data->start_class, regnode_ssc);
6094 flags |= SCF_DO_STCLASS_OR;
6095 }
6096 }
6097 scan= tail;
6098 continue;
6099 }
6100#else
6101 else if (PL_regkind[OP(scan)] == TRIE) {
6102 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6103 U8*bang=NULL;
6104
6105 min += trie->minlen;
6106 delta += (trie->maxlen - trie->minlen);
6107 flags &= ~SCF_DO_STCLASS; /* xxx */
6108 if (flags & SCF_DO_SUBSTR) {
6109 /* Cannot expect anything... */
6110 scan_commit(pRExC_state, data, minlenp, is_inf);
6111 data->pos_min += trie->minlen;
6112 data->pos_delta += (trie->maxlen - trie->minlen);
6113 if (trie->maxlen != trie->minlen)
6114 data->cur_is_floating = 1; /* float */
6115 }
6116 if (trie->jump) /* no more substrings -- for now /grr*/
6117 flags &= ~SCF_DO_SUBSTR;
6118 }
6119#endif /* old or new */
6120#endif /* TRIE_STUDY_OPT */
6121
6122 /* Else: zero-length, ignore. */
6123 scan = regnext(scan);
6124 }
6125
6126 finish:
6127 if (frame) {
6128 /* we need to unwind recursion. */
6129 depth = depth - 1;
6130
6131 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6132 DEBUG_PEEP("fend", scan, depth, flags);
6133
6134 /* restore previous context */
6135 last = frame->last_regnode;
6136 scan = frame->next_regnode;
6137 stopparen = frame->stopparen;
6138 recursed_depth = frame->prev_recursed_depth;
6139
6140 RExC_frame_last = frame->prev_frame;
6141 frame = frame->this_prev_frame;
6142 goto fake_study_recurse;
6143 }
6144
6145 assert(!frame);
6146 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6147
6148 *scanp = scan;
6149 *deltap = is_inf_internal ? SSize_t_MAX : delta;
6150
6151 if (flags & SCF_DO_SUBSTR && is_inf)
6152 data->pos_delta = SSize_t_MAX - data->pos_min;
6153 if (is_par > (I32)U8_MAX)
6154 is_par = 0;
6155 if (is_par && pars==1 && data) {
6156 data->flags |= SF_IN_PAR;
6157 data->flags &= ~SF_HAS_PAR;
6158 }
6159 else if (pars && data) {
6160 data->flags |= SF_HAS_PAR;
6161 data->flags &= ~SF_IN_PAR;
6162 }
6163 if (flags & SCF_DO_STCLASS_OR)
6164 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6165 if (flags & SCF_TRIE_RESTUDY)
6166 data->flags |= SCF_TRIE_RESTUDY;
6167
6168 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6169
6170 {
6171 SSize_t final_minlen= min < stopmin ? min : stopmin;
6172
6173 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6174 if (final_minlen > SSize_t_MAX - delta)
6175 RExC_maxlen = SSize_t_MAX;
6176 else if (RExC_maxlen < final_minlen + delta)
6177 RExC_maxlen = final_minlen + delta;
6178 }
6179 return final_minlen;
6180 }
6181 NOT_REACHED; /* NOTREACHED */
6182}
6183
6184STATIC U32
6185S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6186{
6187 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6188
6189 PERL_ARGS_ASSERT_ADD_DATA;
6190
6191 Renewc(RExC_rxi->data,
6192 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6193 char, struct reg_data);
6194 if(count)
6195 Renew(RExC_rxi->data->what, count + n, U8);
6196 else
6197 Newx(RExC_rxi->data->what, n, U8);
6198 RExC_rxi->data->count = count + n;
6199 Copy(s, RExC_rxi->data->what + count, n, U8);
6200 return count;
6201}
6202
6203/*XXX: todo make this not included in a non debugging perl, but appears to be
6204 * used anyway there, in 'use re' */
6205#ifndef PERL_IN_XSUB_RE
6206void
6207Perl_reginitcolors(pTHX)
6208{
6209 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6210 if (s) {
6211 char *t = savepv(s);
6212 int i = 0;
6213 PL_colors[0] = t;
6214 while (++i < 6) {
6215 t = strchr(t, '\t');
6216 if (t) {
6217 *t = '\0';
6218 PL_colors[i] = ++t;
6219 }
6220 else
6221 PL_colors[i] = t = (char *)"";
6222 }
6223 } else {
6224 int i = 0;
6225 while (i < 6)
6226 PL_colors[i++] = (char *)"";
6227 }
6228 PL_colorset = 1;
6229}
6230#endif
6231
6232
6233#ifdef TRIE_STUDY_OPT
6234#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6235 STMT_START { \
6236 if ( \
6237 (data.flags & SCF_TRIE_RESTUDY) \
6238 && ! restudied++ \
6239 ) { \
6240 dOsomething; \
6241 goto reStudy; \
6242 } \
6243 } STMT_END
6244#else
6245#define CHECK_RESTUDY_GOTO_butfirst
6246#endif
6247
6248/*
6249 * pregcomp - compile a regular expression into internal code
6250 *
6251 * Decides which engine's compiler to call based on the hint currently in
6252 * scope
6253 */
6254
6255#ifndef PERL_IN_XSUB_RE
6256
6257/* return the currently in-scope regex engine (or the default if none) */
6258
6259regexp_engine const *
6260Perl_current_re_engine(pTHX)
6261{
6262 if (IN_PERL_COMPILETIME) {
6263 HV * const table = GvHV(PL_hintgv);
6264 SV **ptr;
6265
6266 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6267 return &PL_core_reg_engine;
6268 ptr = hv_fetchs(table, "regcomp", FALSE);
6269 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6270 return &PL_core_reg_engine;
6271 return INT2PTR(regexp_engine*, SvIV(*ptr));
6272 }
6273 else {
6274 SV *ptr;
6275 if (!PL_curcop->cop_hints_hash)
6276 return &PL_core_reg_engine;
6277 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6278 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6279 return &PL_core_reg_engine;
6280 return INT2PTR(regexp_engine*, SvIV(ptr));
6281 }
6282}
6283
6284
6285REGEXP *
6286Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6287{
6288 regexp_engine const *eng = current_re_engine();
6289 GET_RE_DEBUG_FLAGS_DECL;
6290
6291 PERL_ARGS_ASSERT_PREGCOMP;
6292
6293 /* Dispatch a request to compile a regexp to correct regexp engine. */
6294 DEBUG_COMPILE_r({
6295 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6296 PTR2UV(eng));
6297 });
6298 return CALLREGCOMP_ENG(eng, pattern, flags);
6299}
6300#endif
6301
6302/* public(ish) entry point for the perl core's own regex compiling code.
6303 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6304 * pattern rather than a list of OPs, and uses the internal engine rather
6305 * than the current one */
6306
6307REGEXP *
6308Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6309{
6310 SV *pat = pattern; /* defeat constness! */
6311 PERL_ARGS_ASSERT_RE_COMPILE;
6312 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6313#ifdef PERL_IN_XSUB_RE
6314 &my_reg_engine,
6315#else
6316 &PL_core_reg_engine,
6317#endif
6318 NULL, NULL, rx_flags, 0);
6319}
6320
6321
6322static void
6323S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6324{
6325 int n;
6326
6327 if (--cbs->refcnt > 0)
6328 return;
6329 for (n = 0; n < cbs->count; n++) {
6330 REGEXP *rx = cbs->cb[n].src_regex;
6331 cbs->cb[n].src_regex = NULL;
6332 SvREFCNT_dec(rx);
6333 }
6334 Safefree(cbs->cb);
6335 Safefree(cbs);
6336}
6337
6338
6339static struct reg_code_blocks *
6340S_alloc_code_blocks(pTHX_ int ncode)
6341{
6342 struct reg_code_blocks *cbs;
6343 Newx(cbs, 1, struct reg_code_blocks);
6344 cbs->count = ncode;
6345 cbs->refcnt = 1;
6346 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6347 if (ncode)
6348 Newx(cbs->cb, ncode, struct reg_code_block);
6349 else
6350 cbs->cb = NULL;
6351 return cbs;
6352}
6353
6354
6355/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6356 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6357 * point to the realloced string and length.
6358 *
6359 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6360 * stuff added */
6361
6362static void
6363S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6364 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6365{
6366 U8 *const src = (U8*)*pat_p;
6367 U8 *dst, *d;
6368 int n=0;
6369 STRLEN s = 0;
6370 bool do_end = 0;
6371 GET_RE_DEBUG_FLAGS_DECL;
6372
6373 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6374 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6375
6376 Newx(dst, *plen_p * 2 + 1, U8);
6377 d = dst;
6378
6379 while (s < *plen_p) {
6380 append_utf8_from_native_byte(src[s], &d);
6381
6382 if (n < num_code_blocks) {
6383 assert(pRExC_state->code_blocks);
6384 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6385 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6386 assert(*(d - 1) == '(');
6387 do_end = 1;
6388 }
6389 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6390 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6391 assert(*(d - 1) == ')');
6392 do_end = 0;
6393 n++;
6394 }
6395 }
6396 s++;
6397 }
6398 *d = '\0';
6399 *plen_p = d - dst;
6400 *pat_p = (char*) dst;
6401 SAVEFREEPV(*pat_p);
6402 RExC_orig_utf8 = RExC_utf8 = 1;
6403}
6404
6405
6406
6407/* S_concat_pat(): concatenate a list of args to the pattern string pat,
6408 * while recording any code block indices, and handling overloading,
6409 * nested qr// objects etc. If pat is null, it will allocate a new
6410 * string, or just return the first arg, if there's only one.
6411 *
6412 * Returns the malloced/updated pat.
6413 * patternp and pat_count is the array of SVs to be concatted;
6414 * oplist is the optional list of ops that generated the SVs;
6415 * recompile_p is a pointer to a boolean that will be set if
6416 * the regex will need to be recompiled.
6417 * delim, if non-null is an SV that will be inserted between each element
6418 */
6419
6420static SV*
6421S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6422 SV *pat, SV ** const patternp, int pat_count,
6423 OP *oplist, bool *recompile_p, SV *delim)
6424{
6425 SV **svp;
6426 int n = 0;
6427 bool use_delim = FALSE;
6428 bool alloced = FALSE;
6429
6430 /* if we know we have at least two args, create an empty string,
6431 * then concatenate args to that. For no args, return an empty string */
6432 if (!pat && pat_count != 1) {
6433 pat = newSVpvs("");
6434 SAVEFREESV(pat);
6435 alloced = TRUE;
6436 }
6437
6438 for (svp = patternp; svp < patternp + pat_count; svp++) {
6439 SV *sv;
6440 SV *rx = NULL;
6441 STRLEN orig_patlen = 0;
6442 bool code = 0;
6443 SV *msv = use_delim ? delim : *svp;
6444 if (!msv) msv = &PL_sv_undef;
6445
6446 /* if we've got a delimiter, we go round the loop twice for each
6447 * svp slot (except the last), using the delimiter the second
6448 * time round */
6449 if (use_delim) {
6450 svp--;
6451 use_delim = FALSE;
6452 }
6453 else if (delim)
6454 use_delim = TRUE;
6455
6456 if (SvTYPE(msv) == SVt_PVAV) {
6457 /* we've encountered an interpolated array within
6458 * the pattern, e.g. /...@a..../. Expand the list of elements,
6459 * then recursively append elements.
6460 * The code in this block is based on S_pushav() */
6461
6462 AV *const av = (AV*)msv;
6463 const SSize_t maxarg = AvFILL(av) + 1;
6464 SV **array;
6465
6466 if (oplist) {
6467 assert(oplist->op_type == OP_PADAV
6468 || oplist->op_type == OP_RV2AV);
6469 oplist = OpSIBLING(oplist);
6470 }
6471
6472 if (SvRMAGICAL(av)) {
6473 SSize_t i;
6474
6475 Newx(array, maxarg, SV*);
6476 SAVEFREEPV(array);
6477 for (i=0; i < maxarg; i++) {
6478 SV ** const svp = av_fetch(av, i, FALSE);
6479 array[i] = svp ? *svp : &PL_sv_undef;
6480 }
6481 }
6482 else
6483 array = AvARRAY(av);
6484
6485 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6486 array, maxarg, NULL, recompile_p,
6487 /* $" */
6488 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6489
6490 continue;
6491 }
6492
6493
6494 /* we make the assumption here that each op in the list of
6495 * op_siblings maps to one SV pushed onto the stack,
6496 * except for code blocks, with have both an OP_NULL and
6497 * and OP_CONST.
6498 * This allows us to match up the list of SVs against the
6499 * list of OPs to find the next code block.
6500 *
6501 * Note that PUSHMARK PADSV PADSV ..
6502 * is optimised to
6503 * PADRANGE PADSV PADSV ..
6504 * so the alignment still works. */
6505
6506 if (oplist) {
6507 if (oplist->op_type == OP_NULL
6508 && (oplist->op_flags & OPf_SPECIAL))
6509 {
6510 assert(n < pRExC_state->code_blocks->count);
6511 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6512 pRExC_state->code_blocks->cb[n].block = oplist;
6513 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6514 n++;
6515 code = 1;
6516 oplist = OpSIBLING(oplist); /* skip CONST */
6517 assert(oplist);
6518 }
6519 oplist = OpSIBLING(oplist);;
6520 }
6521
6522 /* apply magic and QR overloading to arg */
6523
6524 SvGETMAGIC(msv);
6525 if (SvROK(msv) && SvAMAGIC(msv)) {
6526 SV *sv = AMG_CALLunary(msv, regexp_amg);
6527 if (sv) {
6528 if (SvROK(sv))
6529 sv = SvRV(sv);
6530 if (SvTYPE(sv) != SVt_REGEXP)
6531 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6532 msv = sv;
6533 }
6534 }
6535
6536 /* try concatenation overload ... */
6537 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6538 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6539 {
6540 sv_setsv(pat, sv);
6541 /* overloading involved: all bets are off over literal
6542 * code. Pretend we haven't seen it */
6543 if (n)
6544 pRExC_state->code_blocks->count -= n;
6545 n = 0;
6546 }
6547 else {
6548 /* ... or failing that, try "" overload */
6549 while (SvAMAGIC(msv)
6550 && (sv = AMG_CALLunary(msv, string_amg))
6551 && sv != msv
6552 && !( SvROK(msv)
6553 && SvROK(sv)
6554 && SvRV(msv) == SvRV(sv))
6555 ) {
6556 msv = sv;
6557 SvGETMAGIC(msv);
6558 }
6559 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6560 msv = SvRV(msv);
6561
6562 if (pat) {
6563 /* this is a partially unrolled
6564 * sv_catsv_nomg(pat, msv);
6565 * that allows us to adjust code block indices if
6566 * needed */
6567 STRLEN dlen;
6568 char *dst = SvPV_force_nomg(pat, dlen);
6569 orig_patlen = dlen;
6570 if (SvUTF8(msv) && !SvUTF8(pat)) {
6571 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6572 sv_setpvn(pat, dst, dlen);
6573 SvUTF8_on(pat);
6574 }
6575 sv_catsv_nomg(pat, msv);
6576 rx = msv;
6577 }
6578 else {
6579 /* We have only one SV to process, but we need to verify
6580 * it is properly null terminated or we will fail asserts
6581 * later. In theory we probably shouldn't get such SV's,
6582 * but if we do we should handle it gracefully. */
6583 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6584 /* not a string, or a string with a trailing null */
6585 pat = msv;
6586 } else {
6587 /* a string with no trailing null, we need to copy it
6588 * so it has a trailing null */
6589 pat = sv_2mortal(newSVsv(msv));
6590 }
6591 }
6592
6593 if (code)
6594 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6595 }
6596
6597 /* extract any code blocks within any embedded qr//'s */
6598 if (rx && SvTYPE(rx) == SVt_REGEXP
6599 && RX_ENGINE((REGEXP*)rx)->op_comp)
6600 {
6601
6602 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6603 if (ri->code_blocks && ri->code_blocks->count) {
6604 int i;
6605 /* the presence of an embedded qr// with code means
6606 * we should always recompile: the text of the
6607 * qr// may not have changed, but it may be a
6608 * different closure than last time */
6609 *recompile_p = 1;
6610 if (pRExC_state->code_blocks) {
6611 int new_count = pRExC_state->code_blocks->count
6612 + ri->code_blocks->count;
6613 Renew(pRExC_state->code_blocks->cb,
6614 new_count, struct reg_code_block);
6615 pRExC_state->code_blocks->count = new_count;
6616 }
6617 else
6618 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6619 ri->code_blocks->count);
6620
6621 for (i=0; i < ri->code_blocks->count; i++) {
6622 struct reg_code_block *src, *dst;
6623 STRLEN offset = orig_patlen
6624 + ReANY((REGEXP *)rx)->pre_prefix;
6625 assert(n < pRExC_state->code_blocks->count);
6626 src = &ri->code_blocks->cb[i];
6627 dst = &pRExC_state->code_blocks->cb[n];
6628 dst->start = src->start + offset;
6629 dst->end = src->end + offset;
6630 dst->block = src->block;
6631 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6632 src->src_regex
6633 ? src->src_regex
6634 : (REGEXP*)rx);
6635 n++;
6636 }
6637 }
6638 }
6639 }
6640 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6641 if (alloced)
6642 SvSETMAGIC(pat);
6643
6644 return pat;
6645}
6646
6647
6648
6649/* see if there are any run-time code blocks in the pattern.
6650 * False positives are allowed */
6651
6652static bool
6653S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6654 char *pat, STRLEN plen)
6655{
6656 int n = 0;
6657 STRLEN s;
6658
6659 PERL_UNUSED_CONTEXT;
6660
6661 for (s = 0; s < plen; s++) {
6662 if ( pRExC_state->code_blocks
6663 && n < pRExC_state->code_blocks->count
6664 && s == pRExC_state->code_blocks->cb[n].start)
6665 {
6666 s = pRExC_state->code_blocks->cb[n].end;
6667 n++;
6668 continue;
6669 }
6670 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6671 * positives here */
6672 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6673 (pat[s+2] == '{'
6674 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6675 )
6676 return 1;
6677 }
6678 return 0;
6679}
6680
6681/* Handle run-time code blocks. We will already have compiled any direct
6682 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6683 * copy of it, but with any literal code blocks blanked out and
6684 * appropriate chars escaped; then feed it into
6685 *
6686 * eval "qr'modified_pattern'"
6687 *
6688 * For example,
6689 *
6690 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6691 *
6692 * becomes
6693 *
6694 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6695 *
6696 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6697 * and merge them with any code blocks of the original regexp.
6698 *
6699 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6700 * instead, just save the qr and return FALSE; this tells our caller that
6701 * the original pattern needs upgrading to utf8.
6702 */
6703
6704static bool
6705S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6706 char *pat, STRLEN plen)
6707{
6708 SV *qr;
6709
6710 GET_RE_DEBUG_FLAGS_DECL;
6711
6712 if (pRExC_state->runtime_code_qr) {
6713 /* this is the second time we've been called; this should
6714 * only happen if the main pattern got upgraded to utf8
6715 * during compilation; re-use the qr we compiled first time
6716 * round (which should be utf8 too)
6717 */
6718 qr = pRExC_state->runtime_code_qr;
6719 pRExC_state->runtime_code_qr = NULL;
6720 assert(RExC_utf8 && SvUTF8(qr));
6721 }
6722 else {
6723 int n = 0;
6724 STRLEN s;
6725 char *p, *newpat;
6726 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6727 SV *sv, *qr_ref;
6728 dSP;
6729
6730 /* determine how many extra chars we need for ' and \ escaping */
6731 for (s = 0; s < plen; s++) {
6732 if (pat[s] == '\'' || pat[s] == '\\')
6733 newlen++;
6734 }
6735
6736 Newx(newpat, newlen, char);
6737 p = newpat;
6738 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6739
6740 for (s = 0; s < plen; s++) {
6741 if ( pRExC_state->code_blocks
6742 && n < pRExC_state->code_blocks->count
6743 && s == pRExC_state->code_blocks->cb[n].start)
6744 {
6745 /* blank out literal code block */
6746 assert(pat[s] == '(');
6747 while (s <= pRExC_state->code_blocks->cb[n].end) {
6748 *p++ = '_';
6749 s++;
6750 }
6751 s--;
6752 n++;
6753 continue;
6754 }
6755 if (pat[s] == '\'' || pat[s] == '\\')
6756 *p++ = '\\';
6757 *p++ = pat[s];
6758 }
6759 *p++ = '\'';
6760 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
6761 *p++ = 'x';
6762 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
6763 *p++ = 'x';
6764 }
6765 }
6766 *p++ = '\0';
6767 DEBUG_COMPILE_r({
6768 Perl_re_printf( aTHX_
6769 "%sre-parsing pattern for runtime code:%s %s\n",
6770 PL_colors[4], PL_colors[5], newpat);
6771 });
6772
6773 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6774 Safefree(newpat);
6775
6776 ENTER;
6777 SAVETMPS;
6778 save_re_context();
6779 PUSHSTACKi(PERLSI_REQUIRE);
6780 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6781 * parsing qr''; normally only q'' does this. It also alters
6782 * hints handling */
6783 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6784 SvREFCNT_dec_NN(sv);
6785 SPAGAIN;
6786 qr_ref = POPs;
6787 PUTBACK;
6788 {
6789 SV * const errsv = ERRSV;
6790 if (SvTRUE_NN(errsv))
6791 /* use croak_sv ? */
6792 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
6793 }
6794 assert(SvROK(qr_ref));
6795 qr = SvRV(qr_ref);
6796 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6797 /* the leaving below frees the tmp qr_ref.
6798 * Give qr a life of its own */
6799 SvREFCNT_inc(qr);
6800 POPSTACK;
6801 FREETMPS;
6802 LEAVE;
6803
6804 }
6805
6806 if (!RExC_utf8 && SvUTF8(qr)) {
6807 /* first time through; the pattern got upgraded; save the
6808 * qr for the next time through */
6809 assert(!pRExC_state->runtime_code_qr);
6810 pRExC_state->runtime_code_qr = qr;
6811 return 0;
6812 }
6813
6814
6815 /* extract any code blocks within the returned qr// */
6816
6817
6818 /* merge the main (r1) and run-time (r2) code blocks into one */
6819 {
6820 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6821 struct reg_code_block *new_block, *dst;
6822 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6823 int i1 = 0, i2 = 0;
6824 int r1c, r2c;
6825
6826 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
6827 {
6828 SvREFCNT_dec_NN(qr);
6829 return 1;
6830 }
6831
6832 if (!r1->code_blocks)
6833 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
6834
6835 r1c = r1->code_blocks->count;
6836 r2c = r2->code_blocks->count;
6837
6838 Newx(new_block, r1c + r2c, struct reg_code_block);
6839
6840 dst = new_block;
6841
6842 while (i1 < r1c || i2 < r2c) {
6843 struct reg_code_block *src;
6844 bool is_qr = 0;
6845
6846 if (i1 == r1c) {
6847 src = &r2->code_blocks->cb[i2++];
6848 is_qr = 1;
6849 }
6850 else if (i2 == r2c)
6851 src = &r1->code_blocks->cb[i1++];
6852 else if ( r1->code_blocks->cb[i1].start
6853 < r2->code_blocks->cb[i2].start)
6854 {
6855 src = &r1->code_blocks->cb[i1++];
6856 assert(src->end < r2->code_blocks->cb[i2].start);
6857 }
6858 else {
6859 assert( r1->code_blocks->cb[i1].start
6860 > r2->code_blocks->cb[i2].start);
6861 src = &r2->code_blocks->cb[i2++];
6862 is_qr = 1;
6863 assert(src->end < r1->code_blocks->cb[i1].start);
6864 }
6865
6866 assert(pat[src->start] == '(');
6867 assert(pat[src->end] == ')');
6868 dst->start = src->start;
6869 dst->end = src->end;
6870 dst->block = src->block;
6871 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6872 : src->src_regex;
6873 dst++;
6874 }
6875 r1->code_blocks->count += r2c;
6876 Safefree(r1->code_blocks->cb);
6877 r1->code_blocks->cb = new_block;
6878 }
6879
6880 SvREFCNT_dec_NN(qr);
6881 return 1;
6882}
6883
6884
6885STATIC bool
6886S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
6887 struct reg_substr_datum *rsd,
6888 struct scan_data_substrs *sub,
6889 STRLEN longest_length)
6890{
6891 /* This is the common code for setting up the floating and fixed length
6892 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6893 * as to whether succeeded or not */
6894
6895 I32 t;
6896 SSize_t ml;
6897 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
6898 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
6899
6900 if (! (longest_length
6901 || (eol /* Can't have SEOL and MULTI */
6902 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6903 )
6904 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6905 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6906 {
6907 return FALSE;
6908 }
6909
6910 /* copy the information about the longest from the reg_scan_data
6911 over to the program. */
6912 if (SvUTF8(sub->str)) {
6913 rsd->substr = NULL;
6914 rsd->utf8_substr = sub->str;
6915 } else {
6916 rsd->substr = sub->str;
6917 rsd->utf8_substr = NULL;
6918 }
6919 /* end_shift is how many chars that must be matched that
6920 follow this item. We calculate it ahead of time as once the
6921 lookbehind offset is added in we lose the ability to correctly
6922 calculate it.*/
6923 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
6924 rsd->end_shift = ml - sub->min_offset
6925 - longest_length
6926 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
6927 * intead? - DAPM
6928 + (SvTAIL(sub->str) != 0)
6929 */
6930 + sub->lookbehind;
6931
6932 t = (eol/* Can't have SEOL and MULTI */
6933 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6934 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
6935
6936 return TRUE;
6937}
6938
6939/*
6940 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6941 * regular expression into internal code.
6942 * The pattern may be passed either as:
6943 * a list of SVs (patternp plus pat_count)
6944 * a list of OPs (expr)
6945 * If both are passed, the SV list is used, but the OP list indicates
6946 * which SVs are actually pre-compiled code blocks
6947 *
6948 * The SVs in the list have magic and qr overloading applied to them (and
6949 * the list may be modified in-place with replacement SVs in the latter
6950 * case).
6951 *
6952 * If the pattern hasn't changed from old_re, then old_re will be
6953 * returned.
6954 *
6955 * eng is the current engine. If that engine has an op_comp method, then
6956 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6957 * do the initial concatenation of arguments and pass on to the external
6958 * engine.
6959 *
6960 * If is_bare_re is not null, set it to a boolean indicating whether the
6961 * arg list reduced (after overloading) to a single bare regex which has
6962 * been returned (i.e. /$qr/).
6963 *
6964 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6965 *
6966 * pm_flags contains the PMf_* flags, typically based on those from the
6967 * pm_flags field of the related PMOP. Currently we're only interested in
6968 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6969 *
6970 * We can't allocate space until we know how big the compiled form will be,
6971 * but we can't compile it (and thus know how big it is) until we've got a
6972 * place to put the code. So we cheat: we compile it twice, once with code
6973 * generation turned off and size counting turned on, and once "for real".
6974 * This also means that we don't allocate space until we are sure that the
6975 * thing really will compile successfully, and we never have to move the
6976 * code and thus invalidate pointers into it. (Note that it has to be in
6977 * one piece because free() must be able to free it all.) [NB: not true in perl]
6978 *
6979 * Beware that the optimization-preparation code in here knows about some
6980 * of the structure of the compiled regexp. [I'll say.]
6981 */
6982
6983REGEXP *
6984Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6985 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6986 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
6987{
6988 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
6989 struct regexp *r;
6990 STRLEN plen;
6991 char *exp;
6992 regnode *scan;
6993 I32 flags;
6994 SSize_t minlen = 0;
6995 U32 rx_flags;
6996 SV *pat;
6997 SV** new_patternp = patternp;
6998
6999 /* these are all flags - maybe they should be turned
7000 * into a single int with different bit masks */
7001 I32 sawlookahead = 0;
7002 I32 sawplus = 0;
7003 I32 sawopen = 0;
7004 I32 sawminmod = 0;
7005
7006 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7007 bool recompile = 0;
7008 bool runtime_code = 0;
7009 scan_data_t data;
7010 RExC_state_t RExC_state;
7011 RExC_state_t * const pRExC_state = &RExC_state;
7012#ifdef TRIE_STUDY_OPT
7013 int restudied = 0;
7014 RExC_state_t copyRExC_state;
7015#endif
7016 GET_RE_DEBUG_FLAGS_DECL;
7017
7018 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7019
7020 DEBUG_r(if (!PL_colorset) reginitcolors());
7021
7022 /* Initialize these here instead of as-needed, as is quick and avoids
7023 * having to test them each time otherwise */
7024 if (! PL_InBitmap) {
7025#ifdef DEBUGGING
7026 char * dump_len_string;
7027#endif
7028
7029 /* This is calculated here, because the Perl program that generates the
7030 * static global ones doesn't currently have access to
7031 * NUM_ANYOF_CODE_POINTS */
7032 PL_InBitmap = _new_invlist(2);
7033 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7034 NUM_ANYOF_CODE_POINTS - 1);
7035#ifdef DEBUGGING
7036 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7037 if ( ! dump_len_string
7038 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7039 {
7040 PL_dump_re_max_len = 60; /* A reasonable default */
7041 }
7042#endif
7043 }
7044
7045 pRExC_state->warn_text = NULL;
7046 pRExC_state->code_blocks = NULL;
7047
7048 if (is_bare_re)
7049 *is_bare_re = FALSE;
7050
7051 if (expr && (expr->op_type == OP_LIST ||
7052 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7053 /* allocate code_blocks if needed */
7054 OP *o;
7055 int ncode = 0;
7056
7057 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7058 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7059 ncode++; /* count of DO blocks */
7060
7061 if (ncode)
7062 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7063 }
7064
7065 if (!pat_count) {
7066 /* compile-time pattern with just OP_CONSTs and DO blocks */
7067
7068 int n;
7069 OP *o;
7070
7071 /* find how many CONSTs there are */
7072 assert(expr);
7073 n = 0;
7074 if (expr->op_type == OP_CONST)
7075 n = 1;
7076 else
7077 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7078 if (o->op_type == OP_CONST)
7079 n++;
7080 }
7081
7082 /* fake up an SV array */
7083
7084 assert(!new_patternp);
7085 Newx(new_patternp, n, SV*);
7086 SAVEFREEPV(new_patternp);
7087 pat_count = n;
7088
7089 n = 0;
7090 if (expr->op_type == OP_CONST)
7091 new_patternp[n] = cSVOPx_sv(expr);
7092 else
7093 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7094 if (o->op_type == OP_CONST)
7095 new_patternp[n++] = cSVOPo_sv;
7096 }
7097
7098 }
7099
7100 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7101 "Assembling pattern from %d elements%s\n", pat_count,
7102 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7103
7104 /* set expr to the first arg op */
7105
7106 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7107 && expr->op_type != OP_CONST)
7108 {
7109 expr = cLISTOPx(expr)->op_first;
7110 assert( expr->op_type == OP_PUSHMARK
7111 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7112 || expr->op_type == OP_PADRANGE);
7113 expr = OpSIBLING(expr);
7114 }
7115
7116 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7117 expr, &recompile, NULL);
7118
7119 /* handle bare (possibly after overloading) regex: foo =~ $re */
7120 {
7121 SV *re = pat;
7122 if (SvROK(re))
7123 re = SvRV(re);
7124 if (SvTYPE(re) == SVt_REGEXP) {
7125 if (is_bare_re)
7126 *is_bare_re = TRUE;
7127 SvREFCNT_inc(re);
7128 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7129 "Precompiled pattern%s\n",
7130 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7131
7132 return (REGEXP*)re;
7133 }
7134 }
7135
7136 exp = SvPV_nomg(pat, plen);
7137
7138 if (!eng->op_comp) {
7139 if ((SvUTF8(pat) && IN_BYTES)
7140 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7141 {
7142 /* make a temporary copy; either to convert to bytes,
7143 * or to avoid repeating get-magic / overloaded stringify */
7144 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7145 (IN_BYTES ? 0 : SvUTF8(pat)));
7146 }
7147 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7148 }
7149
7150 /* ignore the utf8ness if the pattern is 0 length */
7151 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7152
7153 RExC_rx_sv = NULL; \
7154 RExC_uni_semantics = 0;
7155 RExC_seen_unfolded_sharp_s = 0;
7156 RExC_contains_locale = 0;
7157 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7158 RExC_in_script_run = 0;
7159 RExC_study_started = 0;
7160 pRExC_state->runtime_code_qr = NULL;
7161 RExC_frame_head= NULL;
7162 RExC_frame_last= NULL;
7163 RExC_frame_count= 0;
7164 RExC_total_parens = 0;
7165
7166 DEBUG_r({
7167 RExC_mysv1= sv_newmortal();
7168 RExC_mysv2= sv_newmortal();
7169 });
7170
7171 DEBUG_COMPILE_r({
7172 SV *dsv= sv_newmortal();
7173 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7174 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7175 PL_colors[4], PL_colors[5], s);
7176 });
7177
7178 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7179 * to utf8 */
7180
7181 if ((pm_flags & PMf_USE_RE_EVAL)
7182 /* this second condition covers the non-regex literal case,
7183 * i.e. $foo =~ '(?{})'. */
7184 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7185 )
7186 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7187
7188 redo_parse:
7189 /* return old regex if pattern hasn't changed */
7190 /* XXX: note in the below we have to check the flags as well as the
7191 * pattern.
7192 *
7193 * Things get a touch tricky as we have to compare the utf8 flag
7194 * independently from the compile flags. */
7195
7196 if ( old_re
7197 && !recompile
7198 && !!RX_UTF8(old_re) == !!RExC_utf8
7199 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7200 && RX_PRECOMP(old_re)
7201 && RX_PRELEN(old_re) == plen
7202 && memEQ(RX_PRECOMP(old_re), exp, plen)
7203 && !runtime_code /* with runtime code, always recompile */ )
7204 {
7205 return old_re;
7206 }
7207
7208 rx_flags = orig_rx_flags;
7209
7210 if ( initial_charset == REGEX_DEPENDS_CHARSET
7211 && (RExC_utf8 ||RExC_uni_semantics))
7212 {
7213
7214 /* Set to use unicode semantics if the pattern is in utf8 and has the
7215 * 'depends' charset specified, as it means unicode when utf8 */
7216 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7217 }
7218
7219 RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7220 RExC_flags = rx_flags;
7221 RExC_pm_flags = pm_flags;
7222
7223 if (runtime_code) {
7224 assert(TAINTING_get || !TAINT_get);
7225 if (TAINT_get)
7226 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7227
7228 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7229 /* whoops, we have a non-utf8 pattern, whilst run-time code
7230 * got compiled as utf8. Try again with a utf8 pattern */
7231 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7232 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7233 goto redo_parse;
7234 }
7235 }
7236 assert(!pRExC_state->runtime_code_qr);
7237
7238 RExC_sawback = 0;
7239
7240 RExC_seen = 0;
7241 RExC_maxlen = 0;
7242 RExC_in_lookbehind = 0;
7243 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7244 RExC_extralen = 0;
7245#ifdef EBCDIC
7246 RExC_recode_x_to_native = 0;
7247#endif
7248 RExC_in_multi_char_class = 0;
7249
7250 /* First pass: determine size, legality. */
7251 RExC_pass1 = TRUE;
7252 RExC_parse = exp;
7253 RExC_start = RExC_copy_start_in_constructed = exp;
7254 RExC_end = exp + plen;
7255 RExC_precomp_end = RExC_end;
7256 RExC_naughty = 0;
7257 RExC_npar = 1;
7258 RExC_nestroot = 0;
7259 RExC_size = 0L;
7260 RExC_emit = 1;
7261 RExC_whilem_seen = 0;
7262 RExC_open_parens = 0;
7263 RExC_close_parens = 0;
7264 RExC_end_op = NULL;
7265 RExC_paren_names = NULL;
7266#ifdef DEBUGGING
7267 RExC_paren_name_list = NULL;
7268#endif
7269 RExC_recurse = NULL;
7270 RExC_study_chunk_recursed = NULL;
7271 RExC_study_chunk_recursed_bytes= 0;
7272 RExC_recurse_count = 0;
7273 pRExC_state->code_index = 0;
7274
7275 /* We allocate scratch space as large as the largest node, for use in the
7276 * first pass. Since many functions return RExC_emit on success, and '0'
7277 * if an error, RExC_emit must never be 0, so we set it to 1 and double
7278 * the scratch space */
7279 Newxc(RExC_emit_start, 2 * sizeof(regnode_ssc), char, regnode);
7280 SAVEFREEPV(RExC_emit_start);
7281
7282 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7283 * code makes sure the final byte is an uncounted NUL. But should this
7284 * ever not be the case, lots of things could read beyond the end of the
7285 * buffer: loops like
7286 * while(isFOO(*RExC_parse)) RExC_parse++;
7287 * strchr(RExC_parse, "foo");
7288 * etc. So it is worth noting. */
7289 assert(*RExC_end == '\0');
7290
7291 DEBUG_PARSE_r(
7292 Perl_re_printf( aTHX_ "Starting first pass (sizing)\n");
7293 RExC_lastnum=0;
7294 RExC_lastparse=NULL;
7295 );
7296
7297 if (reg(pRExC_state, 0, &flags, 1) == 0) {
7298 /* It's possible to write a regexp in ascii that represents Unicode
7299 codepoints outside of the byte range, such as via \x{100}. If we
7300 detect such a sequence we have to convert the entire pattern to utf8
7301 and then recompile, as our sizing calculation will have been based
7302 on 1 byte == 1 character, but we will need to use utf8 to encode
7303 at least some part of the pattern, and therefore must convert the whole
7304 thing.
7305 -- dmq */
7306 if (MUST_RESTART(flags)) {
7307 if (flags & NEED_UTF8) {
7308 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7309 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7310 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7311 }
7312 else {
7313 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7314 }
7315
7316 goto redo_parse;
7317 }
7318 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
7319 }
7320
7321 DEBUG_PARSE_r({
7322 Perl_re_printf( aTHX_
7323 "Required size %" IVdf " nodes\n"
7324 "Starting second pass (creation)\n",
7325 (IV)RExC_size);
7326 RExC_lastnum=0;
7327 RExC_lastparse=NULL;
7328 });
7329
7330 /* The first pass could have found things that force Unicode semantics */
7331 if ((RExC_utf8 || RExC_uni_semantics)
7332 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
7333 {
7334 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7335 }
7336
7337 /* Small enough for pointer-storage convention?
7338 If extralen==0, this means that we will not need long jumps. */
7339 if (RExC_size >= 0x10000L && RExC_extralen)
7340 RExC_size += RExC_extralen;
7341 else
7342 RExC_extralen = 0;
7343 if (RExC_whilem_seen > 15)
7344 RExC_whilem_seen = 15;
7345
7346 /* Allocate space and zero-initialize. Note, the two step process
7347 of zeroing when in debug mode, thus anything assigned has to
7348 happen after that */
7349 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7350 r = ReANY(Rx);
7351 Newxc(RExC_rxi, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7352 char, regexp_internal);
7353 if ( r == NULL || RExC_rxi == NULL )
7354 FAIL("Regexp out of space");
7355#ifdef DEBUGGING
7356 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
7357 Zero(RExC_rxi, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
7358 char);
7359#else
7360 /* bulk initialize base fields with 0. */
7361 Zero(RExC_rxi, sizeof(regexp_internal), char);
7362#endif
7363
7364 /* non-zero initialization begins here */
7365 RXi_SET( r, RExC_rxi );
7366 r->engine= eng;
7367 r->extflags = rx_flags;
7368 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7369
7370 if (pm_flags & PMf_IS_QR) {
7371 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7372 if (RExC_rxi->code_blocks)
7373 RExC_rxi->code_blocks->refcnt++;
7374 }
7375
7376 {
7377 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7378 bool has_charset = (get_regex_charset(r->extflags)
7379 != REGEX_DEPENDS_CHARSET);
7380
7381 /* The caret is output if there are any defaults: if not all the STD
7382 * flags are set, or if no character set specifier is needed */
7383 bool has_default =
7384 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7385 || ! has_charset);
7386 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7387 == REG_RUN_ON_COMMENT_SEEN);
7388 U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
7389 >> RXf_PMf_STD_PMMOD_SHIFT);
7390 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7391 char *p;
7392
7393 /* We output all the necessary flags; we never output a minus, as all
7394 * those are defaults, so are
7395 * covered by the caret */
7396 const STRLEN wraplen = plen + has_p + has_runon
7397 + has_default /* If needs a caret */
7398 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7399
7400 /* If needs a character set specifier */
7401 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7402 + (sizeof("(?:)") - 1);
7403
7404 /* make sure PL_bitcount bounds not exceeded */
7405 assert(sizeof(STD_PAT_MODS) <= 8);
7406
7407 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7408 SvPOK_on(Rx);
7409 if (RExC_utf8)
7410 SvFLAGS(Rx) |= SVf_UTF8;
7411 *p++='('; *p++='?';
7412
7413 /* If a default, cover it using the caret */
7414 if (has_default) {
7415 *p++= DEFAULT_PAT_MOD;
7416 }
7417 if (has_charset) {
7418 STRLEN len;
7419 const char* const name = get_regex_charset_name(r->extflags, &len);
7420 Copy(name, p, len, char);
7421 p += len;
7422 }
7423 if (has_p)
7424 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7425 {
7426 char ch;
7427 while((ch = *fptr++)) {
7428 if(reganch & 1)
7429 *p++ = ch;
7430 reganch >>= 1;
7431 }
7432 }
7433
7434 *p++ = ':';
7435 Copy(RExC_precomp, p, plen, char);
7436 assert ((RX_WRAPPED(Rx) - p) < 16);
7437 r->pre_prefix = p - RX_WRAPPED(Rx);
7438 p += plen;
7439
7440 /* Adding a trailing \n causes this to compile properly:
7441 my $R = qr / A B C # D E/x; /($R)/
7442 Otherwise the parens are considered part of the comment */
7443 if (has_runon)
7444 *p++ = '\n';
7445 *p++ = ')';
7446 *p = 0;
7447 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7448 }
7449
7450 r->intflags = 0;
7451 RExC_total_parens = RExC_npar;
7452 r->nparens = RExC_total_parens - 1; /* set early to validate backrefs */
7453
7454 /* Useful during FAIL. */
7455#ifdef RE_TRACK_PATTERN_OFFSETS
7456 Newxz(RExC_offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7457 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7458 "%s %" UVuf " bytes for offset annotations.\n",
7459 RExC_offsets ? "Got" : "Couldn't get",
7460 (UV)((2*RExC_size+1) * sizeof(U32))));
7461#endif
7462 SetProgLen(RExC_rxi, RExC_size);
7463 RExC_rx_sv = Rx;
7464 RExC_rx = r;
7465
7466 /* Second pass: emit code. */
7467 RExC_pass1 = FALSE;
7468 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7469 RExC_pm_flags = pm_flags;
7470 RExC_parse = exp;
7471 RExC_end = exp + plen;
7472 RExC_naughty = 0;
7473 RExC_emit_start = RExC_rxi->program;
7474 RExC_emit = 1;
7475 RExC_emit_bound = RExC_rxi->program + RExC_size + 1;
7476 pRExC_state->code_index = 0;
7477
7478 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7479 /* setup various meta data about recursion, this all requires
7480 * RExC_npar to be correctly set, and a bit later on we clear it */
7481 if (RExC_seen & REG_RECURSE_SEEN) {
7482 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
7483 "%*s%*s Setting up open/close parens\n",
7484 22, "| |", (int)(0 * 2 + 1), ""));
7485
7486 /* setup RExC_open_parens, which holds the address of each
7487 * OPEN tag, and to make things simpler for the 0 index
7488 * the start of the program - this is used later for offsets */
7489 Newxz(RExC_open_parens, RExC_npar, regnode_offset);
7490 SAVEFREEPV(RExC_open_parens);
7491 RExC_open_parens[0] = RExC_emit;
7492
7493 /* setup RExC_close_parens, which holds the address of each
7494 * CLOSE tag, and to make things simpler for the 0 index
7495 * the end of the program - this is used later for offsets */
7496 Newxz(RExC_close_parens, RExC_npar, regnode_offset);
7497 SAVEFREEPV(RExC_close_parens);
7498 /* we dont know where end op starts yet, so we dont
7499 * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
7500
7501 /* Note, RExC_npar is 1 + the number of parens in a pattern.
7502 * So its 1 if there are no parens. */
7503 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7504 ((RExC_npar & 0x07) != 0);
7505 Newx(RExC_study_chunk_recursed,
7506 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7507 SAVEFREEPV(RExC_study_chunk_recursed);
7508 }
7509 RExC_npar = 1;
7510 if (reg(pRExC_state, 0, &flags, 1) == 0) {
7511 ReREFCNT_dec(Rx);
7512 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
7513 }
7514 DEBUG_OPTIMISE_r(
7515 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7516 );
7517
7518 /* XXXX To minimize changes to RE engine we always allocate
7519 3-units-long substrs field. */
7520 Newx(r->substrs, 1, struct reg_substr_data);
7521 if (RExC_recurse_count) {
7522 Newx(RExC_recurse, RExC_recurse_count, regnode *);
7523 SAVEFREEPV(RExC_recurse);
7524 }
7525
7526 reStudy:
7527 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7528 DEBUG_r(
7529 RExC_study_chunk_recursed_count= 0;
7530 );
7531 Zero(r->substrs, 1, struct reg_substr_data);
7532 if (RExC_study_chunk_recursed) {
7533 Zero(RExC_study_chunk_recursed,
7534 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7535 }
7536
7537
7538#ifdef TRIE_STUDY_OPT
7539 if (!restudied) {
7540 StructCopy(&zero_scan_data, &data, scan_data_t);
7541 copyRExC_state = RExC_state;
7542 } else {
7543 U32 seen=RExC_seen;
7544 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7545
7546 RExC_state = copyRExC_state;
7547 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7548 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7549 else
7550 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7551 StructCopy(&zero_scan_data, &data, scan_data_t);
7552 }
7553#else
7554 StructCopy(&zero_scan_data, &data, scan_data_t);
7555#endif
7556
7557 /* Dig out information for optimizations. */
7558 r->extflags = RExC_flags; /* was pm_op */
7559 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7560
7561 if (UTF)
7562 SvUTF8_on(Rx); /* Unicode in it? */
7563 RExC_rxi->regstclass = NULL;
7564 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7565 r->intflags |= PREGf_NAUGHTY;
7566 scan = RExC_rxi->program + 1; /* First BRANCH. */
7567
7568 /* testing for BRANCH here tells us whether there is "must appear"
7569 data in the pattern. If there is then we can use it for optimisations */
7570 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7571 */
7572 SSize_t fake;
7573 STRLEN longest_length[2];
7574 regnode_ssc ch_class; /* pointed to by data */
7575 int stclass_flag;
7576 SSize_t last_close = 0; /* pointed to by data */
7577 regnode *first= scan;
7578 regnode *first_next= regnext(first);
7579 int i;
7580
7581 /*
7582 * Skip introductions and multiplicators >= 1
7583 * so that we can extract the 'meat' of the pattern that must
7584 * match in the large if() sequence following.
7585 * NOTE that EXACT is NOT covered here, as it is normally
7586 * picked up by the optimiser separately.
7587 *
7588 * This is unfortunate as the optimiser isnt handling lookahead
7589 * properly currently.
7590 *
7591 */
7592 while ((OP(first) == OPEN && (sawopen = 1)) ||
7593 /* An OR of *one* alternative - should not happen now. */
7594 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7595 /* for now we can't handle lookbehind IFMATCH*/
7596 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7597 (OP(first) == PLUS) ||
7598 (OP(first) == MINMOD) ||
7599 /* An {n,m} with n>0 */
7600 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7601 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7602 {
7603 /*
7604 * the only op that could be a regnode is PLUS, all the rest
7605 * will be regnode_1 or regnode_2.
7606 *
7607 * (yves doesn't think this is true)
7608 */
7609 if (OP(first) == PLUS)
7610 sawplus = 1;
7611 else {
7612 if (OP(first) == MINMOD)
7613 sawminmod = 1;
7614 first += regarglen[OP(first)];
7615 }
7616 first = NEXTOPER(first);
7617 first_next= regnext(first);
7618 }
7619
7620 /* Starting-point info. */
7621 again:
7622 DEBUG_PEEP("first:", first, 0, 0);
7623 /* Ignore EXACT as we deal with it later. */
7624 if (PL_regkind[OP(first)] == EXACT) {
7625 if (OP(first) == EXACT || OP(first) == EXACTL)
7626 NOOP; /* Empty, get anchored substr later. */
7627 else
7628 RExC_rxi->regstclass = first;
7629 }
7630#ifdef TRIE_STCLASS
7631 else if (PL_regkind[OP(first)] == TRIE &&
7632 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
7633 {
7634 /* this can happen only on restudy */
7635 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7636 }
7637#endif
7638 else if (REGNODE_SIMPLE(OP(first)))
7639 RExC_rxi->regstclass = first;
7640 else if (PL_regkind[OP(first)] == BOUND ||
7641 PL_regkind[OP(first)] == NBOUND)
7642 RExC_rxi->regstclass = first;
7643 else if (PL_regkind[OP(first)] == BOL) {
7644 r->intflags |= (OP(first) == MBOL
7645 ? PREGf_ANCH_MBOL
7646 : PREGf_ANCH_SBOL);
7647 first = NEXTOPER(first);
7648 goto again;
7649 }
7650 else if (OP(first) == GPOS) {
7651 r->intflags |= PREGf_ANCH_GPOS;
7652 first = NEXTOPER(first);
7653 goto again;
7654 }
7655 else if ((!sawopen || !RExC_sawback) &&
7656 !sawlookahead &&
7657 (OP(first) == STAR &&
7658 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7659 !(r->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
7660 {
7661 /* turn .* into ^.* with an implied $*=1 */
7662 const int type =
7663 (OP(NEXTOPER(first)) == REG_ANY)
7664 ? PREGf_ANCH_MBOL
7665 : PREGf_ANCH_SBOL;
7666 r->intflags |= (type | PREGf_IMPLICIT);
7667 first = NEXTOPER(first);
7668 goto again;
7669 }
7670 if (sawplus && !sawminmod && !sawlookahead
7671 && (!sawopen || !RExC_sawback)
7672 && !pRExC_state->code_blocks) /* May examine pos and $& */
7673 /* x+ must match at the 1st pos of run of x's */
7674 r->intflags |= PREGf_SKIP;
7675
7676 /* Scan is after the zeroth branch, first is atomic matcher. */
7677#ifdef TRIE_STUDY_OPT
7678 DEBUG_PARSE_r(
7679 if (!restudied)
7680 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
7681 (IV)(first - scan + 1))
7682 );
7683#else
7684 DEBUG_PARSE_r(
7685 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
7686 (IV)(first - scan + 1))
7687 );
7688#endif
7689
7690
7691 /*
7692 * If there's something expensive in the r.e., find the
7693 * longest literal string that must appear and make it the
7694 * regmust. Resolve ties in favor of later strings, since
7695 * the regstart check works with the beginning of the r.e.
7696 * and avoiding duplication strengthens checking. Not a
7697 * strong reason, but sufficient in the absence of others.
7698 * [Now we resolve ties in favor of the earlier string if
7699 * it happens that c_offset_min has been invalidated, since the
7700 * earlier string may buy us something the later one won't.]
7701 */
7702
7703 data.substrs[0].str = newSVpvs("");
7704 data.substrs[1].str = newSVpvs("");
7705 data.last_found = newSVpvs("");
7706 data.cur_is_floating = 0; /* initially any found substring is fixed */
7707 ENTER_with_name("study_chunk");
7708 SAVEFREESV(data.substrs[0].str);
7709 SAVEFREESV(data.substrs[1].str);
7710 SAVEFREESV(data.last_found);
7711 first = scan;
7712 if (!RExC_rxi->regstclass) {
7713 ssc_init(pRExC_state, &ch_class);
7714 data.start_class = &ch_class;
7715 stclass_flag = SCF_DO_STCLASS_AND;
7716 } else /* XXXX Check for BOUND? */
7717 stclass_flag = 0;
7718 data.last_closep = &last_close;
7719
7720 DEBUG_RExC_seen();
7721 /*
7722 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
7723 * (NO top level branches)
7724 */
7725 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7726 scan + RExC_size, /* Up to end */
7727 &data, -1, 0, NULL,
7728 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7729 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7730 0);
7731
7732
7733 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7734
7735
7736 if ( RExC_total_parens == 1 && !data.cur_is_floating
7737 && data.last_start_min == 0 && data.last_end > 0
7738 && !RExC_seen_zerolen
7739 && !(RExC_seen & REG_VERBARG_SEEN)
7740 && !(RExC_seen & REG_GPOS_SEEN)
7741 ){
7742 r->extflags |= RXf_CHECK_ALL;
7743 }
7744 scan_commit(pRExC_state, &data,&minlen, 0);
7745
7746
7747 /* XXX this is done in reverse order because that's the way the
7748 * code was before it was parameterised. Don't know whether it
7749 * actually needs doing in reverse order. DAPM */
7750 for (i = 1; i >= 0; i--) {
7751 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
7752
7753 if ( !( i
7754 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
7755 && data.substrs[0].min_offset
7756 == data.substrs[1].min_offset
7757 && SvCUR(data.substrs[0].str)
7758 == SvCUR(data.substrs[1].str)
7759 )
7760 && S_setup_longest (aTHX_ pRExC_state,
7761 &(r->substrs->data[i]),
7762 &(data.substrs[i]),
7763 longest_length[i]))
7764 {
7765 r->substrs->data[i].min_offset =
7766 data.substrs[i].min_offset - data.substrs[i].lookbehind;
7767
7768 r->substrs->data[i].max_offset = data.substrs[i].max_offset;
7769 /* Don't offset infinity */
7770 if (data.substrs[i].max_offset < SSize_t_MAX)
7771 r->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
7772 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
7773 }
7774 else {
7775 r->substrs->data[i].substr = NULL;
7776 r->substrs->data[i].utf8_substr = NULL;
7777 longest_length[i] = 0;
7778 }
7779 }
7780
7781 LEAVE_with_name("study_chunk");
7782
7783 if (RExC_rxi->regstclass
7784 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
7785 RExC_rxi->regstclass = NULL;
7786
7787 if ((!(r->substrs->data[0].substr || r->substrs->data[0].utf8_substr)
7788 || r->substrs->data[0].min_offset)
7789 && stclass_flag
7790 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7791 && is_ssc_worth_it(pRExC_state, data.start_class))
7792 {
7793 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7794
7795 ssc_finalize(pRExC_state, data.start_class);
7796
7797 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7798 StructCopy(data.start_class,
7799 (regnode_ssc*)RExC_rxi->data->data[n],
7800 regnode_ssc);
7801 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
7802 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7803 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7804 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7805 Perl_re_printf( aTHX_
7806 "synthetic stclass \"%s\".\n",
7807 SvPVX_const(sv));});
7808 data.start_class = NULL;
7809 }
7810
7811 /* A temporary algorithm prefers floated substr to fixed one of
7812 * same length to dig more info. */
7813 i = (longest_length[0] <= longest_length[1]);
7814 r->substrs->check_ix = i;
7815 r->check_end_shift = r->substrs->data[i].end_shift;
7816 r->check_substr = r->substrs->data[i].substr;
7817 r->check_utf8 = r->substrs->data[i].utf8_substr;
7818 r->check_offset_min = r->substrs->data[i].min_offset;
7819 r->check_offset_max = r->substrs->data[i].max_offset;
7820 if (!i && (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
7821 r->intflags |= PREGf_NOSCAN;
7822
7823 if ((r->check_substr || r->check_utf8) ) {
7824 r->extflags |= RXf_USE_INTUIT;
7825 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7826 r->extflags |= RXf_INTUIT_TAIL;
7827 }
7828
7829 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7830 if ( (STRLEN)minlen < longest_length[1] )
7831 minlen= longest_length[1];
7832 if ( (STRLEN)minlen < longest_length[0] )
7833 minlen= longest_length[0];
7834 */
7835 }
7836 else {
7837 /* Several toplevels. Best we can is to set minlen. */
7838 SSize_t fake;
7839 regnode_ssc ch_class;
7840 SSize_t last_close = 0;
7841
7842 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
7843
7844 scan = RExC_rxi->program + 1;
7845 ssc_init(pRExC_state, &ch_class);
7846 data.start_class = &ch_class;
7847 data.last_closep = &last_close;
7848
7849 DEBUG_RExC_seen();
7850 /*
7851 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
7852 * (patterns WITH top level branches)
7853 */
7854 minlen = study_chunk(pRExC_state,
7855 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7856 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7857 ? SCF_TRIE_DOING_RESTUDY
7858 : 0),
7859 0);
7860
7861 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7862
7863 r->check_substr = NULL;
7864 r->check_utf8 = NULL;
7865 r->substrs->data[0].substr = NULL;
7866 r->substrs->data[0].utf8_substr = NULL;
7867 r->substrs->data[1].substr = NULL;
7868 r->substrs->data[1].utf8_substr = NULL;
7869
7870 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7871 && is_ssc_worth_it(pRExC_state, data.start_class))
7872 {
7873 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7874
7875 ssc_finalize(pRExC_state, data.start_class);
7876
7877 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7878 StructCopy(data.start_class,
7879 (regnode_ssc*)RExC_rxi->data->data[n],
7880 regnode_ssc);
7881 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
7882 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7883 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7884 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7885 Perl_re_printf( aTHX_
7886 "synthetic stclass \"%s\".\n",
7887 SvPVX_const(sv));});
7888 data.start_class = NULL;
7889 }
7890 }
7891
7892 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7893 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7894 r->maxlen = REG_INFTY;
7895 }
7896 else {
7897 r->maxlen = RExC_maxlen;
7898 }
7899
7900 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7901 the "real" pattern. */
7902 DEBUG_OPTIMISE_r({
7903 Perl_re_printf( aTHX_ "minlen: %" IVdf " r->minlen:%" IVdf " maxlen:%" IVdf "\n",
7904 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7905 });
7906 r->minlenret = minlen;
7907 if (r->minlen < minlen)
7908 r->minlen = minlen;
7909
7910 if (RExC_seen & REG_RECURSE_SEEN ) {
7911 r->intflags |= PREGf_RECURSE_SEEN;
7912 Newx(r->recurse_locinput, r->nparens + 1, char *);
7913 }
7914 if (RExC_seen & REG_GPOS_SEEN)
7915 r->intflags |= PREGf_GPOS_SEEN;
7916 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7917 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7918 lookbehind */
7919 if (pRExC_state->code_blocks)
7920 r->extflags |= RXf_EVAL_SEEN;
7921 if (RExC_seen & REG_VERBARG_SEEN)
7922 {
7923 r->intflags |= PREGf_VERBARG_SEEN;
7924 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7925 }
7926 if (RExC_seen & REG_CUTGROUP_SEEN)
7927 r->intflags |= PREGf_CUTGROUP_SEEN;
7928 if (pm_flags & PMf_USE_RE_EVAL)
7929 r->intflags |= PREGf_USE_RE_EVAL;
7930 if (RExC_paren_names)
7931 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7932 else
7933 RXp_PAREN_NAMES(r) = NULL;
7934
7935 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7936 * so it can be used in pp.c */
7937 if (r->intflags & PREGf_ANCH)
7938 r->extflags |= RXf_IS_ANCHORED;
7939
7940
7941 {
7942 /* this is used to identify "special" patterns that might result
7943 * in Perl NOT calling the regex engine and instead doing the match "itself",
7944 * particularly special cases in split//. By having the regex compiler
7945 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7946 * we avoid weird issues with equivalent patterns resulting in different behavior,
7947 * AND we allow non Perl engines to get the same optimizations by the setting the
7948 * flags appropriately - Yves */
7949 regnode *first = RExC_rxi->program + 1;
7950 U8 fop = OP(first);
7951 regnode *next = regnext(first);
7952 U8 nop = OP(next);
7953
7954 if (PL_regkind[fop] == NOTHING && nop == END)
7955 r->extflags |= RXf_NULL;
7956 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7957 /* when fop is SBOL first->flags will be true only when it was
7958 * produced by parsing /\A/, and not when parsing /^/. This is
7959 * very important for the split code as there we want to
7960 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7961 * See rt #122761 for more details. -- Yves */
7962 r->extflags |= RXf_START_ONLY;
7963 else if (fop == PLUS
7964 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7965 && nop == END)
7966 r->extflags |= RXf_WHITE;
7967 else if ( r->extflags & RXf_SPLIT
7968 && (fop == EXACT || fop == EXACTL)
7969 && STR_LEN(first) == 1
7970 && *(STRING(first)) == ' '
7971 && nop == END )
7972 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7973
7974 }
7975
7976 if (RExC_contains_locale) {
7977 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7978 }
7979
7980#ifdef DEBUGGING
7981 if (RExC_paren_names) {
7982 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7983 RExC_rxi->data->data[RExC_rxi->name_list_idx]
7984 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7985 } else
7986#endif
7987 RExC_rxi->name_list_idx = 0;
7988
7989 while ( RExC_recurse_count > 0 ) {
7990 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
7991 /*
7992 * This data structure is set up in study_chunk() and is used
7993 * to calculate the distance between a GOSUB regopcode and
7994 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
7995 * it refers to.
7996 *
7997 * If for some reason someone writes code that optimises
7998 * away a GOSUB opcode then the assert should be changed to
7999 * an if(scan) to guard the ARG2L_SET() - Yves
8000 *
8001 */
8002 assert(scan && OP(scan) == GOSUB);
8003 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8004 }
8005
8006 Newxz(r->offs, RExC_total_parens, regexp_paren_pair);
8007 /* assume we don't need to swap parens around before we match */
8008 DEBUG_TEST_r({
8009 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8010 (unsigned long)RExC_study_chunk_recursed_count);
8011 });
8012 DEBUG_DUMP_r({
8013 DEBUG_RExC_seen();
8014 Perl_re_printf( aTHX_ "Final program:\n");
8015 regdump(r);
8016 });
8017#ifdef RE_TRACK_PATTERN_OFFSETS
8018 DEBUG_OFFSETS_r(if (RExC_offsets) {
8019 const STRLEN len = RExC_offsets[0];
8020 STRLEN i;
8021 GET_RE_DEBUG_FLAGS_DECL;
8022 Perl_re_printf( aTHX_
8023 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
8024 for (i = 1; i <= len; i++) {
8025 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
8026 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
8027 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
8028 }
8029 Perl_re_printf( aTHX_ "\n");
8030 });
8031#endif
8032
8033#ifdef USE_ITHREADS
8034 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8035 * by setting the regexp SV to readonly-only instead. If the
8036 * pattern's been recompiled, the USEDness should remain. */
8037 if (old_re && SvREADONLY(old_re))
8038 SvREADONLY_on(Rx);
8039#endif
8040 return Rx;
8041}
8042
8043
8044SV*
8045Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8046 const U32 flags)
8047{
8048 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8049
8050 PERL_UNUSED_ARG(value);
8051
8052 if (flags & RXapif_FETCH) {
8053 return reg_named_buff_fetch(rx, key, flags);
8054 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8055 Perl_croak_no_modify();
8056 return NULL;
8057 } else if (flags & RXapif_EXISTS) {
8058 return reg_named_buff_exists(rx, key, flags)
8059 ? &PL_sv_yes
8060 : &PL_sv_no;
8061 } else if (flags & RXapif_REGNAMES) {
8062 return reg_named_buff_all(rx, flags);
8063 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8064 return reg_named_buff_scalar(rx, flags);
8065 } else {
8066 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8067 return NULL;
8068 }
8069}
8070
8071SV*
8072Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8073 const U32 flags)
8074{
8075 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8076 PERL_UNUSED_ARG(lastkey);
8077
8078 if (flags & RXapif_FIRSTKEY)
8079 return reg_named_buff_firstkey(rx, flags);
8080 else if (flags & RXapif_NEXTKEY)
8081 return reg_named_buff_nextkey(rx, flags);
8082 else {
8083 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8084 (int)flags);
8085 return NULL;
8086 }
8087}
8088
8089SV*
8090Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8091 const U32 flags)
8092{
8093 SV *ret;
8094 struct regexp *const rx = ReANY(r);
8095
8096 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8097
8098 if (rx && RXp_PAREN_NAMES(rx)) {
8099 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8100 if (he_str) {
8101 IV i;
8102 SV* sv_dat=HeVAL(he_str);
8103 I32 *nums=(I32*)SvPVX(sv_dat);
8104 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8105 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8106 if ((I32)(rx->nparens) >= nums[i]
8107 && rx->offs[nums[i]].start != -1
8108 && rx->offs[nums[i]].end != -1)
8109 {
8110 ret = newSVpvs("");
8111 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8112 if (!retarray)
8113 return ret;
8114 } else {
8115 if (retarray)
8116 ret = newSVsv(&PL_sv_undef);
8117 }
8118 if (retarray)
8119 av_push(retarray, ret);
8120 }
8121 if (retarray)
8122 return newRV_noinc(MUTABLE_SV(retarray));
8123 }
8124 }
8125 return NULL;
8126}
8127
8128bool
8129Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8130 const U32 flags)
8131{
8132 struct regexp *const rx = ReANY(r);
8133
8134 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8135
8136 if (rx && RXp_PAREN_NAMES(rx)) {
8137 if (flags & RXapif_ALL) {
8138 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8139 } else {
8140 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8141 if (sv) {
8142 SvREFCNT_dec_NN(sv);
8143 return TRUE;
8144 } else {
8145 return FALSE;
8146 }
8147 }
8148 } else {
8149 return FALSE;
8150 }
8151}
8152
8153SV*
8154Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8155{
8156 struct regexp *const rx = ReANY(r);
8157
8158 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8159
8160 if ( rx && RXp_PAREN_NAMES(rx) ) {
8161 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8162
8163 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8164 } else {
8165 return FALSE;
8166 }
8167}
8168
8169SV*
8170Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8171{
8172 struct regexp *const rx = ReANY(r);
8173 GET_RE_DEBUG_FLAGS_DECL;
8174
8175 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8176
8177 if (rx && RXp_PAREN_NAMES(rx)) {
8178 HV *hv = RXp_PAREN_NAMES(rx);
8179 HE *temphe;
8180 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8181 IV i;
8182 IV parno = 0;
8183 SV* sv_dat = HeVAL(temphe);
8184 I32 *nums = (I32*)SvPVX(sv_dat);
8185 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8186 if ((I32)(rx->lastparen) >= nums[i] &&
8187 rx->offs[nums[i]].start != -1 &&
8188 rx->offs[nums[i]].end != -1)
8189 {
8190 parno = nums[i];
8191 break;
8192 }
8193 }
8194 if (parno || flags & RXapif_ALL) {
8195 return newSVhek(HeKEY_hek(temphe));
8196 }
8197 }
8198 }
8199 return NULL;
8200}
8201
8202SV*
8203Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8204{
8205 SV *ret;
8206 AV *av;
8207 SSize_t length;
8208 struct regexp *const rx = ReANY(r);
8209
8210 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8211
8212 if (rx && RXp_PAREN_NAMES(rx)) {
8213 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8214 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8215 } else if (flags & RXapif_ONE) {
8216 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8217 av = MUTABLE_AV(SvRV(ret));
8218 length = av_tindex(av);
8219 SvREFCNT_dec_NN(ret);
8220 return newSViv(length + 1);
8221 } else {
8222 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8223 (int)flags);
8224 return NULL;
8225 }
8226 }
8227 return &PL_sv_undef;
8228}
8229
8230SV*
8231Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8232{
8233 struct regexp *const rx = ReANY(r);
8234 AV *av = newAV();
8235
8236 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8237
8238 if (rx && RXp_PAREN_NAMES(rx)) {
8239 HV *hv= RXp_PAREN_NAMES(rx);
8240 HE *temphe;
8241 (void)hv_iterinit(hv);
8242 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8243 IV i;
8244 IV parno = 0;
8245 SV* sv_dat = HeVAL(temphe);
8246 I32 *nums = (I32*)SvPVX(sv_dat);
8247 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8248 if ((I32)(rx->lastparen) >= nums[i] &&
8249 rx->offs[nums[i]].start != -1 &&
8250 rx->offs[nums[i]].end != -1)
8251 {
8252 parno = nums[i];
8253 break;
8254 }
8255 }
8256 if (parno || flags & RXapif_ALL) {
8257 av_push(av, newSVhek(HeKEY_hek(temphe)));
8258 }
8259 }
8260 }
8261
8262 return newRV_noinc(MUTABLE_SV(av));
8263}
8264
8265void
8266Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8267 SV * const sv)
8268{
8269 struct regexp *const rx = ReANY(r);
8270 char *s = NULL;
8271 SSize_t i = 0;
8272 SSize_t s1, t1;
8273 I32 n = paren;
8274
8275 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8276
8277 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8278 || n == RX_BUFF_IDX_CARET_FULLMATCH
8279 || n == RX_BUFF_IDX_CARET_POSTMATCH
8280 )
8281 {
8282 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8283 if (!keepcopy) {
8284 /* on something like
8285 * $r = qr/.../;
8286 * /$qr/p;
8287 * the KEEPCOPY is set on the PMOP rather than the regex */
8288 if (PL_curpm && r == PM_GETRE(PL_curpm))
8289 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8290 }
8291 if (!keepcopy)
8292 goto ret_undef;
8293 }
8294
8295 if (!rx->subbeg)
8296 goto ret_undef;
8297
8298 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8299 /* no need to distinguish between them any more */
8300 n = RX_BUFF_IDX_FULLMATCH;
8301
8302 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8303 && rx->offs[0].start != -1)
8304 {
8305 /* $`, ${^PREMATCH} */
8306 i = rx->offs[0].start;
8307 s = rx->subbeg;
8308 }
8309 else
8310 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8311 && rx->offs[0].end != -1)
8312 {
8313 /* $', ${^POSTMATCH} */
8314 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8315 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8316 }
8317 else
8318 if ( 0 <= n && n <= (I32)rx->nparens &&
8319 (s1 = rx->offs[n].start) != -1 &&
8320 (t1 = rx->offs[n].end) != -1)
8321 {
8322 /* $&, ${^MATCH}, $1 ... */
8323 i = t1 - s1;
8324 s = rx->subbeg + s1 - rx->suboffset;
8325 } else {
8326 goto ret_undef;
8327 }
8328
8329 assert(s >= rx->subbeg);
8330 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8331 if (i >= 0) {
8332#ifdef NO_TAINT_SUPPORT
8333 sv_setpvn(sv, s, i);
8334#else
8335 const int oldtainted = TAINT_get;
8336 TAINT_NOT;
8337 sv_setpvn(sv, s, i);
8338 TAINT_set(oldtainted);
8339#endif
8340 if (RXp_MATCH_UTF8(rx))
8341 SvUTF8_on(sv);
8342 else
8343 SvUTF8_off(sv);
8344 if (TAINTING_get) {
8345 if (RXp_MATCH_TAINTED(rx)) {
8346 if (SvTYPE(sv) >= SVt_PVMG) {
8347 MAGIC* const mg = SvMAGIC(sv);
8348 MAGIC* mgt;
8349 TAINT;
8350 SvMAGIC_set(sv, mg->mg_moremagic);
8351 SvTAINT(sv);
8352 if ((mgt = SvMAGIC(sv))) {
8353 mg->mg_moremagic = mgt;
8354 SvMAGIC_set(sv, mg);
8355 }
8356 } else {
8357 TAINT;
8358 SvTAINT(sv);
8359 }
8360 } else
8361 SvTAINTED_off(sv);
8362 }
8363 } else {
8364 ret_undef:
8365 sv_set_undef(sv);
8366 return;
8367 }
8368}
8369
8370void
8371Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8372 SV const * const value)
8373{
8374 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8375
8376 PERL_UNUSED_ARG(rx);
8377 PERL_UNUSED_ARG(paren);
8378 PERL_UNUSED_ARG(value);
8379
8380 if (!PL_localizing)
8381 Perl_croak_no_modify();
8382}
8383
8384I32
8385Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8386 const I32 paren)
8387{
8388 struct regexp *const rx = ReANY(r);
8389 I32 i;
8390 I32 s1, t1;
8391
8392 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8393
8394 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8395 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8396 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8397 )
8398 {
8399 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8400 if (!keepcopy) {
8401 /* on something like
8402 * $r = qr/.../;
8403 * /$qr/p;
8404 * the KEEPCOPY is set on the PMOP rather than the regex */
8405 if (PL_curpm && r == PM_GETRE(PL_curpm))
8406 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8407 }
8408 if (!keepcopy)
8409 goto warn_undef;
8410 }
8411
8412 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8413 switch (paren) {
8414 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8415 case RX_BUFF_IDX_PREMATCH: /* $` */
8416 if (rx->offs[0].start != -1) {
8417 i = rx->offs[0].start;
8418 if (i > 0) {
8419 s1 = 0;
8420 t1 = i;
8421 goto getlen;
8422 }
8423 }
8424 return 0;
8425
8426 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8427 case RX_BUFF_IDX_POSTMATCH: /* $' */
8428 if (rx->offs[0].end != -1) {
8429 i = rx->sublen - rx->offs[0].end;
8430 if (i > 0) {
8431 s1 = rx->offs[0].end;
8432 t1 = rx->sublen;
8433 goto getlen;
8434 }
8435 }
8436 return 0;
8437
8438 default: /* $& / ${^MATCH}, $1, $2, ... */
8439 if (paren <= (I32)rx->nparens &&
8440 (s1 = rx->offs[paren].start) != -1 &&
8441 (t1 = rx->offs[paren].end) != -1)
8442 {
8443 i = t1 - s1;
8444 goto getlen;
8445 } else {
8446 warn_undef:
8447 if (ckWARN(WARN_UNINITIALIZED))
8448 report_uninit((const SV *)sv);
8449 return 0;
8450 }
8451 }
8452 getlen:
8453 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8454 const char * const s = rx->subbeg - rx->suboffset + s1;
8455 const U8 *ep;
8456 STRLEN el;
8457
8458 i = t1 - s1;
8459 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8460 i = el;
8461 }
8462 return i;
8463}
8464
8465SV*
8466Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8467{
8468 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8469 PERL_UNUSED_ARG(rx);
8470 if (0)
8471 return NULL;
8472 else
8473 return newSVpvs("Regexp");
8474}
8475
8476/* Scans the name of a named buffer from the pattern.
8477 * If flags is REG_RSN_RETURN_NULL returns null.
8478 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8479 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8480 * to the parsed name as looked up in the RExC_paren_names hash.
8481 * If there is an error throws a vFAIL().. type exception.
8482 */
8483
8484#define REG_RSN_RETURN_NULL 0
8485#define REG_RSN_RETURN_NAME 1
8486#define REG_RSN_RETURN_DATA 2
8487
8488STATIC SV*
8489S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8490{
8491 char *name_start = RExC_parse;
8492 SV* sv_name;
8493
8494 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8495
8496 assert (RExC_parse <= RExC_end);
8497 if (RExC_parse == RExC_end) NOOP;
8498 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8499 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8500 * using do...while */
8501 if (UTF)
8502 do {
8503 RExC_parse += UTF8SKIP(RExC_parse);
8504 } while ( RExC_parse < RExC_end
8505 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8506 else
8507 do {
8508 RExC_parse++;
8509 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8510 } else {
8511 RExC_parse++; /* so the <- from the vFAIL is after the offending
8512 character */
8513 vFAIL("Group name must start with a non-digit word character");
8514 }
8515 if ( flags ) {
8516 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8517 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8518 if ( flags == REG_RSN_RETURN_NAME)
8519 return sv_name;
8520 else if (flags==REG_RSN_RETURN_DATA) {
8521 HE *he_str = NULL;
8522 SV *sv_dat = NULL;
8523 if ( ! sv_name ) /* should not happen*/
8524 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8525 if (RExC_paren_names)
8526 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8527 if ( he_str )
8528 sv_dat = HeVAL(he_str);
8529 if ( ! sv_dat ) /* Didn't find group */
8530 vFAIL("Reference to nonexistent named group");
8531 return sv_dat;
8532 }
8533 else {
8534 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8535 (unsigned long) flags);
8536 }
8537 NOT_REACHED; /* NOTREACHED */
8538 }
8539 return NULL;
8540}
8541
8542#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8543 int num; \
8544 if (RExC_lastparse!=RExC_parse) { \
8545 Perl_re_printf( aTHX_ "%s", \
8546 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8547 RExC_end - RExC_parse, 16, \
8548 "", "", \
8549 PERL_PV_ESCAPE_UNI_DETECT | \
8550 PERL_PV_PRETTY_ELLIPSES | \
8551 PERL_PV_PRETTY_LTGT | \
8552 PERL_PV_ESCAPE_RE | \
8553 PERL_PV_PRETTY_EXACTSIZE \
8554 ) \
8555 ); \
8556 } else \
8557 Perl_re_printf( aTHX_ "%16s",""); \
8558 \
8559 if (SIZE_ONLY) \
8560 num = RExC_size + 1; \
8561 else \
8562 num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \
8563 if (RExC_lastnum!=num) \
8564 Perl_re_printf( aTHX_ "|%4d", num); \
8565 else \
8566 Perl_re_printf( aTHX_ "|%4s",""); \
8567 Perl_re_printf( aTHX_ "|%*s%-4s", \
8568 (int)((depth*2)), "", \
8569 (funcname) \
8570 ); \
8571 RExC_lastnum=num; \
8572 RExC_lastparse=RExC_parse; \
8573})
8574
8575
8576
8577#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8578 DEBUG_PARSE_MSG((funcname)); \
8579 Perl_re_printf( aTHX_ "%4s","\n"); \
8580})
8581#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
8582 DEBUG_PARSE_MSG((funcname)); \
8583 Perl_re_printf( aTHX_ fmt "\n",args); \
8584})
8585
8586/* This section of code defines the inversion list object and its methods. The
8587 * interfaces are highly subject to change, so as much as possible is static to
8588 * this file. An inversion list is here implemented as a malloc'd C UV array
8589 * as an SVt_INVLIST scalar.
8590 *
8591 * An inversion list for Unicode is an array of code points, sorted by ordinal
8592 * number. Each element gives the code point that begins a range that extends
8593 * up-to but not including the code point given by the next element. The final
8594 * element gives the first code point of a range that extends to the platform's
8595 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
8596 * ...) give ranges whose code points are all in the inversion list. We say
8597 * that those ranges are in the set. The odd-numbered elements give ranges
8598 * whose code points are not in the inversion list, and hence not in the set.
8599 * Thus, element [0] is the first code point in the list. Element [1]
8600 * is the first code point beyond that not in the list; and element [2] is the
8601 * first code point beyond that that is in the list. In other words, the first
8602 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8603 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
8604 * all code points in that range are not in the inversion list. The third
8605 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8606 * list, and so forth. Thus every element whose index is divisible by two
8607 * gives the beginning of a range that is in the list, and every element whose
8608 * index is not divisible by two gives the beginning of a range not in the
8609 * list. If the final element's index is divisible by two, the inversion list
8610 * extends to the platform's infinity; otherwise the highest code point in the
8611 * inversion list is the contents of that element minus 1.
8612 *
8613 * A range that contains just a single code point N will look like
8614 * invlist[i] == N
8615 * invlist[i+1] == N+1
8616 *
8617 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8618 * impossible to represent, so element [i+1] is omitted. The single element
8619 * inversion list
8620 * invlist[0] == UV_MAX
8621 * contains just UV_MAX, but is interpreted as matching to infinity.
8622 *
8623 * Taking the complement (inverting) an inversion list is quite simple, if the
8624 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8625 * This implementation reserves an element at the beginning of each inversion
8626 * list to always contain 0; there is an additional flag in the header which
8627 * indicates if the list begins at the 0, or is offset to begin at the next
8628 * element. This means that the inversion list can be inverted without any
8629 * copying; just flip the flag.
8630 *
8631 * More about inversion lists can be found in "Unicode Demystified"
8632 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8633 *
8634 * The inversion list data structure is currently implemented as an SV pointing
8635 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8636 * array of UV whose memory management is automatically handled by the existing
8637 * facilities for SV's.
8638 *
8639 * Some of the methods should always be private to the implementation, and some
8640 * should eventually be made public */
8641
8642/* The header definitions are in F<invlist_inline.h> */
8643
8644#ifndef PERL_IN_XSUB_RE
8645
8646PERL_STATIC_INLINE UV*
8647S__invlist_array_init(SV* const invlist, const bool will_have_0)
8648{
8649 /* Returns a pointer to the first element in the inversion list's array.
8650 * This is called upon initialization of an inversion list. Where the
8651 * array begins depends on whether the list has the code point U+0000 in it
8652 * or not. The other parameter tells it whether the code that follows this
8653 * call is about to put a 0 in the inversion list or not. The first
8654 * element is either the element reserved for 0, if TRUE, or the element
8655 * after it, if FALSE */
8656
8657 bool* offset = get_invlist_offset_addr(invlist);
8658 UV* zero_addr = (UV *) SvPVX(invlist);
8659
8660 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8661
8662 /* Must be empty */
8663 assert(! _invlist_len(invlist));
8664
8665 *zero_addr = 0;
8666
8667 /* 1^1 = 0; 1^0 = 1 */
8668 *offset = 1 ^ will_have_0;
8669 return zero_addr + *offset;
8670}
8671
8672PERL_STATIC_INLINE void
8673S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8674{
8675 /* Sets the current number of elements stored in the inversion list.
8676 * Updates SvCUR correspondingly */
8677 PERL_UNUSED_CONTEXT;
8678 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8679
8680 assert(is_invlist(invlist));
8681
8682 SvCUR_set(invlist,
8683 (len == 0)
8684 ? 0
8685 : TO_INTERNAL_SIZE(len + offset));
8686 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8687}
8688
8689STATIC void
8690S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8691{
8692 /* Replaces the inversion list in 'dest' with the one from 'src'. It
8693 * steals the list from 'src', so 'src' is made to have a NULL list. This
8694 * is similar to what SvSetMagicSV() would do, if it were implemented on
8695 * inversion lists, though this routine avoids a copy */
8696
8697 const UV src_len = _invlist_len(src);
8698 const bool src_offset = *get_invlist_offset_addr(src);
8699 const STRLEN src_byte_len = SvLEN(src);
8700 char * array = SvPVX(src);
8701
8702 const int oldtainted = TAINT_get;
8703
8704 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8705
8706 assert(is_invlist(src));
8707 assert(is_invlist(dest));
8708 assert(! invlist_is_iterating(src));
8709 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8710
8711 /* Make sure it ends in the right place with a NUL, as our inversion list
8712 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8713 * asserts it */
8714 array[src_byte_len - 1] = '\0';
8715
8716 TAINT_NOT; /* Otherwise it breaks */
8717 sv_usepvn_flags(dest,
8718 (char *) array,
8719 src_byte_len - 1,
8720
8721 /* This flag is documented to cause a copy to be avoided */
8722 SV_HAS_TRAILING_NUL);
8723 TAINT_set(oldtainted);
8724 SvPV_set(src, 0);
8725 SvLEN_set(src, 0);
8726 SvCUR_set(src, 0);
8727
8728 /* Finish up copying over the other fields in an inversion list */
8729 *get_invlist_offset_addr(dest) = src_offset;
8730 invlist_set_len(dest, src_len, src_offset);
8731 *get_invlist_previous_index_addr(dest) = 0;
8732 invlist_iterfinish(dest);
8733}
8734
8735PERL_STATIC_INLINE IV*
8736S_get_invlist_previous_index_addr(SV* invlist)
8737{
8738 /* Return the address of the IV that is reserved to hold the cached index
8739 * */
8740 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8741
8742 assert(is_invlist(invlist));
8743
8744 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8745}
8746
8747PERL_STATIC_INLINE IV
8748S_invlist_previous_index(SV* const invlist)
8749{
8750 /* Returns cached index of previous search */
8751
8752 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8753
8754 return *get_invlist_previous_index_addr(invlist);
8755}
8756
8757PERL_STATIC_INLINE void
8758S_invlist_set_previous_index(SV* const invlist, const IV index)
8759{
8760 /* Caches <index> for later retrieval */
8761
8762 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8763
8764 assert(index == 0 || index < (int) _invlist_len(invlist));
8765
8766 *get_invlist_previous_index_addr(invlist) = index;
8767}
8768
8769PERL_STATIC_INLINE void
8770S_invlist_trim(SV* invlist)
8771{
8772 /* Free the not currently-being-used space in an inversion list */
8773
8774 /* But don't free up the space needed for the 0 UV that is always at the
8775 * beginning of the list, nor the trailing NUL */
8776 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
8777
8778 PERL_ARGS_ASSERT_INVLIST_TRIM;
8779
8780 assert(is_invlist(invlist));
8781
8782 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
8783}
8784
8785PERL_STATIC_INLINE void
8786S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
8787{
8788 PERL_ARGS_ASSERT_INVLIST_CLEAR;
8789
8790 assert(is_invlist(invlist));
8791
8792 invlist_set_len(invlist, 0, 0);
8793 invlist_trim(invlist);
8794}
8795
8796#endif /* ifndef PERL_IN_XSUB_RE */
8797
8798PERL_STATIC_INLINE bool
8799S_invlist_is_iterating(SV* const invlist)
8800{
8801 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8802
8803 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8804}
8805
8806#ifndef PERL_IN_XSUB_RE
8807
8808PERL_STATIC_INLINE UV
8809S_invlist_max(SV* const invlist)
8810{
8811 /* Returns the maximum number of elements storable in the inversion list's
8812 * array, without having to realloc() */
8813
8814 PERL_ARGS_ASSERT_INVLIST_MAX;
8815
8816 assert(is_invlist(invlist));
8817
8818 /* Assumes worst case, in which the 0 element is not counted in the
8819 * inversion list, so subtracts 1 for that */
8820 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8821 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8822 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8823}
8824
8825STATIC void
8826S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
8827{
8828 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
8829
8830 /* First 1 is in case the zero element isn't in the list; second 1 is for
8831 * trailing NUL */
8832 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8833 invlist_set_len(invlist, 0, 0);
8834
8835 /* Force iterinit() to be used to get iteration to work */
8836 invlist_iterfinish(invlist);
8837
8838 *get_invlist_previous_index_addr(invlist) = 0;
8839}
8840
8841SV*
8842Perl__new_invlist(pTHX_ IV initial_size)
8843{
8844
8845 /* Return a pointer to a newly constructed inversion list, with enough
8846 * space to store 'initial_size' elements. If that number is negative, a
8847 * system default is used instead */
8848
8849 SV* new_list;
8850
8851 if (initial_size < 0) {
8852 initial_size = 10;
8853 }
8854
8855 /* Allocate the initial space */
8856 new_list = newSV_type(SVt_INVLIST);
8857
8858 initialize_invlist_guts(new_list, initial_size);
8859
8860 return new_list;
8861}
8862
8863SV*
8864Perl__new_invlist_C_array(pTHX_ const UV* const list)
8865{
8866 /* Return a pointer to a newly constructed inversion list, initialized to
8867 * point to <list>, which has to be in the exact correct inversion list
8868 * form, including internal fields. Thus this is a dangerous routine that
8869 * should not be used in the wrong hands. The passed in 'list' contains
8870 * several header fields at the beginning that are not part of the
8871 * inversion list body proper */
8872
8873 const STRLEN length = (STRLEN) list[0];
8874 const UV version_id = list[1];
8875 const bool offset = cBOOL(list[2]);
8876#define HEADER_LENGTH 3
8877 /* If any of the above changes in any way, you must change HEADER_LENGTH
8878 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8879 * perl -E 'say int(rand 2**31-1)'
8880 */
8881#define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8882 data structure type, so that one being
8883 passed in can be validated to be an
8884 inversion list of the correct vintage.
8885 */
8886
8887 SV* invlist = newSV_type(SVt_INVLIST);
8888
8889 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8890
8891 if (version_id != INVLIST_VERSION_ID) {
8892 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8893 }
8894
8895 /* The generated array passed in includes header elements that aren't part
8896 * of the list proper, so start it just after them */
8897 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8898
8899 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8900 shouldn't touch it */
8901
8902 *(get_invlist_offset_addr(invlist)) = offset;
8903
8904 /* The 'length' passed to us is the physical number of elements in the
8905 * inversion list. But if there is an offset the logical number is one
8906 * less than that */
8907 invlist_set_len(invlist, length - offset, offset);
8908
8909 invlist_set_previous_index(invlist, 0);
8910
8911 /* Initialize the iteration pointer. */
8912 invlist_iterfinish(invlist);
8913
8914 SvREADONLY_on(invlist);
8915
8916 return invlist;
8917}
8918
8919STATIC void
8920S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8921{
8922 /* Grow the maximum size of an inversion list */
8923
8924 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8925
8926 assert(is_invlist(invlist));
8927
8928 /* Add one to account for the zero element at the beginning which may not
8929 * be counted by the calling parameters */
8930 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8931}
8932
8933STATIC void
8934S__append_range_to_invlist(pTHX_ SV* const invlist,
8935 const UV start, const UV end)
8936{
8937 /* Subject to change or removal. Append the range from 'start' to 'end' at
8938 * the end of the inversion list. The range must be above any existing
8939 * ones. */
8940
8941 UV* array;
8942 UV max = invlist_max(invlist);
8943 UV len = _invlist_len(invlist);
8944 bool offset;
8945
8946 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8947
8948 if (len == 0) { /* Empty lists must be initialized */
8949 offset = start != 0;
8950 array = _invlist_array_init(invlist, ! offset);
8951 }
8952 else {
8953 /* Here, the existing list is non-empty. The current max entry in the
8954 * list is generally the first value not in the set, except when the
8955 * set extends to the end of permissible values, in which case it is
8956 * the first entry in that final set, and so this call is an attempt to
8957 * append out-of-order */
8958
8959 UV final_element = len - 1;
8960 array = invlist_array(invlist);
8961 if ( array[final_element] > start
8962 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8963 {
8964 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
8965 array[final_element], start,
8966 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8967 }
8968
8969 /* Here, it is a legal append. If the new range begins 1 above the end
8970 * of the range below it, it is extending the range below it, so the
8971 * new first value not in the set is one greater than the newly
8972 * extended range. */
8973 offset = *get_invlist_offset_addr(invlist);
8974 if (array[final_element] == start) {
8975 if (end != UV_MAX) {
8976 array[final_element] = end + 1;
8977 }
8978 else {
8979 /* But if the end is the maximum representable on the machine,
8980 * assume that infinity was actually what was meant. Just let
8981 * the range that this would extend to have no end */
8982 invlist_set_len(invlist, len - 1, offset);
8983 }
8984 return;
8985 }
8986 }
8987
8988 /* Here the new range doesn't extend any existing set. Add it */
8989
8990 len += 2; /* Includes an element each for the start and end of range */
8991
8992 /* If wll overflow the existing space, extend, which may cause the array to
8993 * be moved */
8994 if (max < len) {
8995 invlist_extend(invlist, len);
8996
8997 /* Have to set len here to avoid assert failure in invlist_array() */
8998 invlist_set_len(invlist, len, offset);
8999
9000 array = invlist_array(invlist);
9001 }
9002 else {
9003 invlist_set_len(invlist, len, offset);
9004 }
9005
9006 /* The next item on the list starts the range, the one after that is
9007 * one past the new range. */
9008 array[len - 2] = start;
9009 if (end != UV_MAX) {
9010 array[len - 1] = end + 1;
9011 }
9012 else {
9013 /* But if the end is the maximum representable on the machine, just let
9014 * the range have no end */
9015 invlist_set_len(invlist, len - 1, offset);
9016 }
9017}
9018
9019SSize_t
9020Perl__invlist_search(SV* const invlist, const UV cp)
9021{
9022 /* Searches the inversion list for the entry that contains the input code
9023 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9024 * return value is the index into the list's array of the range that
9025 * contains <cp>, that is, 'i' such that
9026 * array[i] <= cp < array[i+1]
9027 */
9028
9029 IV low = 0;
9030 IV mid;
9031 IV high = _invlist_len(invlist);
9032 const IV highest_element = high - 1;
9033 const UV* array;
9034
9035 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9036
9037 /* If list is empty, return failure. */
9038 if (high == 0) {
9039 return -1;
9040 }
9041
9042 /* (We can't get the array unless we know the list is non-empty) */
9043 array = invlist_array(invlist);
9044
9045 mid = invlist_previous_index(invlist);
9046 assert(mid >=0);
9047 if (mid > highest_element) {
9048 mid = highest_element;
9049 }
9050
9051 /* <mid> contains the cache of the result of the previous call to this
9052 * function (0 the first time). See if this call is for the same result,
9053 * or if it is for mid-1. This is under the theory that calls to this
9054 * function will often be for related code points that are near each other.
9055 * And benchmarks show that caching gives better results. We also test
9056 * here if the code point is within the bounds of the list. These tests
9057 * replace others that would have had to be made anyway to make sure that
9058 * the array bounds were not exceeded, and these give us extra information
9059 * at the same time */
9060 if (cp >= array[mid]) {
9061 if (cp >= array[highest_element]) {
9062 return highest_element;
9063 }
9064
9065 /* Here, array[mid] <= cp < array[highest_element]. This means that
9066 * the final element is not the answer, so can exclude it; it also
9067 * means that <mid> is not the final element, so can refer to 'mid + 1'
9068 * safely */
9069 if (cp < array[mid + 1]) {
9070 return mid;
9071 }
9072 high--;
9073 low = mid + 1;
9074 }
9075 else { /* cp < aray[mid] */
9076 if (cp < array[0]) { /* Fail if outside the array */
9077 return -1;
9078 }
9079 high = mid;
9080 if (cp >= array[mid - 1]) {
9081 goto found_entry;
9082 }
9083 }
9084
9085 /* Binary search. What we are looking for is <i> such that
9086 * array[i] <= cp < array[i+1]
9087 * The loop below converges on the i+1. Note that there may not be an
9088 * (i+1)th element in the array, and things work nonetheless */
9089 while (low < high) {
9090 mid = (low + high) / 2;
9091 assert(mid <= highest_element);
9092 if (array[mid] <= cp) { /* cp >= array[mid] */
9093 low = mid + 1;
9094
9095 /* We could do this extra test to exit the loop early.
9096 if (cp < array[low]) {
9097 return mid;
9098 }
9099 */
9100 }
9101 else { /* cp < array[mid] */
9102 high = mid;
9103 }
9104 }
9105
9106 found_entry:
9107 high--;
9108 invlist_set_previous_index(invlist, high);
9109 return high;
9110}
9111
9112void
9113Perl__invlist_populate_swatch(SV* const invlist,
9114 const UV start, const UV end, U8* swatch)
9115{
9116 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
9117 * but is used when the swash has an inversion list. This makes this much
9118 * faster, as it uses a binary search instead of a linear one. This is
9119 * intimately tied to that function, and perhaps should be in utf8.c,
9120 * except it is intimately tied to inversion lists as well. It assumes
9121 * that <swatch> is all 0's on input */
9122
9123 UV current = start;
9124 const IV len = _invlist_len(invlist);
9125 IV i;
9126 const UV * array;
9127
9128 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
9129
9130 if (len == 0) { /* Empty inversion list */
9131 return;
9132 }
9133
9134 array = invlist_array(invlist);
9135
9136 /* Find which element it is */
9137 i = _invlist_search(invlist, start);
9138
9139 /* We populate from <start> to <end> */
9140 while (current < end) {
9141 UV upper;
9142
9143 /* The inversion list gives the results for every possible code point
9144 * after the first one in the list. Only those ranges whose index is
9145 * even are ones that the inversion list matches. For the odd ones,
9146 * and if the initial code point is not in the list, we have to skip
9147 * forward to the next element */
9148 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
9149 i++;
9150 if (i >= len) { /* Finished if beyond the end of the array */
9151 return;
9152 }
9153 current = array[i];
9154 if (current >= end) { /* Finished if beyond the end of what we
9155 are populating */
9156 if (LIKELY(end < UV_MAX)) {
9157 return;
9158 }
9159
9160 /* We get here when the upper bound is the maximum
9161 * representable on the machine, and we are looking for just
9162 * that code point. Have to special case it */
9163 i = len;
9164 goto join_end_of_list;
9165 }
9166 }
9167 assert(current >= start);
9168
9169 /* The current range ends one below the next one, except don't go past
9170 * <end> */
9171 i++;
9172 upper = (i < len && array[i] < end) ? array[i] : end;
9173
9174 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
9175 * for each code point in it */
9176 for (; current < upper; current++) {
9177 const STRLEN offset = (STRLEN)(current - start);
9178 swatch[offset >> 3] |= 1 << (offset & 7);
9179 }
9180
9181 join_end_of_list:
9182
9183 /* Quit if at the end of the list */
9184 if (i >= len) {
9185
9186 /* But first, have to deal with the highest possible code point on
9187 * the platform. The previous code assumes that <end> is one
9188 * beyond where we want to populate, but that is impossible at the
9189 * platform's infinity, so have to handle it specially */
9190 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
9191 {
9192 const STRLEN offset = (STRLEN)(end - start);
9193 swatch[offset >> 3] |= 1 << (offset & 7);
9194 }
9195 return;
9196 }
9197
9198 /* Advance to the next range, which will be for code points not in the
9199 * inversion list */
9200 current = array[i];
9201 }
9202
9203 return;
9204}
9205
9206void
9207Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9208 const bool complement_b, SV** output)
9209{
9210 /* Take the union of two inversion lists and point '*output' to it. On
9211 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9212 * even 'a' or 'b'). If to an inversion list, the contents of the original
9213 * list will be replaced by the union. The first list, 'a', may be
9214 * NULL, in which case a copy of the second list is placed in '*output'.
9215 * If 'complement_b' is TRUE, the union is taken of the complement
9216 * (inversion) of 'b' instead of b itself.
9217 *
9218 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9219 * Richard Gillam, published by Addison-Wesley, and explained at some
9220 * length there. The preface says to incorporate its examples into your
9221 * code at your own risk.
9222 *
9223 * The algorithm is like a merge sort. */
9224
9225 const UV* array_a; /* a's array */
9226 const UV* array_b;
9227 UV len_a; /* length of a's array */
9228 UV len_b;
9229
9230 SV* u; /* the resulting union */
9231 UV* array_u;
9232 UV len_u = 0;
9233
9234 UV i_a = 0; /* current index into a's array */
9235 UV i_b = 0;
9236 UV i_u = 0;
9237
9238 /* running count, as explained in the algorithm source book; items are
9239 * stopped accumulating and are output when the count changes to/from 0.
9240 * The count is incremented when we start a range that's in an input's set,
9241 * and decremented when we start a range that's not in a set. So this
9242 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9243 * and hence nothing goes into the union; 1, just one of the inputs is in
9244 * its set (and its current range gets added to the union); and 2 when both
9245 * inputs are in their sets. */
9246 UV count = 0;
9247
9248 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9249 assert(a != b);
9250 assert(*output == NULL || is_invlist(*output));
9251
9252 len_b = _invlist_len(b);
9253 if (len_b == 0) {
9254
9255 /* Here, 'b' is empty, hence it's complement is all possible code
9256 * points. So if the union includes the complement of 'b', it includes
9257 * everything, and we need not even look at 'a'. It's easiest to
9258 * create a new inversion list that matches everything. */
9259 if (complement_b) {
9260 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9261
9262 if (*output == NULL) { /* If the output didn't exist, just point it
9263 at the new list */
9264 *output = everything;
9265 }
9266 else { /* Otherwise, replace its contents with the new list */
9267 invlist_replace_list_destroys_src(*output, everything);
9268 SvREFCNT_dec_NN(everything);
9269 }
9270
9271 return;
9272 }
9273
9274 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9275 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9276 * output will be empty */
9277
9278 if (a == NULL || _invlist_len(a) == 0) {
9279 if (*output == NULL) {
9280 *output = _new_invlist(0);
9281 }
9282 else {
9283 invlist_clear(*output);
9284 }
9285 return;
9286 }
9287
9288 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9289 * union. We can just return a copy of 'a' if '*output' doesn't point
9290 * to an existing list */
9291 if (*output == NULL) {
9292 *output = invlist_clone(a, NULL);
9293 return;
9294 }
9295
9296 /* If the output is to overwrite 'a', we have a no-op, as it's
9297 * already in 'a' */
9298 if (*output == a) {
9299 return;
9300 }
9301
9302 /* Here, '*output' is to be overwritten by 'a' */
9303 u = invlist_clone(a, NULL);
9304 invlist_replace_list_destroys_src(*output, u);
9305 SvREFCNT_dec_NN(u);
9306
9307 return;
9308 }
9309
9310 /* Here 'b' is not empty. See about 'a' */
9311
9312 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9313
9314 /* Here, 'a' is empty (and b is not). That means the union will come
9315 * entirely from 'b'. If '*output' is NULL, we can directly return a
9316 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9317 * the clone */
9318
9319 SV ** dest = (*output == NULL) ? output : &u;
9320 *dest = invlist_clone(b, NULL);
9321 if (complement_b) {
9322 _invlist_invert(*dest);
9323 }
9324
9325 if (dest == &u) {
9326 invlist_replace_list_destroys_src(*output, u);
9327 SvREFCNT_dec_NN(u);
9328 }
9329
9330 return;
9331 }
9332
9333 /* Here both lists exist and are non-empty */
9334 array_a = invlist_array(a);
9335 array_b = invlist_array(b);
9336
9337 /* If are to take the union of 'a' with the complement of b, set it
9338 * up so are looking at b's complement. */
9339 if (complement_b) {
9340
9341 /* To complement, we invert: if the first element is 0, remove it. To
9342 * do this, we just pretend the array starts one later */
9343 if (array_b[0] == 0) {
9344 array_b++;
9345 len_b--;
9346 }
9347 else {
9348
9349 /* But if the first element is not zero, we pretend the list starts
9350 * at the 0 that is always stored immediately before the array. */
9351 array_b--;
9352 len_b++;
9353 }
9354 }
9355
9356 /* Size the union for the worst case: that the sets are completely
9357 * disjoint */
9358 u = _new_invlist(len_a + len_b);
9359
9360 /* Will contain U+0000 if either component does */
9361 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9362 || (len_b > 0 && array_b[0] == 0));
9363
9364 /* Go through each input list item by item, stopping when have exhausted
9365 * one of them */
9366 while (i_a < len_a && i_b < len_b) {
9367 UV cp; /* The element to potentially add to the union's array */
9368 bool cp_in_set; /* is it in the the input list's set or not */
9369
9370 /* We need to take one or the other of the two inputs for the union.
9371 * Since we are merging two sorted lists, we take the smaller of the
9372 * next items. In case of a tie, we take first the one that is in its
9373 * set. If we first took the one not in its set, it would decrement
9374 * the count, possibly to 0 which would cause it to be output as ending
9375 * the range, and the next time through we would take the same number,
9376 * and output it again as beginning the next range. By doing it the
9377 * opposite way, there is no possibility that the count will be
9378 * momentarily decremented to 0, and thus the two adjoining ranges will
9379 * be seamlessly merged. (In a tie and both are in the set or both not
9380 * in the set, it doesn't matter which we take first.) */
9381 if ( array_a[i_a] < array_b[i_b]
9382 || ( array_a[i_a] == array_b[i_b]
9383 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9384 {
9385 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9386 cp = array_a[i_a++];
9387 }
9388 else {
9389 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9390 cp = array_b[i_b++];
9391 }
9392
9393 /* Here, have chosen which of the two inputs to look at. Only output
9394 * if the running count changes to/from 0, which marks the
9395 * beginning/end of a range that's in the set */
9396 if (cp_in_set) {
9397 if (count == 0) {
9398 array_u[i_u++] = cp;
9399 }
9400 count++;
9401 }
9402 else {
9403 count--;
9404 if (count == 0) {
9405 array_u[i_u++] = cp;
9406 }
9407 }
9408 }
9409
9410
9411 /* The loop above increments the index into exactly one of the input lists
9412 * each iteration, and ends when either index gets to its list end. That
9413 * means the other index is lower than its end, and so something is
9414 * remaining in that one. We decrement 'count', as explained below, if
9415 * that list is in its set. (i_a and i_b each currently index the element
9416 * beyond the one we care about.) */
9417 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9418 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9419 {
9420 count--;
9421 }
9422
9423 /* Above we decremented 'count' if the list that had unexamined elements in
9424 * it was in its set. This has made it so that 'count' being non-zero
9425 * means there isn't anything left to output; and 'count' equal to 0 means
9426 * that what is left to output is precisely that which is left in the
9427 * non-exhausted input list.
9428 *
9429 * To see why, note first that the exhausted input obviously has nothing
9430 * left to add to the union. If it was in its set at its end, that means
9431 * the set extends from here to the platform's infinity, and hence so does
9432 * the union and the non-exhausted set is irrelevant. The exhausted set
9433 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9434 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9435 * 'count' remains at 1. This is consistent with the decremented 'count'
9436 * != 0 meaning there's nothing left to add to the union.
9437 *
9438 * But if the exhausted input wasn't in its set, it contributed 0 to
9439 * 'count', and the rest of the union will be whatever the other input is.
9440 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9441 * otherwise it gets decremented to 0. This is consistent with 'count'
9442 * == 0 meaning the remainder of the union is whatever is left in the
9443 * non-exhausted list. */
9444 if (count != 0) {
9445 len_u = i_u;
9446 }
9447 else {
9448 IV copy_count = len_a - i_a;
9449 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9450 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9451 }
9452 else { /* The non-exhausted input is b */
9453 copy_count = len_b - i_b;
9454 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9455 }
9456 len_u = i_u + copy_count;
9457 }
9458
9459 /* Set the result to the final length, which can change the pointer to
9460 * array_u, so re-find it. (Note that it is unlikely that this will
9461 * change, as we are shrinking the space, not enlarging it) */
9462 if (len_u != _invlist_len(u)) {
9463 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9464 invlist_trim(u);
9465 array_u = invlist_array(u);
9466 }
9467
9468 if (*output == NULL) { /* Simply return the new inversion list */
9469 *output = u;
9470 }
9471 else {
9472 /* Otherwise, overwrite the inversion list that was in '*output'. We
9473 * could instead free '*output', and then set it to 'u', but experience
9474 * has shown [perl #127392] that if the input is a mortal, we can get a
9475 * huge build-up of these during regex compilation before they get
9476 * freed. */
9477 invlist_replace_list_destroys_src(*output, u);
9478 SvREFCNT_dec_NN(u);
9479 }
9480
9481 return;
9482}
9483
9484void
9485Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9486 const bool complement_b, SV** i)
9487{
9488 /* Take the intersection of two inversion lists and point '*i' to it. On
9489 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9490 * even 'a' or 'b'). If to an inversion list, the contents of the original
9491 * list will be replaced by the intersection. The first list, 'a', may be
9492 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9493 * TRUE, the result will be the intersection of 'a' and the complement (or
9494 * inversion) of 'b' instead of 'b' directly.
9495 *
9496 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9497 * Richard Gillam, published by Addison-Wesley, and explained at some
9498 * length there. The preface says to incorporate its examples into your
9499 * code at your own risk. In fact, it had bugs
9500 *
9501 * The algorithm is like a merge sort, and is essentially the same as the
9502 * union above
9503 */
9504
9505 const UV* array_a; /* a's array */
9506 const UV* array_b;
9507 UV len_a; /* length of a's array */
9508 UV len_b;
9509
9510 SV* r; /* the resulting intersection */
9511 UV* array_r;
9512 UV len_r = 0;
9513
9514 UV i_a = 0; /* current index into a's array */
9515 UV i_b = 0;
9516 UV i_r = 0;
9517
9518 /* running count of how many of the two inputs are postitioned at ranges
9519 * that are in their sets. As explained in the algorithm source book,
9520 * items are stopped accumulating and are output when the count changes
9521 * to/from 2. The count is incremented when we start a range that's in an
9522 * input's set, and decremented when we start a range that's not in a set.
9523 * Only when it is 2 are we in the intersection. */
9524 UV count = 0;
9525
9526 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9527 assert(a != b);
9528 assert(*i == NULL || is_invlist(*i));
9529
9530 /* Special case if either one is empty */
9531 len_a = (a == NULL) ? 0 : _invlist_len(a);
9532 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9533 if (len_a != 0 && complement_b) {
9534
9535 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9536 * must be empty. Here, also we are using 'b's complement, which
9537 * hence must be every possible code point. Thus the intersection
9538 * is simply 'a'. */
9539
9540 if (*i == a) { /* No-op */
9541 return;
9542 }
9543
9544 if (*i == NULL) {
9545 *i = invlist_clone(a, NULL);
9546 return;
9547 }
9548
9549 r = invlist_clone(a, NULL);
9550 invlist_replace_list_destroys_src(*i, r);
9551 SvREFCNT_dec_NN(r);
9552 return;
9553 }
9554
9555 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9556 * intersection must be empty */
9557 if (*i == NULL) {
9558 *i = _new_invlist(0);
9559 return;
9560 }
9561
9562 invlist_clear(*i);
9563 return;
9564 }
9565
9566 /* Here both lists exist and are non-empty */
9567 array_a = invlist_array(a);
9568 array_b = invlist_array(b);
9569
9570 /* If are to take the intersection of 'a' with the complement of b, set it
9571 * up so are looking at b's complement. */
9572 if (complement_b) {
9573
9574 /* To complement, we invert: if the first element is 0, remove it. To
9575 * do this, we just pretend the array starts one later */
9576 if (array_b[0] == 0) {
9577 array_b++;
9578 len_b--;
9579 }
9580 else {
9581
9582 /* But if the first element is not zero, we pretend the list starts
9583 * at the 0 that is always stored immediately before the array. */
9584 array_b--;
9585 len_b++;
9586 }
9587 }
9588
9589 /* Size the intersection for the worst case: that the intersection ends up
9590 * fragmenting everything to be completely disjoint */
9591 r= _new_invlist(len_a + len_b);
9592
9593 /* Will contain U+0000 iff both components do */
9594 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9595 && len_b > 0 && array_b[0] == 0);
9596
9597 /* Go through each list item by item, stopping when have exhausted one of
9598 * them */
9599 while (i_a < len_a && i_b < len_b) {
9600 UV cp; /* The element to potentially add to the intersection's
9601 array */
9602 bool cp_in_set; /* Is it in the input list's set or not */
9603
9604 /* We need to take one or the other of the two inputs for the
9605 * intersection. Since we are merging two sorted lists, we take the
9606 * smaller of the next items. In case of a tie, we take first the one
9607 * that is not in its set (a difference from the union algorithm). If
9608 * we first took the one in its set, it would increment the count,
9609 * possibly to 2 which would cause it to be output as starting a range
9610 * in the intersection, and the next time through we would take that
9611 * same number, and output it again as ending the set. By doing the
9612 * opposite of this, there is no possibility that the count will be
9613 * momentarily incremented to 2. (In a tie and both are in the set or
9614 * both not in the set, it doesn't matter which we take first.) */
9615 if ( array_a[i_a] < array_b[i_b]
9616 || ( array_a[i_a] == array_b[i_b]
9617 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9618 {
9619 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9620 cp = array_a[i_a++];
9621 }
9622 else {
9623 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9624 cp= array_b[i_b++];
9625 }
9626
9627 /* Here, have chosen which of the two inputs to look at. Only output
9628 * if the running count changes to/from 2, which marks the
9629 * beginning/end of a range that's in the intersection */
9630 if (cp_in_set) {
9631 count++;
9632 if (count == 2) {
9633 array_r[i_r++] = cp;
9634 }
9635 }
9636 else {
9637 if (count == 2) {
9638 array_r[i_r++] = cp;
9639 }
9640 count--;
9641 }
9642
9643 }
9644
9645 /* The loop above increments the index into exactly one of the input lists
9646 * each iteration, and ends when either index gets to its list end. That
9647 * means the other index is lower than its end, and so something is
9648 * remaining in that one. We increment 'count', as explained below, if the
9649 * exhausted list was in its set. (i_a and i_b each currently index the
9650 * element beyond the one we care about.) */
9651 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9652 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9653 {
9654 count++;
9655 }
9656
9657 /* Above we incremented 'count' if the exhausted list was in its set. This
9658 * has made it so that 'count' being below 2 means there is nothing left to
9659 * output; otheriwse what's left to add to the intersection is precisely
9660 * that which is left in the non-exhausted input list.
9661 *
9662 * To see why, note first that the exhausted input obviously has nothing
9663 * left to affect the intersection. If it was in its set at its end, that
9664 * means the set extends from here to the platform's infinity, and hence
9665 * anything in the non-exhausted's list will be in the intersection, and
9666 * anything not in it won't be. Hence, the rest of the intersection is
9667 * precisely what's in the non-exhausted list The exhausted set also
9668 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
9669 * it means 'count' is now at least 2. This is consistent with the
9670 * incremented 'count' being >= 2 means to add the non-exhausted list to
9671 * the intersection.
9672 *
9673 * But if the exhausted input wasn't in its set, it contributed 0 to
9674 * 'count', and the intersection can't include anything further; the
9675 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
9676 * incremented. This is consistent with 'count' being < 2 meaning nothing
9677 * further to add to the intersection. */
9678 if (count < 2) { /* Nothing left to put in the intersection. */
9679 len_r = i_r;
9680 }
9681 else { /* copy the non-exhausted list, unchanged. */
9682 IV copy_count = len_a - i_a;
9683 if (copy_count > 0) { /* a is the one with stuff left */
9684 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9685 }
9686 else { /* b is the one with stuff left */
9687 copy_count = len_b - i_b;
9688 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9689 }
9690 len_r = i_r + copy_count;
9691 }
9692
9693 /* Set the result to the final length, which can change the pointer to
9694 * array_r, so re-find it. (Note that it is unlikely that this will
9695 * change, as we are shrinking the space, not enlarging it) */
9696 if (len_r != _invlist_len(r)) {
9697 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9698 invlist_trim(r);
9699 array_r = invlist_array(r);
9700 }
9701
9702 if (*i == NULL) { /* Simply return the calculated intersection */
9703 *i = r;
9704 }
9705 else { /* Otherwise, replace the existing inversion list in '*i'. We could
9706 instead free '*i', and then set it to 'r', but experience has
9707 shown [perl #127392] that if the input is a mortal, we can get a
9708 huge build-up of these during regex compilation before they get
9709 freed. */
9710 if (len_r) {
9711 invlist_replace_list_destroys_src(*i, r);
9712 }
9713 else {
9714 invlist_clear(*i);
9715 }
9716 SvREFCNT_dec_NN(r);
9717 }
9718
9719 return;
9720}
9721
9722SV*
9723Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9724{
9725 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9726 * set. A pointer to the inversion list is returned. This may actually be
9727 * a new list, in which case the passed in one has been destroyed. The
9728 * passed-in inversion list can be NULL, in which case a new one is created
9729 * with just the one range in it. The new list is not necessarily
9730 * NUL-terminated. Space is not freed if the inversion list shrinks as a
9731 * result of this function. The gain would not be large, and in many
9732 * cases, this is called multiple times on a single inversion list, so
9733 * anything freed may almost immediately be needed again.
9734 *
9735 * This used to mostly call the 'union' routine, but that is much more
9736 * heavyweight than really needed for a single range addition */
9737
9738 UV* array; /* The array implementing the inversion list */
9739 UV len; /* How many elements in 'array' */
9740 SSize_t i_s; /* index into the invlist array where 'start'
9741 should go */
9742 SSize_t i_e = 0; /* And the index where 'end' should go */
9743 UV cur_highest; /* The highest code point in the inversion list
9744 upon entry to this function */
9745
9746 /* This range becomes the whole inversion list if none already existed */
9747 if (invlist == NULL) {
9748 invlist = _new_invlist(2);
9749 _append_range_to_invlist(invlist, start, end);
9750 return invlist;
9751 }
9752
9753 /* Likewise, if the inversion list is currently empty */
9754 len = _invlist_len(invlist);
9755 if (len == 0) {
9756 _append_range_to_invlist(invlist, start, end);
9757 return invlist;
9758 }
9759
9760 /* Starting here, we have to know the internals of the list */
9761 array = invlist_array(invlist);
9762
9763 /* If the new range ends higher than the current highest ... */
9764 cur_highest = invlist_highest(invlist);
9765 if (end > cur_highest) {
9766
9767 /* If the whole range is higher, we can just append it */
9768 if (start > cur_highest) {
9769 _append_range_to_invlist(invlist, start, end);
9770 return invlist;
9771 }
9772
9773 /* Otherwise, add the portion that is higher ... */
9774 _append_range_to_invlist(invlist, cur_highest + 1, end);
9775
9776 /* ... and continue on below to handle the rest. As a result of the
9777 * above append, we know that the index of the end of the range is the
9778 * final even numbered one of the array. Recall that the final element
9779 * always starts a range that extends to infinity. If that range is in
9780 * the set (meaning the set goes from here to infinity), it will be an
9781 * even index, but if it isn't in the set, it's odd, and the final
9782 * range in the set is one less, which is even. */
9783 if (end == UV_MAX) {
9784 i_e = len;
9785 }
9786 else {
9787 i_e = len - 2;
9788 }
9789 }
9790
9791 /* We have dealt with appending, now see about prepending. If the new
9792 * range starts lower than the current lowest ... */
9793 if (start < array[0]) {
9794
9795 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
9796 * Let the union code handle it, rather than having to know the
9797 * trickiness in two code places. */
9798 if (UNLIKELY(start == 0)) {
9799 SV* range_invlist;
9800
9801 range_invlist = _new_invlist(2);
9802 _append_range_to_invlist(range_invlist, start, end);
9803
9804 _invlist_union(invlist, range_invlist, &invlist);
9805
9806 SvREFCNT_dec_NN(range_invlist);
9807
9808 return invlist;
9809 }
9810
9811 /* If the whole new range comes before the first entry, and doesn't
9812 * extend it, we have to insert it as an additional range */
9813 if (end < array[0] - 1) {
9814 i_s = i_e = -1;
9815 goto splice_in_new_range;
9816 }
9817
9818 /* Here the new range adjoins the existing first range, extending it
9819 * downwards. */
9820 array[0] = start;
9821
9822 /* And continue on below to handle the rest. We know that the index of
9823 * the beginning of the range is the first one of the array */
9824 i_s = 0;
9825 }
9826 else { /* Not prepending any part of the new range to the existing list.
9827 * Find where in the list it should go. This finds i_s, such that:
9828 * invlist[i_s] <= start < array[i_s+1]
9829 */
9830 i_s = _invlist_search(invlist, start);
9831 }
9832
9833 /* At this point, any extending before the beginning of the inversion list
9834 * and/or after the end has been done. This has made it so that, in the
9835 * code below, each endpoint of the new range is either in a range that is
9836 * in the set, or is in a gap between two ranges that are. This means we
9837 * don't have to worry about exceeding the array bounds.
9838 *
9839 * Find where in the list the new range ends (but we can skip this if we
9840 * have already determined what it is, or if it will be the same as i_s,
9841 * which we already have computed) */
9842 if (i_e == 0) {
9843 i_e = (start == end)
9844 ? i_s
9845 : _invlist_search(invlist, end);
9846 }
9847
9848 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
9849 * is a range that goes to infinity there is no element at invlist[i_e+1],
9850 * so only the first relation holds. */
9851
9852 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9853
9854 /* Here, the ranges on either side of the beginning of the new range
9855 * are in the set, and this range starts in the gap between them.
9856 *
9857 * The new range extends the range above it downwards if the new range
9858 * ends at or above that range's start */
9859 const bool extends_the_range_above = ( end == UV_MAX
9860 || end + 1 >= array[i_s+1]);
9861
9862 /* The new range extends the range below it upwards if it begins just
9863 * after where that range ends */
9864 if (start == array[i_s]) {
9865
9866 /* If the new range fills the entire gap between the other ranges,
9867 * they will get merged together. Other ranges may also get
9868 * merged, depending on how many of them the new range spans. In
9869 * the general case, we do the merge later, just once, after we
9870 * figure out how many to merge. But in the case where the new
9871 * range exactly spans just this one gap (possibly extending into
9872 * the one above), we do the merge here, and an early exit. This
9873 * is done here to avoid having to special case later. */
9874 if (i_e - i_s <= 1) {
9875
9876 /* If i_e - i_s == 1, it means that the new range terminates
9877 * within the range above, and hence 'extends_the_range_above'
9878 * must be true. (If the range above it extends to infinity,
9879 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
9880 * will be 0, so no harm done.) */
9881 if (extends_the_range_above) {
9882 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
9883 invlist_set_len(invlist,
9884 len - 2,
9885 *(get_invlist_offset_addr(invlist)));
9886 return invlist;
9887 }
9888
9889 /* Here, i_e must == i_s. We keep them in sync, as they apply
9890 * to the same range, and below we are about to decrement i_s
9891 * */
9892 i_e--;
9893 }
9894
9895 /* Here, the new range is adjacent to the one below. (It may also
9896 * span beyond the range above, but that will get resolved later.)
9897 * Extend the range below to include this one. */
9898 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
9899 i_s--;
9900 start = array[i_s];
9901 }
9902 else if (extends_the_range_above) {
9903
9904 /* Here the new range only extends the range above it, but not the
9905 * one below. It merges with the one above. Again, we keep i_e
9906 * and i_s in sync if they point to the same range */
9907 if (i_e == i_s) {
9908 i_e++;
9909 }
9910 i_s++;
9911 array[i_s] = start;
9912 }
9913 }
9914
9915 /* Here, we've dealt with the new range start extending any adjoining
9916 * existing ranges.
9917 *
9918 * If the new range extends to infinity, it is now the final one,
9919 * regardless of what was there before */
9920 if (UNLIKELY(end == UV_MAX)) {
9921 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
9922 return invlist;
9923 }
9924
9925 /* If i_e started as == i_s, it has also been dealt with,
9926 * and been updated to the new i_s, which will fail the following if */
9927 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
9928
9929 /* Here, the ranges on either side of the end of the new range are in
9930 * the set, and this range ends in the gap between them.
9931 *
9932 * If this range is adjacent to (hence extends) the range above it, it
9933 * becomes part of that range; likewise if it extends the range below,
9934 * it becomes part of that range */
9935 if (end + 1 == array[i_e+1]) {
9936 i_e++;
9937 array[i_e] = start;
9938 }
9939 else if (start <= array[i_e]) {
9940 array[i_e] = end + 1;
9941 i_e--;
9942 }
9943 }
9944
9945 if (i_s == i_e) {
9946
9947 /* If the range fits entirely in an existing range (as possibly already
9948 * extended above), it doesn't add anything new */
9949 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9950 return invlist;
9951 }
9952
9953 /* Here, no part of the range is in the list. Must add it. It will
9954 * occupy 2 more slots */
9955 splice_in_new_range:
9956
9957 invlist_extend(invlist, len + 2);
9958 array = invlist_array(invlist);
9959 /* Move the rest of the array down two slots. Don't include any
9960 * trailing NUL */
9961 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
9962
9963 /* Do the actual splice */
9964 array[i_e+1] = start;
9965 array[i_e+2] = end + 1;
9966 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
9967 return invlist;
9968 }
9969
9970 /* Here the new range crossed the boundaries of a pre-existing range. The
9971 * code above has adjusted things so that both ends are in ranges that are
9972 * in the set. This means everything in between must also be in the set.
9973 * Just squash things together */
9974 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
9975 invlist_set_len(invlist,
9976 len - i_e + i_s,
9977 *(get_invlist_offset_addr(invlist)));
9978
9979 return invlist;
9980}
9981
9982SV*
9983Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9984 UV** other_elements_ptr)
9985{
9986 /* Create and return an inversion list whose contents are to be populated
9987 * by the caller. The caller gives the number of elements (in 'size') and
9988 * the very first element ('element0'). This function will set
9989 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9990 * are to be placed.
9991 *
9992 * Obviously there is some trust involved that the caller will properly
9993 * fill in the other elements of the array.
9994 *
9995 * (The first element needs to be passed in, as the underlying code does
9996 * things differently depending on whether it is zero or non-zero) */
9997
9998 SV* invlist = _new_invlist(size);
9999 bool offset;
10000
10001 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10002
10003 invlist = add_cp_to_invlist(invlist, element0);
10004 offset = *get_invlist_offset_addr(invlist);
10005
10006 invlist_set_len(invlist, size, offset);
10007 *other_elements_ptr = invlist_array(invlist) + 1;
10008 return invlist;
10009}
10010
10011#endif
10012
10013PERL_STATIC_INLINE SV*
10014S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
10015 return _add_range_to_invlist(invlist, cp, cp);
10016}
10017
10018#ifndef PERL_IN_XSUB_RE
10019void
10020Perl__invlist_invert(pTHX_ SV* const invlist)
10021{
10022 /* Complement the input inversion list. This adds a 0 if the list didn't
10023 * have a zero; removes it otherwise. As described above, the data
10024 * structure is set up so that this is very efficient */
10025
10026 PERL_ARGS_ASSERT__INVLIST_INVERT;
10027
10028 assert(! invlist_is_iterating(invlist));
10029
10030 /* The inverse of matching nothing is matching everything */
10031 if (_invlist_len(invlist) == 0) {
10032 _append_range_to_invlist(invlist, 0, UV_MAX);
10033 return;
10034 }
10035
10036 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10037}
10038
10039SV*
10040Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10041{
10042
10043 /* Return a new inversion list that is a copy of the input one, which is
10044 * unchanged. The new list will not be mortal even if the old one was. */
10045
10046 const STRLEN nominal_length = _invlist_len(invlist); /* Why not +1 XXX */
10047 const STRLEN physical_length = SvCUR(invlist);
10048 const bool offset = *(get_invlist_offset_addr(invlist));
10049
10050 PERL_ARGS_ASSERT_INVLIST_CLONE;
10051
10052 /* Need to allocate extra space to accommodate Perl's addition of a
10053 * trailing NUL to SvPV's, since it thinks they are always strings */
10054 if (new_invlist == NULL) {
10055 new_invlist = _new_invlist(nominal_length);
10056 }
10057 else {
10058 sv_upgrade(new_invlist, SVt_INVLIST);
10059 initialize_invlist_guts(new_invlist, nominal_length);
10060 }
10061
10062 *(get_invlist_offset_addr(new_invlist)) = offset;
10063 invlist_set_len(new_invlist, nominal_length, offset);
10064 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10065
10066 return new_invlist;
10067}
10068
10069#endif
10070
10071PERL_STATIC_INLINE STRLEN*
10072S_get_invlist_iter_addr(SV* invlist)
10073{
10074 /* Return the address of the UV that contains the current iteration
10075 * position */
10076
10077 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
10078
10079 assert(is_invlist(invlist));
10080
10081 return &(((XINVLIST*) SvANY(invlist))->iterator);
10082}
10083
10084PERL_STATIC_INLINE void
10085S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
10086{
10087 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
10088
10089 *get_invlist_iter_addr(invlist) = 0;
10090}
10091
10092PERL_STATIC_INLINE void
10093S_invlist_iterfinish(SV* invlist)
10094{
10095 /* Terminate iterator for invlist. This is to catch development errors.
10096 * Any iteration that is interrupted before completed should call this
10097 * function. Functions that add code points anywhere else but to the end
10098 * of an inversion list assert that they are not in the middle of an
10099 * iteration. If they were, the addition would make the iteration
10100 * problematical: if the iteration hadn't reached the place where things
10101 * were being added, it would be ok */
10102
10103 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
10104
10105 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
10106}
10107
10108STATIC bool
10109S_invlist_iternext(SV* invlist, UV* start, UV* end)
10110{
10111 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
10112 * This call sets in <*start> and <*end>, the next range in <invlist>.
10113 * Returns <TRUE> if successful and the next call will return the next
10114 * range; <FALSE> if was already at the end of the list. If the latter,
10115 * <*start> and <*end> are unchanged, and the next call to this function
10116 * will start over at the beginning of the list */
10117
10118 STRLEN* pos = get_invlist_iter_addr(invlist);
10119 UV len = _invlist_len(invlist);
10120 UV *array;
10121
10122 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
10123
10124 if (*pos >= len) {
10125 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
10126 return FALSE;
10127 }
10128
10129 array = invlist_array(invlist);
10130
10131 *start = array[(*pos)++];
10132
10133 if (*pos >= len) {
10134 *end = UV_MAX;
10135 }
10136 else {
10137 *end = array[(*pos)++] - 1;
10138 }
10139
10140 return TRUE;
10141}
10142
10143PERL_STATIC_INLINE UV
10144S_invlist_highest(SV* const invlist)
10145{
10146 /* Returns the highest code point that matches an inversion list. This API
10147 * has an ambiguity, as it returns 0 under either the highest is actually
10148 * 0, or if the list is empty. If this distinction matters to you, check
10149 * for emptiness before calling this function */
10150
10151 UV len = _invlist_len(invlist);
10152 UV *array;
10153
10154 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10155
10156 if (len == 0) {
10157 return 0;
10158 }
10159
10160 array = invlist_array(invlist);
10161
10162 /* The last element in the array in the inversion list always starts a
10163 * range that goes to infinity. That range may be for code points that are
10164 * matched in the inversion list, or it may be for ones that aren't
10165 * matched. In the latter case, the highest code point in the set is one
10166 * less than the beginning of this range; otherwise it is the final element
10167 * of this range: infinity */
10168 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10169 ? UV_MAX
10170 : array[len - 1] - 1;
10171}
10172
10173STATIC SV *
10174S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10175{
10176 /* Get the contents of an inversion list into a string SV so that they can
10177 * be printed out. If 'traditional_style' is TRUE, it uses the format
10178 * traditionally done for debug tracing; otherwise it uses a format
10179 * suitable for just copying to the output, with blanks between ranges and
10180 * a dash between range components */
10181
10182 UV start, end;
10183 SV* output;
10184 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10185 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10186
10187 if (traditional_style) {
10188 output = newSVpvs("\n");
10189 }
10190 else {
10191 output = newSVpvs("");
10192 }
10193
10194 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10195
10196 assert(! invlist_is_iterating(invlist));
10197
10198 invlist_iterinit(invlist);
10199 while (invlist_iternext(invlist, &start, &end)) {
10200 if (end == UV_MAX) {
10201 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c",
10202 start, intra_range_delimiter,
10203 inter_range_delimiter);
10204 }
10205 else if (end != start) {
10206 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10207 start,
10208 intra_range_delimiter,
10209 end, inter_range_delimiter);
10210 }
10211 else {
10212 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10213 start, inter_range_delimiter);
10214 }
10215 }
10216
10217 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10218 SvCUR_set(output, SvCUR(output) - 1);
10219 }
10220
10221 return output;
10222}
10223
10224#ifndef PERL_IN_XSUB_RE
10225void
10226Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10227 const char * const indent, SV* const invlist)
10228{
10229 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10230 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10231 * the string 'indent'. The output looks like this:
10232 [0] 0x000A .. 0x000D
10233 [2] 0x0085
10234 [4] 0x2028 .. 0x2029
10235 [6] 0x3104 .. INFINITY
10236 * This means that the first range of code points matched by the list are
10237 * 0xA through 0xD; the second range contains only the single code point
10238 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10239 * are used to define each range (except if the final range extends to
10240 * infinity, only a single element is needed). The array index of the
10241 * first element for the corresponding range is given in brackets. */
10242
10243 UV start, end;
10244 STRLEN count = 0;
10245
10246 PERL_ARGS_ASSERT__INVLIST_DUMP;
10247
10248 if (invlist_is_iterating(invlist)) {
10249 Perl_dump_indent(aTHX_ level, file,
10250 "%sCan't dump inversion list because is in middle of iterating\n",
10251 indent);
10252 return;
10253 }
10254
10255 invlist_iterinit(invlist);
10256 while (invlist_iternext(invlist, &start, &end)) {
10257 if (end == UV_MAX) {
10258 Perl_dump_indent(aTHX_ level, file,
10259 "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n",
10260 indent, (UV)count, start);
10261 }
10262 else if (end != start) {
10263 Perl_dump_indent(aTHX_ level, file,
10264 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10265 indent, (UV)count, start, end);
10266 }
10267 else {
10268 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10269 indent, (UV)count, start);
10270 }
10271 count += 2;
10272 }
10273}
10274
10275#endif
10276
10277#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10278bool
10279Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10280{
10281 /* Return a boolean as to if the two passed in inversion lists are
10282 * identical. The final argument, if TRUE, says to take the complement of
10283 * the second inversion list before doing the comparison */
10284
10285 const UV* array_a = invlist_array(a);
10286 const UV* array_b = invlist_array(b);
10287 UV len_a = _invlist_len(a);
10288 UV len_b = _invlist_len(b);
10289
10290 PERL_ARGS_ASSERT__INVLISTEQ;
10291
10292 /* If are to compare 'a' with the complement of b, set it
10293 * up so are looking at b's complement. */
10294 if (complement_b) {
10295
10296 /* The complement of nothing is everything, so <a> would have to have
10297 * just one element, starting at zero (ending at infinity) */
10298 if (len_b == 0) {
10299 return (len_a == 1 && array_a[0] == 0);
10300 }
10301 else if (array_b[0] == 0) {
10302
10303 /* Otherwise, to complement, we invert. Here, the first element is
10304 * 0, just remove it. To do this, we just pretend the array starts
10305 * one later */
10306
10307 array_b++;
10308 len_b--;
10309 }
10310 else {
10311
10312 /* But if the first element is not zero, we pretend the list starts
10313 * at the 0 that is always stored immediately before the array. */
10314 array_b--;
10315 len_b++;
10316 }
10317 }
10318
10319 return len_a == len_b
10320 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10321
10322}
10323#endif
10324
10325/*
10326 * As best we can, determine the characters that can match the start of
10327 * the given EXACTF-ish node.
10328 *
10329 * Returns the invlist as a new SV*; it is the caller's responsibility to
10330 * call SvREFCNT_dec() when done with it.
10331 */
10332STATIC SV*
10333S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10334{
10335 const U8 * s = (U8*)STRING(node);
10336 SSize_t bytelen = STR_LEN(node);
10337 UV uc;
10338 /* Start out big enough for 2 separate code points */
10339 SV* invlist = _new_invlist(4);
10340
10341 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10342
10343 if (! UTF) {
10344 uc = *s;
10345
10346 /* We punt and assume can match anything if the node begins
10347 * with a multi-character fold. Things are complicated. For
10348 * example, /ffi/i could match any of:
10349 * "\N{LATIN SMALL LIGATURE FFI}"
10350 * "\N{LATIN SMALL LIGATURE FF}I"
10351 * "F\N{LATIN SMALL LIGATURE FI}"
10352 * plus several other things; and making sure we have all the
10353 * possibilities is hard. */
10354 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10355 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10356 }
10357 else {
10358 /* Any Latin1 range character can potentially match any
10359 * other depending on the locale */
10360 if (OP(node) == EXACTFL) {
10361 _invlist_union(invlist, PL_Latin1, &invlist);
10362 }
10363 else {
10364 /* But otherwise, it matches at least itself. We can
10365 * quickly tell if it has a distinct fold, and if so,
10366 * it matches that as well */
10367 invlist = add_cp_to_invlist(invlist, uc);
10368 if (IS_IN_SOME_FOLD_L1(uc))
10369 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10370 }
10371
10372 /* Some characters match above-Latin1 ones under /i. This
10373 * is true of EXACTFL ones when the locale is UTF-8 */
10374 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10375 && (! isASCII(uc) || (OP(node) != EXACTFAA
10376 && OP(node) != EXACTFAA_NO_TRIE)))
10377 {
10378 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10379 }
10380 }
10381 }
10382 else { /* Pattern is UTF-8 */
10383 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10384 const U8* e = s + bytelen;
10385 IV fc;
10386
10387 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10388
10389 /* The only code points that aren't folded in a UTF EXACTFish
10390 * node are are the problematic ones in EXACTFL nodes */
10391 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10392 /* We need to check for the possibility that this EXACTFL
10393 * node begins with a multi-char fold. Therefore we fold
10394 * the first few characters of it so that we can make that
10395 * check */
10396 U8 *d = folded;
10397 int i;
10398
10399 fc = -1;
10400 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10401 if (isASCII(*s)) {
10402 *(d++) = (U8) toFOLD(*s);
10403 if (fc < 0) { /* Save the first fold */
10404 fc = *(d-1);
10405 }
10406 s++;
10407 }
10408 else {
10409 STRLEN len;
10410 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10411 if (fc < 0) { /* Save the first fold */
10412 fc = fold;
10413 }
10414 d += len;
10415 s += UTF8SKIP(s);
10416 }
10417 }
10418
10419 /* And set up so the code below that looks in this folded
10420 * buffer instead of the node's string */
10421 e = d;
10422 s = folded;
10423 }
10424
10425 /* When we reach here 's' points to the fold of the first
10426 * character(s) of the node; and 'e' points to far enough along
10427 * the folded string to be just past any possible multi-char
10428 * fold.
10429 *
10430 * Unlike the non-UTF-8 case, the macro for determining if a
10431 * string is a multi-char fold requires all the characters to
10432 * already be folded. This is because of all the complications
10433 * if not. Note that they are folded anyway, except in EXACTFL
10434 * nodes. Like the non-UTF case above, we punt if the node
10435 * begins with a multi-char fold */
10436
10437 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10438 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10439 }
10440 else { /* Single char fold */
10441 unsigned int k;
10442 unsigned int first_folds_to;
10443 const unsigned int * remaining_folds_to_list;
10444 Size_t folds_to_count;
10445
10446 /* It matches itself */
10447 invlist = add_cp_to_invlist(invlist, fc);
10448
10449 /* ... plus all the things that fold to it, which are found in
10450 * PL_utf8_foldclosures */
10451 folds_to_count = _inverse_folds(fc, &first_folds_to,
10452 &remaining_folds_to_list);
10453 for (k = 0; k < folds_to_count; k++) {
10454 UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1];
10455
10456 /* /aa doesn't allow folds between ASCII and non- */
10457 if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10458 && isASCII(c) != isASCII(fc))
10459 {
10460 continue;
10461 }
10462
10463 invlist = add_cp_to_invlist(invlist, c);
10464 }
10465 }
10466 }
10467
10468 return invlist;
10469}
10470
10471#undef HEADER_LENGTH
10472#undef TO_INTERNAL_SIZE
10473#undef FROM_INTERNAL_SIZE
10474#undef INVLIST_VERSION_ID
10475
10476/* End of inversion list object */
10477
10478STATIC void
10479S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10480{
10481 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10482 * constructs, and updates RExC_flags with them. On input, RExC_parse
10483 * should point to the first flag; it is updated on output to point to the
10484 * final ')' or ':'. There needs to be at least one flag, or this will
10485 * abort */
10486
10487 /* for (?g), (?gc), and (?o) warnings; warning
10488 about (?c) will warn about (?g) -- japhy */
10489
10490#define WASTED_O 0x01
10491#define WASTED_G 0x02
10492#define WASTED_C 0x04
10493#define WASTED_GC (WASTED_G|WASTED_C)
10494 I32 wastedflags = 0x00;
10495 U32 posflags = 0, negflags = 0;
10496 U32 *flagsp = &posflags;
10497 char has_charset_modifier = '\0';
10498 regex_charset cs;
10499 bool has_use_defaults = FALSE;
10500 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10501 int x_mod_count = 0;
10502
10503 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10504
10505 /* '^' as an initial flag sets certain defaults */
10506 if (UCHARAT(RExC_parse) == '^') {
10507 RExC_parse++;
10508 has_use_defaults = TRUE;
10509 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10510 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
10511 ? REGEX_UNICODE_CHARSET
10512 : REGEX_DEPENDS_CHARSET);
10513 }
10514
10515 cs = get_regex_charset(RExC_flags);
10516 if (cs == REGEX_DEPENDS_CHARSET
10517 && (RExC_utf8 || RExC_uni_semantics))
10518 {
10519 cs = REGEX_UNICODE_CHARSET;
10520 }
10521
10522 while (RExC_parse < RExC_end) {
10523 /* && strchr("iogcmsx", *RExC_parse) */
10524 /* (?g), (?gc) and (?o) are useless here
10525 and must be globally applied -- japhy */
10526 switch (*RExC_parse) {
10527
10528 /* Code for the imsxn flags */
10529 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10530
10531 case LOCALE_PAT_MOD:
10532 if (has_charset_modifier) {
10533 goto excess_modifier;
10534 }
10535 else if (flagsp == &negflags) {
10536 goto neg_modifier;
10537 }
10538 cs = REGEX_LOCALE_CHARSET;
10539 has_charset_modifier = LOCALE_PAT_MOD;
10540 break;
10541 case UNICODE_PAT_MOD:
10542 if (has_charset_modifier) {
10543 goto excess_modifier;
10544 }
10545 else if (flagsp == &negflags) {
10546 goto neg_modifier;
10547 }
10548 cs = REGEX_UNICODE_CHARSET;
10549 has_charset_modifier = UNICODE_PAT_MOD;
10550 break;
10551 case ASCII_RESTRICT_PAT_MOD:
10552 if (flagsp == &negflags) {
10553 goto neg_modifier;
10554 }
10555 if (has_charset_modifier) {
10556 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10557 goto excess_modifier;
10558 }
10559 /* Doubled modifier implies more restricted */
10560 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10561 }
10562 else {
10563 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10564 }
10565 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10566 break;
10567 case DEPENDS_PAT_MOD:
10568 if (has_use_defaults) {
10569 goto fail_modifiers;
10570 }
10571 else if (flagsp == &negflags) {
10572 goto neg_modifier;
10573 }
10574 else if (has_charset_modifier) {
10575 goto excess_modifier;
10576 }
10577
10578 /* The dual charset means unicode semantics if the
10579 * pattern (or target, not known until runtime) are
10580 * utf8, or something in the pattern indicates unicode
10581 * semantics */
10582 cs = (RExC_utf8 || RExC_uni_semantics)
10583 ? REGEX_UNICODE_CHARSET
10584 : REGEX_DEPENDS_CHARSET;
10585 has_charset_modifier = DEPENDS_PAT_MOD;
10586 break;
10587 excess_modifier:
10588 RExC_parse++;
10589 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10590 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10591 }
10592 else if (has_charset_modifier == *(RExC_parse - 1)) {
10593 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10594 *(RExC_parse - 1));
10595 }
10596 else {
10597 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10598 }
10599 NOT_REACHED; /*NOTREACHED*/
10600 neg_modifier:
10601 RExC_parse++;
10602 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10603 *(RExC_parse - 1));
10604 NOT_REACHED; /*NOTREACHED*/
10605 case ONCE_PAT_MOD: /* 'o' */
10606 case GLOBAL_PAT_MOD: /* 'g' */
10607 if (PASS2 && ckWARN(WARN_REGEXP)) {
10608 const I32 wflagbit = *RExC_parse == 'o'
10609 ? WASTED_O
10610 : WASTED_G;
10611 if (! (wastedflags & wflagbit) ) {
10612 wastedflags |= wflagbit;
10613 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10614 vWARN5(
10615 RExC_parse + 1,
10616 "Useless (%s%c) - %suse /%c modifier",
10617 flagsp == &negflags ? "?-" : "?",
10618 *RExC_parse,
10619 flagsp == &negflags ? "don't " : "",
10620 *RExC_parse
10621 );
10622 }
10623 }
10624 break;
10625
10626 case CONTINUE_PAT_MOD: /* 'c' */
10627 if (PASS2 && ckWARN(WARN_REGEXP)) {
10628 if (! (wastedflags & WASTED_C) ) {
10629 wastedflags |= WASTED_GC;
10630 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10631 vWARN3(
10632 RExC_parse + 1,
10633 "Useless (%sc) - %suse /gc modifier",
10634 flagsp == &negflags ? "?-" : "?",
10635 flagsp == &negflags ? "don't " : ""
10636 );
10637 }
10638 }
10639 break;
10640 case KEEPCOPY_PAT_MOD: /* 'p' */
10641 if (flagsp == &negflags) {
10642 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10643 } else {
10644 *flagsp |= RXf_PMf_KEEPCOPY;
10645 }
10646 break;
10647 case '-':
10648 /* A flag is a default iff it is following a minus, so
10649 * if there is a minus, it means will be trying to
10650 * re-specify a default which is an error */
10651 if (has_use_defaults || flagsp == &negflags) {
10652 goto fail_modifiers;
10653 }
10654 flagsp = &negflags;
10655 wastedflags = 0; /* reset so (?g-c) warns twice */
10656 x_mod_count = 0;
10657 break;
10658 case ':':
10659 case ')':
10660
10661 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10662 negflags |= RXf_PMf_EXTENDED_MORE;
10663 }
10664 RExC_flags |= posflags;
10665
10666 if (negflags & RXf_PMf_EXTENDED) {
10667 negflags |= RXf_PMf_EXTENDED_MORE;
10668 }
10669 RExC_flags &= ~negflags;
10670 set_regex_charset(&RExC_flags, cs);
10671
10672 return;
10673 default:
10674 fail_modifiers:
10675 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10676 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10677 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10678 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10679 NOT_REACHED; /*NOTREACHED*/
10680 }
10681
10682 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10683 }
10684
10685 vFAIL("Sequence (?... not terminated");
10686}
10687
10688/*
10689 - reg - regular expression, i.e. main body or parenthesized thing
10690 *
10691 * Caller must absorb opening parenthesis.
10692 *
10693 * Combining parenthesis handling with the base level of regular expression
10694 * is a trifle forced, but the need to tie the tails of the branches to what
10695 * follows makes it hard to avoid.
10696 */
10697#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10698#ifdef DEBUGGING
10699#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10700#else
10701#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10702#endif
10703
10704PERL_STATIC_INLINE regnode_offset
10705S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10706 I32 *flagp,
10707 char * parse_start,
10708 char ch
10709 )
10710{
10711 regnode_offset ret;
10712 char* name_start = RExC_parse;
10713 U32 num = 0;
10714 SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
10715 ? REG_RSN_RETURN_NULL
10716 : REG_RSN_RETURN_DATA);
10717 GET_RE_DEBUG_FLAGS_DECL;
10718
10719 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10720
10721 if (RExC_parse == name_start || *RExC_parse != ch) {
10722 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10723 vFAIL2("Sequence %.3s... not terminated", parse_start);
10724 }
10725
10726 if (sv_dat) {
10727 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10728 RExC_rxi->data->data[num]=(void*)sv_dat;
10729 SvREFCNT_inc_simple_void_NN(sv_dat);
10730 }
10731 RExC_sawback = 1;
10732 ret = reganode(pRExC_state,
10733 ((! FOLD)
10734 ? NREF
10735 : (ASCII_FOLD_RESTRICTED)
10736 ? NREFFA
10737 : (AT_LEAST_UNI_SEMANTICS)
10738 ? NREFFU
10739 : (LOC)
10740 ? NREFFL
10741 : NREFF),
10742 num);
10743 *flagp |= HASWIDTH;
10744
10745 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
10746 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
10747
10748 nextchar(pRExC_state);
10749 return ret;
10750}
10751
10752/* On success, returns the offset at which any next node should be placed into
10753 * the regex engine program being compiled.
10754 *
10755 * Returns 0 otherwise, with *flagp set to indicate why:
10756 * TRYAGAIN at the end of (?) that only sets flags.
10757 * RESTART_PARSE if the sizing scan needs to be restarted, or'd with
10758 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
10759 * Otherwise would only return 0 if regbranch() returns 0, which cannot
10760 * happen. */
10761STATIC regnode_offset
10762S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
10763 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10764 * 2 is like 1, but indicates that nextchar() has been called to advance
10765 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
10766 * this flag alerts us to the need to check for that */
10767{
10768 regnode_offset ret = 0; /* Will be the head of the group. */
10769 regnode_offset br;
10770 regnode_offset lastbr;
10771 regnode_offset ender = 0;
10772 I32 parno = 0;
10773 I32 flags;
10774 U32 oregflags = RExC_flags;
10775 bool have_branch = 0;
10776 bool is_open = 0;
10777 I32 freeze_paren = 0;
10778 I32 after_freeze = 0;
10779 I32 num; /* numeric backreferences */
10780
10781 char * parse_start = RExC_parse; /* MJD */
10782 char * const oregcomp_parse = RExC_parse;
10783
10784 GET_RE_DEBUG_FLAGS_DECL;
10785
10786 PERL_ARGS_ASSERT_REG;
10787 DEBUG_PARSE("reg ");
10788
10789 *flagp = 0; /* Tentatively. */
10790
10791 /* Having this true makes it feasible to have a lot fewer tests for the
10792 * parse pointer being in scope. For example, we can write
10793 * while(isFOO(*RExC_parse)) RExC_parse++;
10794 * instead of
10795 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
10796 */
10797 assert(*RExC_end == '\0');
10798
10799 /* Make an OPEN node, if parenthesized. */
10800 if (paren) {
10801
10802 /* Under /x, space and comments can be gobbled up between the '(' and
10803 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
10804 * intervening space, as the sequence is a token, and a token should be
10805 * indivisible */
10806 bool has_intervening_patws = (paren == 2)
10807 && *(RExC_parse - 1) != '(';
10808
10809 if (RExC_parse >= RExC_end) {
10810 vFAIL("Unmatched (");
10811 }
10812
10813 if (paren == 'r') { /* Atomic script run */
10814 paren = '>';
10815 goto parse_rest;
10816 }
10817 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
10818 char *start_verb = RExC_parse + 1;
10819 STRLEN verb_len;
10820 char *start_arg = NULL;
10821 unsigned char op = 0;
10822 int arg_required = 0;
10823 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
10824 bool has_upper = FALSE;
10825
10826 if (has_intervening_patws) {
10827 RExC_parse++; /* past the '*' */
10828
10829 /* For strict backwards compatibility, don't change the message
10830 * now that we also have lowercase operands */
10831 if (isUPPER(*RExC_parse)) {
10832 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
10833 }
10834 else {
10835 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
10836 }
10837 }
10838 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
10839 if ( *RExC_parse == ':' ) {
10840 start_arg = RExC_parse + 1;
10841 break;
10842 }
10843 else if (! UTF) {
10844 if (isUPPER(*RExC_parse)) {
10845 has_upper = TRUE;
10846 }
10847 RExC_parse++;
10848 }
10849 else {
10850 RExC_parse += UTF8SKIP(RExC_parse);
10851 }
10852 }
10853 verb_len = RExC_parse - start_verb;
10854 if ( start_arg ) {
10855 if (RExC_parse >= RExC_end) {
10856 goto unterminated_verb_pattern;
10857 }
10858
10859 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10860 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
10861 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10862 }
10863 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
10864 unterminated_verb_pattern:
10865 if (has_upper) {
10866 vFAIL("Unterminated verb pattern argument");
10867 }
10868 else {
10869 vFAIL("Unterminated '(*...' argument");
10870 }
10871 }
10872 } else {
10873 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
10874 if (has_upper) {
10875 vFAIL("Unterminated verb pattern");
10876 }
10877 else {
10878 vFAIL("Unterminated '(*...' construct");
10879 }
10880 }
10881 }
10882
10883 /* Here, we know that RExC_parse < RExC_end */
10884
10885 switch ( *start_verb ) {
10886 case 'A': /* (*ACCEPT) */
10887 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
10888 op = ACCEPT;
10889 internal_argval = RExC_nestroot;
10890 }
10891 break;
10892 case 'C': /* (*COMMIT) */
10893 if ( memEQs(start_verb, verb_len,"COMMIT") )
10894 op = COMMIT;
10895 break;
10896 case 'F': /* (*FAIL) */
10897 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
10898 op = OPFAIL;
10899 }
10900 break;
10901 case ':': /* (*:NAME) */
10902 case 'M': /* (*MARK:NAME) */
10903 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
10904 op = MARKPOINT;
10905 arg_required = 1;
10906 }
10907 break;
10908 case 'P': /* (*PRUNE) */
10909 if ( memEQs(start_verb, verb_len,"PRUNE") )
10910 op = PRUNE;
10911 break;
10912 case 'S': /* (*SKIP) */
10913 if ( memEQs(start_verb, verb_len,"SKIP") )
10914 op = SKIP;
10915 break;
10916 case 'T': /* (*THEN) */
10917 /* [19:06] <TimToady> :: is then */
10918 if ( memEQs(start_verb, verb_len,"THEN") ) {
10919 op = CUTGROUP;
10920 RExC_seen |= REG_CUTGROUP_SEEN;
10921 }
10922 break;
10923 case 'a':
10924 if ( memEQs(start_verb, verb_len, "asr")
10925 || memEQs(start_verb, verb_len, "atomic_script_run"))
10926 {
10927 paren = 'r'; /* Mnemonic: recursed run */
10928 goto script_run;
10929 }
10930 else if (memEQs(start_verb, verb_len, "atomic")) {
10931 paren = 't'; /* AtOMIC */
10932 goto alpha_assertions;
10933 }
10934 break;
10935 case 'p':
10936 if ( memEQs(start_verb, verb_len, "plb")
10937 || memEQs(start_verb, verb_len, "positive_lookbehind"))
10938 {
10939 paren = 'b';
10940 goto lookbehind_alpha_assertions;
10941 }
10942 else if ( memEQs(start_verb, verb_len, "pla")
10943 || memEQs(start_verb, verb_len, "positive_lookahead"))
10944 {
10945 paren = 'a';
10946 goto alpha_assertions;
10947 }
10948 break;
10949 case 'n':
10950 if ( memEQs(start_verb, verb_len, "nlb")
10951 || memEQs(start_verb, verb_len, "negative_lookbehind"))
10952 {
10953 paren = 'B';
10954 goto lookbehind_alpha_assertions;
10955 }
10956 else if ( memEQs(start_verb, verb_len, "nla")
10957 || memEQs(start_verb, verb_len, "negative_lookahead"))
10958 {
10959 paren = 'A';
10960 goto alpha_assertions;
10961 }
10962 break;
10963 case 's':
10964 if ( memEQs(start_verb, verb_len, "sr")
10965 || memEQs(start_verb, verb_len, "script_run"))
10966 {
10967 regnode_offset atomic;
10968
10969 paren = 's';
10970
10971 script_run:
10972
10973 /* This indicates Unicode rules. */
10974 REQUIRE_UNI_RULES(flagp, 0);
10975
10976 if (! start_arg) {
10977 goto no_colon;
10978 }
10979
10980 RExC_parse = start_arg;
10981
10982 if (RExC_in_script_run) {
10983
10984 /* Nested script runs are treated as no-ops, because
10985 * if the nested one fails, the outer one must as
10986 * well. It could fail sooner, and avoid (??{} with
10987 * side effects, but that is explicitly documented as
10988 * undefined behavior. */
10989
10990 ret = 0;
10991
10992 if (paren == 's') {
10993 paren = ':';
10994 goto parse_rest;
10995 }
10996
10997 /* But, the atomic part of a nested atomic script run
10998 * isn't a no-op, but can be treated just like a '(?>'
10999 * */
11000 paren = '>';
11001 goto parse_rest;
11002 }
11003
11004 /* By doing this here, we avoid extra warnings for nested
11005 * script runs */
11006 ckWARNexperimental(RExC_parse,
11007 WARN_EXPERIMENTAL__SCRIPT_RUN,
11008 "The script_run feature is experimental");
11009
11010 if (paren == 's') {
11011 /* Here, we're starting a new regular script run */
11012 ret = reg_node(pRExC_state, SROPEN);
11013 RExC_in_script_run = 1;
11014 is_open = 1;
11015 goto parse_rest;
11016 }
11017
11018 /* Here, we are starting an atomic script run. This is
11019 * handled by recursing to deal with the atomic portion
11020 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11021
11022 ret = reg_node(pRExC_state, SROPEN);
11023
11024 RExC_in_script_run = 1;
11025
11026 atomic = reg(pRExC_state, 'r', &flags, depth);
11027 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11028 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11029 return 0;
11030 }
11031
11032 REGTAIL(pRExC_state, ret, atomic);
11033
11034 REGTAIL(pRExC_state, atomic,
11035 reg_node(pRExC_state, SRCLOSE));
11036
11037 RExC_in_script_run = 0;
11038 return ret;
11039 }
11040
11041 break;
11042
11043 lookbehind_alpha_assertions:
11044 RExC_seen |= REG_LOOKBEHIND_SEEN;
11045 RExC_in_lookbehind++;
11046 /*FALLTHROUGH*/
11047
11048 alpha_assertions:
11049 ckWARNexperimental(RExC_parse,
11050 WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
11051 "The alpha_assertions feature is experimental");
11052
11053 RExC_seen_zerolen++;
11054
11055 if (! start_arg) {
11056 goto no_colon;
11057 }
11058
11059 /* An empty negative lookahead assertion simply is failure */
11060 if (paren == 'A' && RExC_parse == start_arg) {
11061 ret=reganode(pRExC_state, OPFAIL, 0);
11062 nextchar(pRExC_state);
11063 return ret;
11064 }
11065
11066 RExC_parse = start_arg;
11067 goto parse_rest;
11068
11069 no_colon:
11070 vFAIL2utf8f(
11071 "'(*%" UTF8f "' requires a terminating ':'",
11072 UTF8fARG(UTF, verb_len, start_verb));
11073 NOT_REACHED; /*NOTREACHED*/
11074
11075 } /* End of switch */
11076 if ( ! op ) {
11077 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11078 if (has_upper || verb_len == 0) {
11079 vFAIL2utf8f(
11080 "Unknown verb pattern '%" UTF8f "'",
11081 UTF8fARG(UTF, verb_len, start_verb));
11082 }
11083 else {
11084 vFAIL2utf8f(
11085 "Unknown '(*...)' construct '%" UTF8f "'",
11086 UTF8fARG(UTF, verb_len, start_verb));
11087 }
11088 }
11089 if ( RExC_parse == start_arg ) {
11090 start_arg = NULL;
11091 }
11092 if ( arg_required && !start_arg ) {
11093 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11094 verb_len, start_verb);
11095 }
11096 if (internal_argval == -1) {
11097 ret = reganode(pRExC_state, op, 0);
11098 } else {
11099 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11100 }
11101 RExC_seen |= REG_VERBARG_SEEN;
11102 if ( ! SIZE_ONLY ) {
11103 if (start_arg) {
11104 SV *sv = newSVpvn( start_arg,
11105 RExC_parse - start_arg);
11106 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11107 STR_WITH_LEN("S"));
11108 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11109 FLAGS(REGNODE_p(ret)) = 1;
11110 } else {
11111 FLAGS(REGNODE_p(ret)) = 0;
11112 }
11113 if ( internal_argval != -1 )
11114 ARG2L_SET(REGNODE_p(ret), internal_argval);
11115 }
11116 nextchar(pRExC_state);
11117 return ret;
11118 }
11119 else if (*RExC_parse == '?') { /* (?...) */
11120 bool is_logical = 0;
11121 const char * const seqstart = RExC_parse;
11122 const char * endptr;
11123 if (has_intervening_patws) {
11124 RExC_parse++;
11125 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11126 }
11127
11128 RExC_parse++; /* past the '?' */
11129 paren = *RExC_parse; /* might be a trailing NUL, if not
11130 well-formed */
11131 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11132 if (RExC_parse > RExC_end) {
11133 paren = '\0';
11134 }
11135 ret = 0; /* For look-ahead/behind. */
11136 switch (paren) {
11137
11138 case 'P': /* (?P...) variants for those used to PCRE/Python */
11139 paren = *RExC_parse;
11140 if ( paren == '<') { /* (?P<...>) named capture */
11141 RExC_parse++;
11142 if (RExC_parse >= RExC_end) {
11143 vFAIL("Sequence (?P<... not terminated");
11144 }
11145 goto named_capture;
11146 }
11147 else if (paren == '>') { /* (?P>name) named recursion */
11148 RExC_parse++;
11149 if (RExC_parse >= RExC_end) {
11150 vFAIL("Sequence (?P>... not terminated");
11151 }
11152 goto named_recursion;
11153 }
11154 else if (paren == '=') { /* (?P=...) named backref */
11155 RExC_parse++;
11156 return handle_named_backref(pRExC_state, flagp,
11157 parse_start, ')');
11158 }
11159 RExC_parse += SKIP_IF_CHAR(RExC_parse);
11160 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11161 vFAIL3("Sequence (%.*s...) not recognized",
11162 RExC_parse-seqstart, seqstart);
11163 NOT_REACHED; /*NOTREACHED*/
11164 case '<': /* (?<...) */
11165 if (*RExC_parse == '!')
11166 paren = ',';
11167 else if (*RExC_parse != '=')
11168 named_capture:
11169 { /* (?<...>) */
11170 char *name_start;
11171 SV *svname;
11172 paren= '>';
11173 /* FALLTHROUGH */
11174 case '\'': /* (?'...') */
11175 name_start = RExC_parse;
11176 svname = reg_scan_name(pRExC_state,
11177 SIZE_ONLY /* reverse test from the others */
11178 ? REG_RSN_RETURN_NAME
11179 : REG_RSN_RETURN_NULL);
11180 if ( RExC_parse == name_start
11181 || RExC_parse >= RExC_end
11182 || *RExC_parse != paren)
11183 {
11184 vFAIL2("Sequence (?%c... not terminated",
11185 paren=='>' ? '<' : paren);
11186 }
11187 if (SIZE_ONLY) {
11188 HE *he_str;
11189 SV *sv_dat = NULL;
11190 if (!svname) /* shouldn't happen */
11191 Perl_croak(aTHX_
11192 "panic: reg_scan_name returned NULL");
11193 if (!RExC_paren_names) {
11194 RExC_paren_names= newHV();
11195 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11196#ifdef DEBUGGING
11197 RExC_paren_name_list= newAV();
11198 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11199#endif
11200 }
11201 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11202 if ( he_str )
11203 sv_dat = HeVAL(he_str);
11204 if ( ! sv_dat ) {
11205 /* croak baby croak */
11206 Perl_croak(aTHX_
11207 "panic: paren_name hash element allocation failed");
11208 } else if ( SvPOK(sv_dat) ) {
11209 /* (?|...) can mean we have dupes so scan to check
11210 its already been stored. Maybe a flag indicating
11211 we are inside such a construct would be useful,
11212 but the arrays are likely to be quite small, so
11213 for now we punt -- dmq */
11214 IV count = SvIV(sv_dat);
11215 I32 *pv = (I32*)SvPVX(sv_dat);
11216 IV i;
11217 for ( i = 0 ; i < count ; i++ ) {
11218 if ( pv[i] == RExC_npar ) {
11219 count = 0;
11220 break;
11221 }
11222 }
11223 if ( count ) {
11224 pv = (I32*)SvGROW(sv_dat,
11225 SvCUR(sv_dat) + sizeof(I32)+1);
11226 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11227 pv[count] = RExC_npar;
11228 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11229 }
11230 } else {
11231 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11232 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11233 sizeof(I32));
11234 SvIOK_on(sv_dat);
11235 SvIV_set(sv_dat, 1);
11236 }
11237#ifdef DEBUGGING
11238 /* Yes this does cause a memory leak in debugging Perls
11239 * */
11240 if (!av_store(RExC_paren_name_list,
11241 RExC_npar, SvREFCNT_inc_NN(svname)))
11242 SvREFCNT_dec_NN(svname);
11243#endif
11244
11245 /*sv_dump(sv_dat);*/
11246 }
11247 nextchar(pRExC_state);
11248 paren = 1;
11249 goto capturing_parens;
11250 }
11251
11252 RExC_seen |= REG_LOOKBEHIND_SEEN;
11253 RExC_in_lookbehind++;
11254 RExC_parse++;
11255 if (RExC_parse >= RExC_end) {
11256 vFAIL("Sequence (?... not terminated");
11257 }
11258
11259 /* FALLTHROUGH */
11260 case '=': /* (?=...) */
11261 RExC_seen_zerolen++;
11262 break;
11263 case '!': /* (?!...) */
11264 RExC_seen_zerolen++;
11265 /* check if we're really just a "FAIL" assertion */
11266 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11267 FALSE /* Don't force to /x */ );
11268 if (*RExC_parse == ')') {
11269 ret=reganode(pRExC_state, OPFAIL, 0);
11270 nextchar(pRExC_state);
11271 return ret;
11272 }
11273 break;
11274 case '|': /* (?|...) */
11275 /* branch reset, behave like a (?:...) except that
11276 buffers in alternations share the same numbers */
11277 paren = ':';
11278 after_freeze = freeze_paren = RExC_npar;
11279 break;
11280 case ':': /* (?:...) */
11281 case '>': /* (?>...) */
11282 break;
11283 case '$': /* (?$...) */
11284 case '@': /* (?@...) */
11285 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11286 break;
11287 case '0' : /* (?0) */
11288 case 'R' : /* (?R) */
11289 if (RExC_parse == RExC_end || *RExC_parse != ')')
11290 FAIL("Sequence (?R) not terminated");
11291 num = 0;
11292 RExC_seen |= REG_RECURSE_SEEN;
11293 *flagp |= POSTPONED;
11294 goto gen_recurse_regop;
11295 /*notreached*/
11296 /* named and numeric backreferences */
11297 case '&': /* (?&NAME) */
11298 parse_start = RExC_parse - 1;
11299 named_recursion:
11300 {
11301 SV *sv_dat = reg_scan_name(pRExC_state,
11302 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11303 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11304 }
11305 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11306 vFAIL("Sequence (?&... not terminated");
11307 goto gen_recurse_regop;
11308 /* NOTREACHED */
11309 case '+':
11310 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11311 RExC_parse++;
11312 vFAIL("Illegal pattern");
11313 }
11314 goto parse_recursion;
11315 /* NOTREACHED*/
11316 case '-': /* (?-1) */
11317 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11318 RExC_parse--; /* rewind to let it be handled later */
11319 goto parse_flags;
11320 }
11321 /* FALLTHROUGH */
11322 case '1': case '2': case '3': case '4': /* (?1) */
11323 case '5': case '6': case '7': case '8': case '9':
11324 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11325 parse_recursion:
11326 {
11327 bool is_neg = FALSE;
11328 UV unum;
11329 parse_start = RExC_parse - 1; /* MJD */
11330 if (*RExC_parse == '-') {
11331 RExC_parse++;
11332 is_neg = TRUE;
11333 }
11334 endptr = RExC_end;
11335 if (grok_atoUV(RExC_parse, &unum, &endptr)
11336 && unum <= I32_MAX
11337 ) {
11338 num = (I32)unum;
11339 RExC_parse = (char*)endptr;
11340 } else
11341 num = I32_MAX;
11342 if (is_neg) {
11343 /* Some limit for num? */
11344 num = -num;
11345 }
11346 }
11347 if (*RExC_parse!=')')
11348 vFAIL("Expecting close bracket");
11349
11350 gen_recurse_regop:
11351 if ( paren == '-' ) {
11352 /*
11353 Diagram of capture buffer numbering.
11354 Top line is the normal capture buffer numbers
11355 Bottom line is the negative indexing as from
11356 the X (the (?-2))
11357
11358 + 1 2 3 4 5 X 6 7
11359 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11360 - 5 4 3 2 1 X x x
11361
11362 */
11363 num = RExC_npar + num;
11364 if (num < 1) {
11365 RExC_parse++;
11366 vFAIL("Reference to nonexistent group");
11367 }
11368 } else if ( paren == '+' ) {
11369 num = RExC_npar + num - 1;
11370 }
11371 /* We keep track how many GOSUB items we have produced.
11372 To start off the ARG2L() of the GOSUB holds its "id",
11373 which is used later in conjunction with RExC_recurse
11374 to calculate the offset we need to jump for the GOSUB,
11375 which it will store in the final representation.
11376 We have to defer the actual calculation until much later
11377 as the regop may move.
11378 */
11379
11380 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11381 if (!SIZE_ONLY) {
11382 if (num > (I32)RExC_rx->nparens) {
11383 RExC_parse++;
11384 vFAIL("Reference to nonexistent group");
11385 }
11386 RExC_recurse_count++;
11387 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11388 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11389 22, "| |", (int)(depth * 2 + 1), "",
11390 (UV)ARG(REGNODE_p(ret)),
11391 (IV)ARG2L(REGNODE_p(ret))));
11392 }
11393 RExC_seen |= REG_RECURSE_SEEN;
11394
11395 Set_Node_Length(REGNODE_p(ret),
11396 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11397 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11398
11399 *flagp |= POSTPONED;
11400 assert(*RExC_parse == ')');
11401 nextchar(pRExC_state);
11402 return ret;
11403
11404 /* NOTREACHED */
11405
11406 case '?': /* (??...) */
11407 is_logical = 1;
11408 if (*RExC_parse != '{') {
11409 RExC_parse += SKIP_IF_CHAR(RExC_parse);
11410 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11411 vFAIL2utf8f(
11412 "Sequence (%" UTF8f "...) not recognized",
11413 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11414 NOT_REACHED; /*NOTREACHED*/
11415 }
11416 *flagp |= POSTPONED;
11417 paren = '{';
11418 RExC_parse++;
11419 /* FALLTHROUGH */
11420 case '{': /* (?{...}) */
11421 {
11422 U32 n = 0;
11423 struct reg_code_block *cb;
11424 OP * o;
11425
11426 RExC_seen_zerolen++;
11427
11428 if ( !pRExC_state->code_blocks
11429 || pRExC_state->code_index
11430 >= pRExC_state->code_blocks->count
11431 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11432 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11433 - RExC_start)
11434 ) {
11435 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11436 FAIL("panic: Sequence (?{...}): no code block found\n");
11437 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11438 }
11439 /* this is a pre-compiled code block (?{...}) */
11440 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11441 RExC_parse = RExC_start + cb->end;
11442 if (!SIZE_ONLY) {
11443 o = cb->block;
11444 if (cb->src_regex) {
11445 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11446 RExC_rxi->data->data[n] =
11447 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11448 RExC_rxi->data->data[n+1] = (void*)o;
11449 }
11450 else {
11451 n = add_data(pRExC_state,
11452 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11453 RExC_rxi->data->data[n] = (void*)o;
11454 }
11455 }
11456 pRExC_state->code_index++;
11457 nextchar(pRExC_state);
11458
11459 if (is_logical) {
11460 regnode_offset eval;
11461 ret = reg_node(pRExC_state, LOGICAL);
11462
11463 eval = reg2Lanode(pRExC_state, EVAL,
11464 n,
11465
11466 /* for later propagation into (??{})
11467 * return value */
11468 RExC_flags & RXf_PMf_COMPILETIME
11469 );
11470 if (!SIZE_ONLY) {
11471 FLAGS(REGNODE_p(ret)) = 2;
11472 }
11473 REGTAIL(pRExC_state, ret, eval);
11474 /* deal with the length of this later - MJD */
11475 return ret;
11476 }
11477 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11478 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11479 Set_Node_Offset(REGNODE_p(ret), parse_start);
11480 return ret;
11481 }
11482 case '(': /* (?(?{...})...) and (?(?=...)...) */
11483 {
11484 int is_define= 0;
11485 const int DEFINE_len = sizeof("DEFINE") - 1;
11486 if ( RExC_parse < RExC_end - 1
11487 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11488 && ( RExC_parse[1] == '='
11489 || RExC_parse[1] == '!'
11490 || RExC_parse[1] == '<'
11491 || RExC_parse[1] == '{'))
11492 || ( RExC_parse[0] == '*' /* (?(*...)) */
11493 && ( memBEGINs(RExC_parse + 1,
11494 (Size_t) (RExC_end - (RExC_parse + 1)),
11495 "pla:")
11496 || memBEGINs(RExC_parse + 1,
11497 (Size_t) (RExC_end - (RExC_parse + 1)),
11498 "plb:")
11499 || memBEGINs(RExC_parse + 1,
11500 (Size_t) (RExC_end - (RExC_parse + 1)),
11501 "nla:")
11502 || memBEGINs(RExC_parse + 1,
11503 (Size_t) (RExC_end - (RExC_parse + 1)),
11504 "nlb:")
11505 || memBEGINs(RExC_parse + 1,
11506 (Size_t) (RExC_end - (RExC_parse + 1)),
11507 "positive_lookahead:")
11508 || memBEGINs(RExC_parse + 1,
11509 (Size_t) (RExC_end - (RExC_parse + 1)),
11510 "positive_lookbehind:")
11511 || memBEGINs(RExC_parse + 1,
11512 (Size_t) (RExC_end - (RExC_parse + 1)),
11513 "negative_lookahead:")
11514 || memBEGINs(RExC_parse + 1,
11515 (Size_t) (RExC_end - (RExC_parse + 1)),
11516 "negative_lookbehind:"))))
11517 ) { /* Lookahead or eval. */
11518 I32 flag;
11519 regnode_offset tail;
11520
11521 ret = reg_node(pRExC_state, LOGICAL);
11522 if (!SIZE_ONLY)
11523 FLAGS(REGNODE_p(ret)) = 1;
11524
11525 tail = reg(pRExC_state, 1, &flag, depth+1);
11526 RETURN_FAIL_ON_RESTART(flag, flagp);
11527 REGTAIL(pRExC_state, ret, tail);
11528 goto insert_if;
11529 }
11530 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11531 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11532 {
11533 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11534 char *name_start= RExC_parse++;
11535 U32 num = 0;
11536 SV *sv_dat=reg_scan_name(pRExC_state,
11537 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11538 if ( RExC_parse == name_start
11539 || RExC_parse >= RExC_end
11540 || *RExC_parse != ch)
11541 {
11542 vFAIL2("Sequence (?(%c... not terminated",
11543 (ch == '>' ? '<' : ch));
11544 }
11545 RExC_parse++;
11546 if (sv_dat) {
11547 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11548 RExC_rxi->data->data[num]=(void*)sv_dat;
11549 SvREFCNT_inc_simple_void_NN(sv_dat);
11550 }
11551 ret = reganode(pRExC_state, NGROUPP, num);
11552 goto insert_if_check_paren;
11553 }
11554 else if (memBEGINs(RExC_parse,
11555 (STRLEN) (RExC_end - RExC_parse),
11556 "DEFINE"))
11557 {
11558 ret = reganode(pRExC_state, DEFINEP, 0);
11559 RExC_parse += DEFINE_len;
11560 is_define = 1;
11561 goto insert_if_check_paren;
11562 }
11563 else if (RExC_parse[0] == 'R') {
11564 RExC_parse++;
11565 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11566 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11567 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11568 */
11569 parno = 0;
11570 if (RExC_parse[0] == '0') {
11571 parno = 1;
11572 RExC_parse++;
11573 }
11574 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11575 UV uv;
11576 endptr = RExC_end;
11577 if (grok_atoUV(RExC_parse, &uv, &endptr)
11578 && uv <= I32_MAX
11579 ) {
11580 parno = (I32)uv + 1;
11581 RExC_parse = (char*)endptr;
11582 }
11583 /* else "Switch condition not recognized" below */
11584 } else if (RExC_parse[0] == '&') {
11585 SV *sv_dat;
11586 RExC_parse++;
11587 sv_dat = reg_scan_name(pRExC_state,
11588 SIZE_ONLY
11589 ? REG_RSN_RETURN_NULL
11590 : REG_RSN_RETURN_DATA);
11591
11592 /* we should only have a false sv_dat when
11593 * SIZE_ONLY is true, and we always have false
11594 * sv_dat when SIZE_ONLY is true.
11595 * reg_scan_name() will VFAIL() if the name is
11596 * unknown when SIZE_ONLY is false, and otherwise
11597 * will return something, and when SIZE_ONLY is
11598 * true, reg_scan_name() just parses the string,
11599 * and doesnt return anything. (in theory) */
11600 assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
11601
11602 if (sv_dat)
11603 parno = 1 + *((I32 *)SvPVX(sv_dat));
11604 }
11605 ret = reganode(pRExC_state, INSUBP, parno);
11606 goto insert_if_check_paren;
11607 }
11608 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11609 /* (?(1)...) */
11610 char c;
11611 UV uv;
11612 endptr = RExC_end;
11613 if (grok_atoUV(RExC_parse, &uv, &endptr)
11614 && uv <= I32_MAX
11615 ) {
11616 parno = (I32)uv;
11617 RExC_parse = (char*)endptr;
11618 }
11619 else {
11620 vFAIL("panic: grok_atoUV returned FALSE");
11621 }
11622 ret = reganode(pRExC_state, GROUPP, parno);
11623
11624 insert_if_check_paren:
11625 if (UCHARAT(RExC_parse) != ')') {
11626 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11627 vFAIL("Switch condition not recognized");
11628 }
11629 nextchar(pRExC_state);
11630 insert_if:
11631 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11632 br = regbranch(pRExC_state, &flags, 1, depth+1);
11633 if (br == 0) {
11634 RETURN_FAIL_ON_RESTART(flags,flagp);
11635 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11636 (UV) flags);
11637 } else
11638 REGTAIL(pRExC_state, br, reganode(pRExC_state,
11639 LONGJMP, 0));
11640 c = UCHARAT(RExC_parse);
11641 nextchar(pRExC_state);
11642 if (flags&HASWIDTH)
11643 *flagp |= HASWIDTH;
11644 if (c == '|') {
11645 if (is_define)
11646 vFAIL("(?(DEFINE)....) does not allow branches");
11647
11648 /* Fake one for optimizer. */
11649 lastbr = reganode(pRExC_state, IFTHEN, 0);
11650
11651 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11652 RETURN_FAIL_ON_RESTART(flags, flagp);
11653 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11654 (UV) flags);
11655 }
11656 REGTAIL(pRExC_state, ret, lastbr);
11657 if (flags&HASWIDTH)
11658 *flagp |= HASWIDTH;
11659 c = UCHARAT(RExC_parse);
11660 nextchar(pRExC_state);
11661 }
11662 else
11663 lastbr = 0;
11664 if (c != ')') {
11665 if (RExC_parse >= RExC_end)
11666 vFAIL("Switch (?(condition)... not terminated");
11667 else
11668 vFAIL("Switch (?(condition)... contains too many branches");
11669 }
11670 ender = reg_node(pRExC_state, TAIL);
11671 REGTAIL(pRExC_state, br, ender);
11672 if (lastbr) {
11673 REGTAIL(pRExC_state, lastbr, ender);
11674 REGTAIL(pRExC_state, REGNODE_OFFSET(
11675 NEXTOPER(
11676 NEXTOPER(REGNODE_p(lastbr)))),
11677 ender);
11678 }
11679 else
11680 REGTAIL(pRExC_state, ret, ender);
11681 RExC_size++; /* XXX WHY do we need this?!!
11682 For large programs it seems to be required
11683 but I can't figure out why. -- dmq*/
11684 return ret;
11685 }
11686 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11687 vFAIL("Unknown switch condition (?(...))");
11688 }
11689 case '[': /* (?[ ... ]) */
11690 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11691 oregcomp_parse);
11692 case 0: /* A NUL */
11693 RExC_parse--; /* for vFAIL to print correctly */
11694 vFAIL("Sequence (? incomplete");
11695 break;
11696 default: /* e.g., (?i) */
11697 RExC_parse = (char *) seqstart + 1;
11698 parse_flags:
11699 parse_lparen_question_flags(pRExC_state);
11700 if (UCHARAT(RExC_parse) != ':') {
11701 if (RExC_parse < RExC_end)
11702 nextchar(pRExC_state);
11703 *flagp = TRYAGAIN;
11704 return 0;
11705 }
11706 paren = ':';
11707 nextchar(pRExC_state);
11708 ret = 0;
11709 goto parse_rest;
11710 } /* end switch */
11711 }
11712 else {
11713 if (*RExC_parse == '{') {
11714 ckWARNregdep(RExC_parse + 1,
11715 "Unescaped left brace in regex is "
11716 "deprecated here (and will be fatal "
11717 "in Perl 5.32), passed through");
11718 }
11719 /* Not bothering to indent here, as the above 'else' is temporary
11720 * */
11721 if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
11722 capturing_parens:
11723 parno = RExC_npar;
11724 RExC_npar++;
11725
11726 ret = reganode(pRExC_state, OPEN, parno);
11727 if (!SIZE_ONLY ){
11728 if (!RExC_nestroot)
11729 RExC_nestroot = parno;
11730 if (RExC_open_parens && !RExC_open_parens[parno])
11731 {
11732 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11733 "%*s%*s Setting open paren #%" IVdf " to %d\n",
11734 22, "| |", (int)(depth * 2 + 1), "",
11735 (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
11736 RExC_open_parens[parno]= ret;
11737 }
11738 }
11739
11740 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
11741 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
11742 is_open = 1;
11743 } else {
11744 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
11745 paren = ':';
11746 ret = 0;
11747 }
11748 }
11749 }
11750 else /* ! paren */
11751 ret = 0;
11752
11753 parse_rest:
11754 /* Pick up the branches, linking them together. */
11755 parse_start = RExC_parse; /* MJD */
11756 br = regbranch(pRExC_state, &flags, 1, depth+1);
11757
11758 /* branch_len = (paren != 0); */
11759
11760 if (br == 0) {
11761 RETURN_FAIL_ON_RESTART(flags, flagp);
11762 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
11763 }
11764 if (*RExC_parse == '|') {
11765 if (RExC_use_BRANCHJ) {
11766 reginsert(pRExC_state, BRANCHJ, br, depth+1);
11767 }
11768 else { /* MJD */
11769 reginsert(pRExC_state, BRANCH, br, depth+1);
11770 Set_Node_Length(REGNODE_p(br), paren != 0);
11771 Set_Node_Offset_To_R(br, parse_start-RExC_start);
11772 }
11773 have_branch = 1;
11774 if (SIZE_ONLY)
11775 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
11776 }
11777 else if (paren == ':') {
11778 *flagp |= flags&SIMPLE;
11779 }
11780 if (is_open) { /* Starts with OPEN. */
11781 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
11782 }
11783 else if (paren != '?') /* Not Conditional */
11784 ret = br;
11785 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11786 lastbr = br;
11787 while (*RExC_parse == '|') {
11788 if (RExC_use_BRANCHJ) {
11789 ender = reganode(pRExC_state, LONGJMP, 0);
11790
11791 /* Append to the previous. */
11792 REGTAIL(pRExC_state,
11793 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
11794 ender);
11795 }
11796 if (SIZE_ONLY)
11797 RExC_extralen += 2; /* Account for LONGJMP. */
11798 nextchar(pRExC_state);
11799 if (freeze_paren) {
11800 if (RExC_npar > after_freeze)
11801 after_freeze = RExC_npar;
11802 RExC_npar = freeze_paren;
11803 }
11804 br = regbranch(pRExC_state, &flags, 0, depth+1);
11805
11806 if (br == 0) {
11807 RETURN_FAIL_ON_RESTART(flags, flagp);
11808 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
11809 }
11810 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
11811 lastbr = br;
11812 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11813 }
11814
11815 if (have_branch || paren != ':') {
11816 regnode * br;
11817
11818 /* Make a closing node, and hook it on the end. */
11819 switch (paren) {
11820 case ':':
11821 ender = reg_node(pRExC_state, TAIL);
11822 break;
11823 case 1: case 2:
11824 ender = reganode(pRExC_state, CLOSE, parno);
11825 if ( RExC_close_parens ) {
11826 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11827 "%*s%*s Setting close paren #%" IVdf " to %d\n",
11828 22, "| |", (int)(depth * 2 + 1), "",
11829 (IV)parno, REG_NODE_NUM(REGNODE_p(ender))));
11830 RExC_close_parens[parno]= ender;
11831 if (RExC_nestroot == parno)
11832 RExC_nestroot = 0;
11833 }
11834 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
11835 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
11836 break;
11837 case 's':
11838 ender = reg_node(pRExC_state, SRCLOSE);
11839 RExC_in_script_run = 0;
11840 break;
11841 case '<':
11842 case 'a':
11843 case 'A':
11844 case 'b':
11845 case 'B':
11846 case ',':
11847 case '=':
11848 case '!':
11849 *flagp &= ~HASWIDTH;
11850 /* FALLTHROUGH */
11851 case 't': /* aTomic */
11852 case '>':
11853 ender = reg_node(pRExC_state, SUCCEED);
11854 break;
11855 case 0:
11856 ender = reg_node(pRExC_state, END);
11857 if (!SIZE_ONLY) {
11858 assert(!RExC_end_op); /* there can only be one! */
11859 RExC_end_op = REGNODE_p(ender);
11860 if (RExC_close_parens) {
11861 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11862 "%*s%*s Setting close paren #0 (END) to %d\n",
11863 22, "| |", (int)(depth * 2 + 1), "",
11864 REG_NODE_NUM(REGNODE_p(ender))));
11865
11866 RExC_close_parens[0]= ender;
11867 }
11868 }
11869 break;
11870 }
11871 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11872 DEBUG_PARSE_MSG("lsbr");
11873 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
11874 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
11875 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
11876 SvPV_nolen_const(RExC_mysv1),
11877 (IV)REG_NODE_NUM(REGNODE_p(lastbr)),
11878 SvPV_nolen_const(RExC_mysv2),
11879 (IV)REG_NODE_NUM(REGNODE_p(ender)),
11880 (IV)(ender - lastbr)
11881 );
11882 });
11883 REGTAIL(pRExC_state, lastbr, ender);
11884
11885 if (have_branch && !SIZE_ONLY) {
11886 char is_nothing= 1;
11887 if (depth==1)
11888 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
11889
11890 /* Hook the tails of the branches to the closing node. */
11891 for (br = REGNODE_p(ret); br; br = regnext(br)) {
11892 const U8 op = PL_regkind[OP(br)];
11893 if (op == BRANCH) {
11894 REGTAIL_STUDY(pRExC_state,
11895 REGNODE_OFFSET(NEXTOPER(br)),
11896 ender);
11897 if ( OP(NEXTOPER(br)) != NOTHING
11898 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
11899 is_nothing= 0;
11900 }
11901 else if (op == BRANCHJ) {
11902 REGTAIL_STUDY(pRExC_state,
11903 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
11904 ender);
11905 /* for now we always disable this optimisation * /
11906 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
11907 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
11908 */
11909 is_nothing= 0;
11910 }
11911 }
11912 if (is_nothing) {
11913 regnode * ret_as_regnode = REGNODE_p(ret);
11914 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
11915 ? regnext(ret_as_regnode)
11916 : ret_as_regnode;
11917 DEBUG_PARSE_r(if (!SIZE_ONLY) {
11918 DEBUG_PARSE_MSG("NADA");
11919 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
11920 NULL, pRExC_state);
11921 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
11922 NULL, pRExC_state);
11923 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
11924 SvPV_nolen_const(RExC_mysv1),
11925 (IV)REG_NODE_NUM(ret_as_regnode),
11926 SvPV_nolen_const(RExC_mysv2),
11927 (IV)REG_NODE_NUM(REGNODE_p(ender)),
11928 (IV)(ender - ret)
11929 );
11930 });
11931 OP(br)= NOTHING;
11932 if (OP(REGNODE_p(ender)) == TAIL) {
11933 NEXT_OFF(br)= 0;
11934 RExC_emit= REGNODE_OFFSET(br) + 1;
11935 } else {
11936 regnode *opt;
11937 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
11938 OP(opt)= OPTIMIZED;
11939 NEXT_OFF(br)= REGNODE_p(ender) - br;
11940 }
11941 }
11942 }
11943 }
11944
11945 {
11946 const char *p;
11947 /* Even/odd or x=don't care: 010101x10x */
11948 static const char parens[] = "=!aA<,>Bbt";
11949 /* flag below is set to 0 up through 'A'; 1 for larger */
11950
11951 if (paren && (p = strchr(parens, paren))) {
11952 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
11953 int flag = (p - parens) > 3;
11954
11955 if (paren == '>' || paren == 't') {
11956 node = SUSPEND, flag = 0;
11957 }
11958
11959 reginsert(pRExC_state, node, ret, depth+1);
11960 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11961 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
11962 FLAGS(REGNODE_p(ret)) = flag;
11963 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
11964 }
11965 }
11966
11967 /* Check for proper termination. */
11968 if (paren) {
11969 /* restore original flags, but keep (?p) and, if we've changed from /d
11970 * rules to /u, keep the /u */
11971 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
11972 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
11973 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
11974 }
11975 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
11976 RExC_parse = oregcomp_parse;
11977 vFAIL("Unmatched (");
11978 }
11979 nextchar(pRExC_state);
11980 }
11981 else if (!paren && RExC_parse < RExC_end) {
11982 if (*RExC_parse == ')') {
11983 RExC_parse++;
11984 vFAIL("Unmatched )");
11985 }
11986 else
11987 FAIL("Junk on end of regexp"); /* "Can't happen". */
11988 NOT_REACHED; /* NOTREACHED */
11989 }
11990
11991 if (RExC_in_lookbehind) {
11992 RExC_in_lookbehind--;
11993 }
11994 if (after_freeze > RExC_npar)
11995 RExC_npar = after_freeze;
11996 return(ret);
11997}
11998
11999/*
12000 - regbranch - one alternative of an | operator
12001 *
12002 * Implements the concatenation operator.
12003 *
12004 * On success, returns the offset at which any next node should be placed into
12005 * the regex engine program being compiled.
12006 *
12007 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the sizing scan needs
12008 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12009 * UTF-8
12010 */
12011STATIC regnode_offset
12012S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12013{
12014 regnode_offset ret;
12015 regnode_offset chain = 0;
12016 regnode_offset latest;
12017 I32 flags = 0, c = 0;
12018 GET_RE_DEBUG_FLAGS_DECL;
12019
12020 PERL_ARGS_ASSERT_REGBRANCH;
12021
12022 DEBUG_PARSE("brnc");
12023
12024 if (first)
12025 ret = 0;
12026 else {
12027 if (RExC_use_BRANCHJ)
12028 ret = reganode(pRExC_state, BRANCHJ, 0);
12029 else {
12030 ret = reg_node(pRExC_state, BRANCH);
12031 Set_Node_Length(REGNODE_p(ret), 1);
12032 }
12033 }
12034
12035 if (!first && SIZE_ONLY)
12036 RExC_extralen += 1; /* BRANCHJ */
12037
12038 *flagp = WORST; /* Tentatively. */
12039
12040 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12041 FALSE /* Don't force to /x */ );
12042 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12043 flags &= ~TRYAGAIN;
12044 latest = regpiece(pRExC_state, &flags, depth+1);
12045 if (latest == 0) {
12046 if (flags & TRYAGAIN)
12047 continue;
12048 RETURN_FAIL_ON_RESTART(flags, flagp);
12049 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12050 }
12051 else if (ret == 0)
12052 ret = latest;
12053 *flagp |= flags&(HASWIDTH|POSTPONED);
12054 if (chain == 0) /* First piece. */
12055 *flagp |= flags&SPSTART;
12056 else {
12057 /* FIXME adding one for every branch after the first is probably
12058 * excessive now we have TRIE support. (hv) */
12059 MARK_NAUGHTY(1);
12060 REGTAIL(pRExC_state, chain, latest);
12061 }
12062 chain = latest;
12063 c++;
12064 }
12065 if (chain == 0) { /* Loop ran zero times. */
12066 chain = reg_node(pRExC_state, NOTHING);
12067 if (ret == 0)
12068 ret = chain;
12069 }
12070 if (c == 1) {
12071 *flagp |= flags&SIMPLE;
12072 }
12073
12074 return ret;
12075}
12076
12077/*
12078 - regpiece - something followed by possible quantifier * + ? {n,m}
12079 *
12080 * Note that the branching code sequences used for ? and the general cases
12081 * of * and + are somewhat optimized: they use the same NOTHING node as
12082 * both the endmarker for their branch list and the body of the last branch.
12083 * It might seem that this node could be dispensed with entirely, but the
12084 * endmarker role is not redundant.
12085 *
12086 * On success, returns the offset at which any next node should be placed into
12087 * the regex engine program being compiled.
12088 *
12089 * Returns 0 otherwise, with *flagp set to indicate why:
12090 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12091 * RESTART_PARSE if the sizing scan needs to be restarted, or'd with
12092 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12093 */
12094STATIC regnode_offset
12095S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12096{
12097 regnode_offset ret;
12098 char op;
12099 char *next;
12100 I32 flags;
12101 const char * const origparse = RExC_parse;
12102 I32 min;
12103 I32 max = REG_INFTY;
12104#ifdef RE_TRACK_PATTERN_OFFSETS
12105 char *parse_start;
12106#endif
12107 const char *maxpos = NULL;
12108 UV uv;
12109
12110 /* Save the original in case we change the emitted regop to a FAIL. */
12111 const regnode_offset orig_emit = RExC_emit;
12112
12113 GET_RE_DEBUG_FLAGS_DECL;
12114
12115 PERL_ARGS_ASSERT_REGPIECE;
12116
12117 DEBUG_PARSE("piec");
12118
12119 ret = regatom(pRExC_state, &flags, depth+1);
12120 if (ret == 0) {
12121 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12122 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12123 }
12124
12125 op = *RExC_parse;
12126
12127 if (op == '{' && regcurly(RExC_parse)) {
12128 maxpos = NULL;
12129#ifdef RE_TRACK_PATTERN_OFFSETS
12130 parse_start = RExC_parse; /* MJD */
12131#endif
12132 next = RExC_parse + 1;
12133 while (isDIGIT(*next) || *next == ',') {
12134 if (*next == ',') {
12135 if (maxpos)
12136 break;
12137 else
12138 maxpos = next;
12139 }
12140 next++;
12141 }
12142 if (*next == '}') { /* got one */
12143 const char* endptr;
12144 if (!maxpos)
12145 maxpos = next;
12146 RExC_parse++;
12147 if (isDIGIT(*RExC_parse)) {
12148 endptr = RExC_end;
12149 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12150 vFAIL("Invalid quantifier in {,}");
12151 if (uv >= REG_INFTY)
12152 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12153 min = (I32)uv;
12154 } else {
12155 min = 0;
12156 }
12157 if (*maxpos == ',')
12158 maxpos++;
12159 else
12160 maxpos = RExC_parse;
12161 if (isDIGIT(*maxpos)) {
12162 endptr = RExC_end;
12163 if (!grok_atoUV(maxpos, &uv, &endptr))
12164 vFAIL("Invalid quantifier in {,}");
12165 if (uv >= REG_INFTY)
12166 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12167 max = (I32)uv;
12168 } else {
12169 max = REG_INFTY; /* meaning "infinity" */
12170 }
12171 RExC_parse = next;
12172 nextchar(pRExC_state);
12173 if (max < min) { /* If can't match, warn and optimize to fail
12174 unconditionally */
12175 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12176 if (PASS2) {
12177 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12178 NEXT_OFF(REGNODE_p(orig_emit)) =
12179 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12180 }
12181 return ret;
12182 }
12183 else if (min == max && *RExC_parse == '?')
12184 {
12185 ckWARN2reg(RExC_parse + 1,
12186 "Useless use of greediness modifier '%c'",
12187 *RExC_parse);
12188 }
12189
12190 do_curly:
12191 if ((flags&SIMPLE)) {
12192 if (min == 0 && max == REG_INFTY) {
12193 reginsert(pRExC_state, STAR, ret, depth+1);
12194 MARK_NAUGHTY(4);
12195 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12196 goto nest_check;
12197 }
12198 if (min == 1 && max == REG_INFTY) {
12199 reginsert(pRExC_state, PLUS, ret, depth+1);
12200 MARK_NAUGHTY(3);
12201 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12202 goto nest_check;
12203 }
12204 MARK_NAUGHTY_EXP(2, 2);
12205 reginsert(pRExC_state, CURLY, ret, depth+1);
12206 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12207 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12208 }
12209 else {
12210 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12211
12212 FLAGS(REGNODE_p(w)) = 0;
12213 REGTAIL(pRExC_state, ret, w);
12214 if (RExC_use_BRANCHJ) {
12215 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12216 reginsert(pRExC_state, NOTHING, ret, depth+1);
12217 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12218 }
12219 reginsert(pRExC_state, CURLYX, ret, depth+1);
12220 /* MJD hk */
12221 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12222 Set_Node_Length(REGNODE_p(ret),
12223 op == '{' ? (RExC_parse - parse_start) : 1);
12224
12225 if (RExC_use_BRANCHJ)
12226 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12227 LONGJMP. */
12228 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
12229 if (SIZE_ONLY)
12230 RExC_whilem_seen++, RExC_extralen += 3;
12231 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12232 }
12233 FLAGS(REGNODE_p(ret)) = 0;
12234
12235 if (min > 0)
12236 *flagp = WORST;
12237 if (max > 0)
12238 *flagp |= HASWIDTH;
12239 if (!SIZE_ONLY) {
12240 ARG1_SET(REGNODE_p(ret), (U16)min);
12241 ARG2_SET(REGNODE_p(ret), (U16)max);
12242 }
12243 if (max == REG_INFTY)
12244 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12245
12246 goto nest_check;
12247 }
12248 }
12249
12250 if (!ISMULT1(op)) {
12251 *flagp = flags;
12252 return(ret);
12253 }
12254
12255#if 0 /* Now runtime fix should be reliable. */
12256
12257 /* if this is reinstated, don't forget to put this back into perldiag:
12258
12259 =item Regexp *+ operand could be empty at {#} in regex m/%s/
12260
12261 (F) The part of the regexp subject to either the * or + quantifier
12262 could match an empty string. The {#} shows in the regular
12263 expression about where the problem was discovered.
12264
12265 */
12266
12267 if (!(flags&HASWIDTH) && op != '?')
12268 vFAIL("Regexp *+ operand could be empty");
12269#endif
12270
12271#ifdef RE_TRACK_PATTERN_OFFSETS
12272 parse_start = RExC_parse;
12273#endif
12274 nextchar(pRExC_state);
12275
12276 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12277
12278 if (op == '*') {
12279 min = 0;
12280 goto do_curly;
12281 }
12282 else if (op == '+') {
12283 min = 1;
12284 goto do_curly;
12285 }
12286 else if (op == '?') {
12287 min = 0; max = 1;
12288 goto do_curly;
12289 }
12290 nest_check:
12291 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12292 ckWARN2reg(RExC_parse,
12293 "%" UTF8f " matches null string many times",
12294 UTF8fARG(UTF, (RExC_parse >= origparse
12295 ? RExC_parse - origparse
12296 : 0),
12297 origparse));
12298 }
12299
12300 if (*RExC_parse == '?') {
12301 nextchar(pRExC_state);
12302 reginsert(pRExC_state, MINMOD, ret, depth+1);
12303 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
12304 }
12305 else if (*RExC_parse == '+') {
12306 regnode_offset ender;
12307 nextchar(pRExC_state);
12308 ender = reg_node(pRExC_state, SUCCEED);
12309 REGTAIL(pRExC_state, ret, ender);
12310 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12311 ender = reg_node(pRExC_state, TAIL);
12312 REGTAIL(pRExC_state, ret, ender);
12313 }
12314
12315 if (ISMULT2(RExC_parse)) {
12316 RExC_parse++;
12317 vFAIL("Nested quantifiers");
12318 }
12319
12320 return(ret);
12321}
12322
12323STATIC bool
12324S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12325 regnode_offset * node_p,
12326 UV * code_point_p,
12327 int * cp_count,
12328 I32 * flagp,
12329 const bool strict,
12330 const U32 depth
12331 )
12332{
12333 /* This routine teases apart the various meanings of \N and returns
12334 * accordingly. The input parameters constrain which meaning(s) is/are valid
12335 * in the current context.
12336 *
12337 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12338 *
12339 * If <code_point_p> is not NULL, the context is expecting the result to be a
12340 * single code point. If this \N instance turns out to a single code point,
12341 * the function returns TRUE and sets *code_point_p to that code point.
12342 *
12343 * If <node_p> is not NULL, the context is expecting the result to be one of
12344 * the things representable by a regnode. If this \N instance turns out to be
12345 * one such, the function generates the regnode, returns TRUE and sets *node_p
12346 * to point to the offset of that regnode into the regex engine program being
12347 * compiled.
12348 *
12349 * If this instance of \N isn't legal in any context, this function will
12350 * generate a fatal error and not return.
12351 *
12352 * On input, RExC_parse should point to the first char following the \N at the
12353 * time of the call. On successful return, RExC_parse will have been updated
12354 * to point to just after the sequence identified by this routine. Also
12355 * *flagp has been updated as needed.
12356 *
12357 * When there is some problem with the current context and this \N instance,
12358 * the function returns FALSE, without advancing RExC_parse, nor setting
12359 * *node_p, nor *code_point_p, nor *flagp.
12360 *
12361 * If <cp_count> is not NULL, the caller wants to know the length (in code
12362 * points) that this \N sequence matches. This is set, and the input is
12363 * parsed for errors, even if the function returns FALSE, as detailed below.
12364 *
12365 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
12366 *
12367 * Probably the most common case is for the \N to specify a single code point.
12368 * *cp_count will be set to 1, and *code_point_p will be set to that code
12369 * point.
12370 *
12371 * Another possibility is for the input to be an empty \N{}, which for
12372 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
12373 * will be set to a generated NOTHING node.
12374 *
12375 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12376 * set to 0. *node_p will be set to a generated REG_ANY node.
12377 *
12378 * The fourth possibility is that \N resolves to a sequence of more than one
12379 * code points. *cp_count will be set to the number of code points in the
12380 * sequence. *node_p will be set to a generated node returned by this
12381 * function calling S_reg().
12382 *
12383 * The final possibility is that it is premature to be calling this function;
12384 * that pass1 needs to be restarted. This can happen when this changes from
12385 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12386 * latter occurs only when the fourth possibility would otherwise be in
12387 * effect, and is because one of those code points requires the pattern to be
12388 * recompiled as UTF-8. The function returns FALSE, and sets the
12389 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12390 * happens, the caller needs to desist from continuing parsing, and return
12391 * this information to its caller. This is not set for when there is only one
12392 * code point, as this can be called as part of an ANYOF node, and they can
12393 * store above-Latin1 code points without the pattern having to be in UTF-8.
12394 *
12395 * For non-single-quoted regexes, the tokenizer has resolved character and
12396 * sequence names inside \N{...} into their Unicode values, normalizing the
12397 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12398 * hex-represented code points in the sequence. This is done there because
12399 * the names can vary based on what charnames pragma is in scope at the time,
12400 * so we need a way to take a snapshot of what they resolve to at the time of
12401 * the original parse. [perl #56444].
12402 *
12403 * That parsing is skipped for single-quoted regexes, so we may here get
12404 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
12405 * parser. But if the single-quoted regex is something like '\N{U+41}', that
12406 * is legal and handled here. The code point is Unicode, and has to be
12407 * translated into the native character set for non-ASCII platforms.
12408 */
12409
12410 char * endbrace; /* points to '}' following the name */
12411 char* p = RExC_parse; /* Temporary */
12412
12413 SV * substitute_parse = NULL;
12414 char *orig_end;
12415 char *save_start;
12416 I32 flags;
12417 Size_t count = 0; /* code point count kept internally by this function */
12418
12419 GET_RE_DEBUG_FLAGS_DECL;
12420
12421 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12422
12423 GET_RE_DEBUG_FLAGS;
12424
12425 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12426 assert(! (node_p && cp_count)); /* At most 1 should be set */
12427
12428 if (cp_count) { /* Initialize return for the most common case */
12429 *cp_count = 1;
12430 }
12431
12432 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12433 * modifier. The other meanings do not, so use a temporary until we find
12434 * out which we are being called with */
12435 skip_to_be_ignored_text(pRExC_state, &p,
12436 FALSE /* Don't force to /x */ );
12437
12438 /* Disambiguate between \N meaning a named character versus \N meaning
12439 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12440 * quantifier, or there is no '{' at all */
12441 if (*p != '{' || regcurly(p)) {
12442 RExC_parse = p;
12443 if (cp_count) {
12444 *cp_count = -1;
12445 }
12446
12447 if (! node_p) {
12448 return FALSE;
12449 }
12450
12451 *node_p = reg_node(pRExC_state, REG_ANY);
12452 *flagp |= HASWIDTH|SIMPLE;
12453 MARK_NAUGHTY(1);
12454 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12455 return TRUE;
12456 }
12457
12458 /* The test above made sure that the next real character is a '{', but
12459 * under the /x modifier, it could be separated by space (or a comment and
12460 * \n) and this is not allowed (for consistency with \x{...} and the
12461 * tokenizer handling of \N{NAME}). */
12462 if (*RExC_parse != '{') {
12463 vFAIL("Missing braces on \\N{}");
12464 }
12465
12466 RExC_parse++; /* Skip past the '{' */
12467
12468 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12469 if (! endbrace) { /* no trailing brace */
12470 vFAIL2("Missing right brace on \\%c{}", 'N');
12471 }
12472
12473 /* Here, we have decided it should be a named character or sequence */
12474 REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
12475 semantics */
12476
12477 if (endbrace == RExC_parse) { /* empty: \N{} */
12478 if (strict) {
12479 RExC_parse++; /* Position after the "}" */
12480 vFAIL("Zero length \\N{}");
12481 }
12482 if (cp_count) {
12483 *cp_count = 0;
12484 }
12485 nextchar(pRExC_state);
12486 if (! node_p) {
12487 return FALSE;
12488 }
12489
12490 *node_p = reg_node(pRExC_state, NOTHING);
12491 return TRUE;
12492 }
12493
12494 /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
12495 if ( endbrace - RExC_parse < 2
12496 || strnNE(RExC_parse, "U+", 2))
12497 {
12498 RExC_parse = endbrace; /* position msg's '<--HERE' */
12499 vFAIL("\\N{NAME} must be resolved by the lexer");
12500 }
12501
12502 /* This code purposely indented below because of future changes coming */
12503
12504 /* We can get to here when the input is \N{U+...} or when toke.c has
12505 * converted a name to the \N{U+...} form. This include changing a
12506 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
12507
12508 RExC_parse += 2; /* Skip past the 'U+' */
12509
12510 /* Code points are separated by dots. The '}' terminates the whole
12511 * thing. */
12512
12513 do { /* Loop until the ending brace */
12514 UV cp = 0;
12515 char * start_digit; /* The first of the current code point */
12516 if (! isXDIGIT(*RExC_parse)) {
12517 RExC_parse++;
12518 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12519 }
12520
12521 start_digit = RExC_parse;
12522 count++;
12523
12524 /* Loop through the hex digits of the current code point */
12525 do {
12526 /* Adding this digit will shift the result 4 bits. If that
12527 * result would be above the legal max, it's overflow */
12528 if (cp > MAX_LEGAL_CP >> 4) {
12529
12530 /* Find the end of the code point */
12531 do {
12532 RExC_parse ++;
12533 } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
12534
12535 /* Be sure to synchronize this message with the similar one
12536 * in utf8.c */
12537 vFAIL4("Use of code point 0x%.*s is not allowed; the"
12538 " permissible max is 0x%" UVxf,
12539 (int) (RExC_parse - start_digit), start_digit,
12540 MAX_LEGAL_CP);
12541 }
12542
12543 /* Accumulate this (valid) digit into the running total */
12544 cp = (cp << 4) + READ_XDIGIT(RExC_parse);
12545
12546 /* READ_XDIGIT advanced the input pointer. Ignore a single
12547 * underscore separator */
12548 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
12549 RExC_parse++;
12550 }
12551 } while (isXDIGIT(*RExC_parse));
12552
12553 /* Here, have accumulated the next code point */
12554 if (RExC_parse >= endbrace) { /* If done ... */
12555 if (count != 1) {
12556 goto do_concat;
12557 }
12558
12559 /* Here, is a single code point; fail if doesn't want that */
12560 if (! code_point_p) {
12561 RExC_parse = p;
12562 return FALSE;
12563 }
12564
12565 /* A single code point is easy to handle; just return it */
12566 *code_point_p = UNI_TO_NATIVE(cp);
12567 RExC_parse = endbrace;
12568 nextchar(pRExC_state);
12569 return TRUE;
12570 }
12571
12572 /* Here, the only legal thing would be a multiple character
12573 * sequence (of the form "\N{U+c1.c2. ... }". So the next
12574 * character must be a dot (and the one after that can't be the
12575 * endbrace, or we'd have something like \N{U+100.} ) */
12576 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
12577 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
12578 ? UTF8SKIP(RExC_parse)
12579 : 1;
12580 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
12581 RExC_parse = endbrace;
12582 }
12583 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12584 }
12585
12586 /* Here, looks like its really a multiple character sequence. Fail
12587 * if that's not what the caller wants. But continue with counting
12588 * and error checking if they still want a count */
12589 if (! node_p && ! cp_count) {
12590 return FALSE;
12591 }
12592
12593 /* What is done here is to convert this to a sub-pattern of the
12594 * form \x{char1}\x{char2}... and then call reg recursively to
12595 * parse it (enclosing in "(?: ... )" ). That way, it retains its
12596 * atomicness, while not having to worry about special handling
12597 * that some code points may have. We don't create a subpattern,
12598 * but go through the motions of code point counting and error
12599 * checking, if the caller doesn't want a node returned. */
12600
12601 if (node_p && count == 1) {
12602 substitute_parse = newSVpvs("?:");
12603 }
12604
12605 do_concat:
12606
12607 if (node_p) {
12608 /* Convert to notation the rest of the code understands */
12609 sv_catpvs(substitute_parse, "\\x{");
12610 sv_catpvn(substitute_parse, start_digit,
12611 RExC_parse - start_digit);
12612 sv_catpvs(substitute_parse, "}");
12613 }
12614
12615 /* Move to after the dot (or ending brace the final time through.)
12616 * */
12617 RExC_parse++;
12618 count++;
12619
12620 } while (RExC_parse < endbrace);
12621
12622 if (! node_p) { /* Doesn't want the node */
12623 assert (cp_count);
12624
12625 *cp_count = count;
12626 return FALSE;
12627 }
12628
12629 sv_catpvs(substitute_parse, ")");
12630
12631#ifdef EBCDIC
12632 /* The values are Unicode, and therefore have to be converted to native
12633 * on a non-Unicode (meaning non-ASCII) platform. */
12634 RExC_recode_x_to_native = 1;
12635#endif
12636
12637 /* Here, we have the string the name evaluates to, ready to be parsed,
12638 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
12639 * constructs. This can be called from within a substitute parse already.
12640 * The error reporting mechanism doesn't work for 2 levels of this, but the
12641 * code above has validated this new construct, so there should be no
12642 * errors generated by the below. And this isn' an exact copy, so the
12643 * mechanism to seamlessly deal with this won't work, so turn off warnings
12644 * during it */
12645 save_start = RExC_start;
12646 orig_end = RExC_end;
12647
12648 RExC_parse = RExC_start = SvPVX(substitute_parse);
12649 RExC_end = RExC_parse + SvCUR(substitute_parse);
12650 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
12651
12652 *node_p = reg(pRExC_state, 1, &flags, depth+1);
12653
12654 /* Restore the saved values */
12655 RESTORE_WARNINGS;
12656 RExC_start = save_start;
12657 RExC_parse = endbrace;
12658 RExC_end = orig_end;
12659#ifdef EBCDIC
12660 RExC_recode_x_to_native = 0;
12661#endif
12662
12663 SvREFCNT_dec_NN(substitute_parse);
12664
12665 if (! *node_p) {
12666 RETURN_X_ON_RESTART(FALSE, flags, flagp);
12667 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
12668 (UV) flags);
12669 }
12670 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12671
12672 nextchar(pRExC_state);
12673
12674 return TRUE;
12675}
12676
12677
12678PERL_STATIC_INLINE U8
12679S_compute_EXACTish(RExC_state_t *pRExC_state)
12680{
12681 U8 op;
12682
12683 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12684
12685 if (! FOLD) {
12686 return (LOC)
12687 ? EXACTL
12688 : EXACT;
12689 }
12690
12691 op = get_regex_charset(RExC_flags);
12692 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
12693 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
12694 been, so there is no hole */
12695 }
12696
12697 return op + EXACTF;
12698}
12699
12700PERL_STATIC_INLINE void
12701S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
12702 regnode_offset node, I32* flagp, STRLEN len,
12703 UV code_point, bool downgradable)
12704{
12705 /* This knows the details about sizing an EXACTish node, setting flags for
12706 * it (by setting <*flagp>, and potentially populating it with a single
12707 * character.
12708 *
12709 * If <len> (the length in bytes) is non-zero, this function assumes that
12710 * the node has already been populated, and just does the sizing. In this
12711 * case <code_point> should be the final code point that has already been
12712 * placed into the node. This value will be ignored except that under some
12713 * circumstances <*flagp> is set based on it.
12714 *
12715 * If <len> is zero, the function assumes that the node is to contain only
12716 * the single character given by <code_point> and calculates what <len>
12717 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
12718 * additionally will populate the node's STRING with <code_point> or its
12719 * fold if folding.
12720 *
12721 * In both cases <*flagp> is appropriately set
12722 *
12723 * It knows that under FOLD, the Latin Sharp S and UTF characters above
12724 * 255, must be folded (the former only when the rules indicate it can
12725 * match 'ss')
12726 *
12727 * When it does the populating, it looks at the flag 'downgradable'. If
12728 * true with a node that folds, it checks if the single code point
12729 * participates in a fold, and if not downgrades the node to an EXACT.
12730 * This helps the optimizer */
12731
12732 bool len_passed_in = cBOOL(len != 0);
12733 U8 character[UTF8_MAXBYTES_CASE+1];
12734
12735 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
12736
12737 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
12738 * sizing difference, and is extra work that is thrown away */
12739 if (downgradable && ! PASS2) {
12740 downgradable = FALSE;
12741 }
12742
12743 if (! len_passed_in) {
12744 if (UTF) {
12745 if (UVCHR_IS_INVARIANT(code_point)) {
12746 if (LOC || ! FOLD) { /* /l defers folding until runtime */
12747 *character = (U8) code_point;
12748 }
12749 else { /* Here is /i and not /l. (toFOLD() is defined on just
12750 ASCII, which isn't the same thing as INVARIANT on
12751 EBCDIC, but it works there, as the extra invariants
12752 fold to themselves) */
12753 *character = toFOLD((U8) code_point);
12754
12755 /* We can downgrade to an EXACT node if this character
12756 * isn't a folding one. Note that this assumes that
12757 * nothing above Latin1 folds to some other invariant than
12758 * one of these alphabetics; otherwise we would also have
12759 * to check:
12760 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12761 * || ASCII_FOLD_RESTRICTED))
12762 */
12763 if (downgradable && PL_fold[code_point] == code_point) {
12764 OP(REGNODE_p(node)) = EXACT;
12765 }
12766 }
12767 len = 1;
12768 }
12769 else if (FOLD && (! LOC
12770 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
12771 { /* Folding, and ok to do so now */
12772 UV folded = _to_uni_fold_flags(
12773 code_point,
12774 character,
12775 &len,
12776 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12777 ? FOLD_FLAGS_NOMIX_ASCII
12778 : 0));
12779 if (downgradable
12780 && folded == code_point /* This quickly rules out many
12781 cases, avoiding the
12782 _invlist_contains_cp() overhead
12783 for those. */
12784 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
12785 {
12786 OP(REGNODE_p(node)) = (LOC)
12787 ? EXACTL
12788 : EXACT;
12789 }
12790 }
12791 else if (code_point <= MAX_UTF8_TWO_BYTE) {
12792
12793 /* Not folding this cp, and can output it directly */
12794 *character = UTF8_TWO_BYTE_HI(code_point);
12795 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
12796 len = 2;
12797 }
12798 else {
12799 uvchr_to_utf8( character, code_point);
12800 len = UTF8SKIP(character);
12801 }
12802 } /* Else pattern isn't UTF8. */
12803 else if (! FOLD) {
12804 *character = (U8) code_point;
12805 len = 1;
12806 } /* Else is folded non-UTF8 */
12807#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12808 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12809 || UNICODE_DOT_DOT_VERSION > 0)
12810 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
12811#else
12812 else if (1) {
12813#endif
12814 /* We don't fold any non-UTF8 except possibly the Sharp s (see
12815 * comments at join_exact()); */
12816 *character = (U8) code_point;
12817 len = 1;
12818
12819 /* Can turn into an EXACT node if we know the fold at compile time,
12820 * and it folds to itself and doesn't particpate in other folds */
12821 if (downgradable
12822 && ! LOC
12823 && PL_fold_latin1[code_point] == code_point
12824 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12825 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
12826 {
12827 OP(REGNODE_p(node)) = EXACT;
12828 }
12829 } /* else is Sharp s. May need to fold it */
12830 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
12831 *character = 's';
12832 *(character + 1) = 's';
12833 len = 2;
12834 }
12835 else {
12836 *character = LATIN_SMALL_LETTER_SHARP_S;
12837 len = 1;
12838 }
12839 }
12840
12841 if (SIZE_ONLY) {
12842 RExC_size += STR_SZ(len);
12843 }
12844 else {
12845 RExC_emit += STR_SZ(len);
12846 STR_LEN(REGNODE_p(node)) = len;
12847 if (! len_passed_in) {
12848 Copy((char *) character, STRING(REGNODE_p(node)), len, char);
12849 }
12850 }
12851
12852 *flagp |= HASWIDTH;
12853
12854 /* A single character node is SIMPLE, except for the special-cased SHARP S
12855 * under /di. */
12856 if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
12857#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12858 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12859 || UNICODE_DOT_DOT_VERSION > 0)
12860 && ( code_point != LATIN_SMALL_LETTER_SHARP_S
12861 || ! FOLD || ! DEPENDS_SEMANTICS)
12862#endif
12863 ) {
12864 *flagp |= SIMPLE;
12865 }
12866
12867 /* The OP may not be well defined in PASS1 */
12868 if (PASS2 && OP(REGNODE_p(node)) == EXACTFL) {
12869 RExC_contains_locale = 1;
12870 }
12871}
12872
12873STATIC bool
12874S_new_regcurly(const char *s, const char *e)
12875{
12876 /* This is a temporary function designed to match the most lenient form of
12877 * a {m,n} quantifier we ever envision, with either number omitted, and
12878 * spaces anywhere between/before/after them.
12879 *
12880 * If this function fails, then the string it matches is very unlikely to
12881 * ever be considered a valid quantifier, so we can allow the '{' that
12882 * begins it to be considered as a literal */
12883
12884 bool has_min = FALSE;
12885 bool has_max = FALSE;
12886
12887 PERL_ARGS_ASSERT_NEW_REGCURLY;
12888
12889 if (s >= e || *s++ != '{')
12890 return FALSE;
12891
12892 while (s < e && isSPACE(*s)) {
12893 s++;
12894 }
12895 while (s < e && isDIGIT(*s)) {
12896 has_min = TRUE;
12897 s++;
12898 }
12899 while (s < e && isSPACE(*s)) {
12900 s++;
12901 }
12902
12903 if (*s == ',') {
12904 s++;
12905 while (s < e && isSPACE(*s)) {
12906 s++;
12907 }
12908 while (s < e && isDIGIT(*s)) {
12909 has_max = TRUE;
12910 s++;
12911 }
12912 while (s < e && isSPACE(*s)) {
12913 s++;
12914 }
12915 }
12916
12917 return s < e && *s == '}' && (has_min || has_max);
12918}
12919
12920/* Parse backref decimal value, unless it's too big to sensibly be a backref,
12921 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
12922
12923static I32
12924S_backref_value(char *p, char *e)
12925{
12926 const char* endptr = e;
12927 UV val;
12928 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
12929 return (I32)val;
12930 return I32_MAX;
12931}
12932
12933
12934/*
12935 - regatom - the lowest level
12936
12937 Try to identify anything special at the start of the current parse position.
12938 If there is, then handle it as required. This may involve generating a
12939 single regop, such as for an assertion; or it may involve recursing, such as
12940 to handle a () structure.
12941
12942 If the string doesn't start with something special then we gobble up
12943 as much literal text as we can. If we encounter a quantifier, we have to
12944 back off the final literal character, as that quantifier applies to just it
12945 and not to the whole string of literals.
12946
12947 Once we have been able to handle whatever type of thing started the
12948 sequence, we return the offset into the regex engine program being compiled
12949 at which any next regnode should be placed.
12950
12951 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
12952 Returns 0, setting *flagp to RESTART_PARSE if the sizing scan needs to be
12953 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
12954 Otherwise does not return 0.
12955
12956 Note: we have to be careful with escapes, as they can be both literal
12957 and special, and in the case of \10 and friends, context determines which.
12958
12959 A summary of the code structure is:
12960
12961 switch (first_byte) {
12962 cases for each special:
12963 handle this special;
12964 break;
12965 case '\\':
12966 switch (2nd byte) {
12967 cases for each unambiguous special:
12968 handle this special;
12969 break;
12970 cases for each ambigous special/literal:
12971 disambiguate;
12972 if (special) handle here
12973 else goto defchar;
12974 default: // unambiguously literal:
12975 goto defchar;
12976 }
12977 default: // is a literal char
12978 // FALL THROUGH
12979 defchar:
12980 create EXACTish node for literal;
12981 while (more input and node isn't full) {
12982 switch (input_byte) {
12983 cases for each special;
12984 make sure parse pointer is set so that the next call to
12985 regatom will see this special first
12986 goto loopdone; // EXACTish node terminated by prev. char
12987 default:
12988 append char to EXACTISH node;
12989 }
12990 get next input byte;
12991 }
12992 loopdone:
12993 }
12994 return the generated node;
12995
12996 Specifically there are two separate switches for handling
12997 escape sequences, with the one for handling literal escapes requiring
12998 a dummy entry for all of the special escapes that are actually handled
12999 by the other.
13000
13001*/
13002
13003STATIC regnode_offset
13004S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13005{
13006 regnode_offset ret = 0;
13007 I32 flags = 0;
13008 char *parse_start;
13009 U8 op;
13010 int invert = 0;
13011 U8 arg;
13012
13013 GET_RE_DEBUG_FLAGS_DECL;
13014
13015 *flagp = WORST; /* Tentatively. */
13016
13017 DEBUG_PARSE("atom");
13018
13019 PERL_ARGS_ASSERT_REGATOM;
13020
13021 tryagain:
13022 parse_start = RExC_parse;
13023 assert(RExC_parse < RExC_end);
13024 switch ((U8)*RExC_parse) {
13025 case '^':
13026 RExC_seen_zerolen++;
13027 nextchar(pRExC_state);
13028 if (RExC_flags & RXf_PMf_MULTILINE)
13029 ret = reg_node(pRExC_state, MBOL);
13030 else
13031 ret = reg_node(pRExC_state, SBOL);
13032 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13033 break;
13034 case '$':
13035 nextchar(pRExC_state);
13036 if (*RExC_parse)
13037 RExC_seen_zerolen++;
13038 if (RExC_flags & RXf_PMf_MULTILINE)
13039 ret = reg_node(pRExC_state, MEOL);
13040 else
13041 ret = reg_node(pRExC_state, SEOL);
13042 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13043 break;
13044 case '.':
13045 nextchar(pRExC_state);
13046 if (RExC_flags & RXf_PMf_SINGLELINE)
13047 ret = reg_node(pRExC_state, SANY);
13048 else
13049 ret = reg_node(pRExC_state, REG_ANY);
13050 *flagp |= HASWIDTH|SIMPLE;
13051 MARK_NAUGHTY(1);
13052 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13053 break;
13054 case '[':
13055 {
13056 char * const oregcomp_parse = ++RExC_parse;
13057 ret = regclass(pRExC_state, flagp, depth+1,
13058 FALSE, /* means parse the whole char class */
13059 TRUE, /* allow multi-char folds */
13060 FALSE, /* don't silence non-portable warnings. */
13061 (bool) RExC_strict,
13062 TRUE, /* Allow an optimized regnode result */
13063 NULL,
13064 NULL);
13065 if (ret == 0) {
13066 RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, NEED_UTF8);
13067 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13068 (UV) *flagp);
13069 }
13070 if (*RExC_parse != ']') {
13071 RExC_parse = oregcomp_parse;
13072 vFAIL("Unmatched [");
13073 }
13074 nextchar(pRExC_state);
13075 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13076 break;
13077 }
13078 case '(':
13079 nextchar(pRExC_state);
13080 ret = reg(pRExC_state, 2, &flags, depth+1);
13081 if (ret == 0) {
13082 if (flags & TRYAGAIN) {
13083 if (RExC_parse >= RExC_end) {
13084 /* Make parent create an empty node if needed. */
13085 *flagp |= TRYAGAIN;
13086 return(0);
13087 }
13088 goto tryagain;
13089 }
13090 RETURN_FAIL_ON_RESTART(flags, flagp);
13091 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13092 (UV) flags);
13093 }
13094 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13095 break;
13096 case '|':
13097 case ')':
13098 if (flags & TRYAGAIN) {
13099 *flagp |= TRYAGAIN;
13100 return 0;
13101 }
13102 vFAIL("Internal urp");
13103 /* Supposed to be caught earlier. */
13104 break;
13105 case '?':
13106 case '+':
13107 case '*':
13108 RExC_parse++;
13109 vFAIL("Quantifier follows nothing");
13110 break;
13111 case '\\':
13112 /* Special Escapes
13113
13114 This switch handles escape sequences that resolve to some kind
13115 of special regop and not to literal text. Escape sequnces that
13116 resolve to literal text are handled below in the switch marked
13117 "Literal Escapes".
13118
13119 Every entry in this switch *must* have a corresponding entry
13120 in the literal escape switch. However, the opposite is not
13121 required, as the default for this switch is to jump to the
13122 literal text handling code.
13123 */
13124 RExC_parse++;
13125 switch ((U8)*RExC_parse) {
13126 /* Special Escapes */
13127 case 'A':
13128 RExC_seen_zerolen++;
13129 ret = reg_node(pRExC_state, SBOL);
13130 /* SBOL is shared with /^/ so we set the flags so we can tell
13131 * /\A/ from /^/ in split. We check ret because first pass we
13132 * have no regop struct to set the flags on. */
13133 if (PASS2)
13134 FLAGS(REGNODE_p(ret)) = 1;
13135 *flagp |= SIMPLE;
13136 goto finish_meta_pat;
13137 case 'G':
13138 ret = reg_node(pRExC_state, GPOS);
13139 RExC_seen |= REG_GPOS_SEEN;
13140 *flagp |= SIMPLE;
13141 goto finish_meta_pat;
13142 case 'K':
13143 RExC_seen_zerolen++;
13144 ret = reg_node(pRExC_state, KEEPS);
13145 *flagp |= SIMPLE;
13146 /* XXX:dmq : disabling in-place substitution seems to
13147 * be necessary here to avoid cases of memory corruption, as
13148 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13149 */
13150 RExC_seen |= REG_LOOKBEHIND_SEEN;
13151 goto finish_meta_pat;
13152 case 'Z':
13153 ret = reg_node(pRExC_state, SEOL);
13154 *flagp |= SIMPLE;
13155 RExC_seen_zerolen++; /* Do not optimize RE away */
13156 goto finish_meta_pat;
13157 case 'z':
13158 ret = reg_node(pRExC_state, EOS);
13159 *flagp |= SIMPLE;
13160 RExC_seen_zerolen++; /* Do not optimize RE away */
13161 goto finish_meta_pat;
13162 case 'C':
13163 vFAIL("\\C no longer supported");
13164 case 'X':
13165 ret = reg_node(pRExC_state, CLUMP);
13166 *flagp |= HASWIDTH;
13167 goto finish_meta_pat;
13168
13169 case 'W':
13170 invert = 1;
13171 /* FALLTHROUGH */
13172 case 'w':
13173 arg = ANYOF_WORDCHAR;
13174 goto join_posix;
13175
13176 case 'B':
13177 invert = 1;
13178 /* FALLTHROUGH */
13179 case 'b':
13180 {
13181 regex_charset charset = get_regex_charset(RExC_flags);
13182
13183 RExC_seen_zerolen++;
13184 RExC_seen |= REG_LOOKBEHIND_SEEN;
13185 op = BOUND + charset;
13186
13187 if (op == BOUNDL) {
13188 RExC_contains_locale = 1;
13189 }
13190
13191 ret = reg_node(pRExC_state, op);
13192 *flagp |= SIMPLE;
13193 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13194 FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND;
13195 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
13196 OP(REGNODE_p(ret)) = BOUNDA;
13197 }
13198 }
13199 else {
13200 STRLEN length;
13201 char name = *RExC_parse;
13202 char * endbrace = NULL;
13203 RExC_parse += 2;
13204 if (RExC_parse < RExC_end) {
13205 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13206 }
13207
13208 if (! endbrace) {
13209 vFAIL2("Missing right brace on \\%c{}", name);
13210 }
13211 /* XXX Need to decide whether to take spaces or not. Should be
13212 * consistent with \p{}, but that currently is SPACE, which
13213 * means vertical too, which seems wrong
13214 * while (isBLANK(*RExC_parse)) {
13215 RExC_parse++;
13216 }*/
13217 if (endbrace == RExC_parse) {
13218 RExC_parse++; /* After the '}' */
13219 vFAIL2("Empty \\%c{}", name);
13220 }
13221 length = endbrace - RExC_parse;
13222 /*while (isBLANK(*(RExC_parse + length - 1))) {
13223 length--;
13224 }*/
13225 switch (*RExC_parse) {
13226 case 'g':
13227 if ( length != 1
13228 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13229 {
13230 goto bad_bound_type;
13231 }
13232 FLAGS(REGNODE_p(ret)) = GCB_BOUND;
13233 break;
13234 case 'l':
13235 if (length != 2 || *(RExC_parse + 1) != 'b') {
13236 goto bad_bound_type;
13237 }
13238 FLAGS(REGNODE_p(ret)) = LB_BOUND;
13239 break;
13240 case 's':
13241 if (length != 2 || *(RExC_parse + 1) != 'b') {
13242 goto bad_bound_type;
13243 }
13244 FLAGS(REGNODE_p(ret)) = SB_BOUND;
13245 break;
13246 case 'w':
13247 if (length != 2 || *(RExC_parse + 1) != 'b') {
13248 goto bad_bound_type;
13249 }
13250 FLAGS(REGNODE_p(ret)) = WB_BOUND;
13251 break;
13252 default:
13253 bad_bound_type:
13254 RExC_parse = endbrace;
13255 vFAIL2utf8f(
13256 "'%" UTF8f "' is an unknown bound type",
13257 UTF8fARG(UTF, length, endbrace - length));
13258 NOT_REACHED; /*NOTREACHED*/
13259 }
13260 RExC_parse = endbrace;
13261 REQUIRE_UNI_RULES(flagp, 0);
13262
13263 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
13264 OP(REGNODE_p(ret)) = BOUNDU;
13265 length += 4;
13266
13267 /* Don't have to worry about UTF-8, in this message because
13268 * to get here the contents of the \b must be ASCII */
13269 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13270 "Using /u for '%.*s' instead of /%s",
13271 (unsigned) length,
13272 endbrace - length + 1,
13273 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13274 ? ASCII_RESTRICT_PAT_MODS
13275 : ASCII_MORE_RESTRICT_PAT_MODS);
13276 }
13277 }
13278
13279 if (PASS2 && invert) {
13280 OP(REGNODE_p(ret)) += NBOUND - BOUND;
13281 }
13282 goto finish_meta_pat;
13283 }
13284
13285 case 'D':
13286 invert = 1;
13287 /* FALLTHROUGH */
13288 case 'd':
13289 arg = ANYOF_DIGIT;
13290 if (! DEPENDS_SEMANTICS) {
13291 goto join_posix;
13292 }
13293
13294 /* \d doesn't have any matches in the upper Latin1 range, hence /d
13295 * is equivalent to /u. Changing to /u saves some branches at
13296 * runtime */
13297 op = POSIXU;
13298 goto join_posix_op_known;
13299
13300 case 'R':
13301 ret = reg_node(pRExC_state, LNBREAK);
13302 *flagp |= HASWIDTH|SIMPLE;
13303 goto finish_meta_pat;
13304
13305 case 'H':
13306 invert = 1;
13307 /* FALLTHROUGH */
13308 case 'h':
13309 arg = ANYOF_BLANK;
13310 op = POSIXU;
13311 goto join_posix_op_known;
13312
13313 case 'V':
13314 invert = 1;
13315 /* FALLTHROUGH */
13316 case 'v':
13317 arg = ANYOF_VERTWS;
13318 op = POSIXU;
13319 goto join_posix_op_known;
13320
13321 case 'S':
13322 invert = 1;
13323 /* FALLTHROUGH */
13324 case 's':
13325 arg = ANYOF_SPACE;
13326
13327 join_posix:
13328
13329 op = POSIXD + get_regex_charset(RExC_flags);
13330 if (op > POSIXA) { /* /aa is same as /a */
13331 op = POSIXA;
13332 }
13333 else if (op == POSIXL) {
13334 RExC_contains_locale = 1;
13335 }
13336
13337 join_posix_op_known:
13338
13339 if (invert) {
13340 op += NPOSIXD - POSIXD;
13341 }
13342
13343 ret = reg_node(pRExC_state, op);
13344 if (! SIZE_ONLY) {
13345 FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
13346 }
13347
13348 *flagp |= HASWIDTH|SIMPLE;
13349 /* FALLTHROUGH */
13350
13351 finish_meta_pat:
13352 if ( UCHARAT(RExC_parse + 1) == '{'
13353 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13354 {
13355 RExC_parse += 2;
13356 vFAIL("Unescaped left brace in regex is illegal here");
13357 }
13358 nextchar(pRExC_state);
13359 Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
13360 break;
13361 case 'p':
13362 case 'P':
13363 RExC_parse--;
13364
13365 ret = regclass(pRExC_state, flagp, depth+1,
13366 TRUE, /* means just parse this element */
13367 FALSE, /* don't allow multi-char folds */
13368 FALSE, /* don't silence non-portable warnings. It
13369 would be a bug if these returned
13370 non-portables */
13371 (bool) RExC_strict,
13372 TRUE, /* Allow an optimized regnode result */
13373 NULL,
13374 NULL);
13375 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13376 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13377 * multi-char folds are allowed. */
13378 if (!ret)
13379 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13380 (UV) *flagp);
13381
13382 RExC_parse--;
13383
13384 Set_Node_Offset(REGNODE_p(ret), parse_start);
13385 Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
13386 nextchar(pRExC_state);
13387 break;
13388 case 'N':
13389 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13390 * \N{...} evaluates to a sequence of more than one code points).
13391 * The function call below returns a regnode, which is our result.
13392 * The parameters cause it to fail if the \N{} evaluates to a
13393 * single code point; we handle those like any other literal. The
13394 * reason that the multicharacter case is handled here and not as
13395 * part of the EXACtish code is because of quantifiers. In
13396 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13397 * this way makes that Just Happen. dmq.
13398 * join_exact() will join this up with adjacent EXACTish nodes
13399 * later on, if appropriate. */
13400 ++RExC_parse;
13401 if (grok_bslash_N(pRExC_state,
13402 &ret, /* Want a regnode returned */
13403 NULL, /* Fail if evaluates to a single code
13404 point */
13405 NULL, /* Don't need a count of how many code
13406 points */
13407 flagp,
13408 RExC_strict,
13409 depth)
13410 ) {
13411 break;
13412 }
13413
13414 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13415
13416 /* Here, evaluates to a single code point. Go get that */
13417 RExC_parse = parse_start;
13418 goto defchar;
13419
13420 case 'k': /* Handle \k<NAME> and \k'NAME' */
13421 parse_named_seq:
13422 {
13423 char ch;
13424 if ( RExC_parse >= RExC_end - 1
13425 || (( ch = RExC_parse[1]) != '<'
13426 && ch != '\''
13427 && ch != '{'))
13428 {
13429 RExC_parse++;
13430 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13431 vFAIL2("Sequence %.2s... not terminated", parse_start);
13432 } else {
13433 RExC_parse += 2;
13434 ret = handle_named_backref(pRExC_state,
13435 flagp,
13436 parse_start,
13437 (ch == '<')
13438 ? '>'
13439 : (ch == '{')
13440 ? '}'
13441 : '\'');
13442 }
13443 break;
13444 }
13445 case 'g':
13446 case '1': case '2': case '3': case '4':
13447 case '5': case '6': case '7': case '8': case '9':
13448 {
13449 I32 num;
13450 bool hasbrace = 0;
13451
13452 if (*RExC_parse == 'g') {
13453 bool isrel = 0;
13454
13455 RExC_parse++;
13456 if (*RExC_parse == '{') {
13457 RExC_parse++;
13458 hasbrace = 1;
13459 }
13460 if (*RExC_parse == '-') {
13461 RExC_parse++;
13462 isrel = 1;
13463 }
13464 if (hasbrace && !isDIGIT(*RExC_parse)) {
13465 if (isrel) RExC_parse--;
13466 RExC_parse -= 2;
13467 goto parse_named_seq;
13468 }
13469
13470 if (RExC_parse >= RExC_end) {
13471 goto unterminated_g;
13472 }
13473 num = S_backref_value(RExC_parse, RExC_end);
13474 if (num == 0)
13475 vFAIL("Reference to invalid group 0");
13476 else if (num == I32_MAX) {
13477 if (isDIGIT(*RExC_parse))
13478 vFAIL("Reference to nonexistent group");
13479 else
13480 unterminated_g:
13481 vFAIL("Unterminated \\g... pattern");
13482 }
13483
13484 if (isrel) {
13485 num = RExC_npar - num;
13486 if (num < 1)
13487 vFAIL("Reference to nonexistent or unclosed group");
13488 }
13489 }
13490 else {
13491 num = S_backref_value(RExC_parse, RExC_end);
13492 /* bare \NNN might be backref or octal - if it is larger
13493 * than or equal RExC_npar then it is assumed to be an
13494 * octal escape. Note RExC_npar is +1 from the actual
13495 * number of parens. */
13496 /* Note we do NOT check if num == I32_MAX here, as that is
13497 * handled by the RExC_npar check */
13498
13499 if (
13500 /* any numeric escape < 10 is always a backref */
13501 num > 9
13502 /* any numeric escape < RExC_npar is a backref */
13503 && num >= RExC_npar
13504 /* cannot be an octal escape if it starts with 8 */
13505 && *RExC_parse != '8'
13506 /* cannot be an octal escape it it starts with 9 */
13507 && *RExC_parse != '9'
13508 ) {
13509 /* Probably not meant to be a backref, instead likely
13510 * to be an octal character escape, e.g. \35 or \777.
13511 * The above logic should make it obvious why using
13512 * octal escapes in patterns is problematic. - Yves */
13513 RExC_parse = parse_start;
13514 goto defchar;
13515 }
13516 }
13517
13518 /* At this point RExC_parse points at a numeric escape like
13519 * \12 or \88 or something similar, which we should NOT treat
13520 * as an octal escape. It may or may not be a valid backref
13521 * escape. For instance \88888888 is unlikely to be a valid
13522 * backref. */
13523 while (isDIGIT(*RExC_parse))
13524 RExC_parse++;
13525 if (hasbrace) {
13526 if (*RExC_parse != '}')
13527 vFAIL("Unterminated \\g{...} pattern");
13528 RExC_parse++;
13529 }
13530 if (!SIZE_ONLY) {
13531 if (num > (I32)RExC_rx->nparens)
13532 vFAIL("Reference to nonexistent group");
13533 }
13534 RExC_sawback = 1;
13535 ret = reganode(pRExC_state,
13536 ((! FOLD)
13537 ? REF
13538 : (ASCII_FOLD_RESTRICTED)
13539 ? REFFA
13540 : (AT_LEAST_UNI_SEMANTICS)
13541 ? REFFU
13542 : (LOC)
13543 ? REFFL
13544 : REFF),
13545 num);
13546 *flagp |= HASWIDTH;
13547
13548 /* override incorrect value set in reganode MJD */
13549 Set_Node_Offset(REGNODE_p(ret), parse_start);
13550 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13551 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13552 FALSE /* Don't force to /x */ );
13553 }
13554 break;
13555 case '\0':
13556 if (RExC_parse >= RExC_end)
13557 FAIL("Trailing \\");
13558 /* FALLTHROUGH */
13559 default:
13560 /* Do not generate "unrecognized" warnings here, we fall
13561 back into the quick-grab loop below */
13562 RExC_parse = parse_start;
13563 goto defchar;
13564 } /* end of switch on a \foo sequence */
13565 break;
13566
13567 case '#':
13568
13569 /* '#' comments should have been spaced over before this function was
13570 * called */
13571 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13572 /*
13573 if (RExC_flags & RXf_PMf_EXTENDED) {
13574 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13575 if (RExC_parse < RExC_end)
13576 goto tryagain;
13577 }
13578 */
13579
13580 /* FALLTHROUGH */
13581
13582 default:
13583 defchar: {
13584
13585 /* Here, we have determined that the next thing is probably a
13586 * literal character. RExC_parse points to the first byte of its
13587 * definition. (It still may be an escape sequence that evaluates
13588 * to a single character) */
13589
13590 STRLEN len = 0;
13591 UV ender = 0;
13592 char *p;
13593 char *s;
13594
13595/* This allows us to fill a node with just enough spare so that if the final
13596 * character folds, its expansion is guaranteed to fit */
13597#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
13598 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE+1];
13599
13600 char *s0;
13601 U8 upper_parse = MAX_NODE_STRING_SIZE;
13602
13603 /* We start out as an EXACT node, even if under /i, until we find a
13604 * character which is in a fold. The algorithm now segregates into
13605 * separate nodes, characters that fold from those that don't under
13606 * /i. (This hopefull will create nodes that are fixed strings
13607 * even under /i, giving the optimizer something to grab onto to.)
13608 * So, if a node has something in it and the next character is in
13609 * the opposite category, that node is closed up, and the function
13610 * returns. Then regatom is called again, and a new node is
13611 * created for the new category. */
13612 U8 node_type = EXACT;
13613
13614 bool next_is_quantifier;
13615 char * oldp = NULL;
13616
13617 /* We can convert EXACTF nodes to EXACTFU if they contain only
13618 * characters that match identically regardless of the target
13619 * string's UTF8ness. The reason to do this is that EXACTF is not
13620 * trie-able, EXACTFU is.
13621 *
13622 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13623 * contain only above-Latin1 characters (hence must be in UTF8),
13624 * which don't participate in folds with Latin1-range characters,
13625 * as the latter's folds aren't known until runtime. (We don't
13626 * need to figure this out until pass 2) */
13627 bool maybe_exactfu = PASS2;
13628
13629 /* To see if RExC_uni_semantics changes during parsing of the node.
13630 * */
13631 bool uni_semantics_at_node_start;
13632
13633 /* The node_type may change below, but since the size of the node
13634 * doesn't change, it works */
13635 ret = reg_node(pRExC_state, node_type);
13636
13637 /* In pass1, folded, we use a temporary buffer instead of the
13638 * actual node, as the node doesn't exist yet */
13639 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(REGNODE_p(ret));
13640
13641 s0 = s;
13642
13643 reparse:
13644
13645 /* This breaks under rare circumstances. If folding, we do not
13646 * want to split a node at a character that is a non-final in a
13647 * multi-char fold, as an input string could just happen to want to
13648 * match across the node boundary. The code at the end of the loop
13649 * looks for this, and backs off until it finds not such a
13650 * character, but it is possible (though extremely, extremely
13651 * unlikely) for all characters in the node to be non-final fold
13652 * ones, in which case we just leave the node fully filled, and
13653 * hope that it doesn't match the string in just the wrong place */
13654
13655 assert( ! UTF /* Is at the beginning of a character */
13656 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13657 || UTF8_IS_START(UCHARAT(RExC_parse)));
13658
13659 uni_semantics_at_node_start = cBOOL(RExC_uni_semantics);
13660
13661 /* Here, we have a literal character. Find the maximal string of
13662 * them in the input that we can fit into a single EXACTish node.
13663 * We quit at the first non-literal or when the node gets full, or
13664 * under /i the categorization of folding/non-folding character
13665 * changes */
13666 for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
13667
13668 /* In most cases each iteration adds one byte to the output.
13669 * The exceptions override this */
13670 Size_t added_len = 1;
13671
13672 oldp = p;
13673
13674 /* White space has already been ignored */
13675 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
13676 || ! is_PATWS_safe((p), RExC_end, UTF));
13677
13678 switch ((U8)*p) {
13679 case '^':
13680 case '$':
13681 case '.':
13682 case '[':
13683 case '(':
13684 case ')':
13685 case '|':
13686 goto loopdone;
13687 case '\\':
13688 /* Literal Escapes Switch
13689
13690 This switch is meant to handle escape sequences that
13691 resolve to a literal character.
13692
13693 Every escape sequence that represents something
13694 else, like an assertion or a char class, is handled
13695 in the switch marked 'Special Escapes' above in this
13696 routine, but also has an entry here as anything that
13697 isn't explicitly mentioned here will be treated as
13698 an unescaped equivalent literal.
13699 */
13700
13701 switch ((U8)*++p) {
13702
13703 /* These are all the special escapes. */
13704 case 'A': /* Start assertion */
13705 case 'b': case 'B': /* Word-boundary assertion*/
13706 case 'C': /* Single char !DANGEROUS! */
13707 case 'd': case 'D': /* digit class */
13708 case 'g': case 'G': /* generic-backref, pos assertion */
13709 case 'h': case 'H': /* HORIZWS */
13710 case 'k': case 'K': /* named backref, keep marker */
13711 case 'p': case 'P': /* Unicode property */
13712 case 'R': /* LNBREAK */
13713 case 's': case 'S': /* space class */
13714 case 'v': case 'V': /* VERTWS */
13715 case 'w': case 'W': /* word class */
13716 case 'X': /* eXtended Unicode "combining
13717 character sequence" */
13718 case 'z': case 'Z': /* End of line/string assertion */
13719 --p;
13720 goto loopdone;
13721
13722 /* Anything after here is an escape that resolves to a
13723 literal. (Except digits, which may or may not)
13724 */
13725 case 'n':
13726 ender = '\n';
13727 p++;
13728 break;
13729 case 'N': /* Handle a single-code point named character. */
13730 RExC_parse = p + 1;
13731 if (! grok_bslash_N(pRExC_state,
13732 NULL, /* Fail if evaluates to
13733 anything other than a
13734 single code point */
13735 &ender, /* The returned single code
13736 point */
13737 NULL, /* Don't need a count of
13738 how many code points */
13739 flagp,
13740 RExC_strict,
13741 depth)
13742 ) {
13743 if (*flagp & NEED_UTF8)
13744 FAIL("panic: grok_bslash_N set NEED_UTF8");
13745 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13746
13747 /* Here, it wasn't a single code point. Go close
13748 * up this EXACTish node. The switch() prior to
13749 * this switch handles the other cases */
13750 RExC_parse = p = oldp;
13751 goto loopdone;
13752 }
13753 p = RExC_parse;
13754 RExC_parse = parse_start;
13755 if (ender > 0xff) {
13756 REQUIRE_UTF8(flagp);
13757 }
13758 break;
13759 case 'r':
13760 ender = '\r';
13761 p++;
13762 break;
13763 case 't':
13764 ender = '\t';
13765 p++;
13766 break;
13767 case 'f':
13768 ender = '\f';
13769 p++;
13770 break;
13771 case 'e':
13772 ender = ESC_NATIVE;
13773 p++;
13774 break;
13775 case 'a':
13776 ender = '\a';
13777 p++;
13778 break;
13779 case 'o':
13780 {
13781 UV result;
13782 const char* error_msg;
13783
13784 bool valid = grok_bslash_o(&p,
13785 RExC_end,
13786 &result,
13787 &error_msg,
13788 TO_OUTPUT_WARNINGS(p),
13789 (bool) RExC_strict,
13790 TRUE, /* Output warnings
13791 for non-
13792 portables */
13793 UTF);
13794 if (! valid) {
13795 RExC_parse = p; /* going to die anyway; point
13796 to exact spot of failure */
13797 vFAIL(error_msg);
13798 }
13799 UPDATE_WARNINGS_LOC(p - 1);
13800 ender = result;
13801 if (ender > 0xff) {
13802 REQUIRE_UTF8(flagp);
13803 }
13804 break;
13805 }
13806 case 'x':
13807 {
13808 UV result = UV_MAX; /* initialize to erroneous
13809 value */
13810 const char* error_msg;
13811
13812 bool valid = grok_bslash_x(&p,
13813 RExC_end,
13814 &result,
13815 &error_msg,
13816 TO_OUTPUT_WARNINGS(p),
13817 (bool) RExC_strict,
13818 TRUE, /* Silence warnings
13819 for non-
13820 portables */
13821 UTF);
13822 if (! valid) {
13823 RExC_parse = p; /* going to die anyway; point
13824 to exact spot of failure */
13825 vFAIL(error_msg);
13826 }
13827 UPDATE_WARNINGS_LOC(p - 1);
13828 ender = result;
13829
13830 if (ender < 0x100) {
13831#ifdef EBCDIC
13832 if (RExC_recode_x_to_native) {
13833 ender = LATIN1_TO_NATIVE(ender);
13834 }
13835#endif
13836 }
13837 else {
13838 REQUIRE_UTF8(flagp);
13839 }
13840 break;
13841 }
13842 case 'c':
13843 p++;
13844 ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
13845 UPDATE_WARNINGS_LOC(p);
13846 p++;
13847 break;
13848 case '8': case '9': /* must be a backreference */
13849 --p;
13850 /* we have an escape like \8 which cannot be an octal escape
13851 * so we exit the loop, and let the outer loop handle this
13852 * escape which may or may not be a legitimate backref. */
13853 goto loopdone;
13854 case '1': case '2': case '3':case '4':
13855 case '5': case '6': case '7':
13856 /* When we parse backslash escapes there is ambiguity
13857 * between backreferences and octal escapes. Any escape
13858 * from \1 - \9 is a backreference, any multi-digit
13859 * escape which does not start with 0 and which when
13860 * evaluated as decimal could refer to an already
13861 * parsed capture buffer is a back reference. Anything
13862 * else is octal.
13863 *
13864 * Note this implies that \118 could be interpreted as
13865 * 118 OR as "\11" . "8" depending on whether there
13866 * were 118 capture buffers defined already in the
13867 * pattern. */
13868
13869 /* NOTE, RExC_npar is 1 more than the actual number of
13870 * parens we have seen so far, hence the "<" as opposed
13871 * to "<=" */
13872 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
13873 { /* Not to be treated as an octal constant, go
13874 find backref */
13875 --p;
13876 goto loopdone;
13877 }
13878 /* FALLTHROUGH */
13879 case '0':
13880 {
13881 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13882 STRLEN numlen = 3;
13883 ender = grok_oct(p, &numlen, &flags, NULL);
13884 if (ender > 0xff) {
13885 REQUIRE_UTF8(flagp);
13886 }
13887 p += numlen;
13888 if ( isDIGIT(*p) /* like \08, \178 */
13889 && ckWARN(WARN_REGEXP)
13890 && numlen < 3)
13891 {
13892 reg_warn_non_literal_string(
13893 p + 1,
13894 form_short_octal_warning(p, numlen));
13895 }
13896 }
13897 break;
13898 case '\0':
13899 if (p >= RExC_end)
13900 FAIL("Trailing \\");
13901 /* FALLTHROUGH */
13902 default:
13903 if (isALPHANUMERIC(*p)) {
13904 /* An alpha followed by '{' is going to fail next
13905 * iteration, so don't output this warning in that
13906 * case */
13907 if (! isALPHA(*p) || *(p + 1) != '{') {
13908 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
13909 " passed through", p);
13910 }
13911 }
13912 goto normal_default;
13913 } /* End of switch on '\' */
13914 break;
13915 case '{':
13916 /* Trying to gain new uses for '{' without breaking too
13917 * much existing code is hard. The solution currently
13918 * adopted is:
13919 * 1) If there is no ambiguity that a '{' should always
13920 * be taken literally, at the start of a construct, we
13921 * just do so.
13922 * 2) If the literal '{' conflicts with our desired use
13923 * of it as a metacharacter, we die. The deprecation
13924 * cycles for this have come and gone.
13925 * 3) If there is ambiguity, we raise a simple warning.
13926 * This could happen, for example, if the user
13927 * intended it to introduce a quantifier, but slightly
13928 * misspelled the quantifier. Without this warning,
13929 * the quantifier would silently be taken as a literal
13930 * string of characters instead of a meta construct */
13931 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
13932 if ( RExC_strict
13933 || ( p > parse_start + 1
13934 && isALPHA_A(*(p - 1))
13935 && *(p - 2) == '\\')
13936 || new_regcurly(p, RExC_end))
13937 {
13938 RExC_parse = p + 1;
13939 vFAIL("Unescaped left brace in regex is "
13940 "illegal here");
13941 }
13942 ckWARNreg(p + 1, "Unescaped left brace in regex is"
13943 " passed through");
13944 }
13945 goto normal_default;
13946 case '}':
13947 case ']':
13948 if (p > RExC_parse && RExC_strict) {
13949 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
13950 }
13951 /*FALLTHROUGH*/
13952 default: /* A literal character */
13953 normal_default:
13954 if (! UTF8_IS_INVARIANT(*p) && UTF) {
13955 STRLEN numlen;
13956 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
13957 &numlen, UTF8_ALLOW_DEFAULT);
13958 p += numlen;
13959 }
13960 else
13961 ender = (U8) *p++;
13962 break;
13963 } /* End of switch on the literal */
13964
13965 /* Here, have looked at the literal character, and <ender>
13966 * contains its ordinal; <p> points to the character after it.
13967 * We need to check if the next non-ignored thing is a
13968 * quantifier. Move <p> to after anything that should be
13969 * ignored, which, as a side effect, positions <p> for the next
13970 * loop iteration */
13971 skip_to_be_ignored_text(pRExC_state, &p,
13972 FALSE /* Don't force to /x */ );
13973
13974 /* If the next thing is a quantifier, it applies to this
13975 * character only, which means that this character has to be in
13976 * its own node and can't just be appended to the string in an
13977 * existing node, so if there are already other characters in
13978 * the node, close the node with just them, and set up to do
13979 * this character again next time through, when it will be the
13980 * only thing in its new node */
13981
13982 next_is_quantifier = LIKELY(p < RExC_end)
13983 && UNLIKELY(ISMULT2(p));
13984
13985 if (next_is_quantifier && LIKELY(len)) {
13986 p = oldp;
13987 goto loopdone;
13988 }
13989
13990 /* Ready to add 'ender' to the node */
13991
13992 if (! FOLD) { /* The simple case, just append the literal */
13993
13994 /* In the sizing pass, we need only the size of the
13995 * character we are appending, hence we can delay getting
13996 * its representation until PASS2. */
13997 if (SIZE_ONLY) {
13998 if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
13999 const STRLEN unilen = UVCHR_SKIP(ender);
14000 s += unilen;
14001 added_len = unilen;
14002 }
14003 else {
14004 s++;
14005 }
14006 } else { /* PASS2 */
14007 not_fold_common:
14008 if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
14009 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14010 added_len = (char *) new_s - s;
14011 s = (char *) new_s;
14012 }
14013 else {
14014 *(s++) = (char) ender;
14015 }
14016 }
14017 }
14018 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14019
14020 /* Here are folding under /l, and the code point is
14021 * problematic. If this is the first character in the
14022 * node, change the node type to folding. Otherwise, if
14023 * this is the first problematic character, close up the
14024 * existing node, so can start a new node with this one */
14025 if (! len) {
14026 node_type = EXACTFL;
14027 }
14028 else if (node_type == EXACT) {
14029 p = oldp;
14030 goto loopdone;
14031 }
14032
14033 /* This code point means we can't simplify things */
14034 maybe_exactfu = FALSE;
14035
14036 /* A problematic code point in this context means that its
14037 * fold isn't known until runtime, so we can't fold it now.
14038 * (The non-problematic code points are the above-Latin1
14039 * ones that fold to also all above-Latin1. Their folds
14040 * don't vary no matter what the locale is.) But here we
14041 * have characters whose fold depends on the locale.
14042 * Unlike the non-folding case above, we have to keep track
14043 * of these in the sizing pass, so that we can make sure we
14044 * don't split too-long nodes in the middle of a potential
14045 * multi-char fold. And unlike the regular fold case
14046 * handled in the else clauses below, we don't actually
14047 * fold and don't have special cases to consider. What we
14048 * do for both passes is the PASS2 code for non-folding */
14049 goto not_fold_common;
14050 }
14051 else /* A regular FOLD code point */
14052 if (! UTF)
14053 {
14054 /* Here, are folding and are not UTF-8 encoded; therefore
14055 * the character must be in the range 0-255, and is not /l.
14056 * (Not /l because we already handled these under /l in
14057 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
14058 if (! IS_IN_SOME_FOLD_L1(ender)) {
14059
14060 /* Start a new node for this non-folding character if
14061 * previous ones in the node were folded */
14062 if (len && node_type != EXACT) {
14063 p = oldp;
14064 goto loopdone;
14065 }
14066
14067 *(s++) = (char) ender;
14068 }
14069 else { /* Here, does participate in some fold */
14070
14071 /* if this is the first character in the node, change
14072 * its type to folding. Otherwise, if this is the
14073 * first folding character in the node, close up the
14074 * existing node, so can start a new node with this
14075 * one. */
14076 if (! len) {
14077 node_type = compute_EXACTish(pRExC_state);
14078 }
14079 else if (node_type == EXACT) {
14080 p = oldp;
14081 goto loopdone;
14082 }
14083
14084 /* See if the character's fold differs between /d and
14085 * /u. On non-ancient Unicode versions, this includes
14086 * the multi-char fold SHARP S to 'ss' */
14087
14088#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14089 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14090 || UNICODE_DOT_DOT_VERSION > 0)
14091
14092 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14093
14094 /* See comments for join_exact() as to why we fold
14095 * this non-UTF at compile time */
14096 if (node_type == EXACTFU) {
14097 *(s++) = 's';
14098
14099 /* Let the code below add in the extra 's' */
14100 ender = 's';
14101 added_len = 2;
14102 }
14103 else if ( uni_semantics_at_node_start
14104 != RExC_uni_semantics)
14105 {
14106 /* Here, we are supossed to be using Unicode
14107 * rules, but this folding node is not. This
14108 * happens during pass 1 when the node started
14109 * out not under Unicode rules, but a \N{} was
14110 * encountered during the processing of it,
14111 * causing Unicode rules to be switched into.
14112 * Pass 1 continues uninterrupted, as by the
14113 * time we get to pass 2, we will know enough
14114 * to generate the correct folds. Except in
14115 * this one case, we need to restart the node,
14116 * because the fold of the sharp s requires 2
14117 * characters, and the sizing needs to account
14118 * for that. */
14119 p = oldp;
14120 goto loopdone;
14121 }
14122 else {
14123 RExC_seen_unfolded_sharp_s = 1;
14124 maybe_exactfu = FALSE;
14125 }
14126 }
14127 else if ( len
14128 && isALPHA_FOLD_EQ(ender, 's')
14129 && isALPHA_FOLD_EQ(*(s-1), 's'))
14130 {
14131 maybe_exactfu = FALSE;
14132 }
14133 else
14134#endif
14135
14136 if (PL_fold[ender] != PL_fold_latin1[ender]) {
14137 maybe_exactfu = FALSE;
14138 }
14139
14140 /* Even when folding, we store just the input
14141 * character, as we have an array that finds its fold
14142 * quickly */
14143 *(s++) = (char) ender;
14144 }
14145 }
14146 else { /* FOLD, and UTF */
14147 /* Unlike the non-fold case, we do actually have to
14148 * calculate the fold in pass 1. This is for two reasons,
14149 * the folded length may be longer than the unfolded, and
14150 * we have to calculate how many EXACTish nodes it will
14151 * take; and we may run out of room in a node in the middle
14152 * of a potential multi-char fold, and have to back off
14153 * accordingly. */
14154
14155 if (isASCII_uni(ender)) {
14156
14157 /* As above, we close up and start a new node if the
14158 * previous characters don't match the fold/non-fold
14159 * state of this one. And if this is the first
14160 * character in the node, and it folds, we change the
14161 * node away from being EXACT */
14162 if (! IS_IN_SOME_FOLD_L1(ender)) {
14163 if (len && node_type != EXACT) {
14164 p = oldp;
14165 goto loopdone;
14166 }
14167
14168 *(s)++ = (U8) ender;
14169 }
14170 else { /* Is in a fold */
14171
14172 if (! len) {
14173 node_type = compute_EXACTish(pRExC_state);
14174 }
14175 else if (node_type == EXACT) {
14176 p = oldp;
14177 goto loopdone;
14178 }
14179
14180 *(s)++ = (U8) toFOLD(ender);
14181 }
14182 }
14183 else { /* Not ASCII */
14184 STRLEN foldlen;
14185
14186 /* As above, we close up and start a new node if the
14187 * previous characters don't match the fold/non-fold
14188 * state of this one. And if this is the first
14189 * character in the node, and it folds, we change the
14190 * node away from being EXACT */
14191 if (! _invlist_contains_cp(PL_utf8_foldable, ender)) {
14192 if (len && node_type != EXACT) {
14193 p = oldp;
14194 goto loopdone;
14195 }
14196
14197 s = (char *) uvchr_to_utf8((U8 *) s, ender);
14198 added_len = UVCHR_SKIP(ender);
14199 }
14200 else {
14201
14202 if (! len) {
14203 node_type = compute_EXACTish(pRExC_state);
14204 }
14205 else if (node_type == EXACT) {
14206 p = oldp;
14207 goto loopdone;
14208 }
14209
14210 ender = _to_uni_fold_flags(
14211 ender,
14212 (U8 *) s,
14213 &foldlen,
14214 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14215 ? FOLD_FLAGS_NOMIX_ASCII
14216 : 0));
14217 s += foldlen;
14218 added_len = foldlen;
14219 }
14220 }
14221 }
14222
14223 len += added_len;
14224
14225 if (next_is_quantifier) {
14226
14227 /* Here, the next input is a quantifier, and to get here,
14228 * the current character is the only one in the node. */
14229 goto loopdone;
14230 }
14231
14232 } /* End of loop through literal characters */
14233
14234 /* Here we have either exhausted the input or ran out of room in
14235 * the node. (If we encountered a character that can't be in the
14236 * node, transfer is made directly to <loopdone>, and so we
14237 * wouldn't have fallen off the end of the loop.) In the latter
14238 * case, we artificially have to split the node into two, because
14239 * we just don't have enough space to hold everything. This
14240 * creates a problem if the final character participates in a
14241 * multi-character fold in the non-final position, as a match that
14242 * should have occurred won't, due to the way nodes are matched,
14243 * and our artificial boundary. So back off until we find a non-
14244 * problematic character -- one that isn't at the beginning or
14245 * middle of such a fold. (Either it doesn't participate in any
14246 * folds, or appears only in the final position of all the folds it
14247 * does participate in.) A better solution with far fewer false
14248 * positives, and that would fill the nodes more completely, would
14249 * be to actually have available all the multi-character folds to
14250 * test against, and to back-off only far enough to be sure that
14251 * this node isn't ending with a partial one. <upper_parse> is set
14252 * further below (if we need to reparse the node) to include just
14253 * up through that final non-problematic character that this code
14254 * identifies, so when it is set to less than the full node, we can
14255 * skip the rest of this */
14256 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
14257
14258 const STRLEN full_len = len;
14259
14260 assert(len >= MAX_NODE_STRING_SIZE);
14261
14262 /* Here, <s> points to the final byte of the final character.
14263 * Look backwards through the string until find a non-
14264 * problematic character */
14265
14266 if (! UTF) {
14267
14268 /* This has no multi-char folds to non-UTF characters */
14269 if (ASCII_FOLD_RESTRICTED) {
14270 goto loopdone;
14271 }
14272
14273 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
14274 len = s - s0 + 1;
14275 }
14276 else {
14277
14278 /* Point to the first byte of the final character */
14279 s = (char *) utf8_hop((U8 *) s, -1);
14280
14281 while (s >= s0) { /* Search backwards until find
14282 a non-problematic char */
14283 if (UTF8_IS_INVARIANT(*s)) {
14284
14285 /* There are no ascii characters that participate
14286 * in multi-char folds under /aa. In EBCDIC, the
14287 * non-ascii invariants are all control characters,
14288 * so don't ever participate in any folds. */
14289 if (ASCII_FOLD_RESTRICTED
14290 || ! IS_NON_FINAL_FOLD(*s))
14291 {
14292 break;
14293 }
14294 }
14295 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
14296 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
14297 *s, *(s+1))))
14298 {
14299 break;
14300 }
14301 }
14302 else if (! _invlist_contains_cp(
14303 PL_NonL1NonFinalFold,
14304 valid_utf8_to_uvchr((U8 *) s, NULL)))
14305 {
14306 break;
14307 }
14308
14309 /* Here, the current character is problematic in that
14310 * it does occur in the non-final position of some
14311 * fold, so try the character before it, but have to
14312 * special case the very first byte in the string, so
14313 * we don't read outside the string */
14314 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
14315 } /* End of loop backwards through the string */
14316
14317 /* If there were only problematic characters in the string,
14318 * <s> will point to before s0, in which case the length
14319 * should be 0, otherwise include the length of the
14320 * non-problematic character just found */
14321 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
14322 }
14323
14324 /* Here, have found the final character, if any, that is
14325 * non-problematic as far as ending the node without splitting
14326 * it across a potential multi-char fold. <len> contains the
14327 * number of bytes in the node up-to and including that
14328 * character, or is 0 if there is no such character, meaning
14329 * the whole node contains only problematic characters. In
14330 * this case, give up and just take the node as-is. We can't
14331 * do any better */
14332 if (len == 0) {
14333 len = full_len;
14334
14335 /* If the node ends in an 's' we make sure it stays EXACTF,
14336 * as if it turns into an EXACTFU, it could later get
14337 * joined with another 's' that would then wrongly match
14338 * the sharp s */
14339 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
14340 {
14341 maybe_exactfu = FALSE;
14342 }
14343 } else {
14344
14345 /* Here, the node does contain some characters that aren't
14346 * problematic. If one such is the final character in the
14347 * node, we are done */
14348 if (len == full_len) {
14349 goto loopdone;
14350 }
14351 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
14352
14353 /* If the final character is problematic, but the
14354 * penultimate is not, back-off that last character to
14355 * later start a new node with it */
14356 p = oldp;
14357 goto loopdone;
14358 }
14359
14360 /* Here, the final non-problematic character is earlier
14361 * in the input than the penultimate character. What we do
14362 * is reparse from the beginning, going up only as far as
14363 * this final ok one, thus guaranteeing that the node ends
14364 * in an acceptable character. The reason we reparse is
14365 * that we know how far in the character is, but we don't
14366 * know how to correlate its position with the input parse.
14367 * An alternate implementation would be to build that
14368 * correlation as we go along during the original parse,
14369 * but that would entail extra work for every node, whereas
14370 * this code gets executed only when the string is too
14371 * large for the node, and the final two characters are
14372 * problematic, an infrequent occurrence. Yet another
14373 * possible strategy would be to save the tail of the
14374 * string, and the next time regatom is called, initialize
14375 * with that. The problem with this is that unless you
14376 * back off one more character, you won't be guaranteed
14377 * regatom will get called again, unless regbranch,
14378 * regpiece ... are also changed. If you do back off that
14379 * extra character, so that there is input guaranteed to
14380 * force calling regatom, you can't handle the case where
14381 * just the first character in the node is acceptable. I
14382 * (khw) decided to try this method which doesn't have that
14383 * pitfall; if performance issues are found, we can do a
14384 * combination of the current approach plus that one */
14385 upper_parse = len;
14386 len = 0;
14387 s = s0;
14388 goto reparse;
14389 }
14390 } /* End of verifying node ends with an appropriate char */
14391
14392 loopdone: /* Jumped to when encounters something that shouldn't be
14393 in the node */
14394
14395 /* I (khw) don't know if you can get here with zero length, but the
14396 * old code handled this situation by creating a zero-length EXACT
14397 * node. Might as well be NOTHING instead */
14398 if (len == 0) {
14399 OP(REGNODE_p(ret)) = NOTHING;
14400 }
14401 else {
14402 OP(REGNODE_p(ret)) = node_type;
14403
14404 /* If the node type is EXACT here, check to see if it
14405 * should be EXACTL. */
14406 if (node_type == EXACT) {
14407 if (LOC) {
14408 OP(REGNODE_p(ret)) = EXACTL;
14409 }
14410 }
14411
14412 if (FOLD) {
14413 /* If 'maybe_exactfu' is set, then there are no code points
14414 * that match differently depending on UTF8ness of the
14415 * target string (for /u), or depending on locale for /l */
14416 if (maybe_exactfu) {
14417 if (node_type == EXACTF) {
14418 OP(REGNODE_p(ret)) = EXACTFU;
14419 }
14420 else if (node_type == EXACTFL) {
14421 OP(REGNODE_p(ret)) = EXACTFLU8;
14422 }
14423 }
14424 }
14425
14426 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
14427 FALSE /* Don't look to see if could
14428 be turned into an EXACT
14429 node, as we have already
14430 computed that */
14431 );
14432 }
14433
14434 RExC_parse = p - 1;
14435 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
14436 RExC_parse = p;
14437 {
14438 /* len is STRLEN which is unsigned, need to copy to signed */
14439 IV iv = len;
14440 if (iv < 0)
14441 vFAIL("Internal disaster");
14442 }
14443
14444 } /* End of label 'defchar:' */
14445 break;
14446 } /* End of giant switch on input character */
14447
14448 /* Position parse to next real character */
14449 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14450 FALSE /* Don't force to /x */ );
14451 if ( PASS2 && *RExC_parse == '{'
14452 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
14453 {
14454 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
14455 RExC_parse++;
14456 vFAIL("Unescaped left brace in regex is illegal here");
14457 }
14458 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
14459 " passed through");
14460 }
14461
14462 return(ret);
14463}
14464
14465
14466STATIC void
14467S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14468{
14469 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
14470 * sets up the bitmap and any flags, removing those code points from the
14471 * inversion list, setting it to NULL should it become completely empty */
14472
14473 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14474 assert(PL_regkind[OP(node)] == ANYOF);
14475
14476 ANYOF_BITMAP_ZERO(node);
14477 if (*invlist_ptr) {
14478
14479 /* This gets set if we actually need to modify things */
14480 bool change_invlist = FALSE;
14481
14482 UV start, end;
14483
14484 /* Start looking through *invlist_ptr */
14485 invlist_iterinit(*invlist_ptr);
14486 while (invlist_iternext(*invlist_ptr, &start, &end)) {
14487 UV high;
14488 int i;
14489
14490 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14491 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14492 }
14493
14494 /* Quit if are above what we should change */
14495 if (start >= NUM_ANYOF_CODE_POINTS) {
14496 break;
14497 }
14498
14499 change_invlist = TRUE;
14500
14501 /* Set all the bits in the range, up to the max that we are doing */
14502 high = (end < NUM_ANYOF_CODE_POINTS - 1)
14503 ? end
14504 : NUM_ANYOF_CODE_POINTS - 1;
14505 for (i = start; i <= (int) high; i++) {
14506 if (! ANYOF_BITMAP_TEST(node, i)) {
14507 ANYOF_BITMAP_SET(node, i);
14508 }
14509 }
14510 }
14511 invlist_iterfinish(*invlist_ptr);
14512
14513 /* Done with loop; remove any code points that are in the bitmap from
14514 * *invlist_ptr; similarly for code points above the bitmap if we have
14515 * a flag to match all of them anyways */
14516 if (change_invlist) {
14517 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14518 }
14519 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14520 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14521 }
14522
14523 /* If have completely emptied it, remove it completely */
14524 if (_invlist_len(*invlist_ptr) == 0) {
14525 SvREFCNT_dec_NN(*invlist_ptr);
14526 *invlist_ptr = NULL;
14527 }
14528 }
14529}
14530
14531/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14532 Character classes ([:foo:]) can also be negated ([:^foo:]).
14533 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14534 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14535 but trigger failures because they are currently unimplemented. */
14536
14537#define POSIXCC_DONE(c) ((c) == ':')
14538#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14539#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14540#define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14541
14542#define WARNING_PREFIX "Assuming NOT a POSIX class since "
14543#define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
14544#define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
14545
14546#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14547
14548/* 'posix_warnings' and 'warn_text' are names of variables in the following
14549 * routine. q.v. */
14550#define ADD_POSIX_WARNING(p, text) STMT_START { \
14551 if (posix_warnings) { \
14552 if (! RExC_warn_text ) RExC_warn_text = \
14553 (AV *) sv_2mortal((SV *) newAV()); \
14554 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
14555 WARNING_PREFIX \
14556 text \
14557 REPORT_LOCATION, \
14558 REPORT_LOCATION_ARGS(p))); \
14559 } \
14560 } STMT_END
14561#define CLEAR_POSIX_WARNINGS() \
14562 STMT_START { \
14563 if (posix_warnings && RExC_warn_text) \
14564 av_clear(RExC_warn_text); \
14565 } STMT_END
14566
14567#define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
14568 STMT_START { \
14569 CLEAR_POSIX_WARNINGS(); \
14570 return ret; \
14571 } STMT_END
14572
14573STATIC int
14574S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
14575
14576 const char * const s, /* Where the putative posix class begins.
14577 Normally, this is one past the '['. This
14578 parameter exists so it can be somewhere
14579 besides RExC_parse. */
14580 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
14581 NULL */
14582 AV ** posix_warnings, /* Where to place any generated warnings, or
14583 NULL */
14584 const bool check_only /* Don't die if error */
14585)
14586{
14587 /* This parses what the caller thinks may be one of the three POSIX
14588 * constructs:
14589 * 1) a character class, like [:blank:]
14590 * 2) a collating symbol, like [. .]
14591 * 3) an equivalence class, like [= =]
14592 * In the latter two cases, it croaks if it finds a syntactically legal
14593 * one, as these are not handled by Perl.
14594 *
14595 * The main purpose is to look for a POSIX character class. It returns:
14596 * a) the class number
14597 * if it is a completely syntactically and semantically legal class.
14598 * 'updated_parse_ptr', if not NULL, is set to point to just after the
14599 * closing ']' of the class
14600 * b) OOB_NAMEDCLASS
14601 * if it appears that one of the three POSIX constructs was meant, but
14602 * its specification was somehow defective. 'updated_parse_ptr', if
14603 * not NULL, is set to point to the character just after the end
14604 * character of the class. See below for handling of warnings.
14605 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
14606 * if it doesn't appear that a POSIX construct was intended.
14607 * 'updated_parse_ptr' is not changed. No warnings nor errors are
14608 * raised.
14609 *
14610 * In b) there may be errors or warnings generated. If 'check_only' is
14611 * TRUE, then any errors are discarded. Warnings are returned to the
14612 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
14613 * instead it is NULL, warnings are suppressed. This is done in all
14614 * passes. The reason for this is that the rest of the parsing is heavily
14615 * dependent on whether this routine found a valid posix class or not. If
14616 * it did, the closing ']' is absorbed as part of the class. If no class,
14617 * or an invalid one is found, any ']' will be considered the terminator of
14618 * the outer bracketed character class, leading to very different results.
14619 * In particular, a '(?[ ])' construct will likely have a syntax error if
14620 * the class is parsed other than intended, and this will happen in pass1,
14621 * before the warnings would normally be output. This mechanism allows the
14622 * caller to output those warnings in pass1 just before dieing, giving a
14623 * much better clue as to what is wrong.
14624 *
14625 * The reason for this function, and its complexity is that a bracketed
14626 * character class can contain just about anything. But it's easy to
14627 * mistype the very specific posix class syntax but yielding a valid
14628 * regular bracketed class, so it silently gets compiled into something
14629 * quite unintended.
14630 *
14631 * The solution adopted here maintains backward compatibility except that
14632 * it adds a warning if it looks like a posix class was intended but
14633 * improperly specified. The warning is not raised unless what is input
14634 * very closely resembles one of the 14 legal posix classes. To do this,
14635 * it uses fuzzy parsing. It calculates how many single-character edits it
14636 * would take to transform what was input into a legal posix class. Only
14637 * if that number is quite small does it think that the intention was a
14638 * posix class. Obviously these are heuristics, and there will be cases
14639 * where it errs on one side or another, and they can be tweaked as
14640 * experience informs.
14641 *
14642 * The syntax for a legal posix class is:
14643 *
14644 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
14645 *
14646 * What this routine considers syntactically to be an intended posix class
14647 * is this (the comments indicate some restrictions that the pattern
14648 * doesn't show):
14649 *
14650 * qr/(?x: \[? # The left bracket, possibly
14651 * # omitted
14652 * \h* # possibly followed by blanks
14653 * (?: \^ \h* )? # possibly a misplaced caret
14654 * [:;]? # The opening class character,
14655 * # possibly omitted. A typo
14656 * # semi-colon can also be used.
14657 * \h*
14658 * \^? # possibly a correctly placed
14659 * # caret, but not if there was also
14660 * # a misplaced one
14661 * \h*
14662 * .{3,15} # The class name. If there are
14663 * # deviations from the legal syntax,
14664 * # its edit distance must be close
14665 * # to a real class name in order
14666 * # for it to be considered to be
14667 * # an intended posix class.
14668 * \h*
14669 * [[:punct:]]? # The closing class character,
14670 * # possibly omitted. If not a colon
14671 * # nor semi colon, the class name
14672 * # must be even closer to a valid
14673 * # one
14674 * \h*
14675 * \]? # The right bracket, possibly
14676 * # omitted.
14677 * )/
14678 *
14679 * In the above, \h must be ASCII-only.
14680 *
14681 * These are heuristics, and can be tweaked as field experience dictates.
14682 * There will be cases when someone didn't intend to specify a posix class
14683 * that this warns as being so. The goal is to minimize these, while
14684 * maximizing the catching of things intended to be a posix class that
14685 * aren't parsed as such.
14686 */
14687
14688 const char* p = s;
14689 const char * const e = RExC_end;
14690 unsigned complement = 0; /* If to complement the class */
14691 bool found_problem = FALSE; /* Assume OK until proven otherwise */
14692 bool has_opening_bracket = FALSE;
14693 bool has_opening_colon = FALSE;
14694 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
14695 valid class */
14696 const char * possible_end = NULL; /* used for a 2nd parse pass */
14697 const char* name_start; /* ptr to class name first char */
14698
14699 /* If the number of single-character typos the input name is away from a
14700 * legal name is no more than this number, it is considered to have meant
14701 * the legal name */
14702 int max_distance = 2;
14703
14704 /* to store the name. The size determines the maximum length before we
14705 * decide that no posix class was intended. Should be at least
14706 * sizeof("alphanumeric") */
14707 UV input_text[15];
14708 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
14709
14710 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
14711
14712 CLEAR_POSIX_WARNINGS();
14713
14714 if (p >= e) {
14715 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14716 }
14717
14718 if (*(p - 1) != '[') {
14719 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
14720 found_problem = TRUE;
14721 }
14722 else {
14723 has_opening_bracket = TRUE;
14724 }
14725
14726 /* They could be confused and think you can put spaces between the
14727 * components */
14728 if (isBLANK(*p)) {
14729 found_problem = TRUE;
14730
14731 do {
14732 p++;
14733 } while (p < e && isBLANK(*p));
14734
14735 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14736 }
14737
14738 /* For [. .] and [= =]. These are quite different internally from [: :],
14739 * so they are handled separately. */
14740 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
14741 and 1 for at least one char in it
14742 */
14743 {
14744 const char open_char = *p;
14745 const char * temp_ptr = p + 1;
14746
14747 /* These two constructs are not handled by perl, and if we find a
14748 * syntactically valid one, we croak. khw, who wrote this code, finds
14749 * this explanation of them very unclear:
14750 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
14751 * And searching the rest of the internet wasn't very helpful either.
14752 * It looks like just about any byte can be in these constructs,
14753 * depending on the locale. But unless the pattern is being compiled
14754 * under /l, which is very rare, Perl runs under the C or POSIX locale.
14755 * In that case, it looks like [= =] isn't allowed at all, and that
14756 * [. .] could be any single code point, but for longer strings the
14757 * constituent characters would have to be the ASCII alphabetics plus
14758 * the minus-hyphen. Any sensible locale definition would limit itself
14759 * to these. And any portable one definitely should. Trying to parse
14760 * the general case is a nightmare (see [perl #127604]). So, this code
14761 * looks only for interiors of these constructs that match:
14762 * qr/.|[-\w]{2,}/
14763 * Using \w relaxes the apparent rules a little, without adding much
14764 * danger of mistaking something else for one of these constructs.
14765 *
14766 * [. .] in some implementations described on the internet is usable to
14767 * escape a character that otherwise is special in bracketed character
14768 * classes. For example [.].] means a literal right bracket instead of
14769 * the ending of the class
14770 *
14771 * [= =] can legitimately contain a [. .] construct, but we don't
14772 * handle this case, as that [. .] construct will later get parsed
14773 * itself and croak then. And [= =] is checked for even when not under
14774 * /l, as Perl has long done so.
14775 *
14776 * The code below relies on there being a trailing NUL, so it doesn't
14777 * have to keep checking if the parse ptr < e.
14778 */
14779 if (temp_ptr[1] == open_char) {
14780 temp_ptr++;
14781 }
14782 else while ( temp_ptr < e
14783 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
14784 {
14785 temp_ptr++;
14786 }
14787
14788 if (*temp_ptr == open_char) {
14789 temp_ptr++;
14790 if (*temp_ptr == ']') {
14791 temp_ptr++;
14792 if (! found_problem && ! check_only) {
14793 RExC_parse = (char *) temp_ptr;
14794 vFAIL3("POSIX syntax [%c %c] is reserved for future "
14795 "extensions", open_char, open_char);
14796 }
14797
14798 /* Here, the syntax wasn't completely valid, or else the call
14799 * is to check-only */
14800 if (updated_parse_ptr) {
14801 *updated_parse_ptr = (char *) temp_ptr;
14802 }
14803
14804 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
14805 }
14806 }
14807
14808 /* If we find something that started out to look like one of these
14809 * constructs, but isn't, we continue below so that it can be checked
14810 * for being a class name with a typo of '.' or '=' instead of a colon.
14811 * */
14812 }
14813
14814 /* Here, we think there is a possibility that a [: :] class was meant, and
14815 * we have the first real character. It could be they think the '^' comes
14816 * first */
14817 if (*p == '^') {
14818 found_problem = TRUE;
14819 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14820 complement = 1;
14821 p++;
14822
14823 if (isBLANK(*p)) {
14824 found_problem = TRUE;
14825
14826 do {
14827 p++;
14828 } while (p < e && isBLANK(*p));
14829
14830 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14831 }
14832 }
14833
14834 /* But the first character should be a colon, which they could have easily
14835 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14836 * distinguish from a colon, so treat that as a colon). */
14837 if (*p == ':') {
14838 p++;
14839 has_opening_colon = TRUE;
14840 }
14841 else if (*p == ';') {
14842 found_problem = TRUE;
14843 p++;
14844 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14845 has_opening_colon = TRUE;
14846 }
14847 else {
14848 found_problem = TRUE;
14849 ADD_POSIX_WARNING(p, "there must be a starting ':'");
14850
14851 /* Consider an initial punctuation (not one of the recognized ones) to
14852 * be a left terminator */
14853 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
14854 p++;
14855 }
14856 }
14857
14858 /* They may think that you can put spaces between the components */
14859 if (isBLANK(*p)) {
14860 found_problem = TRUE;
14861
14862 do {
14863 p++;
14864 } while (p < e && isBLANK(*p));
14865
14866 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14867 }
14868
14869 if (*p == '^') {
14870
14871 /* We consider something like [^:^alnum:]] to not have been intended to
14872 * be a posix class, but XXX maybe we should */
14873 if (complement) {
14874 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14875 }
14876
14877 complement = 1;
14878 p++;
14879 }
14880
14881 /* Again, they may think that you can put spaces between the components */
14882 if (isBLANK(*p)) {
14883 found_problem = TRUE;
14884
14885 do {
14886 p++;
14887 } while (p < e && isBLANK(*p));
14888
14889 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14890 }
14891
14892 if (*p == ']') {
14893
14894 /* XXX This ']' may be a typo, and something else was meant. But
14895 * treating it as such creates enough complications, that that
14896 * possibility isn't currently considered here. So we assume that the
14897 * ']' is what is intended, and if we've already found an initial '[',
14898 * this leaves this construct looking like [:] or [:^], which almost
14899 * certainly weren't intended to be posix classes */
14900 if (has_opening_bracket) {
14901 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14902 }
14903
14904 /* But this function can be called when we parse the colon for
14905 * something like qr/[alpha:]]/, so we back up to look for the
14906 * beginning */
14907 p--;
14908
14909 if (*p == ';') {
14910 found_problem = TRUE;
14911 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14912 }
14913 else if (*p != ':') {
14914
14915 /* XXX We are currently very restrictive here, so this code doesn't
14916 * consider the possibility that, say, /[alpha.]]/ was intended to
14917 * be a posix class. */
14918 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14919 }
14920
14921 /* Here we have something like 'foo:]'. There was no initial colon,
14922 * and we back up over 'foo. XXX Unlike the going forward case, we
14923 * don't handle typos of non-word chars in the middle */
14924 has_opening_colon = FALSE;
14925 p--;
14926
14927 while (p > RExC_start && isWORDCHAR(*p)) {
14928 p--;
14929 }
14930 p++;
14931
14932 /* Here, we have positioned ourselves to where we think the first
14933 * character in the potential class is */
14934 }
14935
14936 /* Now the interior really starts. There are certain key characters that
14937 * can end the interior, or these could just be typos. To catch both
14938 * cases, we may have to do two passes. In the first pass, we keep on
14939 * going unless we come to a sequence that matches
14940 * qr/ [[:punct:]] [[:blank:]]* \] /xa
14941 * This means it takes a sequence to end the pass, so two typos in a row if
14942 * that wasn't what was intended. If the class is perfectly formed, just
14943 * this one pass is needed. We also stop if there are too many characters
14944 * being accumulated, but this number is deliberately set higher than any
14945 * real class. It is set high enough so that someone who thinks that
14946 * 'alphanumeric' is a correct name would get warned that it wasn't.
14947 * While doing the pass, we keep track of where the key characters were in
14948 * it. If we don't find an end to the class, and one of the key characters
14949 * was found, we redo the pass, but stop when we get to that character.
14950 * Thus the key character was considered a typo in the first pass, but a
14951 * terminator in the second. If two key characters are found, we stop at
14952 * the second one in the first pass. Again this can miss two typos, but
14953 * catches a single one
14954 *
14955 * In the first pass, 'possible_end' starts as NULL, and then gets set to
14956 * point to the first key character. For the second pass, it starts as -1.
14957 * */
14958
14959 name_start = p;
14960 parse_name:
14961 {
14962 bool has_blank = FALSE;
14963 bool has_upper = FALSE;
14964 bool has_terminating_colon = FALSE;
14965 bool has_terminating_bracket = FALSE;
14966 bool has_semi_colon = FALSE;
14967 unsigned int name_len = 0;
14968 int punct_count = 0;
14969
14970 while (p < e) {
14971
14972 /* Squeeze out blanks when looking up the class name below */
14973 if (isBLANK(*p) ) {
14974 has_blank = TRUE;
14975 found_problem = TRUE;
14976 p++;
14977 continue;
14978 }
14979
14980 /* The name will end with a punctuation */
14981 if (isPUNCT(*p)) {
14982 const char * peek = p + 1;
14983
14984 /* Treat any non-']' punctuation followed by a ']' (possibly
14985 * with intervening blanks) as trying to terminate the class.
14986 * ']]' is very likely to mean a class was intended (but
14987 * missing the colon), but the warning message that gets
14988 * generated shows the error position better if we exit the
14989 * loop at the bottom (eventually), so skip it here. */
14990 if (*p != ']') {
14991 if (peek < e && isBLANK(*peek)) {
14992 has_blank = TRUE;
14993 found_problem = TRUE;
14994 do {
14995 peek++;
14996 } while (peek < e && isBLANK(*peek));
14997 }
14998
14999 if (peek < e && *peek == ']') {
15000 has_terminating_bracket = TRUE;
15001 if (*p == ':') {
15002 has_terminating_colon = TRUE;
15003 }
15004 else if (*p == ';') {
15005 has_semi_colon = TRUE;
15006 has_terminating_colon = TRUE;
15007 }
15008 else {
15009 found_problem = TRUE;
15010 }
15011 p = peek + 1;
15012 goto try_posix;
15013 }
15014 }
15015
15016 /* Here we have punctuation we thought didn't end the class.
15017 * Keep track of the position of the key characters that are
15018 * more likely to have been class-enders */
15019 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15020
15021 /* Allow just one such possible class-ender not actually
15022 * ending the class. */
15023 if (possible_end) {
15024 break;
15025 }
15026 possible_end = p;
15027 }
15028
15029 /* If we have too many punctuation characters, no use in
15030 * keeping going */
15031 if (++punct_count > max_distance) {
15032 break;
15033 }
15034
15035 /* Treat the punctuation as a typo. */
15036 input_text[name_len++] = *p;
15037 p++;
15038 }
15039 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15040 input_text[name_len++] = toLOWER(*p);
15041 has_upper = TRUE;
15042 found_problem = TRUE;
15043 p++;
15044 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15045 input_text[name_len++] = *p;
15046 p++;
15047 }
15048 else {
15049 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15050 p+= UTF8SKIP(p);
15051 }
15052
15053 /* The declaration of 'input_text' is how long we allow a potential
15054 * class name to be, before saying they didn't mean a class name at
15055 * all */
15056 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15057 break;
15058 }
15059 }
15060
15061 /* We get to here when the possible class name hasn't been properly
15062 * terminated before:
15063 * 1) we ran off the end of the pattern; or
15064 * 2) found two characters, each of which might have been intended to
15065 * be the name's terminator
15066 * 3) found so many punctuation characters in the purported name,
15067 * that the edit distance to a valid one is exceeded
15068 * 4) we decided it was more characters than anyone could have
15069 * intended to be one. */
15070
15071 found_problem = TRUE;
15072
15073 /* In the final two cases, we know that looking up what we've
15074 * accumulated won't lead to a match, even a fuzzy one. */
15075 if ( name_len >= C_ARRAY_LENGTH(input_text)
15076 || punct_count > max_distance)
15077 {
15078 /* If there was an intermediate key character that could have been
15079 * an intended end, redo the parse, but stop there */
15080 if (possible_end && possible_end != (char *) -1) {
15081 possible_end = (char *) -1; /* Special signal value to say
15082 we've done a first pass */
15083 p = name_start;
15084 goto parse_name;
15085 }
15086
15087 /* Otherwise, it can't have meant to have been a class */
15088 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15089 }
15090
15091 /* If we ran off the end, and the final character was a punctuation
15092 * one, back up one, to look at that final one just below. Later, we
15093 * will restore the parse pointer if appropriate */
15094 if (name_len && p == e && isPUNCT(*(p-1))) {
15095 p--;
15096 name_len--;
15097 }
15098
15099 if (p < e && isPUNCT(*p)) {
15100 if (*p == ']') {
15101 has_terminating_bracket = TRUE;
15102
15103 /* If this is a 2nd ']', and the first one is just below this
15104 * one, consider that to be the real terminator. This gives a
15105 * uniform and better positioning for the warning message */
15106 if ( possible_end
15107 && possible_end != (char *) -1
15108 && *possible_end == ']'
15109 && name_len && input_text[name_len - 1] == ']')
15110 {
15111 name_len--;
15112 p = possible_end;
15113
15114 /* And this is actually equivalent to having done the 2nd
15115 * pass now, so set it to not try again */
15116 possible_end = (char *) -1;
15117 }
15118 }
15119 else {
15120 if (*p == ':') {
15121 has_terminating_colon = TRUE;
15122 }
15123 else if (*p == ';') {
15124 has_semi_colon = TRUE;
15125 has_terminating_colon = TRUE;
15126 }
15127 p++;
15128 }
15129 }
15130
15131 try_posix:
15132
15133 /* Here, we have a class name to look up. We can short circuit the
15134 * stuff below for short names that can't possibly be meant to be a
15135 * class name. (We can do this on the first pass, as any second pass
15136 * will yield an even shorter name) */
15137 if (name_len < 3) {
15138 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15139 }
15140
15141 /* Find which class it is. Initially switch on the length of the name.
15142 * */
15143 switch (name_len) {
15144 case 4:
15145 if (memEQs(name_start, 4, "word")) {
15146 /* this is not POSIX, this is the Perl \w */
15147 class_number = ANYOF_WORDCHAR;
15148 }
15149 break;
15150 case 5:
15151 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15152 * graph lower print punct space upper
15153 * Offset 4 gives the best switch position. */
15154 switch (name_start[4]) {
15155 case 'a':
15156 if (memBEGINs(name_start, 5, "alph")) /* alpha */
15157 class_number = ANYOF_ALPHA;
15158 break;
15159 case 'e':
15160 if (memBEGINs(name_start, 5, "spac")) /* space */
15161 class_number = ANYOF_SPACE;
15162 break;
15163 case 'h':
15164 if (memBEGINs(name_start, 5, "grap")) /* graph */
15165 class_number = ANYOF_GRAPH;
15166 break;
15167 case 'i':
15168 if (memBEGINs(name_start, 5, "asci")) /* ascii */
15169 class_number = ANYOF_ASCII;
15170 break;
15171 case 'k':
15172 if (memBEGINs(name_start, 5, "blan")) /* blank */
15173 class_number = ANYOF_BLANK;
15174 break;
15175 case 'l':
15176 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15177 class_number = ANYOF_CNTRL;
15178 break;
15179 case 'm':
15180 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15181 class_number = ANYOF_ALPHANUMERIC;
15182 break;
15183 case 'r':
15184 if (memBEGINs(name_start, 5, "lowe")) /* lower */
15185 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15186 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15187 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15188 break;
15189 case 't':
15190 if (memBEGINs(name_start, 5, "digi")) /* digit */
15191 class_number = ANYOF_DIGIT;
15192 else if (memBEGINs(name_start, 5, "prin")) /* print */
15193 class_number = ANYOF_PRINT;
15194 else if (memBEGINs(name_start, 5, "punc")) /* punct */
15195 class_number = ANYOF_PUNCT;
15196 break;
15197 }
15198 break;
15199 case 6:
15200 if (memEQs(name_start, 6, "xdigit"))
15201 class_number = ANYOF_XDIGIT;
15202 break;
15203 }
15204
15205 /* If the name exactly matches a posix class name the class number will
15206 * here be set to it, and the input almost certainly was meant to be a
15207 * posix class, so we can skip further checking. If instead the syntax
15208 * is exactly correct, but the name isn't one of the legal ones, we
15209 * will return that as an error below. But if neither of these apply,
15210 * it could be that no posix class was intended at all, or that one
15211 * was, but there was a typo. We tease these apart by doing fuzzy
15212 * matching on the name */
15213 if (class_number == OOB_NAMEDCLASS && found_problem) {
15214 const UV posix_names[][6] = {
15215 { 'a', 'l', 'n', 'u', 'm' },
15216 { 'a', 'l', 'p', 'h', 'a' },
15217 { 'a', 's', 'c', 'i', 'i' },
15218 { 'b', 'l', 'a', 'n', 'k' },
15219 { 'c', 'n', 't', 'r', 'l' },
15220 { 'd', 'i', 'g', 'i', 't' },
15221 { 'g', 'r', 'a', 'p', 'h' },
15222 { 'l', 'o', 'w', 'e', 'r' },
15223 { 'p', 'r', 'i', 'n', 't' },
15224 { 'p', 'u', 'n', 'c', 't' },
15225 { 's', 'p', 'a', 'c', 'e' },
15226 { 'u', 'p', 'p', 'e', 'r' },
15227 { 'w', 'o', 'r', 'd' },
15228 { 'x', 'd', 'i', 'g', 'i', 't' }
15229 };
15230 /* The names of the above all have added NULs to make them the same
15231 * size, so we need to also have the real lengths */
15232 const UV posix_name_lengths[] = {
15233 sizeof("alnum") - 1,
15234 sizeof("alpha") - 1,
15235 sizeof("ascii") - 1,
15236 sizeof("blank") - 1,
15237 sizeof("cntrl") - 1,
15238 sizeof("digit") - 1,
15239 sizeof("graph") - 1,
15240 sizeof("lower") - 1,
15241 sizeof("print") - 1,
15242 sizeof("punct") - 1,
15243 sizeof("space") - 1,
15244 sizeof("upper") - 1,
15245 sizeof("word") - 1,
15246 sizeof("xdigit")- 1
15247 };
15248 unsigned int i;
15249 int temp_max = max_distance; /* Use a temporary, so if we
15250 reparse, we haven't changed the
15251 outer one */
15252
15253 /* Use a smaller max edit distance if we are missing one of the
15254 * delimiters */
15255 if ( has_opening_bracket + has_opening_colon < 2
15256 || has_terminating_bracket + has_terminating_colon < 2)
15257 {
15258 temp_max--;
15259 }
15260
15261 /* See if the input name is close to a legal one */
15262 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15263
15264 /* Short circuit call if the lengths are too far apart to be
15265 * able to match */
15266 if (abs( (int) (name_len - posix_name_lengths[i]))
15267 > temp_max)
15268 {
15269 continue;
15270 }
15271
15272 if (edit_distance(input_text,
15273 posix_names[i],
15274 name_len,
15275 posix_name_lengths[i],
15276 temp_max
15277 )
15278 > -1)
15279 { /* If it is close, it probably was intended to be a class */
15280 goto probably_meant_to_be;
15281 }
15282 }
15283
15284 /* Here the input name is not close enough to a valid class name
15285 * for us to consider it to be intended to be a posix class. If
15286 * we haven't already done so, and the parse found a character that
15287 * could have been terminators for the name, but which we absorbed
15288 * as typos during the first pass, repeat the parse, signalling it
15289 * to stop at that character */
15290 if (possible_end && possible_end != (char *) -1) {
15291 possible_end = (char *) -1;
15292 p = name_start;
15293 goto parse_name;
15294 }
15295
15296 /* Here neither pass found a close-enough class name */
15297 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15298 }
15299
15300 probably_meant_to_be:
15301
15302 /* Here we think that a posix specification was intended. Update any
15303 * parse pointer */
15304 if (updated_parse_ptr) {
15305 *updated_parse_ptr = (char *) p;
15306 }
15307
15308 /* If a posix class name was intended but incorrectly specified, we
15309 * output or return the warnings */
15310 if (found_problem) {
15311
15312 /* We set flags for these issues in the parse loop above instead of
15313 * adding them to the list of warnings, because we can parse it
15314 * twice, and we only want one warning instance */
15315 if (has_upper) {
15316 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15317 }
15318 if (has_blank) {
15319 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15320 }
15321 if (has_semi_colon) {
15322 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15323 }
15324 else if (! has_terminating_colon) {
15325 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15326 }
15327 if (! has_terminating_bracket) {
15328 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15329 }
15330
15331 if ( posix_warnings
15332 && RExC_warn_text
15333 && av_top_index(RExC_warn_text) > -1)
15334 {
15335 *posix_warnings = RExC_warn_text;
15336 }
15337 }
15338 else if (class_number != OOB_NAMEDCLASS) {
15339 /* If it is a known class, return the class. The class number
15340 * #defines are structured so each complement is +1 to the normal
15341 * one */
15342 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15343 }
15344 else if (! check_only) {
15345
15346 /* Here, it is an unrecognized class. This is an error (unless the
15347 * call is to check only, which we've already handled above) */
15348 const char * const complement_string = (complement)
15349 ? "^"
15350 : "";
15351 RExC_parse = (char *) p;
15352 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
15353 complement_string,
15354 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
15355 }
15356 }
15357
15358 return OOB_NAMEDCLASS;
15359}
15360#undef ADD_POSIX_WARNING
15361
15362STATIC unsigned int
15363S_regex_set_precedence(const U8 my_operator) {
15364
15365 /* Returns the precedence in the (?[...]) construct of the input operator,
15366 * specified by its character representation. The precedence follows
15367 * general Perl rules, but it extends this so that ')' and ']' have (low)
15368 * precedence even though they aren't really operators */
15369
15370 switch (my_operator) {
15371 case '!':
15372 return 5;
15373 case '&':
15374 return 4;
15375 case '^':
15376 case '|':
15377 case '+':
15378 case '-':
15379 return 3;
15380 case ')':
15381 return 2;
15382 case ']':
15383 return 1;
15384 }
15385
15386 NOT_REACHED; /* NOTREACHED */
15387 return 0; /* Silence compiler warning */
15388}
15389
15390STATIC regnode_offset
15391S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
15392 I32 *flagp, U32 depth,
15393 char * const oregcomp_parse)
15394{
15395 /* Handle the (?[...]) construct to do set operations */
15396
15397 U8 curchar; /* Current character being parsed */
15398 UV start, end; /* End points of code point ranges */
15399 SV* final = NULL; /* The end result inversion list */
15400 SV* result_string; /* 'final' stringified */
15401 AV* stack; /* stack of operators and operands not yet
15402 resolved */
15403 AV* fence_stack = NULL; /* A stack containing the positions in
15404 'stack' of where the undealt-with left
15405 parens would be if they were actually
15406 put there */
15407 /* The 'volatile' is a workaround for an optimiser bug
15408 * in Solaris Studio 12.3. See RT #127455 */
15409 volatile IV fence = 0; /* Position of where most recent undealt-
15410 with left paren in stack is; -1 if none.
15411 */
15412 STRLEN len; /* Temporary */
15413 regnode_offset node; /* Temporary, and final regnode returned by
15414 this function */
15415 const bool save_fold = FOLD; /* Temporary */
15416 char *save_end, *save_parse; /* Temporaries */
15417 const bool in_locale = LOC; /* we turn off /l during processing */
15418 AV* posix_warnings = NULL;
15419
15420 GET_RE_DEBUG_FLAGS_DECL;
15421
15422 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
15423
15424 DEBUG_PARSE("xcls");
15425
15426 if (in_locale) {
15427 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
15428 }
15429
15430 /* The use of this operator implies /u. This is required so that the
15431 * compile time values are valid in all runtime cases */
15432 REQUIRE_UNI_RULES(flagp, 0);
15433
15434 /* This will return only an ANYOF regnode, or (unlikely) something smaller
15435 * (such as EXACT). Thus we can skip most everything if just sizing. We
15436 * call regclass to handle '[]' so as to not have to reinvent its parsing
15437 * rules here (throwing away the size it computes each time). And, we exit
15438 * upon an unescaped ']' that isn't one ending a regclass. To do both
15439 * these things, we need to realize that something preceded by a backslash
15440 * is escaped, so we have to keep track of backslashes */
15441 if (SIZE_ONLY) {
15442 UV nest_depth = 0; /* how many nested (?[...]) constructs */
15443
15444 while (RExC_parse < RExC_end) {
15445 SV* current = NULL;
15446
15447 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15448 TRUE /* Force /x */ );
15449
15450 switch (*RExC_parse) {
15451 case '(':
15452 if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
15453 nest_depth++, RExC_parse+=2;
15454 /* FALLTHROUGH */
15455 default:
15456 break;
15457 case '\\':
15458 /* Skip past this, so the next character gets skipped, after
15459 * the switch */
15460 RExC_parse++;
15461 if (*RExC_parse == 'c') {
15462 /* Skip the \cX notation for control characters */
15463 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
15464 }
15465 break;
15466
15467 case '[':
15468 {
15469 /* See if this is a [:posix:] class. */
15470 bool is_posix_class = (OOB_NAMEDCLASS
15471 < handle_possible_posix(pRExC_state,
15472 RExC_parse + 1,
15473 NULL,
15474 NULL,
15475 TRUE /* checking only */));
15476 /* If it is a posix class, leave the parse pointer at the
15477 * '[' to fool regclass() into thinking it is part of a
15478 * '[[:posix:]]'. */
15479 if (! is_posix_class) {
15480 RExC_parse++;
15481 }
15482
15483 /* regclass() can only return RESTART_PARSE and NEED_UTF8
15484 * if multi-char folds are allowed. */
15485 if (!regclass(pRExC_state, flagp, depth+1,
15486 is_posix_class, /* parse the whole char
15487 class only if not a
15488 posix class */
15489 FALSE, /* don't allow multi-char folds */
15490 TRUE, /* silence non-portable warnings. */
15491 TRUE, /* strict */
15492 FALSE, /* Require return to be an ANYOF */
15493 &current,
15494 &posix_warnings
15495 ))
15496 FAIL2("panic: regclass returned failure to handle_sets, "
15497 "flags=%#" UVxf, (UV) *flagp);
15498
15499 /* function call leaves parse pointing to the ']', except
15500 * if we faked it */
15501 if (is_posix_class) {
15502 RExC_parse--;
15503 }
15504
15505 SvREFCNT_dec(current); /* In case it returned something */
15506 break;
15507 }
15508
15509 case ']':
15510 if (RExC_parse[1] == ')') {
15511 RExC_parse++;
15512 if (nest_depth--) break;
15513 node = reganode(pRExC_state, ANYOF, 0);
15514 nextchar(pRExC_state);
15515 Set_Node_Length(REGNODE_p(node),
15516 RExC_parse - oregcomp_parse + 1); /* MJD */
15517 if (in_locale) {
15518 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
15519 }
15520
15521 return node;
15522 }
15523 /* We output the messages even if warnings are off, because we'll fail
15524 * the very next thing, and these give a likely diagnosis for that */
15525 if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
15526 output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
15527 }
15528 RExC_parse++;
15529 vFAIL("Unexpected ']' with no following ')' in (?[...");
15530 }
15531
15532 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
15533 }
15534
15535 /* We output the messages even if warnings are off, because we'll fail
15536 * the very next thing, and these give a likely diagnosis for that */
15537 if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
15538 output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
15539 }
15540
15541 vFAIL("Syntax error in (?[...])");
15542 }
15543
15544 /* Pass 2 only after this. */
15545 ckWARNexperimental(RExC_parse,
15546 WARN_EXPERIMENTAL__REGEX_SETS,
15547 "The regex_sets feature is experimental");
15548
15549 /* Everything in this construct is a metacharacter. Operands begin with
15550 * either a '\' (for an escape sequence), or a '[' for a bracketed
15551 * character class. Any other character should be an operator, or
15552 * parenthesis for grouping. Both types of operands are handled by calling
15553 * regclass() to parse them. It is called with a parameter to indicate to
15554 * return the computed inversion list. The parsing here is implemented via
15555 * a stack. Each entry on the stack is a single character representing one
15556 * of the operators; or else a pointer to an operand inversion list. */
15557
15558#define IS_OPERATOR(a) SvIOK(a)
15559#define IS_OPERAND(a) (! IS_OPERATOR(a))
15560
15561 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
15562 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15563 * with pronouncing it called it Reverse Polish instead, but now that YOU
15564 * know how to pronounce it you can use the correct term, thus giving due
15565 * credit to the person who invented it, and impressing your geek friends.
15566 * Wikipedia says that the pronounciation of "Ł" has been changing so that
15567 * it is now more like an English initial W (as in wonk) than an L.)
15568 *
15569 * This means that, for example, 'a | b & c' is stored on the stack as
15570 *
15571 * c [4]
15572 * b [3]
15573 * & [2]
15574 * a [1]
15575 * | [0]
15576 *
15577 * where the numbers in brackets give the stack [array] element number.
15578 * In this implementation, parentheses are not stored on the stack.
15579 * Instead a '(' creates a "fence" so that the part of the stack below the
15580 * fence is invisible except to the corresponding ')' (this allows us to
15581 * replace testing for parens, by using instead subtraction of the fence
15582 * position). As new operands are processed they are pushed onto the stack
15583 * (except as noted in the next paragraph). New operators of higher
15584 * precedence than the current final one are inserted on the stack before
15585 * the lhs operand (so that when the rhs is pushed next, everything will be
15586 * in the correct positions shown above. When an operator of equal or
15587 * lower precedence is encountered in parsing, all the stacked operations
15588 * of equal or higher precedence are evaluated, leaving the result as the
15589 * top entry on the stack. This makes higher precedence operations
15590 * evaluate before lower precedence ones, and causes operations of equal
15591 * precedence to left associate.
15592 *
15593 * The only unary operator '!' is immediately pushed onto the stack when
15594 * encountered. When an operand is encountered, if the top of the stack is
15595 * a '!", the complement is immediately performed, and the '!' popped. The
15596 * resulting value is treated as a new operand, and the logic in the
15597 * previous paragraph is executed. Thus in the expression
15598 * [a] + ! [b]
15599 * the stack looks like
15600 *
15601 * !
15602 * a
15603 * +
15604 *
15605 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15606 * becomes
15607 *
15608 * !b
15609 * a
15610 * +
15611 *
15612 * A ')' is treated as an operator with lower precedence than all the
15613 * aforementioned ones, which causes all operations on the stack above the
15614 * corresponding '(' to be evaluated down to a single resultant operand.
15615 * Then the fence for the '(' is removed, and the operand goes through the
15616 * algorithm above, without the fence.
15617 *
15618 * A separate stack is kept of the fence positions, so that the position of
15619 * the latest so-far unbalanced '(' is at the top of it.
15620 *
15621 * The ']' ending the construct is treated as the lowest operator of all,
15622 * so that everything gets evaluated down to a single operand, which is the
15623 * result */
15624
15625 sv_2mortal((SV *)(stack = newAV()));
15626 sv_2mortal((SV *)(fence_stack = newAV()));
15627
15628 while (RExC_parse < RExC_end) {
15629 I32 top_index; /* Index of top-most element in 'stack' */
15630 SV** top_ptr; /* Pointer to top 'stack' element */
15631 SV* current = NULL; /* To contain the current inversion list
15632 operand */
15633 SV* only_to_avoid_leaks;
15634
15635 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15636 TRUE /* Force /x */ );
15637 if (RExC_parse >= RExC_end) {
15638 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
15639 }
15640
15641 curchar = UCHARAT(RExC_parse);
15642
15643redo_curchar:
15644
15645#ifdef ENABLE_REGEX_SETS_DEBUGGING
15646 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15647 DEBUG_U(dump_regex_sets_structures(pRExC_state,
15648 stack, fence, fence_stack));
15649#endif
15650
15651 top_index = av_tindex_skip_len_mg(stack);
15652
15653 switch (curchar) {
15654 SV** stacked_ptr; /* Ptr to something already on 'stack' */
15655 char stacked_operator; /* The topmost operator on the 'stack'. */
15656 SV* lhs; /* Operand to the left of the operator */
15657 SV* rhs; /* Operand to the right of the operator */
15658 SV* fence_ptr; /* Pointer to top element of the fence
15659 stack */
15660
15661 case '(':
15662
15663 if ( RExC_parse < RExC_end - 1
15664 && (UCHARAT(RExC_parse + 1) == '?'))
15665 {
15666 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
15667 * This happens when we have some thing like
15668 *
15669 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15670 * ...
15671 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
15672 *
15673 * Here we would be handling the interpolated
15674 * '$thai_or_lao'. We handle this by a recursive call to
15675 * ourselves which returns the inversion list the
15676 * interpolated expression evaluates to. We use the flags
15677 * from the interpolated pattern. */
15678 U32 save_flags = RExC_flags;
15679 const char * save_parse;
15680
15681 RExC_parse += 2; /* Skip past the '(?' */
15682 save_parse = RExC_parse;
15683
15684 /* Parse any flags for the '(?' */
15685 parse_lparen_question_flags(pRExC_state);
15686
15687 if (RExC_parse == save_parse /* Makes sure there was at
15688 least one flag (or else
15689 this embedding wasn't
15690 compiled) */
15691 || RExC_parse >= RExC_end - 4
15692 || UCHARAT(RExC_parse) != ':'
15693 || UCHARAT(++RExC_parse) != '('
15694 || UCHARAT(++RExC_parse) != '?'
15695 || UCHARAT(++RExC_parse) != '[')
15696 {
15697
15698 /* In combination with the above, this moves the
15699 * pointer to the point just after the first erroneous
15700 * character (or if there are no flags, to where they
15701 * should have been) */
15702 if (RExC_parse >= RExC_end - 4) {
15703 RExC_parse = RExC_end;
15704 }
15705 else if (RExC_parse != save_parse) {
15706 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15707 }
15708 vFAIL("Expecting '(?flags:(?[...'");
15709 }
15710
15711 /* Recurse, with the meat of the embedded expression */
15712 RExC_parse++;
15713 (void) handle_regex_sets(pRExC_state, &current, flagp,
15714 depth+1, oregcomp_parse);
15715
15716 /* Here, 'current' contains the embedded expression's
15717 * inversion list, and RExC_parse points to the trailing
15718 * ']'; the next character should be the ')' */
15719 RExC_parse++;
15720 if (UCHARAT(RExC_parse) != ')')
15721 vFAIL("Expecting close paren for nested extended charclass");
15722
15723 /* Then the ')' matching the original '(' handled by this
15724 * case: statement */
15725 RExC_parse++;
15726 if (UCHARAT(RExC_parse) != ')')
15727 vFAIL("Expecting close paren for wrapper for nested extended charclass");
15728
15729 RExC_parse++;
15730 RExC_flags = save_flags;
15731 goto handle_operand;
15732 }
15733
15734 /* A regular '('. Look behind for illegal syntax */
15735 if (top_index - fence >= 0) {
15736 /* If the top entry on the stack is an operator, it had
15737 * better be a '!', otherwise the entry below the top
15738 * operand should be an operator */
15739 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
15740 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15741 || ( IS_OPERAND(*top_ptr)
15742 && ( top_index - fence < 1
15743 || ! (stacked_ptr = av_fetch(stack,
15744 top_index - 1,
15745 FALSE))
15746 || ! IS_OPERATOR(*stacked_ptr))))
15747 {
15748 RExC_parse++;
15749 vFAIL("Unexpected '(' with no preceding operator");
15750 }
15751 }
15752
15753 /* Stack the position of this undealt-with left paren */
15754 av_push(fence_stack, newSViv(fence));
15755 fence = top_index + 1;
15756 break;
15757
15758 case '\\':
15759 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15760 * multi-char folds are allowed. */
15761 if (!regclass(pRExC_state, flagp, depth+1,
15762 TRUE, /* means parse just the next thing */
15763 FALSE, /* don't allow multi-char folds */
15764 FALSE, /* don't silence non-portable warnings. */
15765 TRUE, /* strict */
15766 FALSE, /* Require return to be an ANYOF */
15767 &current,
15768 NULL))
15769 {
15770 FAIL2("panic: regclass returned failure to handle_sets, "
15771 "flags=%#" UVxf, (UV) *flagp);
15772 }
15773
15774 /* regclass() will return with parsing just the \ sequence,
15775 * leaving the parse pointer at the next thing to parse */
15776 RExC_parse--;
15777 goto handle_operand;
15778
15779 case '[': /* Is a bracketed character class */
15780 {
15781 /* See if this is a [:posix:] class. */
15782 bool is_posix_class = (OOB_NAMEDCLASS
15783 < handle_possible_posix(pRExC_state,
15784 RExC_parse + 1,
15785 NULL,
15786 NULL,
15787 TRUE /* checking only */));
15788 /* If it is a posix class, leave the parse pointer at the '['
15789 * to fool regclass() into thinking it is part of a
15790 * '[[:posix:]]'. */
15791 if (! is_posix_class) {
15792 RExC_parse++;
15793 }
15794
15795 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15796 * multi-char folds are allowed. */
15797 if (!regclass(pRExC_state, flagp, depth+1,
15798 is_posix_class, /* parse the whole char
15799 class only if not a
15800 posix class */
15801 FALSE, /* don't allow multi-char folds */
15802 TRUE, /* silence non-portable warnings. */
15803 TRUE, /* strict */
15804 FALSE, /* Require return to be an ANYOF */
15805 &current,
15806 NULL
15807 ))
15808 {
15809 FAIL2("panic: regclass returned failure to handle_sets, "
15810 "flags=%#" UVxf, (UV) *flagp);
15811 }
15812
15813 /* function call leaves parse pointing to the ']', except if we
15814 * faked it */
15815 if (is_posix_class) {
15816 RExC_parse--;
15817 }
15818
15819 goto handle_operand;
15820 }
15821
15822 case ']':
15823 if (top_index >= 1) {
15824 goto join_operators;
15825 }
15826
15827 /* Only a single operand on the stack: are done */
15828 goto done;
15829
15830 case ')':
15831 if (av_tindex_skip_len_mg(fence_stack) < 0) {
15832 RExC_parse++;
15833 vFAIL("Unexpected ')'");
15834 }
15835
15836 /* If nothing after the fence, is missing an operand */
15837 if (top_index - fence < 0) {
15838 RExC_parse++;
15839 goto bad_syntax;
15840 }
15841 /* If at least two things on the stack, treat this as an
15842 * operator */
15843 if (top_index - fence >= 1) {
15844 goto join_operators;
15845 }
15846
15847 /* Here only a single thing on the fenced stack, and there is a
15848 * fence. Get rid of it */
15849 fence_ptr = av_pop(fence_stack);
15850 assert(fence_ptr);
15851 fence = SvIV(fence_ptr);
15852 SvREFCNT_dec_NN(fence_ptr);
15853 fence_ptr = NULL;
15854
15855 if (fence < 0) {
15856 fence = 0;
15857 }
15858
15859 /* Having gotten rid of the fence, we pop the operand at the
15860 * stack top and process it as a newly encountered operand */
15861 current = av_pop(stack);
15862 if (IS_OPERAND(current)) {
15863 goto handle_operand;
15864 }
15865
15866 RExC_parse++;
15867 goto bad_syntax;
15868
15869 case '&':
15870 case '|':
15871 case '+':
15872 case '-':
15873 case '^':
15874
15875 /* These binary operators should have a left operand already
15876 * parsed */
15877 if ( top_index - fence < 0
15878 || top_index - fence == 1
15879 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15880 || ! IS_OPERAND(*top_ptr))
15881 {
15882 goto unexpected_binary;
15883 }
15884
15885 /* If only the one operand is on the part of the stack visible
15886 * to us, we just place this operator in the proper position */
15887 if (top_index - fence < 2) {
15888
15889 /* Place the operator before the operand */
15890
15891 SV* lhs = av_pop(stack);
15892 av_push(stack, newSVuv(curchar));
15893 av_push(stack, lhs);
15894 break;
15895 }
15896
15897 /* But if there is something else on the stack, we need to
15898 * process it before this new operator if and only if the
15899 * stacked operation has equal or higher precedence than the
15900 * new one */
15901
15902 join_operators:
15903
15904 /* The operator on the stack is supposed to be below both its
15905 * operands */
15906 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15907 || IS_OPERAND(*stacked_ptr))
15908 {
15909 /* But if not, it's legal and indicates we are completely
15910 * done if and only if we're currently processing a ']',
15911 * which should be the final thing in the expression */
15912 if (curchar == ']') {
15913 goto done;
15914 }
15915
15916 unexpected_binary:
15917 RExC_parse++;
15918 vFAIL2("Unexpected binary operator '%c' with no "
15919 "preceding operand", curchar);
15920 }
15921 stacked_operator = (char) SvUV(*stacked_ptr);
15922
15923 if (regex_set_precedence(curchar)
15924 > regex_set_precedence(stacked_operator))
15925 {
15926 /* Here, the new operator has higher precedence than the
15927 * stacked one. This means we need to add the new one to
15928 * the stack to await its rhs operand (and maybe more
15929 * stuff). We put it before the lhs operand, leaving
15930 * untouched the stacked operator and everything below it
15931 * */
15932 lhs = av_pop(stack);
15933 assert(IS_OPERAND(lhs));
15934
15935 av_push(stack, newSVuv(curchar));
15936 av_push(stack, lhs);
15937 break;
15938 }
15939
15940 /* Here, the new operator has equal or lower precedence than
15941 * what's already there. This means the operation already
15942 * there should be performed now, before the new one. */
15943
15944 rhs = av_pop(stack);
15945 if (! IS_OPERAND(rhs)) {
15946
15947 /* This can happen when a ! is not followed by an operand,
15948 * like in /(?[\t &!])/ */
15949 goto bad_syntax;
15950 }
15951
15952 lhs = av_pop(stack);
15953
15954 if (! IS_OPERAND(lhs)) {
15955
15956 /* This can happen when there is an empty (), like in
15957 * /(?[[0]+()+])/ */
15958 goto bad_syntax;
15959 }
15960
15961 switch (stacked_operator) {
15962 case '&':
15963 _invlist_intersection(lhs, rhs, &rhs);
15964 break;
15965
15966 case '|':
15967 case '+':
15968 _invlist_union(lhs, rhs, &rhs);
15969 break;
15970
15971 case '-':
15972 _invlist_subtract(lhs, rhs, &rhs);
15973 break;
15974
15975 case '^': /* The union minus the intersection */
15976 {
15977 SV* i = NULL;
15978 SV* u = NULL;
15979
15980 _invlist_union(lhs, rhs, &u);
15981 _invlist_intersection(lhs, rhs, &i);
15982 _invlist_subtract(u, i, &rhs);
15983 SvREFCNT_dec_NN(i);
15984 SvREFCNT_dec_NN(u);
15985 break;
15986 }
15987 }
15988 SvREFCNT_dec(lhs);
15989
15990 /* Here, the higher precedence operation has been done, and the
15991 * result is in 'rhs'. We overwrite the stacked operator with
15992 * the result. Then we redo this code to either push the new
15993 * operator onto the stack or perform any higher precedence
15994 * stacked operation */
15995 only_to_avoid_leaks = av_pop(stack);
15996 SvREFCNT_dec(only_to_avoid_leaks);
15997 av_push(stack, rhs);
15998 goto redo_curchar;
15999
16000 case '!': /* Highest priority, right associative */
16001
16002 /* If what's already at the top of the stack is another '!",
16003 * they just cancel each other out */
16004 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
16005 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16006 {
16007 only_to_avoid_leaks = av_pop(stack);
16008 SvREFCNT_dec(only_to_avoid_leaks);
16009 }
16010 else { /* Otherwise, since it's right associative, just push
16011 onto the stack */
16012 av_push(stack, newSVuv(curchar));
16013 }
16014 break;
16015
16016 default:
16017 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16018 vFAIL("Unexpected character");
16019
16020 handle_operand:
16021
16022 /* Here 'current' is the operand. If something is already on the
16023 * stack, we have to check if it is a !. But first, the code above
16024 * may have altered the stack in the time since we earlier set
16025 * 'top_index'. */
16026
16027 top_index = av_tindex_skip_len_mg(stack);
16028 if (top_index - fence >= 0) {
16029 /* If the top entry on the stack is an operator, it had better
16030 * be a '!', otherwise the entry below the top operand should
16031 * be an operator */
16032 top_ptr = av_fetch(stack, top_index, FALSE);
16033 assert(top_ptr);
16034 if (IS_OPERATOR(*top_ptr)) {
16035
16036 /* The only permissible operator at the top of the stack is
16037 * '!', which is applied immediately to this operand. */
16038 curchar = (char) SvUV(*top_ptr);
16039 if (curchar != '!') {
16040 SvREFCNT_dec(current);
16041 vFAIL2("Unexpected binary operator '%c' with no "
16042 "preceding operand", curchar);
16043 }
16044
16045 _invlist_invert(current);
16046
16047 only_to_avoid_leaks = av_pop(stack);
16048 SvREFCNT_dec(only_to_avoid_leaks);
16049
16050 /* And we redo with the inverted operand. This allows
16051 * handling multiple ! in a row */
16052 goto handle_operand;
16053 }
16054 /* Single operand is ok only for the non-binary ')'
16055 * operator */
16056 else if ((top_index - fence == 0 && curchar != ')')
16057 || (top_index - fence > 0
16058 && (! (stacked_ptr = av_fetch(stack,
16059 top_index - 1,
16060 FALSE))
16061 || IS_OPERAND(*stacked_ptr))))
16062 {
16063 SvREFCNT_dec(current);
16064 vFAIL("Operand with no preceding operator");
16065 }
16066 }
16067
16068 /* Here there was nothing on the stack or the top element was
16069 * another operand. Just add this new one */
16070 av_push(stack, current);
16071
16072 } /* End of switch on next parse token */
16073
16074 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16075 } /* End of loop parsing through the construct */
16076
16077 done:
16078 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16079 vFAIL("Unmatched (");
16080 }
16081
16082 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16083 || ((final = av_pop(stack)) == NULL)
16084 || ! IS_OPERAND(final)
16085 || ! is_invlist(final)
16086 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16087 {
16088 bad_syntax:
16089 SvREFCNT_dec(final);
16090 vFAIL("Incomplete expression within '(?[ ])'");
16091 }
16092
16093 /* Here, 'final' is the resultant inversion list from evaluating the
16094 * expression. Return it if so requested */
16095 if (return_invlist) {
16096 *return_invlist = final;
16097 return END;
16098 }
16099
16100 /* Otherwise generate a resultant node, based on 'final'. regclass() is
16101 * expecting a string of ranges and individual code points */
16102 invlist_iterinit(final);
16103 result_string = newSVpvs("");
16104 while (invlist_iternext(final, &start, &end)) {
16105 if (start == end) {
16106 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16107 }
16108 else {
16109 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16110 start, end);
16111 }
16112 }
16113
16114 /* About to generate an ANYOF (or similar) node from the inversion list we
16115 * have calculated */
16116 save_parse = RExC_parse;
16117 RExC_parse = SvPV(result_string, len);
16118 save_end = RExC_end;
16119 RExC_end = RExC_parse + len;
16120 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16121
16122 /* We turn off folding around the call, as the class we have constructed
16123 * already has all folding taken into consideration, and we don't want
16124 * regclass() to add to that */
16125 RExC_flags &= ~RXf_PMf_FOLD;
16126 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16127 * folds are allowed. */
16128 node = regclass(pRExC_state, flagp, depth+1,
16129 FALSE, /* means parse the whole char class */
16130 FALSE, /* don't allow multi-char folds */
16131 TRUE, /* silence non-portable warnings. The above may very
16132 well have generated non-portable code points, but
16133 they're valid on this machine */
16134 FALSE, /* similarly, no need for strict */
16135 FALSE, /* Require return to be an ANYOF */
16136 NULL,
16137 NULL
16138 );
16139
16140 RESTORE_WARNINGS;
16141 RExC_parse = save_parse + 1;
16142 RExC_end = save_end;
16143 SvREFCNT_dec_NN(final);
16144 SvREFCNT_dec_NN(result_string);
16145
16146 if (save_fold) {
16147 RExC_flags |= RXf_PMf_FOLD;
16148 }
16149
16150 if (!node)
16151 FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
16152 PTR2UV(flagp));
16153
16154 /* Fix up the node type if we are in locale. (We have pretended we are
16155 * under /u for the purposes of regclass(), as this construct will only
16156 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
16157 * as to cause any warnings about bad locales to be output in regexec.c),
16158 * and add the flag that indicates to check if not in a UTF-8 locale. The
16159 * reason we above forbid optimization into something other than an ANYOF
16160 * node is simply to minimize the number of code changes in regexec.c.
16161 * Otherwise we would have to create new EXACTish node types and deal with
16162 * them. This decision could be revisited should this construct become
16163 * popular.
16164 *
16165 * (One might think we could look at the resulting ANYOF node and suppress
16166 * the flag if everything is above 255, as those would be UTF-8 only,
16167 * but this isn't true, as the components that led to that result could
16168 * have been locale-affected, and just happen to cancel each other out
16169 * under UTF-8 locales.) */
16170 if (in_locale) {
16171 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16172
16173 assert(OP(REGNODE_p(node)) == ANYOF);
16174
16175 OP(REGNODE_p(node)) = ANYOFL;
16176 ANYOF_FLAGS(REGNODE_p(node))
16177 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16178 }
16179
16180 nextchar(pRExC_state);
16181 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16182 return node;
16183}
16184
16185#ifdef ENABLE_REGEX_SETS_DEBUGGING
16186
16187STATIC void
16188S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16189 AV * stack, const IV fence, AV * fence_stack)
16190{ /* Dumps the stacks in handle_regex_sets() */
16191
16192 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16193 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16194 SSize_t i;
16195
16196 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16197
16198 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16199
16200 if (stack_top < 0) {
16201 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16202 }
16203 else {
16204 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16205 for (i = stack_top; i >= 0; i--) {
16206 SV ** element_ptr = av_fetch(stack, i, FALSE);
16207 if (! element_ptr) {
16208 }
16209
16210 if (IS_OPERATOR(*element_ptr)) {
16211 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16212 (int) i, (int) SvIV(*element_ptr));
16213 }
16214 else {
16215 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16216 sv_dump(*element_ptr);
16217 }
16218 }
16219 }
16220
16221 if (fence_stack_top < 0) {
16222 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16223 }
16224 else {
16225 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16226 for (i = fence_stack_top; i >= 0; i--) {
16227 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16228 if (! element_ptr) {
16229 }
16230
16231 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16232 (int) i, (int) SvIV(*element_ptr));
16233 }
16234 }
16235}
16236
16237#endif
16238
16239#undef IS_OPERATOR
16240#undef IS_OPERAND
16241
16242STATIC void
16243S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16244{
16245 /* This adds the Latin1/above-Latin1 folding rules.
16246 *
16247 * This should be called only for a Latin1-range code points, cp, which is
16248 * known to be involved in a simple fold with other code points above
16249 * Latin1. It would give false results if /aa has been specified.
16250 * Multi-char folds are outside the scope of this, and must be handled
16251 * specially. */
16252
16253 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16254
16255 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16256
16257 /* The rules that are valid for all Unicode versions are hard-coded in */
16258 switch (cp) {
16259 case 'k':
16260 case 'K':
16261 *invlist =
16262 add_cp_to_invlist(*invlist, KELVIN_SIGN);
16263 break;
16264 case 's':
16265 case 'S':
16266 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16267 break;
16268 case MICRO_SIGN:
16269 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16270 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16271 break;
16272 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16273 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16274 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16275 break;
16276 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16277 *invlist = add_cp_to_invlist(*invlist,
16278 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16279 break;
16280
16281 default: /* Other code points are checked against the data for the
16282 current Unicode version */
16283 {
16284 Size_t folds_to_count;
16285 unsigned int first_folds_to;
16286 const unsigned int * remaining_folds_to_list;
16287 UV folded_cp;
16288
16289 if (isASCII(cp)) {
16290 folded_cp = toFOLD(cp);
16291 }
16292 else {
16293 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16294 Size_t dummy_len;
16295 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16296 }
16297
16298 if (folded_cp > 255) {
16299 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16300 }
16301
16302 folds_to_count = _inverse_folds(folded_cp, &first_folds_to,
16303 &remaining_folds_to_list);
16304 if (folds_to_count == 0) {
16305
16306 /* Use deprecated warning to increase the chances of this being
16307 * output */
16308 ckWARN2reg_d(RExC_parse,
16309 "Perl folding rules are not up-to-date for 0x%02X;"
16310 " please use the perlbug utility to report;", cp);
16311 }
16312 else {
16313 unsigned int i;
16314
16315 if (first_folds_to > 255) {
16316 *invlist = add_cp_to_invlist(*invlist, first_folds_to);
16317 }
16318 for (i = 0; i < folds_to_count - 1; i++) {
16319 if (remaining_folds_to_list[i] > 255) {
16320 *invlist = add_cp_to_invlist(*invlist,
16321 remaining_folds_to_list[i]);
16322 }
16323 }
16324 }
16325 break;
16326 }
16327 }
16328}
16329
16330STATIC void
16331S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
16332{
16333 /* If the final parameter is NULL, output the elements of the array given
16334 * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
16335 * pushed onto it, (creating if necessary) */
16336
16337 SV * msg;
16338 const bool first_is_fatal = ! return_posix_warnings
16339 && ckDEAD(packWARN(WARN_REGEXP));
16340
16341 PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
16342
16343 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16344 if (return_posix_warnings) {
16345 if (! *return_posix_warnings) { /* mortalize to not leak if
16346 warnings are fatal */
16347 *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
16348 }
16349 av_push(*return_posix_warnings, msg);
16350 }
16351 else {
16352 if (first_is_fatal) { /* Avoid leaking this */
16353 av_undef(posix_warnings); /* This isn't necessary if the
16354 array is mortal, but is a
16355 fail-safe */
16356 (void) sv_2mortal(msg);
16357 if (ckDEAD(packWARN(WARN_REGEXP))) {
16358 PREPARE_TO_DIE;
16359 }
16360 }
16361 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s",
16362 SvPVX(msg));
16363 SvREFCNT_dec_NN(msg);
16364 }
16365 }
16366
16367 if (! return_posix_warnings) {
16368 UPDATE_WARNINGS_LOC(RExC_parse);
16369 }
16370}
16371
16372STATIC AV *
16373S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16374{
16375 /* This adds the string scalar <multi_string> to the array
16376 * <multi_char_matches>. <multi_string> is known to have exactly
16377 * <cp_count> code points in it. This is used when constructing a
16378 * bracketed character class and we find something that needs to match more
16379 * than a single character.
16380 *
16381 * <multi_char_matches> is actually an array of arrays. Each top-level
16382 * element is an array that contains all the strings known so far that are
16383 * the same length. And that length (in number of code points) is the same
16384 * as the index of the top-level array. Hence, the [2] element is an
16385 * array, each element thereof is a string containing TWO code points;
16386 * while element [3] is for strings of THREE characters, and so on. Since
16387 * this is for multi-char strings there can never be a [0] nor [1] element.
16388 *
16389 * When we rewrite the character class below, we will do so such that the
16390 * longest strings are written first, so that it prefers the longest
16391 * matching strings first. This is done even if it turns out that any
16392 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
16393 * Christiansen has agreed that this is ok. This makes the test for the
16394 * ligature 'ffi' come before the test for 'ff', for example */
16395
16396 AV* this_array;
16397 AV** this_array_ptr;
16398
16399 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16400
16401 if (! multi_char_matches) {
16402 multi_char_matches = newAV();
16403 }
16404
16405 if (av_exists(multi_char_matches, cp_count)) {
16406 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16407 this_array = *this_array_ptr;
16408 }
16409 else {
16410 this_array = newAV();
16411 av_store(multi_char_matches, cp_count,
16412 (SV*) this_array);
16413 }
16414 av_push(this_array, multi_string);
16415
16416 return multi_char_matches;
16417}
16418
16419/* The names of properties whose definitions are not known at compile time are
16420 * stored in this SV, after a constant heading. So if the length has been
16421 * changed since initialization, then there is a run-time definition. */
16422#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
16423 (SvCUR(listsv) != initial_listsv_len)
16424
16425/* There is a restricted set of white space characters that are legal when
16426 * ignoring white space in a bracketed character class. This generates the
16427 * code to skip them.
16428 *
16429 * There is a line below that uses the same white space criteria but is outside
16430 * this macro. Both here and there must use the same definition */
16431#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
16432 STMT_START { \
16433 if (do_skip) { \
16434 while (isBLANK_A(UCHARAT(p))) \
16435 { \
16436 p++; \
16437 } \
16438 } \
16439 } STMT_END
16440
16441STATIC regnode_offset
16442S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
16443 const bool stop_at_1, /* Just parse the next thing, don't
16444 look for a full character class */
16445 bool allow_multi_folds,
16446 const bool silence_non_portable, /* Don't output warnings
16447 about too large
16448 characters */
16449 const bool strict,
16450 bool optimizable, /* ? Allow a non-ANYOF return
16451 node */
16452 SV** ret_invlist, /* Return an inversion list, not a node */
16453 AV** return_posix_warnings
16454 )
16455{
16456 /* parse a bracketed class specification. Most of these will produce an
16457 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16458 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
16459 * under /i with multi-character folds: it will be rewritten following the
16460 * paradigm of this example, where the <multi-fold>s are characters which
16461 * fold to multiple character sequences:
16462 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16463 * gets effectively rewritten as:
16464 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16465 * reg() gets called (recursively) on the rewritten version, and this
16466 * function will return what it constructs. (Actually the <multi-fold>s
16467 * aren't physically removed from the [abcdefghi], it's just that they are
16468 * ignored in the recursion by means of a flag:
16469 * <RExC_in_multi_char_class>.)
16470 *
16471 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16472 * characters, with the corresponding bit set if that character is in the
16473 * list. For characters above this, a range list or swash is used. There
16474 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16475 * determinable at compile time
16476 *
16477 * On success, returns the offset at which any next node should be placed
16478 * into the regex engine program being compiled.
16479 *
16480 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the sizing scan needs
16481 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
16482 * UTF-8
16483 */
16484
16485 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16486 IV range = 0;
16487 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16488 regnode_offset ret;
16489 STRLEN numlen;
16490 int namedclass = OOB_NAMEDCLASS;
16491 char *rangebegin = NULL;
16492 bool need_class = 0;
16493 SV *listsv = NULL;
16494 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16495 than just initialized. */
16496 SV* properties = NULL; /* Code points that match \p{} \P{} */
16497 SV* posixes = NULL; /* Code points that match classes like [:word:],
16498 extended beyond the Latin1 range. These have to
16499 be kept separate from other code points for much
16500 of this function because their handling is
16501 different under /i, and for most classes under
16502 /d as well */
16503 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
16504 separate for a while from the non-complemented
16505 versions because of complications with /d
16506 matching */
16507 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16508 treated more simply than the general case,
16509 leading to less compilation and execution
16510 work */
16511 UV element_count = 0; /* Number of distinct elements in the class.
16512 Optimizations may be possible if this is tiny */
16513 AV * multi_char_matches = NULL; /* Code points that fold to more than one
16514 character; used under /i */
16515 UV n;
16516 char * stop_ptr = RExC_end; /* where to stop parsing */
16517
16518 /* ignore unescaped whitespace? */
16519 const bool skip_white = cBOOL( ret_invlist
16520 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16521
16522 /* Unicode properties are stored in a swash; this holds the current one
16523 * being parsed. If this swash is the only above-latin1 component of the
16524 * character class, an optimization is to pass it directly on to the
16525 * execution engine. Otherwise, it is set to NULL to indicate that there
16526 * are other things in the class that have to be dealt with at execution
16527 * time */
16528 SV* swash = NULL; /* Code points that match \p{} \P{} */
16529
16530 /* Set if a component of this character class is user-defined; just passed
16531 * on to the engine */
16532 bool has_user_defined_property = FALSE;
16533
16534 /* inversion list of code points this node matches only when the target
16535 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
16536 * /d) */
16537 SV* has_upper_latin1_only_utf8_matches = NULL;
16538
16539 /* Inversion list of code points this node matches regardless of things
16540 * like locale, folding, utf8ness of the target string */
16541 SV* cp_list = NULL;
16542
16543 /* Like cp_list, but code points on this list need to be checked for things
16544 * that fold to/from them under /i */
16545 SV* cp_foldable_list = NULL;
16546
16547 /* Like cp_list, but code points on this list are valid only when the
16548 * runtime locale is UTF-8 */
16549 SV* only_utf8_locale_list = NULL;
16550
16551 /* In a range, if one of the endpoints is non-character-set portable,
16552 * meaning that it hard-codes a code point that may mean a different
16553 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16554 * mnemonic '\t' which each mean the same character no matter which
16555 * character set the platform is on. */
16556 unsigned int non_portable_endpoint = 0;
16557
16558 /* Is the range unicode? which means on a platform that isn't 1-1 native
16559 * to Unicode (i.e. non-ASCII), each code point in it should be considered
16560 * to be a Unicode value. */
16561 bool unicode_range = FALSE;
16562 bool invert = FALSE; /* Is this class to be complemented */
16563
16564 bool warn_super = ALWAYS_WARN_SUPER;
16565
16566 const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in
16567 case we need to change the emitted regop to an EXACT. */
16568 const char * orig_parse = RExC_parse;
16569 const SSize_t orig_size = RExC_size;
16570 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
16571
16572 /* This variable is used to mark where the end in the input is of something
16573 * that looks like a POSIX construct but isn't. During the parse, when
16574 * something looks like it could be such a construct is encountered, it is
16575 * checked for being one, but not if we've already checked this area of the
16576 * input. Only after this position is reached do we check again */
16577 char *not_posix_region_end = RExC_parse - 1;
16578
16579 AV* posix_warnings = NULL;
16580 const bool do_posix_warnings = return_posix_warnings
16581 || (PASS2 && ckWARN(WARN_REGEXP));
16582 U8 op = END; /* The returned node-type, initialized to an impossible
16583 one. */
16584 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
16585 U32 posixl = 0; /* bit field of posix classes matched under /l */
16586 bool use_anyofd = FALSE; /* ? Is this to be an ANYOFD node */
16587
16588 GET_RE_DEBUG_FLAGS_DECL;
16589
16590 PERL_ARGS_ASSERT_REGCLASS;
16591#ifndef DEBUGGING
16592 PERL_UNUSED_ARG(depth);
16593#endif
16594
16595 DEBUG_PARSE("clas");
16596
16597#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
16598 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
16599 && UNICODE_DOT_DOT_VERSION == 0)
16600 allow_multi_folds = FALSE;
16601#endif
16602
16603 if (SIZE_ONLY) {
16604 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
16605 }
16606 else {
16607 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
16608 initial_listsv_len = SvCUR(listsv);
16609 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
16610 }
16611
16612 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16613
16614 assert(RExC_parse <= RExC_end);
16615
16616 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
16617 RExC_parse++;
16618 invert = TRUE;
16619 allow_multi_folds = FALSE;
16620 MARK_NAUGHTY(1);
16621 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16622 }
16623
16624 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16625 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16626 int maybe_class = handle_possible_posix(pRExC_state,
16627 RExC_parse,
16628 &not_posix_region_end,
16629 NULL,
16630 TRUE /* checking only */);
16631 if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16632 ckWARN4reg(not_posix_region_end,
16633 "POSIX syntax [%c %c] belongs inside character classes%s",
16634 *RExC_parse, *RExC_parse,
16635 (maybe_class == OOB_NAMEDCLASS)
16636 ? ((POSIXCC_NOTYET(*RExC_parse))
16637 ? " (but this one isn't implemented)"
16638 : " (but this one isn't fully valid)")
16639 : ""
16640 );
16641 }
16642 }
16643
16644 /* If the caller wants us to just parse a single element, accomplish this
16645 * by faking the loop ending condition */
16646 if (stop_at_1 && RExC_end > RExC_parse) {
16647 stop_ptr = RExC_parse + 1;
16648 }
16649
16650 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16651 if (UCHARAT(RExC_parse) == ']')
16652 goto charclassloop;
16653
16654 while (1) {
16655
16656 if ( posix_warnings
16657 && av_tindex_skip_len_mg(posix_warnings) >= 0
16658 && RExC_parse > not_posix_region_end)
16659 {
16660 /* Warnings about posix class issues are considered tentative until
16661 * we are far enough along in the parse that we can no longer
16662 * change our mind, at which point we either output them or add
16663 * them, if it has so specified, to what gets returned to the
16664 * caller. This is done each time through the loop so that a later
16665 * class won't zap them before they have been dealt with. */
16666 output_or_return_posix_warnings(pRExC_state, posix_warnings,
16667 return_posix_warnings);
16668 }
16669
16670 if (RExC_parse >= stop_ptr) {
16671 break;
16672 }
16673
16674 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16675
16676 if (UCHARAT(RExC_parse) == ']') {
16677 break;
16678 }
16679
16680 charclassloop:
16681
16682 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16683 save_value = value;
16684 save_prevvalue = prevvalue;
16685
16686 if (!range) {
16687 rangebegin = RExC_parse;
16688 element_count++;
16689 non_portable_endpoint = 0;
16690 }
16691 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16692 value = utf8n_to_uvchr((U8*)RExC_parse,
16693 RExC_end - RExC_parse,
16694 &numlen, UTF8_ALLOW_DEFAULT);
16695 RExC_parse += numlen;
16696 }
16697 else
16698 value = UCHARAT(RExC_parse++);
16699
16700 if (value == '[') {
16701 char * posix_class_end;
16702 namedclass = handle_possible_posix(pRExC_state,
16703 RExC_parse,
16704 &posix_class_end,
16705 do_posix_warnings ? &posix_warnings : NULL,
16706 FALSE /* die if error */);
16707 if (namedclass > OOB_NAMEDCLASS) {
16708
16709 /* If there was an earlier attempt to parse this particular
16710 * posix class, and it failed, it was a false alarm, as this
16711 * successful one proves */
16712 if ( posix_warnings
16713 && av_tindex_skip_len_mg(posix_warnings) >= 0
16714 && not_posix_region_end >= RExC_parse
16715 && not_posix_region_end <= posix_class_end)
16716 {
16717 av_undef(posix_warnings);
16718 }
16719
16720 RExC_parse = posix_class_end;
16721 }
16722 else if (namedclass == OOB_NAMEDCLASS) {
16723 not_posix_region_end = posix_class_end;
16724 }
16725 else {
16726 namedclass = OOB_NAMEDCLASS;
16727 }
16728 }
16729 else if ( RExC_parse - 1 > not_posix_region_end
16730 && MAYBE_POSIXCC(value))
16731 {
16732 (void) handle_possible_posix(
16733 pRExC_state,
16734 RExC_parse - 1, /* -1 because parse has already been
16735 advanced */
16736 &not_posix_region_end,
16737 do_posix_warnings ? &posix_warnings : NULL,
16738 TRUE /* checking only */);
16739 }
16740 else if ( strict && ! skip_white
16741 && ( _generic_isCC(value, _CC_VERTSPACE)
16742 || is_VERTWS_cp_high(value)))
16743 {
16744 vFAIL("Literal vertical space in [] is illegal except under /x");
16745 }
16746 else if (value == '\\') {
16747 /* Is a backslash; get the code point of the char after it */
16748
16749 if (RExC_parse >= RExC_end) {
16750 vFAIL("Unmatched [");
16751 }
16752
16753 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16754 value = utf8n_to_uvchr((U8*)RExC_parse,
16755 RExC_end - RExC_parse,
16756 &numlen, UTF8_ALLOW_DEFAULT);
16757 RExC_parse += numlen;
16758 }
16759 else
16760 value = UCHARAT(RExC_parse++);
16761
16762 /* Some compilers cannot handle switching on 64-bit integer
16763 * values, therefore value cannot be an UV. Yes, this will
16764 * be a problem later if we want switch on Unicode.
16765 * A similar issue a little bit later when switching on
16766 * namedclass. --jhi */
16767
16768 /* If the \ is escaping white space when white space is being
16769 * skipped, it means that that white space is wanted literally, and
16770 * is already in 'value'. Otherwise, need to translate the escape
16771 * into what it signifies. */
16772 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16773
16774 case 'w': namedclass = ANYOF_WORDCHAR; break;
16775 case 'W': namedclass = ANYOF_NWORDCHAR; break;
16776 case 's': namedclass = ANYOF_SPACE; break;
16777 case 'S': namedclass = ANYOF_NSPACE; break;
16778 case 'd': namedclass = ANYOF_DIGIT; break;
16779 case 'D': namedclass = ANYOF_NDIGIT; break;
16780 case 'v': namedclass = ANYOF_VERTWS; break;
16781 case 'V': namedclass = ANYOF_NVERTWS; break;
16782 case 'h': namedclass = ANYOF_HORIZWS; break;
16783 case 'H': namedclass = ANYOF_NHORIZWS; break;
16784 case 'N': /* Handle \N{NAME} in class */
16785 {
16786 const char * const backslash_N_beg = RExC_parse - 2;
16787 int cp_count;
16788
16789 if (! grok_bslash_N(pRExC_state,
16790 NULL, /* No regnode */
16791 &value, /* Yes single value */
16792 &cp_count, /* Multiple code pt count */
16793 flagp,
16794 strict,
16795 depth)
16796 ) {
16797
16798 if (*flagp & NEED_UTF8)
16799 FAIL("panic: grok_bslash_N set NEED_UTF8");
16800
16801 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
16802
16803 if (cp_count < 0) {
16804 vFAIL("\\N in a character class must be a named character: \\N{...}");
16805 }
16806 else if (cp_count == 0) {
16807 ckWARNreg(RExC_parse,
16808 "Ignoring zero length \\N{} in character class");
16809 }
16810 else { /* cp_count > 1 */
16811 if (! RExC_in_multi_char_class) {
16812 if (invert || range || *RExC_parse == '-') {
16813 if (strict) {
16814 RExC_parse--;
16815 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
16816 }
16817 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
16818 break; /* <value> contains the first code
16819 point. Drop out of the switch to
16820 process it */
16821 }
16822 else {
16823 SV * multi_char_N = newSVpvn(backslash_N_beg,
16824 RExC_parse - backslash_N_beg);
16825 multi_char_matches
16826 = add_multi_match(multi_char_matches,
16827 multi_char_N,
16828 cp_count);
16829 }
16830 }
16831 } /* End of cp_count != 1 */
16832
16833 /* This element should not be processed further in this
16834 * class */
16835 element_count--;
16836 value = save_value;
16837 prevvalue = save_prevvalue;
16838 continue; /* Back to top of loop to get next char */
16839 }
16840
16841 /* Here, is a single code point, and <value> contains it */
16842 unicode_range = TRUE; /* \N{} are Unicode */
16843 }
16844 break;
16845 case 'p':
16846 case 'P':
16847 {
16848 char *e;
16849 char *i;
16850
16851 /* We will handle any undefined properties ourselves */
16852 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
16853 /* And we actually would prefer to get
16854 * the straight inversion list of the
16855 * swash, since we will be accessing it
16856 * anyway, to save a little time */
16857 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
16858
16859 SvREFCNT_dec(swash); /* Free any left-overs */
16860 if (RExC_parse >= RExC_end)
16861 vFAIL2("Empty \\%c", (U8)value);
16862 if (*RExC_parse == '{') {
16863 const U8 c = (U8)value;
16864 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
16865 if (!e) {
16866 RExC_parse++;
16867 vFAIL2("Missing right brace on \\%c{}", c);
16868 }
16869
16870 RExC_parse++;
16871
16872 /* White space is allowed adjacent to the braces and after
16873 * any '^', even when not under /x */
16874 while (isSPACE(*RExC_parse)) {
16875 RExC_parse++;
16876 }
16877
16878 if (UCHARAT(RExC_parse) == '^') {
16879
16880 /* toggle. (The rhs xor gets the single bit that
16881 * differs between P and p; the other xor inverts just
16882 * that bit) */
16883 value ^= 'P' ^ 'p';
16884
16885 RExC_parse++;
16886 while (isSPACE(*RExC_parse)) {
16887 RExC_parse++;
16888 }
16889 }
16890
16891 if (e == RExC_parse)
16892 vFAIL2("Empty \\%c{}", c);
16893
16894 n = e - RExC_parse;
16895 while (isSPACE(*(RExC_parse + n - 1)))
16896 n--;
16897
16898 } /* The \p isn't immediately followed by a '{' */
16899 else if (! isALPHA(*RExC_parse)) {
16900 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16901 vFAIL2("Character following \\%c must be '{' or a "
16902 "single-character Unicode property name",
16903 (U8) value);
16904 }
16905 else {
16906 e = RExC_parse;
16907 n = 1;
16908 }
16909 if (!SIZE_ONLY) {
16910 char* name = RExC_parse;
16911 char* base_name; /* name after any packages are stripped */
16912 char* lookup_name = NULL;
16913 const char * const colon_colon = "::";
16914 bool invert;
16915
16916 SV* invlist;
16917
16918 /* Temporary workaround for [perl #133136]. For this
16919 * precise input that is in the .t that is failing, load
16920 * utf8.pm, which is what the test wants, so that that
16921 * .t passes */
16922 if ( memEQs(RExC_start, e + 1 - RExC_start,
16923 "foo\\p{Alnum}")
16924 && ! hv_common(GvHVn(PL_incgv),
16925 NULL,
16926 "utf8.pm", sizeof("utf8.pm") - 1,
16927 0, HV_FETCH_ISEXISTS, NULL, 0))
16928 {
16929 require_pv("utf8.pm");
16930 }
16931 invlist = parse_uniprop_string(name, n, FOLD, &invert);
16932 if (invlist) {
16933 if (invert) {
16934 value ^= 'P' ^ 'p';
16935 }
16936 }
16937 else {
16938
16939 /* Try to get the definition of the property into
16940 * <invlist>. If /i is in effect, the effective property
16941 * will have its name be <__NAME_i>. The design is
16942 * discussed in commit
16943 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16944 name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16945 SAVEFREEPV(name);
16946
16947 for (i = RExC_parse; i < RExC_parse + n; i++) {
16948 if (isCNTRL(*i) && *i != '\t') {
16949 RExC_parse = e + 1;
16950 vFAIL2("Can't find Unicode property definition \"%s\"", name);
16951 }
16952 }
16953
16954 if (FOLD) {
16955 lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
16956
16957 /* The function call just below that uses this can fail
16958 * to return, leaking memory if we don't do this */
16959 SAVEFREEPV(lookup_name);
16960 }
16961
16962 /* Look up the property name, and get its swash and
16963 * inversion list, if the property is found */
16964 swash = _core_swash_init("utf8",
16965 (lookup_name)
16966 ? lookup_name
16967 : name,
16968 &PL_sv_undef,
16969 1, /* binary */
16970 0, /* not tr/// */
16971 NULL, /* No inversion list */
16972 &swash_init_flags
16973 );
16974 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
16975 HV* curpkg = (IN_PERL_COMPILETIME)
16976 ? PL_curstash
16977 : CopSTASH(PL_curcop);
16978 UV final_n = n;
16979 bool has_pkg;
16980
16981 if (swash) { /* Got a swash but no inversion list.
16982 Something is likely wrong that will
16983 be sorted-out later */
16984 SvREFCNT_dec_NN(swash);
16985 swash = NULL;
16986 }
16987
16988 /* Here didn't find it. It could be a an error (like a
16989 * typo) in specifying a Unicode property, or it could
16990 * be a user-defined property that will be available at
16991 * run-time. The names of these must begin with 'In'
16992 * or 'Is' (after any packages are stripped off). So
16993 * if not one of those, or if we accept only
16994 * compile-time properties, is an error; otherwise add
16995 * it to the list for run-time look up. */
16996 if ((base_name = rninstr(name, name + n,
16997 colon_colon, colon_colon + 2)))
16998 { /* Has ::. We know this must be a user-defined
16999 property */
17000 base_name += 2;
17001 final_n -= base_name - name;
17002 has_pkg = TRUE;
17003 }
17004 else {
17005 base_name = name;
17006 has_pkg = FALSE;
17007 }
17008
17009 if ( final_n < 3
17010 || base_name[0] != 'I'
17011 || (base_name[1] != 's' && base_name[1] != 'n')
17012 || ret_invlist)
17013 {
17014 const char * const msg
17015 = (has_pkg)
17016 ? "Illegal user-defined property name"
17017 : "Can't find Unicode property definition";
17018 RExC_parse = e + 1;
17019
17020 /* diag_listed_as: Can't find Unicode property definition "%s" */
17021 vFAIL3utf8f("%s \"%" UTF8f "\"",
17022 msg, UTF8fARG(UTF, n, name));
17023 }
17024
17025 /* If the property name doesn't already have a package
17026 * name, add the current one to it so that it can be
17027 * referred to outside it. [perl #121777] */
17028 if (! has_pkg && curpkg) {
17029 char* pkgname = HvNAME(curpkg);
17030 if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
17031 char* full_name = Perl_form(aTHX_
17032 "%s::%s",
17033 pkgname,
17034 name);
17035 n = strlen(full_name);
17036 name = savepvn(full_name, n);
17037 SAVEFREEPV(name);
17038 }
17039 }
17040 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
17041 (value == 'p' ? '+' : '!'),
17042 (FOLD) ? "__" : "",
17043 UTF8fARG(UTF, n, name),
17044 (FOLD) ? "_i" : "");
17045 has_user_defined_property = TRUE;
17046 optimizable = FALSE; /* Will have to leave this an
17047 ANYOF node */
17048
17049 /* We don't know yet what this matches, so have to flag
17050 * it */
17051 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17052 }
17053 else {
17054
17055 /* Here, did get the swash and its inversion list. If
17056 * the swash is from a user-defined property, then this
17057 * whole character class should be regarded as such */
17058 if (swash_init_flags
17059 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
17060 {
17061 has_user_defined_property = TRUE;
17062 }
17063 }
17064 }
17065 if (invlist) {
17066 if (! has_user_defined_property &&
17067 /* We warn on matching an above-Unicode code point
17068 * if the match would return true, except don't
17069 * warn for \p{All}, which has exactly one element
17070 * = 0 */
17071 (_invlist_contains_cp(invlist, 0x110000)
17072 && (! (_invlist_len(invlist) == 1
17073 && *invlist_array(invlist) == 0))))
17074 {
17075 warn_super = TRUE;
17076 }
17077
17078 /* Invert if asking for the complement */
17079 if (value == 'P') {
17080 _invlist_union_complement_2nd(properties,
17081 invlist,
17082 &properties);
17083
17084 /* The swash can't be used as-is, because we've
17085 * inverted things; delay removing it to here after
17086 * have copied its invlist above */
17087 if (! swash) {
17088 SvREFCNT_dec_NN(invlist);
17089 }
17090 SvREFCNT_dec(swash);
17091 swash = NULL;
17092 }
17093 else {
17094 _invlist_union(properties, invlist, &properties);
17095 if (! swash) {
17096 SvREFCNT_dec_NN(invlist);
17097 }
17098 }
17099 }
17100 } /* End of actually getting the values in pass 2 */
17101
17102 RExC_parse = e + 1;
17103 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17104 named */
17105
17106 /* \p means they want Unicode semantics */
17107 REQUIRE_UNI_RULES(flagp, 0);
17108 }
17109 break;
17110 case 'n': value = '\n'; break;
17111 case 'r': value = '\r'; break;
17112 case 't': value = '\t'; break;
17113 case 'f': value = '\f'; break;
17114 case 'b': value = '\b'; break;
17115 case 'e': value = ESC_NATIVE; break;
17116 case 'a': value = '\a'; break;
17117 case 'o':
17118 RExC_parse--; /* function expects to be pointed at the 'o' */
17119 {
17120 const char* error_msg;
17121 bool valid = grok_bslash_o(&RExC_parse,
17122 RExC_end,
17123 &value,
17124 &error_msg,
17125 TO_OUTPUT_WARNINGS(RExC_parse),
17126 strict,
17127 silence_non_portable,
17128 UTF);
17129 if (! valid) {
17130 vFAIL(error_msg);
17131 }
17132 UPDATE_WARNINGS_LOC(RExC_parse - 1);
17133 }
17134 non_portable_endpoint++;
17135 break;
17136 case 'x':
17137 RExC_parse--; /* function expects to be pointed at the 'x' */
17138 {
17139 const char* error_msg;
17140 bool valid = grok_bslash_x(&RExC_parse,
17141 RExC_end,
17142 &value,
17143 &error_msg,
17144 TO_OUTPUT_WARNINGS(RExC_parse),
17145 strict,
17146 silence_non_portable,
17147 UTF);
17148 if (! valid) {
17149 vFAIL(error_msg);
17150 }
17151 UPDATE_WARNINGS_LOC(RExC_parse - 1);
17152 }
17153 non_portable_endpoint++;
17154 break;
17155 case 'c':
17156 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17157 UPDATE_WARNINGS_LOC(RExC_parse);
17158 RExC_parse++;
17159 non_portable_endpoint++;
17160 break;
17161 case '0': case '1': case '2': case '3': case '4':
17162 case '5': case '6': case '7':
17163 {
17164 /* Take 1-3 octal digits */
17165 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17166 numlen = (strict) ? 4 : 3;
17167 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17168 RExC_parse += numlen;
17169 if (numlen != 3) {
17170 if (strict) {
17171 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
17172 vFAIL("Need exactly 3 octal digits");
17173 }
17174 else if (! SIZE_ONLY /* like \08, \178 */
17175 && numlen < 3
17176 && RExC_parse < RExC_end
17177 && isDIGIT(*RExC_parse)
17178 && ckWARN(WARN_REGEXP))
17179 {
17180 reg_warn_non_literal_string(
17181 RExC_parse + 1,
17182 form_short_octal_warning(RExC_parse, numlen));
17183 }
17184 }
17185 non_portable_endpoint++;
17186 break;
17187 }
17188 default:
17189 /* Allow \_ to not give an error */
17190 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
17191 if (strict) {
17192 vFAIL2("Unrecognized escape \\%c in character class",
17193 (int)value);
17194 }
17195 else {
17196 ckWARN2reg(RExC_parse,
17197 "Unrecognized escape \\%c in character class passed through",
17198 (int)value);
17199 }
17200 }
17201 break;
17202 } /* End of switch on char following backslash */
17203 } /* end of handling backslash escape sequences */
17204
17205 /* Here, we have the current token in 'value' */
17206
17207 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17208 U8 classnum;
17209
17210 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
17211 * literal, as is the character that began the false range, i.e.
17212 * the 'a' in the examples */
17213 if (range) {
17214 if (!SIZE_ONLY) {
17215 const int w = (RExC_parse >= rangebegin)
17216 ? RExC_parse - rangebegin
17217 : 0;
17218 if (strict) {
17219 vFAIL2utf8f(
17220 "False [] range \"%" UTF8f "\"",
17221 UTF8fARG(UTF, w, rangebegin));
17222 }
17223 else {
17224 ckWARN2reg(RExC_parse,
17225 "False [] range \"%" UTF8f "\"",
17226 UTF8fARG(UTF, w, rangebegin));
17227 cp_list = add_cp_to_invlist(cp_list, '-');
17228 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17229 prevvalue);
17230 }
17231 }
17232
17233 range = 0; /* this was not a true range */
17234 element_count += 2; /* So counts for three values */
17235 }
17236
17237 classnum = namedclass_to_classnum(namedclass);
17238
17239 if (LOC && namedclass < ANYOF_POSIXL_MAX
17240#ifndef HAS_ISASCII
17241 && classnum != _CC_ASCII
17242#endif
17243 ) {
17244 SV* scratch_list = NULL;
17245
17246 /* What the Posix classes (like \w, [:space:]) match in locale
17247 * isn't knowable under locale until actual match time. Room
17248 * must be reserved (one time per outer bracketed class) to
17249 * store such classes. The space will contain a bit for each
17250 * named class that is to be matched against. This isn't
17251 * needed for \p{} and pseudo-classes, as they are not affected
17252 * by locale, and hence are dealt with separately */
17253 if (! need_class) {
17254 need_class = 1;
17255 anyof_flags |= ANYOF_MATCHES_POSIXL;
17256
17257 /* We can't change this into some other type of node
17258 * (unless this is the only element, in which case there
17259 * are nodes that mean exactly this) as has runtime
17260 * dependencies */
17261 optimizable = FALSE;
17262 }
17263
17264 /* Coverity thinks it is possible for this to be negative; both
17265 * jhi and khw think it's not, but be safer */
17266 assert(! (anyof_flags & ANYOF_MATCHES_POSIXL)
17267 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
17268
17269 /* See if it already matches the complement of this POSIX
17270 * class */
17271 if ( (anyof_flags & ANYOF_MATCHES_POSIXL)
17272 && POSIXL_TEST(posixl, namedclass + ((namedclass % 2)
17273 ? -1
17274 : 1)))
17275 {
17276 posixl_matches_all = TRUE;
17277 break; /* No need to continue. Since it matches both
17278 e.g., \w and \W, it matches everything, and the
17279 bracketed class can be optimized into qr/./s */
17280 }
17281
17282 /* Add this class to those that should be checked at runtime */
17283 POSIXL_SET(posixl, namedclass);
17284
17285 /* The above-Latin1 characters are not subject to locale rules.
17286 * Just add them, in the second pass, to the
17287 * unconditionally-matched list */
17288 if (! SIZE_ONLY) {
17289
17290 /* Get the list of the above-Latin1 code points this
17291 * matches */
17292 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17293 PL_XPosix_ptrs[classnum],
17294
17295 /* Odd numbers are complements, like
17296 * NDIGIT, NASCII, ... */
17297 namedclass % 2 != 0,
17298 &scratch_list);
17299 /* Checking if 'cp_list' is NULL first saves an extra
17300 * clone. Its reference count will be decremented at the
17301 * next union, etc, or if this is the only instance, at the
17302 * end of the routine */
17303 if (! cp_list) {
17304 cp_list = scratch_list;
17305 }
17306 else {
17307 _invlist_union(cp_list, scratch_list, &cp_list);
17308 SvREFCNT_dec_NN(scratch_list);
17309 }
17310 continue; /* Go get next character */
17311 }
17312 }
17313 else if (! SIZE_ONLY) {
17314
17315 /* Here, not in pass1 (in that pass we skip calculating the
17316 * contents of this class), and is not /l, or is a POSIX class
17317 * for which /l doesn't matter (or is a Unicode property, which
17318 * is skipped here). */
17319 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
17320 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17321
17322 /* Here, should be \h, \H, \v, or \V. None of /d, /i
17323 * nor /l make a difference in what these match,
17324 * therefore we just add what they match to cp_list. */
17325 if (classnum != _CC_VERTSPACE) {
17326 assert( namedclass == ANYOF_HORIZWS
17327 || namedclass == ANYOF_NHORIZWS);
17328
17329 /* It turns out that \h is just a synonym for
17330 * XPosixBlank */
17331 classnum = _CC_BLANK;
17332 }
17333
17334 _invlist_union_maybe_complement_2nd(
17335 cp_list,
17336 PL_XPosix_ptrs[classnum],
17337 namedclass % 2 != 0, /* Complement if odd
17338 (NHORIZWS, NVERTWS)
17339 */
17340 &cp_list);
17341 }
17342 }
17343 else if ( UNI_SEMANTICS
17344 || AT_LEAST_ASCII_RESTRICTED
17345 || classnum == _CC_ASCII
17346 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
17347 || classnum == _CC_XDIGIT)))
17348 {
17349 /* We usually have to worry about /d affecting what POSIX
17350 * classes match, with special code needed because we won't
17351 * know until runtime what all matches. But there is no
17352 * extra work needed under /u and /a; and [:ascii:] is
17353 * unaffected by /d; and :digit: and :xdigit: don't have
17354 * runtime differences under /d. So we can special case
17355 * these, and avoid some extra work below, and at runtime.
17356 * */
17357 _invlist_union_maybe_complement_2nd(
17358 simple_posixes,
17359 ((AT_LEAST_ASCII_RESTRICTED)
17360 ? PL_Posix_ptrs[classnum]
17361 : PL_XPosix_ptrs[classnum]),
17362 namedclass % 2 != 0,
17363 &simple_posixes);
17364 }
17365 else { /* Garden variety class. If is NUPPER, NALPHA, ...
17366 complement and use nposixes */
17367 SV** posixes_ptr = namedclass % 2 == 0
17368 ? &posixes
17369 : &nposixes;
17370 _invlist_union_maybe_complement_2nd(
17371 *posixes_ptr,
17372 PL_XPosix_ptrs[classnum],
17373 namedclass % 2 != 0,
17374 posixes_ptr);
17375 }
17376 }
17377 } /* end of namedclass \blah */
17378
17379 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17380
17381 /* If 'range' is set, 'value' is the ending of a range--check its
17382 * validity. (If value isn't a single code point in the case of a
17383 * range, we should have figured that out above in the code that
17384 * catches false ranges). Later, we will handle each individual code
17385 * point in the range. If 'range' isn't set, this could be the
17386 * beginning of a range, so check for that by looking ahead to see if
17387 * the next real character to be processed is the range indicator--the
17388 * minus sign */
17389
17390 if (range) {
17391#ifdef EBCDIC
17392 /* For unicode ranges, we have to test that the Unicode as opposed
17393 * to the native values are not decreasing. (Above 255, there is
17394 * no difference between native and Unicode) */
17395 if (unicode_range && prevvalue < 255 && value < 255) {
17396 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17397 goto backwards_range;
17398 }
17399 }
17400 else
17401#endif
17402 if (prevvalue > value) /* b-a */ {
17403 int w;
17404#ifdef EBCDIC
17405 backwards_range:
17406#endif
17407 w = RExC_parse - rangebegin;
17408 vFAIL2utf8f(
17409 "Invalid [] range \"%" UTF8f "\"",
17410 UTF8fARG(UTF, w, rangebegin));
17411 NOT_REACHED; /* NOTREACHED */
17412 }
17413 }
17414 else {
17415 prevvalue = value; /* save the beginning of the potential range */
17416 if (! stop_at_1 /* Can't be a range if parsing just one thing */
17417 && *RExC_parse == '-')
17418 {
17419 char* next_char_ptr = RExC_parse + 1;
17420
17421 /* Get the next real char after the '-' */
17422 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17423
17424 /* If the '-' is at the end of the class (just before the ']',
17425 * it is a literal minus; otherwise it is a range */
17426 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17427 RExC_parse = next_char_ptr;
17428
17429 /* a bad range like \w-, [:word:]- ? */
17430 if (namedclass > OOB_NAMEDCLASS) {
17431 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
17432 const int w = RExC_parse >= rangebegin
17433 ? RExC_parse - rangebegin
17434 : 0;
17435 if (strict) {
17436 vFAIL4("False [] range \"%*.*s\"",
17437 w, w, rangebegin);
17438 }
17439 else {
17440 vWARN4(RExC_parse,
17441 "False [] range \"%*.*s\"",
17442 w, w, rangebegin);
17443 }
17444 }
17445 if (!SIZE_ONLY) {
17446 cp_list = add_cp_to_invlist(cp_list, '-');
17447 }
17448 element_count++;
17449 } else
17450 range = 1; /* yeah, it's a range! */
17451 continue; /* but do it the next time */
17452 }
17453 }
17454 }
17455
17456 if (namedclass > OOB_NAMEDCLASS) {
17457 continue;
17458 }
17459
17460 /* Here, we have a single value this time through the loop, and
17461 * <prevvalue> is the beginning of the range, if any; or <value> if
17462 * not. */
17463
17464 /* non-Latin1 code point implies unicode semantics. Must be set in
17465 * pass1 so is there for the whole of pass 2 */
17466 if (value > 255) {
17467 REQUIRE_UNI_RULES(flagp, 0);
17468 }
17469
17470 /* Ready to process either the single value, or the completed range.
17471 * For single-valued non-inverted ranges, we consider the possibility
17472 * of multi-char folds. (We made a conscious decision to not do this
17473 * for the other cases because it can often lead to non-intuitive
17474 * results. For example, you have the peculiar case that:
17475 * "s s" =~ /^[^\xDF]+$/i => Y
17476 * "ss" =~ /^[^\xDF]+$/i => N
17477 *
17478 * See [perl #89750] */
17479 if (FOLD && allow_multi_folds && value == prevvalue) {
17480 if (value == LATIN_SMALL_LETTER_SHARP_S
17481 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17482 value)))
17483 {
17484 /* Here <value> is indeed a multi-char fold. Get what it is */
17485
17486 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17487 STRLEN foldlen;
17488
17489 UV folded = _to_uni_fold_flags(
17490 value,
17491 foldbuf,
17492 &foldlen,
17493 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17494 ? FOLD_FLAGS_NOMIX_ASCII
17495 : 0)
17496 );
17497
17498 /* Here, <folded> should be the first character of the
17499 * multi-char fold of <value>, with <foldbuf> containing the
17500 * whole thing. But, if this fold is not allowed (because of
17501 * the flags), <fold> will be the same as <value>, and should
17502 * be processed like any other character, so skip the special
17503 * handling */
17504 if (folded != value) {
17505
17506 /* Skip if we are recursed, currently parsing the class
17507 * again. Otherwise add this character to the list of
17508 * multi-char folds. */
17509 if (! RExC_in_multi_char_class) {
17510 STRLEN cp_count = utf8_length(foldbuf,
17511 foldbuf + foldlen);
17512 SV* multi_fold = sv_2mortal(newSVpvs(""));
17513
17514 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17515
17516 multi_char_matches
17517 = add_multi_match(multi_char_matches,
17518 multi_fold,
17519 cp_count);
17520
17521 }
17522
17523 /* This element should not be processed further in this
17524 * class */
17525 element_count--;
17526 value = save_value;
17527 prevvalue = save_prevvalue;
17528 continue;
17529 }
17530 }
17531 }
17532
17533 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
17534 if (range) {
17535
17536 /* If the range starts above 255, everything is portable and
17537 * likely to be so for any forseeable character set, so don't
17538 * warn. */
17539 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17540 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17541 }
17542 else if (prevvalue != value) {
17543
17544 /* Under strict, ranges that stop and/or end in an ASCII
17545 * printable should have each end point be a portable value
17546 * for it (preferably like 'A', but we don't warn if it is
17547 * a (portable) Unicode name or code point), and the range
17548 * must be be all digits or all letters of the same case.
17549 * Otherwise, the range is non-portable and unclear as to
17550 * what it contains */
17551 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
17552 && ( non_portable_endpoint
17553 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17554 || (isLOWER_A(prevvalue) && isLOWER_A(value))
17555 || (isUPPER_A(prevvalue) && isUPPER_A(value))
17556 ))) {
17557 vWARN(RExC_parse, "Ranges of ASCII printables should"
17558 " be some subset of \"0-9\","
17559 " \"A-Z\", or \"a-z\"");
17560 }
17561 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17562 SSize_t index_start;
17563 SSize_t index_final;
17564
17565 /* But the nature of Unicode and languages mean we
17566 * can't do the same checks for above-ASCII ranges,
17567 * except in the case of digit ones. These should
17568 * contain only digits from the same group of 10. The
17569 * ASCII case is handled just above. Hence here, the
17570 * range could be a range of digits. First some
17571 * unlikely special cases. Grandfather in that a range
17572 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17573 * if its starting value is one of the 10 digits prior
17574 * to it. This is because it is an alternate way of
17575 * writing 19D1, and some people may expect it to be in
17576 * that group. But it is bad, because it won't give
17577 * the expected results. In Unicode 5.2 it was
17578 * considered to be in that group (of 11, hence), but
17579 * this was fixed in the next version */
17580
17581 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17582 goto warn_bad_digit_range;
17583 }
17584 else if (UNLIKELY( prevvalue >= 0x1D7CE
17585 && value <= 0x1D7FF))
17586 {
17587 /* This is the only other case currently in Unicode
17588 * where the algorithm below fails. The code
17589 * points just above are the end points of a single
17590 * range containing only decimal digits. It is 5
17591 * different series of 0-9. All other ranges of
17592 * digits currently in Unicode are just a single
17593 * series. (And mktables will notify us if a later
17594 * Unicode version breaks this.)
17595 *
17596 * If the range being checked is at most 9 long,
17597 * and the digit values represented are in
17598 * numerical order, they are from the same series.
17599 * */
17600 if ( value - prevvalue > 9
17601 || ((( value - 0x1D7CE) % 10)
17602 <= (prevvalue - 0x1D7CE) % 10))
17603 {
17604 goto warn_bad_digit_range;
17605 }
17606 }
17607 else {
17608
17609 /* For all other ranges of digits in Unicode, the
17610 * algorithm is just to check if both end points
17611 * are in the same series, which is the same range.
17612 * */
17613 index_start = _invlist_search(
17614 PL_XPosix_ptrs[_CC_DIGIT],
17615 prevvalue);
17616
17617 /* Warn if the range starts and ends with a digit,
17618 * and they are not in the same group of 10. */
17619 if ( index_start >= 0
17620 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17621 && (index_final =
17622 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17623 value)) != index_start
17624 && index_final >= 0
17625 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17626 {
17627 warn_bad_digit_range:
17628 vWARN(RExC_parse, "Ranges of digits should be"
17629 " from the same group of"
17630 " 10");
17631 }
17632 }
17633 }
17634 }
17635 }
17636 if ((! range || prevvalue == value) && non_portable_endpoint) {
17637 if (isPRINT_A(value)) {
17638 char literal[3];
17639 unsigned d = 0;
17640 if (isBACKSLASHED_PUNCT(value)) {
17641 literal[d++] = '\\';
17642 }
17643 literal[d++] = (char) value;
17644 literal[d++] = '\0';
17645
17646 vWARN4(RExC_parse,
17647 "\"%.*s\" is more clearly written simply as \"%s\"",
17648 (int) (RExC_parse - rangebegin),
17649 rangebegin,
17650 literal
17651 );
17652 }
17653 else if isMNEMONIC_CNTRL(value) {
17654 vWARN4(RExC_parse,
17655 "\"%.*s\" is more clearly written simply as \"%s\"",
17656 (int) (RExC_parse - rangebegin),
17657 rangebegin,
17658 cntrl_to_mnemonic((U8) value)
17659 );
17660 }
17661 }
17662 }
17663
17664 /* Deal with this element of the class */
17665 if (! SIZE_ONLY) {
17666
17667#ifndef EBCDIC
17668 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17669 prevvalue, value);
17670#else
17671 /* On non-ASCII platforms, for ranges that span all of 0..255, and
17672 * ones that don't require special handling, we can just add the
17673 * range like we do for ASCII platforms */
17674 if ((UNLIKELY(prevvalue == 0) && value >= 255)
17675 || ! (prevvalue < 256
17676 && (unicode_range
17677 || (! non_portable_endpoint
17678 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17679 || (isUPPER_A(prevvalue)
17680 && isUPPER_A(value)))))))
17681 {
17682 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17683 prevvalue, value);
17684 }
17685 else {
17686 /* Here, requires special handling. This can be because it is
17687 * a range whose code points are considered to be Unicode, and
17688 * so must be individually translated into native, or because
17689 * its a subrange of 'A-Z' or 'a-z' which each aren't
17690 * contiguous in EBCDIC, but we have defined them to include
17691 * only the "expected" upper or lower case ASCII alphabetics.
17692 * Subranges above 255 are the same in native and Unicode, so
17693 * can be added as a range */
17694 U8 start = NATIVE_TO_LATIN1(prevvalue);
17695 unsigned j;
17696 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17697 for (j = start; j <= end; j++) {
17698 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17699 }
17700 if (value > 255) {
17701 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17702 256, value);
17703 }
17704 }
17705#endif
17706 }
17707
17708 range = 0; /* this range (if it was one) is done now */
17709 } /* End of loop through all the text within the brackets */
17710
17711
17712 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17713 output_or_return_posix_warnings(pRExC_state, posix_warnings,
17714 return_posix_warnings);
17715 }
17716
17717 /* If anything in the class expands to more than one character, we have to
17718 * deal with them by building up a substitute parse string, and recursively
17719 * calling reg() on it, instead of proceeding */
17720 if (multi_char_matches) {
17721 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17722 I32 cp_count;
17723 STRLEN len;
17724 char *save_end = RExC_end;
17725 char *save_parse = RExC_parse;
17726 char *save_start = RExC_start;
17727 Size_t constructed_prefix_len = 0; /* This gives the length of the
17728 constructed portion of the
17729 substitute parse. */
17730 bool first_time = TRUE; /* First multi-char occurrence doesn't get
17731 a "|" */
17732 I32 reg_flags;
17733
17734 assert(! invert);
17735 /* Only one level of recursion allowed */
17736 assert(RExC_copy_start_in_constructed == RExC_precomp);
17737
17738#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
17739 because too confusing */
17740 if (invert) {
17741 sv_catpvs(substitute_parse, "(?:");
17742 }
17743#endif
17744
17745 /* Look at the longest folds first */
17746 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17747 cp_count > 0;
17748 cp_count--)
17749 {
17750
17751 if (av_exists(multi_char_matches, cp_count)) {
17752 AV** this_array_ptr;
17753 SV* this_sequence;
17754
17755 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17756 cp_count, FALSE);
17757 while ((this_sequence = av_pop(*this_array_ptr)) !=
17758 &PL_sv_undef)
17759 {
17760 if (! first_time) {
17761 sv_catpvs(substitute_parse, "|");
17762 }
17763 first_time = FALSE;
17764
17765 sv_catpv(substitute_parse, SvPVX(this_sequence));
17766 }
17767 }
17768 }
17769
17770 /* If the character class contains anything else besides these
17771 * multi-character folds, have to include it in recursive parsing */
17772 if (element_count) {
17773 sv_catpvs(substitute_parse, "|[");
17774 constructed_prefix_len = SvCUR(substitute_parse);
17775 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17776
17777 /* Put in a closing ']' only if not going off the end, as otherwise
17778 * we are adding something that really isn't there */
17779 if (RExC_parse < RExC_end) {
17780 sv_catpvs(substitute_parse, "]");
17781 }
17782 }
17783
17784 sv_catpvs(substitute_parse, ")");
17785#if 0
17786 if (invert) {
17787 /* This is a way to get the parse to skip forward a whole named
17788 * sequence instead of matching the 2nd character when it fails the
17789 * first */
17790 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17791 }
17792#endif
17793
17794 /* Set up the data structure so that any errors will be properly
17795 * reported. See the comments at the definition of
17796 * REPORT_LOCATION_ARGS for details */
17797 RExC_copy_start_in_input = (char *) orig_parse;
17798 RExC_start = RExC_parse = SvPV(substitute_parse, len);
17799 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
17800 RExC_end = RExC_parse + len;
17801 RExC_in_multi_char_class = 1;
17802
17803 ret = reg(pRExC_state, 1, &reg_flags, depth+1);
17804
17805 *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
17806
17807 /* And restore so can parse the rest of the pattern */
17808 RExC_parse = save_parse;
17809 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
17810 RExC_end = save_end;
17811 RExC_in_multi_char_class = 0;
17812 SvREFCNT_dec_NN(multi_char_matches);
17813 return ret;
17814 }
17815
17816 /* Here, we've gone through the entire class and dealt with multi-char
17817 * folds. We are now in a position that we can do some checks to see if we
17818 * can optimize this ANYOF node into a simpler one, even in Pass 1.
17819 * Currently we only do two checks:
17820 * 1) is in the unlikely event that the user has specified both, eg. \w and
17821 * \W under /l, then the class matches everything. (This optimization
17822 * is done only to make the optimizer code run later work.)
17823 * 2) if the character class contains only a single element (including a
17824 * single range), we see if there is an equivalent node for it.
17825 * Other checks are possible */
17826 if ( optimizable
17827 && ! ret_invlist /* Can't optimize if returning the constructed
17828 inversion list */
17829 && (UNLIKELY(posixl_matches_all) || element_count == 1))
17830 {
17831 U8 op = END;
17832 U8 arg = 0;
17833
17834 if (UNLIKELY(posixl_matches_all)) {
17835 op = SANY;
17836 }
17837 else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
17838 class, like \w or [:digit:]
17839 or \p{foo} */
17840
17841 /* All named classes are mapped into POSIXish nodes, with its FLAG
17842 * argument giving which class it is */
17843 switch ((I32)namedclass) {
17844 case ANYOF_UNIPROP:
17845 break;
17846
17847 /* These don't depend on the charset modifiers. They always
17848 * match under /u rules */
17849 case ANYOF_NHORIZWS:
17850 case ANYOF_HORIZWS:
17851 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
17852 /* FALLTHROUGH */
17853
17854 case ANYOF_NVERTWS:
17855 case ANYOF_VERTWS:
17856 op = POSIXU;
17857 goto join_posix;
17858
17859 /* The actual POSIXish node for all the rest depends on the
17860 * charset modifier. The ones in the first set depend only on
17861 * ASCII or, if available on this platform, also locale */
17862
17863 case ANYOF_ASCII:
17864 case ANYOF_NASCII:
17865
17866#ifdef HAS_ISASCII
17867 if (LOC) {
17868 op = POSIXL;
17869 goto join_posix;
17870 }
17871#endif
17872 /* (named_class - ANYOF_ASCII) is 0 or 1. xor'ing with
17873 * invert converts that to 1 or 0 */
17874 op = ASCII + ((namedclass - ANYOF_ASCII) ^ invert);
17875 break;
17876
17877 /* The following don't have any matches in the upper Latin1
17878 * range, hence /d is equivalent to /u for them. Making it /u
17879 * saves some branches at runtime */
17880 case ANYOF_DIGIT:
17881 case ANYOF_NDIGIT:
17882 case ANYOF_XDIGIT:
17883 case ANYOF_NXDIGIT:
17884 if (! DEPENDS_SEMANTICS) {
17885 goto treat_as_default;
17886 }
17887
17888 op = POSIXU;
17889 goto join_posix;
17890
17891 /* The following change to CASED under /i */
17892 case ANYOF_LOWER:
17893 case ANYOF_NLOWER:
17894 case ANYOF_UPPER:
17895 case ANYOF_NUPPER:
17896 if (FOLD) {
17897 namedclass = ANYOF_CASED + (namedclass % 2);
17898 }
17899 /* FALLTHROUGH */
17900
17901 /* The rest have more possibilities depending on the charset.
17902 * We take advantage of the enum ordering of the charset
17903 * modifiers to get the exact node type, */
17904 default:
17905 treat_as_default:
17906 op = POSIXD + get_regex_charset(RExC_flags);
17907 if (op > POSIXA) { /* /aa is same as /a */
17908 op = POSIXA;
17909 }
17910
17911 join_posix:
17912 /* The odd numbered ones are the complements of the
17913 * next-lower even number one */
17914 if (namedclass % 2 == 1) {
17915 invert = ! invert;
17916 namedclass--;
17917 }
17918 arg = namedclass_to_classnum(namedclass);
17919 break;
17920 }
17921 }
17922 else if (value == prevvalue) {
17923
17924 /* Here, the class consists of just a single code point */
17925
17926 if (invert) {
17927 if (! LOC && value == '\n') {
17928 op = REG_ANY; /* Optimize [^\n] */
17929 *flagp |= HASWIDTH|SIMPLE;
17930 MARK_NAUGHTY(1);
17931 }
17932 }
17933 else if (value < 256 || UTF) {
17934
17935 /* Optimize a single value into an EXACTish node, but not if it
17936 * would require converting the pattern to UTF-8. */
17937 op = compute_EXACTish(pRExC_state);
17938 }
17939 } /* Otherwise is a range */
17940 else if (! LOC) { /* locale could vary these */
17941 if (prevvalue == '0') {
17942 if (value == '9') {
17943 arg = _CC_DIGIT;
17944 op = POSIXA;
17945 }
17946 }
17947 else if (! FOLD || ASCII_FOLD_RESTRICTED) {
17948 /* We can optimize A-Z or a-z, but not if they could match
17949 * something like the KELVIN SIGN under /i. */
17950 if (prevvalue == 'A') {
17951 if (value == 'Z'
17952#ifdef EBCDIC
17953 && ! non_portable_endpoint
17954#endif
17955 ) {
17956 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
17957 op = POSIXA;
17958 }
17959 }
17960 else if (prevvalue == 'a') {
17961 if (value == 'z'
17962#ifdef EBCDIC
17963 && ! non_portable_endpoint
17964#endif
17965 ) {
17966 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
17967 op = POSIXA;
17968 }
17969 }
17970 }
17971 }
17972
17973 /* Here, we have changed <op> away from its initial value iff we found
17974 * an optimization */
17975 if (op != END) {
17976
17977 /* Emit the calculated regnode,
17978 * which should correspond to the beginning, not current, state of
17979 * the parse */
17980 const char * cur_parse = RExC_parse;
17981 RExC_parse = (char *)orig_parse;
17982 if (PL_regkind[op] == POSIXD) {
17983 if (op == POSIXL) {
17984 RExC_contains_locale = 1;
17985 }
17986 if (invert) {
17987 op += NPOSIXD - POSIXD;
17988 }
17989 }
17990
17991 ret = reg_node(pRExC_state, op);
17992
17993 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
17994 if (! SIZE_ONLY) {
17995 FLAGS(REGNODE_p(ret)) = arg;
17996 }
17997 *flagp |= HASWIDTH|SIMPLE;
17998 }
17999 else if (PL_regkind[op] == EXACT) {
18000 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
18001 TRUE /* downgradable to EXACT */
18002 );
18003 }
18004 else {
18005 *flagp |= HASWIDTH|SIMPLE;
18006 }
18007
18008 RExC_parse = (char *) cur_parse;
18009
18010 SvREFCNT_dec(posixes);
18011 SvREFCNT_dec(nposixes);
18012 SvREFCNT_dec(simple_posixes);
18013 SvREFCNT_dec(cp_list);
18014 SvREFCNT_dec(cp_foldable_list);
18015 return ret;
18016 }
18017 }
18018
18019 /* Assume we are going to generate an ANYOF-type node. */
18020 op = (posixl)
18021 ? ANYOFPOSIXL
18022 : (LOC)
18023 ? ANYOFL
18024 : ANYOF;
18025 ret = reganode(pRExC_state, op, 0);
18026
18027 if (SIZE_ONLY) {
18028 return ret;
18029 }
18030
18031 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
18032
18033 /* If folding, we calculate all characters that could fold to or from the
18034 * ones already on the list */
18035 if (cp_foldable_list) {
18036 if (FOLD) {
18037 UV start, end; /* End points of code point ranges */
18038
18039 SV* fold_intersection = NULL;
18040 SV** use_list;
18041
18042 /* Our calculated list will be for Unicode rules. For locale
18043 * matching, we have to keep a separate list that is consulted at
18044 * runtime only when the locale indicates Unicode rules. For
18045 * non-locale, we just use the general list */
18046 if (LOC) {
18047 use_list = &only_utf8_locale_list;
18048 }
18049 else {
18050 use_list = &cp_list;
18051 }
18052
18053 /* Only the characters in this class that participate in folds need
18054 * be checked. Get the intersection of this class and all the
18055 * possible characters that are foldable. This can quickly narrow
18056 * down a large class */
18057 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
18058 &fold_intersection);
18059
18060 /* Now look at the foldable characters in this class individually */
18061 invlist_iterinit(fold_intersection);
18062 while (invlist_iternext(fold_intersection, &start, &end)) {
18063 UV j;
18064 UV folded;
18065
18066 /* Look at every character in the range */
18067 for (j = start; j <= end; j++) {
18068 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18069 STRLEN foldlen;
18070 unsigned int k;
18071 Size_t folds_to_count;
18072 unsigned int first_folds_to;
18073 const unsigned int * remaining_folds_to_list;
18074
18075 if (j < 256) {
18076
18077 if (IS_IN_SOME_FOLD_L1(j)) {
18078
18079 /* ASCII is always matched; non-ASCII is matched
18080 * only under Unicode rules (which could happen
18081 * under /l if the locale is a UTF-8 one */
18082 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18083 *use_list = add_cp_to_invlist(*use_list,
18084 PL_fold_latin1[j]);
18085 }
18086 else {
18087 has_upper_latin1_only_utf8_matches
18088 = add_cp_to_invlist(
18089 has_upper_latin1_only_utf8_matches,
18090 PL_fold_latin1[j]);
18091 }
18092 }
18093
18094 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18095 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18096 {
18097 add_above_Latin1_folds(pRExC_state,
18098 (U8) j,
18099 use_list);
18100 }
18101 continue;
18102 }
18103
18104 /* Here is an above Latin1 character. We don't have the
18105 * rules hard-coded for it. First, get its fold. This is
18106 * the simple fold, as the multi-character folds have been
18107 * handled earlier and separated out */
18108 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18109 (ASCII_FOLD_RESTRICTED)
18110 ? FOLD_FLAGS_NOMIX_ASCII
18111 : 0);
18112
18113 /* Single character fold of above Latin1. Add everything
18114 * in its fold closure to the list that this node should
18115 * match. */
18116 folds_to_count = _inverse_folds(folded, &first_folds_to,
18117 &remaining_folds_to_list);
18118 for (k = 0; k <= folds_to_count; k++) {
18119 UV c = (k == 0) /* First time through use itself */
18120 ? folded
18121 : (k == 1) /* 2nd time use, the first fold */
18122 ? first_folds_to
18123
18124 /* Then the remaining ones */
18125 : remaining_folds_to_list[k-2];
18126
18127 /* /aa doesn't allow folds between ASCII and non- */
18128 if (( ASCII_FOLD_RESTRICTED
18129 && (isASCII(c) != isASCII(j))))
18130 {
18131 continue;
18132 }
18133
18134 /* Folds under /l which cross the 255/256 boundary are
18135 * added to a separate list. (These are valid only
18136 * when the locale is UTF-8.) */
18137 if (c < 256 && LOC) {
18138 *use_list = add_cp_to_invlist(*use_list, c);
18139 continue;
18140 }
18141
18142 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18143 {
18144 cp_list = add_cp_to_invlist(cp_list, c);
18145 }
18146 else {
18147 /* Similarly folds involving non-ascii Latin1
18148 * characters under /d are added to their list */
18149 has_upper_latin1_only_utf8_matches
18150 = add_cp_to_invlist(
18151 has_upper_latin1_only_utf8_matches,
18152 c);
18153 }
18154 }
18155 }
18156 }
18157 SvREFCNT_dec_NN(fold_intersection);
18158 }
18159
18160 /* Now that we have finished adding all the folds, there is no reason
18161 * to keep the foldable list separate */
18162 _invlist_union(cp_list, cp_foldable_list, &cp_list);
18163 SvREFCNT_dec_NN(cp_foldable_list);
18164 }
18165
18166 /* And combine the result (if any) with any inversion lists from posix
18167 * classes. The lists are kept separate up to now because we don't want to
18168 * fold the classes (folding of those is automatically handled by the swash
18169 * fetching code) */
18170 if (simple_posixes) { /* These are the classes known to be unaffected by
18171 /a, /aa, and /d */
18172 if (cp_list) {
18173 _invlist_union(cp_list, simple_posixes, &cp_list);
18174 SvREFCNT_dec_NN(simple_posixes);
18175 }
18176 else {
18177 cp_list = simple_posixes;
18178 }
18179 }
18180 if (posixes || nposixes) {
18181 if (! DEPENDS_SEMANTICS) {
18182
18183 /* For everything but /d, we can just add the current 'posixes' and
18184 * 'nposixes' to the main list */
18185 if (posixes) {
18186 if (cp_list) {
18187 _invlist_union(cp_list, posixes, &cp_list);
18188 SvREFCNT_dec_NN(posixes);
18189 }
18190 else {
18191 cp_list = posixes;
18192 }
18193 }
18194 if (nposixes) {
18195 if (cp_list) {
18196 _invlist_union(cp_list, nposixes, &cp_list);
18197 SvREFCNT_dec_NN(nposixes);
18198 }
18199 else {
18200 cp_list = nposixes;
18201 }
18202 }
18203 }
18204 else {
18205 /* Under /d, things like \w match upper Latin1 characters only if
18206 * the target string is in UTF-8. But things like \W match all the
18207 * upper Latin1 characters if the target string is not in UTF-8.
18208 *
18209 * Handle the case where there something like \W separately */
18210 if (nposixes) {
18211 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18212
18213 /* A complemented posix class matches all upper Latin1
18214 * characters if not in UTF-8. And it matches just certain
18215 * ones when in UTF-8. That means those certain ones are
18216 * matched regardless, so can just be added to the
18217 * unconditional list */
18218 if (cp_list) {
18219 _invlist_union(cp_list, nposixes, &cp_list);
18220 SvREFCNT_dec_NN(nposixes);
18221 nposixes = NULL;
18222 }
18223 else {
18224 cp_list = nposixes;
18225 }
18226
18227 /* Likewise for 'posixes' */
18228 _invlist_union(posixes, cp_list, &cp_list);
18229
18230 /* Likewise for anything else in the range that matched only
18231 * under UTF-8 */
18232 if (has_upper_latin1_only_utf8_matches) {
18233 _invlist_union(cp_list,
18234 has_upper_latin1_only_utf8_matches,
18235 &cp_list);
18236 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
18237 has_upper_latin1_only_utf8_matches = NULL;
18238 }
18239
18240 /* If we don't match all the upper Latin1 characters regardless
18241 * of UTF-8ness, we have to set a flag to match the rest when
18242 * not in UTF-8 */
18243 _invlist_subtract(only_non_utf8_list, cp_list,
18244 &only_non_utf8_list);
18245 if (_invlist_len(only_non_utf8_list) != 0) {
18246 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18247 }
18248 SvREFCNT_dec_NN(only_non_utf8_list);
18249 }
18250 else {
18251 /* Here there were no complemented posix classes. That means
18252 * the upper Latin1 characters in 'posixes' match only when the
18253 * target string is in UTF-8. So we have to add them to the
18254 * list of those types of code points, while adding the
18255 * remainder to the unconditional list.
18256 *
18257 * First calculate what they are */
18258 SV* nonascii_but_latin1_properties = NULL;
18259 _invlist_intersection(posixes, PL_UpperLatin1,
18260 &nonascii_but_latin1_properties);
18261
18262 /* And add them to the final list of such characters. */
18263 _invlist_union(has_upper_latin1_only_utf8_matches,
18264 nonascii_but_latin1_properties,
18265 &has_upper_latin1_only_utf8_matches);
18266
18267 /* Remove them from what now becomes the unconditional list */
18268 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18269 &posixes);
18270
18271 /* And add those unconditional ones to the final list */
18272 if (cp_list) {
18273 _invlist_union(cp_list, posixes, &cp_list);
18274 SvREFCNT_dec_NN(posixes);
18275 posixes = NULL;
18276 }
18277 else {
18278 cp_list = posixes;
18279 }
18280
18281 SvREFCNT_dec(nonascii_but_latin1_properties);
18282
18283 /* Get rid of any characters that we now know are matched
18284 * unconditionally from the conditional list, which may make
18285 * that list empty */
18286 _invlist_subtract(has_upper_latin1_only_utf8_matches,
18287 cp_list,
18288 &has_upper_latin1_only_utf8_matches);
18289 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
18290 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
18291 has_upper_latin1_only_utf8_matches = NULL;
18292 }
18293 }
18294 }
18295 }
18296
18297 /* And combine the result (if any) with any inversion list from properties.
18298 * The lists are kept separate up to now so that we can distinguish the two
18299 * in regards to matching above-Unicode. A run-time warning is generated
18300 * if a Unicode property is matched against a non-Unicode code point. But,
18301 * we allow user-defined properties to match anything, without any warning,
18302 * and we also suppress the warning if there is a portion of the character
18303 * class that isn't a Unicode property, and which matches above Unicode, \W
18304 * or [\x{110000}] for example.
18305 * (Note that in this case, unlike the Posix one above, there is no
18306 * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
18307 * forces Unicode semantics */
18308 if (properties) {
18309 if (cp_list) {
18310
18311 /* If it matters to the final outcome, see if a non-property
18312 * component of the class matches above Unicode. If so, the
18313 * warning gets suppressed. This is true even if just a single
18314 * such code point is specified, as, though not strictly correct if
18315 * another such code point is matched against, the fact that they
18316 * are using above-Unicode code points indicates they should know
18317 * the issues involved */
18318 if (warn_super) {
18319 warn_super = ! (invert
18320 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18321 }
18322
18323 _invlist_union(properties, cp_list, &cp_list);
18324 SvREFCNT_dec_NN(properties);
18325 }
18326 else {
18327 cp_list = properties;
18328 }
18329
18330 if (warn_super) {
18331 anyof_flags
18332 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18333
18334 /* Because an ANYOF node is the only one that warns, this node
18335 * can't be optimized into something else */
18336 optimizable = FALSE;
18337 }
18338 }
18339
18340 /* Here, we have calculated what code points should be in the character
18341 * class.
18342 *
18343 * Now we can see about various optimizations. Fold calculation (which we
18344 * did above) needs to take place before inversion. Otherwise /[^k]/i
18345 * would invert to include K, which under /i would match k, which it
18346 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18347 * folded until runtime */
18348
18349 /* If we didn't do folding, it's because some information isn't available
18350 * until runtime; set the run-time fold flag for these. (We don't have to
18351 * worry about properties folding, as that is taken care of by the swash
18352 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
18353 * locales, or the class matches at least one 0-255 range code point */
18354 if (LOC && FOLD) {
18355
18356 /* Some things on the list might be unconditionally included because of
18357 * other components. Remove them, and clean up the list if it goes to
18358 * 0 elements */
18359 if (only_utf8_locale_list && cp_list) {
18360 _invlist_subtract(only_utf8_locale_list, cp_list,
18361 &only_utf8_locale_list);
18362
18363 if (_invlist_len(only_utf8_locale_list) == 0) {
18364 SvREFCNT_dec_NN(only_utf8_locale_list);
18365 only_utf8_locale_list = NULL;
18366 }
18367 }
18368 if (only_utf8_locale_list) {
18369 anyof_flags
18370 |= ANYOFL_FOLD
18371 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18372 }
18373 else if (cp_list) { /* Look to see if a 0-255 code point is in list */
18374 UV start, end;
18375 invlist_iterinit(cp_list);
18376 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
18377 anyof_flags |= ANYOFL_FOLD;
18378 }
18379 invlist_iterfinish(cp_list);
18380 }
18381 }
18382 else if ( DEPENDS_SEMANTICS
18383 && ( has_upper_latin1_only_utf8_matches
18384 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18385 {
18386 use_anyofd = TRUE;
18387 optimizable = FALSE;
18388 }
18389
18390
18391 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
18392 * at compile time. Besides not inverting folded locale now, we can't
18393 * invert if there are things such as \w, which aren't known until runtime
18394 * */
18395 if ( cp_list
18396 && invert
18397 && ! use_anyofd
18398 && ! (anyof_flags & (ANYOF_LOCALE_FLAGS))
18399 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
18400 {
18401 _invlist_invert(cp_list);
18402
18403 /* Any swash can't be used as-is, because we've inverted things */
18404 if (swash) {
18405 SvREFCNT_dec_NN(swash);
18406 swash = NULL;
18407 }
18408
18409 /* Clear the invert flag since have just done it here */
18410 invert = FALSE;
18411 }
18412
18413 if (ret_invlist) {
18414 assert(cp_list);
18415
18416 *ret_invlist = cp_list;
18417 SvREFCNT_dec(swash);
18418
18419 /* Discard the generated node */
18420 if (SIZE_ONLY) {
18421 RExC_size = orig_size;
18422 }
18423 else {
18424 RExC_emit = orig_emit;
18425 }
18426 return orig_emit;
18427 }
18428
18429 /* Some character classes are equivalent to other nodes. Such nodes take
18430 * up less room and generally fewer operations to execute than ANYOF nodes.
18431 * Above, we checked for and optimized into some such equivalents for
18432 * certain common classes that are easy to test. Getting to this point in
18433 * the code means that the class didn't get optimized there. Since this
18434 * code is only executed in Pass 2, it is too late to save space--it has
18435 * been allocated in Pass 1, and currently isn't given back. XXX Why not?
18436 * But turning things into an EXACTish node can allow the optimizer to join
18437 * it to any adjacent such nodes. And if the class is equivalent to things
18438 * like /./, expensive run-time swashes can be avoided. Now that we have
18439 * more complete information, we can find things necessarily missed by the
18440 * earlier code. */
18441
18442 if (optimizable && cp_list && ! invert) {
18443 UV start, end;
18444 U8 op = END; /* The optimzation node-type */
18445 int posix_class = -1; /* Illegal value */
18446 const char * cur_parse= RExC_parse;
18447 U8 ANYOFM_mask = 0xFF;
18448 U32 anode_arg = 0;
18449
18450 invlist_iterinit(cp_list);
18451 if (! invlist_iternext(cp_list, &start, &end)) {
18452
18453 /* Here, the list is empty. This happens, for example, when a
18454 * Unicode property that doesn't match anything is the only element
18455 * in the character class (perluniprops.pod notes such properties).
18456 * */
18457 op = OPFAIL;
18458 *flagp |= HASWIDTH|SIMPLE;
18459 }
18460 else if (start == end) { /* The range is a single code point */
18461 if (! invlist_iternext(cp_list, &start, &end)
18462
18463 /* Don't do this optimization if it would require changing
18464 * the pattern to UTF-8 */
18465 && (start < 256 || UTF))
18466 {
18467 /* Here, the list contains a single code point. Can optimize
18468 * into an EXACTish node */
18469
18470 value = start;
18471
18472 if (! FOLD) {
18473 op = (LOC)
18474 ? EXACTL
18475 : EXACT;
18476 }
18477 else if (LOC) {
18478
18479 /* A locale node under folding with one code point can be
18480 * an EXACTFL, as its fold won't be calculated until
18481 * runtime */
18482 op = EXACTFL;
18483 }
18484 else {
18485
18486 /* Here, we are generally folding, but there is only one
18487 * code point to match. If we have to, we use an EXACT
18488 * node, but it would be better for joining with adjacent
18489 * nodes in the optimization pass if we used the same
18490 * EXACTFish node that any such are likely to be. We can
18491 * do this iff the code point doesn't participate in any
18492 * folds. For example, an EXACTF of a colon is the same as
18493 * an EXACT one, since nothing folds to or from a colon. */
18494 if (value < 256) {
18495 if (IS_IN_SOME_FOLD_L1(value)) {
18496 op = EXACT;
18497 }
18498 }
18499 else {
18500 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
18501 op = EXACT;
18502 }
18503 }
18504
18505 /* If we haven't found the node type, above, it means we
18506 * can use the prevailing one */
18507 if (op == END) {
18508 op = compute_EXACTish(pRExC_state);
18509 }
18510 }
18511 }
18512 } /* End of first range contains just a single code point */
18513 else if (start == 0) {
18514 if (end == UV_MAX) {
18515 op = SANY;
18516 *flagp |= HASWIDTH|SIMPLE;
18517 MARK_NAUGHTY(1);
18518 }
18519 else if (end == '\n' - 1
18520 && invlist_iternext(cp_list, &start, &end)
18521 && start == '\n' + 1 && end == UV_MAX)
18522 {
18523 op = REG_ANY;
18524 *flagp |= HASWIDTH|SIMPLE;
18525 MARK_NAUGHTY(1);
18526 }
18527 }
18528 invlist_iterfinish(cp_list);
18529
18530 if (op == END) {
18531
18532 /* Here, didn't find an optimization. See if this matches any of
18533 * the POSIX classes. First try ASCII */
18534
18535 if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
18536 op = ASCII;
18537 *flagp |= HASWIDTH|SIMPLE;
18538 }
18539 else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
18540 op = NASCII;
18541 *flagp |= HASWIDTH|SIMPLE;
18542 }
18543 else if (invlist_highest(cp_list) >= 0x2029) {
18544
18545 /* Then try the other POSIX classes. The POSIXA ones are about
18546 * the same speed as ANYOF ops, but the ones that have
18547 * above-Latin1 code point matches are somewhat faster than
18548 * ANYOF. So optimize those, but don't bother with the POSIXA
18549 * ones nor [:cntrl:] which has no above-Latin1 matches. If
18550 * this ANYOF node has a lower highest possible matching code
18551 * point than any of the XPosix ones, we know that it can't
18552 * possibly be the same as any of them, so we can avoid
18553 * executing this code. The 0x2029 above for the lowest max
18554 * was determined by manual inspection of the classes, and
18555 * comes from \v. Suppose Unicode in a later version adds a
18556 * higher code point to \v. All that means is that this code
18557 * can be executed unnecessarily. It will still give the
18558 * correct answer. */
18559
18560 for (posix_class = 0;
18561 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
18562 posix_class++)
18563 {
18564 int try_inverted;
18565
18566 if (posix_class == _CC_CNTRL) {
18567 continue;
18568 }
18569
18570 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
18571
18572 /* Check if matches normal or inverted */
18573 if (_invlistEQ(cp_list,
18574 PL_XPosix_ptrs[posix_class],
18575 try_inverted))
18576 {
18577 op = (try_inverted)
18578 ? NPOSIXU
18579 : POSIXU;
18580 *flagp |= HASWIDTH|SIMPLE;
18581 goto found_posix;
18582 }
18583 }
18584 }
18585 found_posix: ;
18586 }
18587
18588 /* If it didn't match a POSIX class, it might be able to be turned
18589 * into an ANYOFM node. Compare two different bytes, bit-by-bit.
18590 * In some positions, the bits in each will be 1; and in other
18591 * positions both will be 0; and in some positions the bit will be
18592 * 1 in one byte, and 0 in the other. Let 'n' be the number of
18593 * positions where the bits differ. We create a mask which has
18594 * exactly 'n' 0 bits, each in a position where the two bytes
18595 * differ. Now take the set of all bytes that when ANDed with the
18596 * mask yield the same result. That set has 2**n elements, and is
18597 * representable by just two 8 bit numbers: the result and the
18598 * mask. Importantly, matching the set can be vectorized by
18599 * creating a word full of the result bytes, and a word full of the
18600 * mask bytes, yielding a significant speed up. Here, see if this
18601 * node matches such a set. As a concrete example consider [01],
18602 * and the byte representing '0' which is 0x30 on ASCII machines.
18603 * It has the bits 0011 0000. Take the mask 1111 1110. If we AND
18604 * 0x31 and 0x30 with that mask we get 0x30. Any other bytes ANDed
18605 * yield something else. So [01], which is a common usage, is
18606 * optimizable into ANYOFM, and can benefit from the speed up. We
18607 * can only do this on UTF-8 invariant bytes, because the variance
18608 * would throw this off. */
18609 if ( op == END
18610 && invlist_highest(cp_list) <=
18611#ifdef EBCDIC
18612 0xFF
18613#else
18614 0x7F
18615#endif
18616 ) {
18617 Size_t cp_count = 0;
18618 bool first_time = TRUE;
18619 unsigned int lowest_cp = 0xFF;
18620 U8 bits_differing = 0;
18621
18622 /* Only needed on EBCDIC, as there, variants and non- are mixed
18623 * together. Could #ifdef it out on ASCII, but probably the
18624 * compiler will optimize it out */
18625 bool has_variant = FALSE;
18626
18627 /* Go through the bytes and find the bit positions that differ */
18628 invlist_iterinit(cp_list);
18629 while (invlist_iternext(cp_list, &start, &end)) {
18630 unsigned int i = start;
18631
18632 cp_count += end - start + 1;
18633
18634 if (first_time) {
18635 if (! UVCHR_IS_INVARIANT(i)) {
18636 has_variant = TRUE;
18637 continue;
18638 }
18639
18640 first_time = FALSE;
18641 lowest_cp = start;
18642
18643 i++;
18644 }
18645
18646 /* Find the bit positions that differ from the lowest code
18647 * point in the node. Keep track of all such positions by
18648 * OR'ing */
18649 for (; i <= end; i++) {
18650 if (! UVCHR_IS_INVARIANT(i)) {
18651 has_variant = TRUE;
18652 continue;
18653 }
18654
18655 bits_differing |= i ^ lowest_cp;
18656 }
18657 }
18658 invlist_iterfinish(cp_list);
18659
18660 /* At the end of the loop, we count how many bits differ from
18661 * the bits in lowest code point, call the count 'd'. If the
18662 * set we found contains 2**d elements, it is the closure of
18663 * all code points that differ only in those bit positions. To
18664 * convince yourself of that, first note that the number in the
18665 * closure must be a power of 2, which we test for. The only
18666 * way we could have that count and it be some differing set,
18667 * is if we got some code points that don't differ from the
18668 * lowest code point in any position, but do differ from each
18669 * other in some other position. That means one code point has
18670 * a 1 in that position, and another has a 0. But that would
18671 * mean that one of them differs from the lowest code point in
18672 * that position, which possibility we've already excluded. */
18673 if ( ! has_variant
18674 && cp_count == 1U << PL_bitcount[bits_differing])
18675 {
18676 assert(cp_count > 1);
18677 op = ANYOFM;
18678
18679 /* We need to make the bits that differ be 0's */
18680 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
18681
18682 /* The argument is the lowest code point */
18683 anode_arg = lowest_cp;
18684 *flagp |= HASWIDTH|SIMPLE;
18685 }
18686 }
18687 }
18688
18689 if (op != END) {
18690 RExC_parse = (char *)orig_parse;
18691 RExC_emit = orig_emit;
18692
18693 if (regarglen[op]) {
18694 ret = reganode(pRExC_state, op, anode_arg);
18695 } else {
18696 ret = reg_node(pRExC_state, op);
18697 }
18698
18699 RExC_parse = (char *)cur_parse;
18700
18701 if (PL_regkind[op] == EXACT) {
18702 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
18703 TRUE /* downgradable to EXACT */
18704 );
18705 }
18706 else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
18707 FLAGS(REGNODE_p(ret)) = posix_class;
18708 }
18709 else if (PL_regkind[op] == ANYOFM) {
18710 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
18711 }
18712
18713 SvREFCNT_dec_NN(cp_list);
18714 return ret;
18715 }
18716 }
18717
18718 /* It's going to be an ANYOF node. */
18719 OP(REGNODE_p(ret)) = (use_anyofd)
18720 ? ANYOFD
18721 : ((posixl)
18722 ? ANYOFPOSIXL
18723 : ((LOC)
18724 ? ANYOFL
18725 : ANYOF));
18726 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
18727
18728 /* Here, <cp_list> contains all the code points we can determine at
18729 * compile time that match under all conditions. Go through it, and
18730 * for things that belong in the bitmap, put them there, and delete from
18731 * <cp_list>. While we are at it, see if everything above 255 is in the
18732 * list, and if so, set a flag to speed up execution */
18733
18734 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
18735
18736 if (posixl) {
18737 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
18738 }
18739
18740 if (invert) {
18741 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
18742 }
18743
18744 /* Here, the bitmap has been populated with all the Latin1 code points that
18745 * always match. Can now add to the overall list those that match only
18746 * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
18747 * */
18748 if (has_upper_latin1_only_utf8_matches) {
18749 if (cp_list) {
18750 _invlist_union(cp_list,
18751 has_upper_latin1_only_utf8_matches,
18752 &cp_list);
18753 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
18754 }
18755 else {
18756 cp_list = has_upper_latin1_only_utf8_matches;
18757 }
18758 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
18759 }
18760
18761 /* If there is a swash and more than one element, we can't use the swash in
18762 * the optimization below. */
18763 if (swash && element_count > 1) {
18764 SvREFCNT_dec_NN(swash);
18765 swash = NULL;
18766 }
18767
18768 /* Note that the optimization of using 'swash' if it is the only thing in
18769 * the class doesn't have us change swash at all, so it can include things
18770 * that are also in the bitmap; otherwise we have purposely deleted that
18771 * duplicate information */
18772 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
18773 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
18774 ? listsv : NULL,
18775 only_utf8_locale_list,
18776 swash, has_user_defined_property);
18777
18778 *flagp |= HASWIDTH|SIMPLE;
18779
18780 if (ANYOF_FLAGS(REGNODE_p(ret)) & ANYOF_LOCALE_FLAGS) {
18781 RExC_contains_locale = 1;
18782 }
18783
18784 return ret;
18785}
18786
18787#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
18788
18789STATIC void
18790S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
18791 regnode* const node,
18792 SV* const cp_list,
18793 SV* const runtime_defns,
18794 SV* const only_utf8_locale_list,
18795 SV* const swash,
18796 const bool has_user_defined_property)
18797{
18798 /* Sets the arg field of an ANYOF-type node 'node', using information about
18799 * the node passed-in. If there is nothing outside the node's bitmap, the
18800 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
18801 * the count returned by add_data(), having allocated and stored an array,
18802 * av, that that count references, as follows:
18803 * av[0] stores the character class description in its textual form.
18804 * This is used later (regexec.c:Perl_regclass_swash()) to
18805 * initialize the appropriate swash, and is also useful for dumping
18806 * the regnode. This is set to &PL_sv_undef if the textual
18807 * description is not needed at run-time (as happens if the other
18808 * elements completely define the class)
18809 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
18810 * computed from av[0]. But if no further computation need be done,
18811 * the swash is stored here now (and av[0] is &PL_sv_undef).
18812 * av[2] stores the inversion list of code points that match only if the
18813 * current locale is UTF-8
18814 * av[3] stores the cp_list inversion list for use in addition or instead
18815 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
18816 * (Otherwise everything needed is already in av[0] and av[1])
18817 * av[4] is set if any component of the class is from a user-defined
18818 * property; used only if av[3] exists */
18819
18820 UV n;
18821
18822 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
18823
18824 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
18825 assert(! (ANYOF_FLAGS(node)
18826 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
18827 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
18828 }
18829 else {
18830 AV * const av = newAV();
18831 SV *rv;
18832
18833 av_store(av, 0, (runtime_defns)
18834 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
18835 if (swash) {
18836 assert(cp_list);
18837 av_store(av, 1, swash);
18838 SvREFCNT_dec_NN(cp_list);
18839 }
18840 else {
18841 av_store(av, 1, &PL_sv_undef);
18842 if (cp_list) {
18843 av_store(av, 3, cp_list);
18844 av_store(av, 4, newSVuv(has_user_defined_property));
18845 }
18846 }
18847
18848 if (only_utf8_locale_list) {
18849 av_store(av, 2, only_utf8_locale_list);
18850 }
18851 else {
18852 av_store(av, 2, &PL_sv_undef);
18853 }
18854
18855 rv = newRV_noinc(MUTABLE_SV(av));
18856 n = add_data(pRExC_state, STR_WITH_LEN("s"));
18857 RExC_rxi->data->data[n] = (void*)rv;
18858 ARG_SET(node, n);
18859 }
18860}
18861
18862#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
18863SV *
18864Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
18865 const regnode* node,
18866 bool doinit,
18867 SV** listsvp,
18868 SV** only_utf8_locale_ptr,
18869 SV** output_invlist)
18870
18871{
18872 /* For internal core use only.
18873 * Returns the swash for the input 'node' in the regex 'prog'.
18874 * If <doinit> is 'true', will attempt to create the swash if not already
18875 * done.
18876 * If <listsvp> is non-null, will return the printable contents of the
18877 * swash. This can be used to get debugging information even before the
18878 * swash exists, by calling this function with 'doinit' set to false, in
18879 * which case the components that will be used to eventually create the
18880 * swash are returned (in a printable form).
18881 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
18882 * store an inversion list of code points that should match only if the
18883 * execution-time locale is a UTF-8 one.
18884 * If <output_invlist> is not NULL, it is where this routine is to store an
18885 * inversion list of the code points that would be instead returned in
18886 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
18887 * when this parameter is used, is just the non-code point data that
18888 * will go into creating the swash. This currently should be just
18889 * user-defined properties whose definitions were not known at compile
18890 * time. Using this parameter allows for easier manipulation of the
18891 * swash's data by the caller. It is illegal to call this function with
18892 * this parameter set, but not <listsvp>
18893 *
18894 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
18895 * that, in spite of this function's name, the swash it returns may include
18896 * the bitmap data as well */
18897
18898 SV *sw = NULL;
18899 SV *si = NULL; /* Input swash initialization string */
18900 SV* invlist = NULL;
18901
18902 RXi_GET_DECL(prog, progi);
18903 const struct reg_data * const data = prog ? progi->data : NULL;
18904
18905 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
18906 assert(! output_invlist || listsvp);
18907
18908 if (data && data->count) {
18909 const U32 n = ARG(node);
18910
18911 if (data->what[n] == 's') {
18912 SV * const rv = MUTABLE_SV(data->data[n]);
18913 AV * const av = MUTABLE_AV(SvRV(rv));
18914 SV **const ary = AvARRAY(av);
18915 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
18916
18917 si = *ary; /* ary[0] = the string to initialize the swash with */
18918
18919 if (av_tindex_skip_len_mg(av) >= 2) {
18920 if (only_utf8_locale_ptr
18921 && ary[2]
18922 && ary[2] != &PL_sv_undef)
18923 {
18924 *only_utf8_locale_ptr = ary[2];
18925 }
18926 else {
18927 assert(only_utf8_locale_ptr);
18928 *only_utf8_locale_ptr = NULL;
18929 }
18930
18931 /* Elements 3 and 4 are either both present or both absent. [3]
18932 * is any inversion list generated at compile time; [4]
18933 * indicates if that inversion list has any user-defined
18934 * properties in it. */
18935 if (av_tindex_skip_len_mg(av) >= 3) {
18936 invlist = ary[3];
18937 if (SvUV(ary[4])) {
18938 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
18939 }
18940 }
18941 else {
18942 invlist = NULL;
18943 }
18944 }
18945
18946 /* Element [1] is reserved for the set-up swash. If already there,
18947 * return it; if not, create it and store it there */
18948 if (ary[1] && SvROK(ary[1])) {
18949 sw = ary[1];
18950 }
18951 else if (doinit && ((si && si != &PL_sv_undef)
18952 || (invlist && invlist != &PL_sv_undef))) {
18953 assert(si);
18954 sw = _core_swash_init("utf8", /* the utf8 package */
18955 "", /* nameless */
18956 si,
18957 1, /* binary */
18958 0, /* not from tr/// */
18959 invlist,
18960 &swash_init_flags);
18961 (void)av_store(av, 1, sw);
18962 }
18963 }
18964 }
18965
18966 /* If requested, return a printable version of what this swash matches */
18967 if (listsvp) {
18968 SV* matches_string = NULL;
18969
18970 /* The swash should be used, if possible, to get the data, as it
18971 * contains the resolved data. But this function can be called at
18972 * compile-time, before everything gets resolved, in which case we
18973 * return the currently best available information, which is the string
18974 * that will eventually be used to do that resolving, 'si' */
18975 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
18976 && (si && si != &PL_sv_undef))
18977 {
18978 /* Here, we only have 'si' (and possibly some passed-in data in
18979 * 'invlist', which is handled below) If the caller only wants
18980 * 'si', use that. */
18981 if (! output_invlist) {
18982 matches_string = newSVsv(si);
18983 }
18984 else {
18985 /* But if the caller wants an inversion list of the node, we
18986 * need to parse 'si' and place as much as possible in the
18987 * desired output inversion list, making 'matches_string' only
18988 * contain the currently unresolvable things */
18989 const char *si_string = SvPVX(si);
18990 STRLEN remaining = SvCUR(si);
18991 UV prev_cp = 0;
18992 U8 count = 0;
18993
18994 /* Ignore everything before the first new-line */
18995 while (*si_string != '\n' && remaining > 0) {
18996 si_string++;
18997 remaining--;
18998 }
18999 assert(remaining > 0);
19000
19001 si_string++;
19002 remaining--;
19003
19004 while (remaining > 0) {
19005
19006 /* The data consists of just strings defining user-defined
19007 * property names, but in prior incarnations, and perhaps
19008 * somehow from pluggable regex engines, it could still
19009 * hold hex code point definitions. Each component of a
19010 * range would be separated by a tab, and each range by a
19011 * new-line. If these are found, instead add them to the
19012 * inversion list */
19013 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
19014 |PERL_SCAN_SILENT_NON_PORTABLE;
19015 STRLEN len = remaining;
19016 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19017
19018 /* If the hex decode routine found something, it should go
19019 * up to the next \n */
19020 if ( *(si_string + len) == '\n') {
19021 if (count) { /* 2nd code point on line */
19022 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19023 }
19024 else {
19025 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19026 }
19027 count = 0;
19028 goto prepare_for_next_iteration;
19029 }
19030
19031 /* If the hex decode was instead for the lower range limit,
19032 * save it, and go parse the upper range limit */
19033 if (*(si_string + len) == '\t') {
19034 assert(count == 0);
19035
19036 prev_cp = cp;
19037 count = 1;
19038 prepare_for_next_iteration:
19039 si_string += len + 1;
19040 remaining -= len + 1;
19041 continue;
19042 }
19043
19044 /* Here, didn't find a legal hex number. Just add it from
19045 * here to the next \n */
19046
19047 remaining -= len;
19048 while (*(si_string + len) != '\n' && remaining > 0) {
19049 remaining--;
19050 len++;
19051 }
19052 if (*(si_string + len) == '\n') {
19053 len++;
19054 remaining--;
19055 }
19056 if (matches_string) {
19057 sv_catpvn(matches_string, si_string, len - 1);
19058 }
19059 else {
19060 matches_string = newSVpvn(si_string, len - 1);
19061 }
19062 si_string += len;
19063 sv_catpvs(matches_string, " ");
19064 } /* end of loop through the text */
19065
19066 assert(matches_string);
19067 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
19068 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19069 }
19070 } /* end of has an 'si' but no swash */
19071 }
19072
19073 /* If we have a swash in place, its equivalent inversion list was above
19074 * placed into 'invlist'. If not, this variable may contain a stored
19075 * inversion list which is information beyond what is in 'si' */
19076 if (invlist) {
19077
19078 /* Again, if the caller doesn't want the output inversion list, put
19079 * everything in 'matches-string' */
19080 if (! output_invlist) {
19081 if ( ! matches_string) {
19082 matches_string = newSVpvs("\n");
19083 }
19084 sv_catsv(matches_string, invlist_contents(invlist,
19085 TRUE /* traditional style */
19086 ));
19087 }
19088 else if (! *output_invlist) {
19089 *output_invlist = invlist_clone(invlist, NULL);
19090 }
19091 else {
19092 _invlist_union(*output_invlist, invlist, output_invlist);
19093 }
19094 }
19095
19096 *listsvp = matches_string;
19097 }
19098
19099 return sw;
19100}
19101#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19102
19103/* reg_skipcomment()
19104
19105 Absorbs an /x style # comment from the input stream,
19106 returning a pointer to the first character beyond the comment, or if the
19107 comment terminates the pattern without anything following it, this returns
19108 one past the final character of the pattern (in other words, RExC_end) and
19109 sets the REG_RUN_ON_COMMENT_SEEN flag.
19110
19111 Note it's the callers responsibility to ensure that we are
19112 actually in /x mode
19113
19114*/
19115
19116PERL_STATIC_INLINE char*
19117S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19118{
19119 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19120
19121 assert(*p == '#');
19122
19123 while (p < RExC_end) {
19124 if (*(++p) == '\n') {
19125 return p+1;
19126 }
19127 }
19128
19129 /* we ran off the end of the pattern without ending the comment, so we have
19130 * to add an \n when wrapping */
19131 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19132 return p;
19133}
19134
19135STATIC void
19136S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19137 char ** p,
19138 const bool force_to_xmod
19139 )
19140{
19141 /* If the text at the current parse position '*p' is a '(?#...)' comment,
19142 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19143 * is /x whitespace, advance '*p' so that on exit it points to the first
19144 * byte past all such white space and comments */
19145
19146 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19147
19148 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19149
19150 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19151
19152 for (;;) {
19153 if (RExC_end - (*p) >= 3
19154 && *(*p) == '('
19155 && *(*p + 1) == '?'
19156 && *(*p + 2) == '#')
19157 {
19158 while (*(*p) != ')') {
19159 if ((*p) == RExC_end)
19160 FAIL("Sequence (?#... not terminated");
19161 (*p)++;
19162 }
19163 (*p)++;
19164 continue;
19165 }
19166
19167 if (use_xmod) {
19168 const char * save_p = *p;
19169 while ((*p) < RExC_end) {
19170 STRLEN len;
19171 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
19172 (*p) += len;
19173 }
19174 else if (*(*p) == '#') {
19175 (*p) = reg_skipcomment(pRExC_state, (*p));
19176 }
19177 else {
19178 break;
19179 }
19180 }
19181 if (*p != save_p) {
19182 continue;
19183 }
19184 }
19185
19186 break;
19187 }
19188
19189 return;
19190}
19191
19192/* nextchar()
19193
19194 Advances the parse position by one byte, unless that byte is the beginning
19195 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
19196 those two cases, the parse position is advanced beyond all such comments and
19197 white space.
19198
19199 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
19200*/
19201
19202STATIC void
19203S_nextchar(pTHX_ RExC_state_t *pRExC_state)
19204{
19205 PERL_ARGS_ASSERT_NEXTCHAR;
19206
19207 if (RExC_parse < RExC_end) {
19208 assert( ! UTF
19209 || UTF8_IS_INVARIANT(*RExC_parse)
19210 || UTF8_IS_START(*RExC_parse));
19211
19212 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
19213
19214 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
19215 FALSE /* Don't force /x */ );
19216 }
19217}
19218
19219STATIC void
19220S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
19221{
19222 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
19223
19224 RExC_size += size;
19225}
19226
19227STATIC regnode_offset
19228S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
19229{
19230 /* Allocate a regnode for 'op', with 'extra_size' extra space. In pass1,
19231 * it aligns and increments RExC_size; in pass2, RExC_emit
19232 *
19233 * It returns the renode's offset into the regex engine program (meaningful
19234 * only in pass2 */
19235
19236 const regnode_offset ret = RExC_emit;
19237
19238 GET_RE_DEBUG_FLAGS_DECL;
19239
19240 PERL_ARGS_ASSERT_REGNODE_GUTS;
19241
19242 assert(extra_size >= regarglen[op]);
19243
19244 if (SIZE_ONLY) {
19245 SIZE_ALIGN(RExC_size);
19246 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
19247 return(ret);
19248 }
19249 if (REGNODE_p(RExC_emit) >= RExC_emit_bound)
19250 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
19251 op, (void*)REGNODE_p(RExC_emit), (void*)RExC_emit_bound);
19252
19253 NODE_ALIGN_FILL(REGNODE_p(ret));
19254#ifndef RE_TRACK_PATTERN_OFFSETS
19255 PERL_UNUSED_ARG(name);
19256#else
19257 if (RExC_offsets) { /* MJD */
19258 MJD_OFFSET_DEBUG(
19259 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
19260 name, __LINE__,
19261 PL_reg_name[op],
19262 (UV)(RExC_emit) > RExC_offsets[0]
19263 ? "Overwriting end of array!\n" : "OK",
19264 (UV)(RExC_emit),
19265 (UV)(RExC_parse - RExC_start),
19266 (UV)RExC_offsets[0]));
19267 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
19268 }
19269#endif
19270 return(ret);
19271}
19272
19273/*
19274- reg_node - emit a node
19275*/
19276STATIC regnode_offset /* Location. */
19277S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
19278{
19279 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
19280
19281 PERL_ARGS_ASSERT_REG_NODE;
19282
19283 assert(regarglen[op] == 0);
19284
19285 if (PASS2) {
19286 regnode_offset ptr = ret;
19287 FILL_ADVANCE_NODE(ptr, op);
19288 RExC_emit = ptr;
19289 }
19290 return(ret);
19291}
19292
19293/*
19294- reganode - emit a node with an argument
19295*/
19296STATIC regnode_offset /* Location. */
19297S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
19298{
19299 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
19300
19301 PERL_ARGS_ASSERT_REGANODE;
19302
19303 /* ANYOF are special cased to allow non-length 1 args */
19304 assert(regarglen[op] == 1 || PL_regkind[op] == ANYOF);
19305
19306 if (PASS2) {
19307 regnode_offset ptr = ret;
19308 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
19309 RExC_emit = ptr;
19310 }
19311 return(ret);
19312}
19313
19314STATIC regnode_offset
19315S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
19316{
19317 /* emit a node with U32 and I32 arguments */
19318
19319 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
19320
19321 PERL_ARGS_ASSERT_REG2LANODE;
19322
19323 assert(regarglen[op] == 2);
19324
19325 if (PASS2) {
19326 regnode_offset ptr = ret;
19327 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
19328 RExC_emit = ptr;
19329 }
19330 return(ret);
19331}
19332
19333/*
19334- reginsert - insert an operator in front of already-emitted operand
19335*
19336* That means that on exit 'operand' is the offset of the newly inserted
19337* operator, and the original operand has been relocated.
19338*
19339* IMPORTANT NOTE - it is the *callers* responsibility to correctly
19340* set up NEXT_OFF() of the inserted node if needed. Something like this:
19341*
19342* reginsert(pRExC, OPFAIL, orig_emit, depth+1);
19343* if (PASS2)
19344* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
19345*
19346* ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
19347*/
19348STATIC void
19349S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
19350 const regnode_offset operand, const U32 depth)
19351{
19352 regnode *src;
19353 regnode *dst;
19354 regnode *place;
19355 const int offset = regarglen[(U8)op];
19356 const int size = NODE_STEP_REGNODE + offset;
19357 GET_RE_DEBUG_FLAGS_DECL;
19358
19359 PERL_ARGS_ASSERT_REGINSERT;
19360 PERL_UNUSED_CONTEXT;
19361 PERL_UNUSED_ARG(depth);
19362/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
19363 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
19364 if (SIZE_ONLY) {
19365 RExC_size += size;
19366 return;
19367 }
19368 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
19369 studying. If this is wrong then we need to adjust RExC_recurse
19370 below like we do with RExC_open_parens/RExC_close_parens. */
19371 src = REGNODE_p(RExC_emit);
19372 RExC_emit += size;
19373 dst = REGNODE_p(RExC_emit);
19374 if (RExC_open_parens) {
19375 int paren;
19376 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
19377 /* remember that RExC_npar is rex->nparens + 1,
19378 * iow it is 1 more than the number of parens seen in
19379 * the pattern so far. */
19380 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
19381 /* note, RExC_open_parens[0] is the start of the
19382 * regex, it can't move. RExC_close_parens[0] is the end
19383 * of the regex, it *can* move. */
19384 if ( paren && RExC_open_parens[paren] >= operand ) {
19385 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
19386 RExC_open_parens[paren] += size;
19387 } else {
19388 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
19389 }
19390 if ( RExC_close_parens[paren] >= operand ) {
19391 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
19392 RExC_close_parens[paren] += size;
19393 } else {
19394 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
19395 }
19396 }
19397 }
19398 if (RExC_end_op)
19399 RExC_end_op += size;
19400
19401 while (src > REGNODE_p(operand)) {
19402 StructCopy(--src, --dst, regnode);
19403#ifdef RE_TRACK_PATTERN_OFFSETS
19404 if (RExC_offsets) { /* MJD 20010112 */
19405 MJD_OFFSET_DEBUG(
19406 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
19407 "reginsert",
19408 __LINE__,
19409 PL_reg_name[op],
19410 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
19411 ? "Overwriting end of array!\n" : "OK",
19412 (UV)REGNODE_OFFSET(src),
19413 (UV)REGNODE_OFFSET(dst),
19414 (UV)RExC_offsets[0]));
19415 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
19416 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
19417 }
19418#endif
19419 }
19420
19421 place = REGNODE_p(operand); /* Op node, where operand used to be. */
19422#ifdef RE_TRACK_PATTERN_OFFSETS
19423 if (RExC_offsets) { /* MJD */
19424 MJD_OFFSET_DEBUG(
19425 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
19426 "reginsert",
19427 __LINE__,
19428 PL_reg_name[op],
19429 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
19430 ? "Overwriting end of array!\n" : "OK",
19431 (UV)REGNODE_OFFSET(place),
19432 (UV)(RExC_parse - RExC_start),
19433 (UV)RExC_offsets[0]));
19434 Set_Node_Offset(place, RExC_parse);
19435 Set_Node_Length(place, 1);
19436 }
19437#endif
19438 src = NEXTOPER(place);
19439 FLAGS(place) = 0;
19440 FILL_NODE(operand, op);
19441
19442 /* Zero out any arguments in the new node */
19443 Zero(src, offset, regnode);
19444}
19445
19446/*
19447- regtail - set the next-pointer at the end of a node chain of p to val.
19448- SEE ALSO: regtail_study
19449*/
19450STATIC void
19451S_regtail(pTHX_ RExC_state_t * pRExC_state,
19452 const regnode_offset p,
19453 const regnode_offset val,
19454 const U32 depth)
19455{
19456 regnode_offset scan;
19457 GET_RE_DEBUG_FLAGS_DECL;
19458
19459 PERL_ARGS_ASSERT_REGTAIL;
19460#ifndef DEBUGGING
19461 PERL_UNUSED_ARG(depth);
19462#endif
19463
19464 if (SIZE_ONLY)
19465 return;
19466
19467 /* Find last node. */
19468 scan = (regnode_offset) p;
19469 for (;;) {
19470 regnode * const temp = regnext(REGNODE_p(scan));
19471 DEBUG_PARSE_r({
19472 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
19473 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19474 Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
19475 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(REGNODE_p(scan)),
19476 (temp == NULL ? "->" : ""),
19477 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
19478 );
19479 });
19480 if (temp == NULL)
19481 break;
19482 scan = REGNODE_OFFSET(temp);
19483 }
19484
19485 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19486 ARG_SET(REGNODE_p(scan), val - scan);
19487 }
19488 else {
19489 NEXT_OFF(REGNODE_p(scan)) = val - scan;
19490 }
19491}
19492
19493#ifdef DEBUGGING
19494/*
19495- regtail_study - set the next-pointer at the end of a node chain of p to val.
19496- Look for optimizable sequences at the same time.
19497- currently only looks for EXACT chains.
19498
19499This is experimental code. The idea is to use this routine to perform
19500in place optimizations on branches and groups as they are constructed,
19501with the long term intention of removing optimization from study_chunk so
19502that it is purely analytical.
19503
19504Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
19505to control which is which.
19506
19507*/
19508/* TODO: All four parms should be const */
19509
19510STATIC U8
19511S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
19512 const regnode_offset val, U32 depth)
19513{
19514 regnode_offset scan;
19515 U8 exact = PSEUDO;
19516#ifdef EXPERIMENTAL_INPLACESCAN
19517 I32 min = 0;
19518#endif
19519 GET_RE_DEBUG_FLAGS_DECL;
19520
19521 PERL_ARGS_ASSERT_REGTAIL_STUDY;
19522
19523
19524 if (SIZE_ONLY)
19525 return exact;
19526
19527 /* Find last node. */
19528
19529 scan = p;
19530 for (;;) {
19531 regnode * const temp = regnext(REGNODE_p(scan));
19532#ifdef EXPERIMENTAL_INPLACESCAN
19533 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
19534 bool unfolded_multi_char; /* Unexamined in this routine */
19535 if (join_exact(pRExC_state, scan, &min,
19536 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
19537 return EXACT;
19538 }
19539#endif
19540 if ( exact ) {
19541 switch (OP(REGNODE_p(scan))) {
19542 case EXACT:
19543 case EXACTL:
19544 case EXACTF:
19545 case EXACTFAA_NO_TRIE:
19546 case EXACTFAA:
19547 case EXACTFU:
19548 case EXACTFLU8:
19549 case EXACTFU_SS:
19550 case EXACTFL:
19551 if( exact == PSEUDO )
19552 exact= OP(REGNODE_p(scan));
19553 else if ( exact != OP(REGNODE_p(scan)) )
19554 exact= 0;
19555 case NOTHING:
19556 break;
19557 default:
19558 exact= 0;
19559 }
19560 }
19561 DEBUG_PARSE_r({
19562 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
19563 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19564 Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
19565 SvPV_nolen_const(RExC_mysv),
19566 REG_NODE_NUM(REGNODE_p(scan)),
19567 PL_reg_name[exact]);
19568 });
19569 if (temp == NULL)
19570 break;
19571 scan = REGNODE_OFFSET(temp);
19572 }
19573 DEBUG_PARSE_r({
19574 DEBUG_PARSE_MSG("");
19575 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
19576 Perl_re_printf( aTHX_
19577 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
19578 SvPV_nolen_const(RExC_mysv),
19579 (IV)REG_NODE_NUM(REGNODE_p(val)),
19580 (IV)(val - scan)
19581 );
19582 });
19583 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19584 ARG_SET(REGNODE_p(scan), val - scan);
19585 }
19586 else {
19587 NEXT_OFF(REGNODE_p(scan)) = val - scan;
19588 }
19589
19590 return exact;
19591}
19592#endif
19593
19594STATIC SV*
19595S_get_ANYOFM_contents(pTHX_ const regnode * n) {
19596
19597 /* Returns an inversion list of all the code points matched by the ANYOFM
19598 * node 'n' */
19599
19600 SV * cp_list = _new_invlist(-1);
19601 const U8 lowest = (U8) ARG(n);
19602 unsigned int i;
19603 U8 count = 0;
19604 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
19605
19606 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
19607
19608 /* Starting with the lowest code point, any code point that ANDed with the
19609 * mask yields the lowest code point is in the set */
19610 for (i = lowest; i <= 0xFF; i++) {
19611 if ((i & FLAGS(n)) == ARG(n)) {
19612 cp_list = add_cp_to_invlist(cp_list, i);
19613 count++;
19614
19615 /* We know how many code points (a power of two) that are in the
19616 * set. No use looking once we've got that number */
19617 if (count >= needed) break;
19618 }
19619 }
19620
19621 return cp_list;
19622}
19623
19624/*
19625 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
19626 */
19627#ifdef DEBUGGING
19628
19629static void
19630S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
19631{
19632 int bit;
19633 int set=0;
19634
19635 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19636
19637 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
19638 if (flags & (1<<bit)) {
19639 if (!set++ && lead)
19640 Perl_re_printf( aTHX_ "%s", lead);
19641 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
19642 }
19643 }
19644 if (lead) {
19645 if (set)
19646 Perl_re_printf( aTHX_ "\n");
19647 else
19648 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
19649 }
19650}
19651
19652static void
19653S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
19654{
19655 int bit;
19656 int set=0;
19657 regex_charset cs;
19658
19659 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19660
19661 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
19662 if (flags & (1<<bit)) {
19663 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
19664 continue;
19665 }
19666 if (!set++ && lead)
19667 Perl_re_printf( aTHX_ "%s", lead);
19668 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
19669 }
19670 }
19671 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
19672 if (!set++ && lead) {
19673 Perl_re_printf( aTHX_ "%s", lead);
19674 }
19675 switch (cs) {
19676 case REGEX_UNICODE_CHARSET:
19677 Perl_re_printf( aTHX_ "UNICODE");
19678 break;
19679 case REGEX_LOCALE_CHARSET:
19680 Perl_re_printf( aTHX_ "LOCALE");
19681 break;
19682 case REGEX_ASCII_RESTRICTED_CHARSET:
19683 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
19684 break;
19685 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
19686 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
19687 break;
19688 default:
19689 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
19690 break;
19691 }
19692 }
19693 if (lead) {
19694 if (set)
19695 Perl_re_printf( aTHX_ "\n");
19696 else
19697 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
19698 }
19699}
19700#endif
19701
19702void
19703Perl_regdump(pTHX_ const regexp *r)
19704{
19705#ifdef DEBUGGING
19706 int i;
19707 SV * const sv = sv_newmortal();
19708 SV *dsv= sv_newmortal();
19709 RXi_GET_DECL(r, ri);
19710 GET_RE_DEBUG_FLAGS_DECL;
19711
19712 PERL_ARGS_ASSERT_REGDUMP;
19713
19714 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
19715
19716 /* Header fields of interest. */
19717 for (i = 0; i < 2; i++) {
19718 if (r->substrs->data[i].substr) {
19719 RE_PV_QUOTED_DECL(s, 0, dsv,
19720 SvPVX_const(r->substrs->data[i].substr),
19721 RE_SV_DUMPLEN(r->substrs->data[i].substr),
19722 PL_dump_re_max_len);
19723 Perl_re_printf( aTHX_
19724 "%s %s%s at %" IVdf "..%" UVuf " ",
19725 i ? "floating" : "anchored",
19726 s,
19727 RE_SV_TAIL(r->substrs->data[i].substr),
19728 (IV)r->substrs->data[i].min_offset,
19729 (UV)r->substrs->data[i].max_offset);
19730 }
19731 else if (r->substrs->data[i].utf8_substr) {
19732 RE_PV_QUOTED_DECL(s, 1, dsv,
19733 SvPVX_const(r->substrs->data[i].utf8_substr),
19734 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
19735 30);
19736 Perl_re_printf( aTHX_
19737 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
19738 i ? "floating" : "anchored",
19739 s,
19740 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
19741 (IV)r->substrs->data[i].min_offset,
19742 (UV)r->substrs->data[i].max_offset);
19743 }
19744 }
19745
19746 if (r->check_substr || r->check_utf8)
19747 Perl_re_printf( aTHX_
19748 (const char *)
19749 ( r->check_substr == r->substrs->data[1].substr
19750 && r->check_utf8 == r->substrs->data[1].utf8_substr
19751 ? "(checking floating" : "(checking anchored"));
19752 if (r->intflags & PREGf_NOSCAN)
19753 Perl_re_printf( aTHX_ " noscan");
19754 if (r->extflags & RXf_CHECK_ALL)
19755 Perl_re_printf( aTHX_ " isall");
19756 if (r->check_substr || r->check_utf8)
19757 Perl_re_printf( aTHX_ ") ");
19758
19759 if (ri->regstclass) {
19760 regprop(r, sv, ri->regstclass, NULL, NULL);
19761 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
19762 }
19763 if (r->intflags & PREGf_ANCH) {
19764 Perl_re_printf( aTHX_ "anchored");
19765 if (r->intflags & PREGf_ANCH_MBOL)
19766 Perl_re_printf( aTHX_ "(MBOL)");
19767 if (r->intflags & PREGf_ANCH_SBOL)
19768 Perl_re_printf( aTHX_ "(SBOL)");
19769 if (r->intflags & PREGf_ANCH_GPOS)
19770 Perl_re_printf( aTHX_ "(GPOS)");
19771 Perl_re_printf( aTHX_ " ");
19772 }
19773 if (r->intflags & PREGf_GPOS_SEEN)
19774 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
19775 if (r->intflags & PREGf_SKIP)
19776 Perl_re_printf( aTHX_ "plus ");
19777 if (r->intflags & PREGf_IMPLICIT)
19778 Perl_re_printf( aTHX_ "implicit ");
19779 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
19780 if (r->extflags & RXf_EVAL_SEEN)
19781 Perl_re_printf( aTHX_ "with eval ");
19782 Perl_re_printf( aTHX_ "\n");
19783 DEBUG_FLAGS_r({
19784 regdump_extflags("r->extflags: ", r->extflags);
19785 regdump_intflags("r->intflags: ", r->intflags);
19786 });
19787#else
19788 PERL_ARGS_ASSERT_REGDUMP;
19789 PERL_UNUSED_CONTEXT;
19790 PERL_UNUSED_ARG(r);
19791#endif /* DEBUGGING */
19792}
19793
19794/* Should be synchronized with ANYOF_ #defines in regcomp.h */
19795#ifdef DEBUGGING
19796
19797# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
19798 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
19799 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
19800 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
19801 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
19802 || _CC_VERTSPACE != 15
19803# error Need to adjust order of anyofs[]
19804# endif
19805static const char * const anyofs[] = {
19806 "\\w",
19807 "\\W",
19808 "\\d",
19809 "\\D",
19810 "[:alpha:]",
19811 "[:^alpha:]",
19812 "[:lower:]",
19813 "[:^lower:]",
19814 "[:upper:]",
19815 "[:^upper:]",
19816 "[:punct:]",
19817 "[:^punct:]",
19818 "[:print:]",
19819 "[:^print:]",
19820 "[:alnum:]",
19821 "[:^alnum:]",
19822 "[:graph:]",
19823 "[:^graph:]",
19824 "[:cased:]",
19825 "[:^cased:]",
19826 "\\s",
19827 "\\S",
19828 "[:blank:]",
19829 "[:^blank:]",
19830 "[:xdigit:]",
19831 "[:^xdigit:]",
19832 "[:cntrl:]",
19833 "[:^cntrl:]",
19834 "[:ascii:]",
19835 "[:^ascii:]",
19836 "\\v",
19837 "\\V"
19838};
19839#endif
19840
19841/*
19842- regprop - printable representation of opcode, with run time support
19843*/
19844
19845void
19846Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
19847{
19848#ifdef DEBUGGING
19849 int k;
19850 RXi_GET_DECL(prog, progi);
19851 GET_RE_DEBUG_FLAGS_DECL;
19852
19853 PERL_ARGS_ASSERT_REGPROP;
19854
19855 SvPVCLEAR(sv);
19856
19857 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
19858 /* It would be nice to FAIL() here, but this may be called from
19859 regexec.c, and it would be hard to supply pRExC_state. */
19860 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19861 (int)OP(o), (int)REGNODE_MAX);
19862 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
19863
19864 k = PL_regkind[OP(o)];
19865
19866 if (k == EXACT) {
19867 sv_catpvs(sv, " ");
19868 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
19869 * is a crude hack but it may be the best for now since
19870 * we have no flag "this EXACTish node was UTF-8"
19871 * --jhi */
19872 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
19873 PL_colors[0], PL_colors[1],
19874 PERL_PV_ESCAPE_UNI_DETECT |
19875 PERL_PV_ESCAPE_NONASCII |
19876 PERL_PV_PRETTY_ELLIPSES |
19877 PERL_PV_PRETTY_LTGT |
19878 PERL_PV_PRETTY_NOCLEAR
19879 );
19880 } else if (k == TRIE) {
19881 /* print the details of the trie in dumpuntil instead, as
19882 * progi->data isn't available here */
19883 const char op = OP(o);
19884 const U32 n = ARG(o);
19885 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
19886 (reg_ac_data *)progi->data->data[n] :
19887 NULL;
19888 const reg_trie_data * const trie
19889 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
19890
19891 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
19892 DEBUG_TRIE_COMPILE_r({
19893 if (trie->jump)
19894 sv_catpvs(sv, "(JUMP)");
19895 Perl_sv_catpvf(aTHX_ sv,
19896 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
19897 (UV)trie->startstate,
19898 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
19899 (UV)trie->wordcount,
19900 (UV)trie->minlen,
19901 (UV)trie->maxlen,
19902 (UV)TRIE_CHARCOUNT(trie),
19903 (UV)trie->uniquecharcount
19904 );
19905 });
19906 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
19907 sv_catpvs(sv, "[");
19908 (void) put_charclass_bitmap_innards(sv,
19909 ((IS_ANYOF_TRIE(op))
19910 ? ANYOF_BITMAP(o)
19911 : TRIE_BITMAP(trie)),
19912 NULL,
19913 NULL,
19914 NULL,
19915 FALSE
19916 );
19917 sv_catpvs(sv, "]");
19918 }
19919 } else if (k == CURLY) {
19920 U32 lo = ARG1(o), hi = ARG2(o);
19921 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
19922 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
19923 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
19924 if (hi == REG_INFTY)
19925 sv_catpvs(sv, "INFTY");
19926 else
19927 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
19928 sv_catpvs(sv, "}");
19929 }
19930 else if (k == WHILEM && o->flags) /* Ordinal/of */
19931 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
19932 else if (k == REF || k == OPEN || k == CLOSE
19933 || k == GROUPP || OP(o)==ACCEPT)
19934 {
19935 AV *name_list= NULL;
19936 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
19937 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
19938 if ( RXp_PAREN_NAMES(prog) ) {
19939 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19940 } else if ( pRExC_state ) {
19941 name_list= RExC_paren_name_list;
19942 }
19943 if (name_list) {
19944 if ( k != REF || (OP(o) < NREF)) {
19945 SV **name= av_fetch(name_list, parno, 0 );
19946 if (name)
19947 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19948 }
19949 else {
19950 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
19951 I32 *nums=(I32*)SvPVX(sv_dat);
19952 SV **name= av_fetch(name_list, nums[0], 0 );
19953 I32 n;
19954 if (name) {
19955 for ( n=0; n<SvIVX(sv_dat); n++ ) {
19956 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
19957 (n ? "," : ""), (IV)nums[n]);
19958 }
19959 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19960 }
19961 }
19962 }
19963 if ( k == REF && reginfo) {
19964 U32 n = ARG(o); /* which paren pair */
19965 I32 ln = prog->offs[n].start;
19966 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
19967 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
19968 else if (ln == prog->offs[n].end)
19969 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
19970 else {
19971 const char *s = reginfo->strbeg + ln;
19972 Perl_sv_catpvf(aTHX_ sv, ": ");
19973 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
19974 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
19975 }
19976 }
19977 } else if (k == GOSUB) {
19978 AV *name_list= NULL;
19979 if ( RXp_PAREN_NAMES(prog) ) {
19980 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19981 } else if ( pRExC_state ) {
19982 name_list= RExC_paren_name_list;
19983 }
19984
19985 /* Paren and offset */
19986 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
19987 (int)((o + (int)ARG2L(o)) - progi->program) );
19988 if (name_list) {
19989 SV **name= av_fetch(name_list, ARG(o), 0 );
19990 if (name)
19991 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19992 }
19993 }
19994 else if (k == LOGICAL)
19995 /* 2: embedded, otherwise 1 */
19996 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
19997 else if (k == ANYOF) {
19998 const U8 flags = ANYOF_FLAGS(o);
19999 bool do_sep = FALSE; /* Do we need to separate various components of
20000 the output? */
20001 /* Set if there is still an unresolved user-defined property */
20002 SV *unresolved = NULL;
20003
20004 /* Things that are ignored except when the runtime locale is UTF-8 */
20005 SV *only_utf8_locale_invlist = NULL;
20006
20007 /* Code points that don't fit in the bitmap */
20008 SV *nonbitmap_invlist = NULL;
20009
20010 /* And things that aren't in the bitmap, but are small enough to be */
20011 SV* bitmap_range_not_in_bitmap = NULL;
20012
20013 const bool inverted = flags & ANYOF_INVERT;
20014
20015 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20016 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20017 sv_catpvs(sv, "{utf8-locale-reqd}");
20018 }
20019 if (flags & ANYOFL_FOLD) {
20020 sv_catpvs(sv, "{i}");
20021 }
20022 }
20023
20024 /* If there is stuff outside the bitmap, get it */
20025 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
20026 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20027 &unresolved,
20028 &only_utf8_locale_invlist,
20029 &nonbitmap_invlist);
20030 /* The non-bitmap data may contain stuff that could fit in the
20031 * bitmap. This could come from a user-defined property being
20032 * finally resolved when this call was done; or much more likely
20033 * because there are matches that require UTF-8 to be valid, and so
20034 * aren't in the bitmap. This is teased apart later */
20035 _invlist_intersection(nonbitmap_invlist,
20036 PL_InBitmap,
20037 &bitmap_range_not_in_bitmap);
20038 /* Leave just the things that don't fit into the bitmap */
20039 _invlist_subtract(nonbitmap_invlist,
20040 PL_InBitmap,
20041 &nonbitmap_invlist);
20042 }
20043
20044 /* Obey this flag to add all above-the-bitmap code points */
20045 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20046 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20047 NUM_ANYOF_CODE_POINTS,
20048 UV_MAX);
20049 }
20050
20051 /* Ready to start outputting. First, the initial left bracket */
20052 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20053
20054 /* Then all the things that could fit in the bitmap */
20055 do_sep = put_charclass_bitmap_innards(sv,
20056 ANYOF_BITMAP(o),
20057 bitmap_range_not_in_bitmap,
20058 only_utf8_locale_invlist,
20059 o,
20060
20061 /* Can't try inverting for a
20062 * better display if there are
20063 * things that haven't been
20064 * resolved */
20065 unresolved != NULL);
20066 SvREFCNT_dec(bitmap_range_not_in_bitmap);
20067
20068 /* If there are user-defined properties which haven't been defined yet,
20069 * output them. If the result is not to be inverted, it is clearest to
20070 * output them in a separate [] from the bitmap range stuff. If the
20071 * result is to be complemented, we have to show everything in one [],
20072 * as the inversion applies to the whole thing. Use {braces} to
20073 * separate them from anything in the bitmap and anything above the
20074 * bitmap. */
20075 if (unresolved) {
20076 if (inverted) {
20077 if (! do_sep) { /* If didn't output anything in the bitmap */
20078 sv_catpvs(sv, "^");
20079 }
20080 sv_catpvs(sv, "{");
20081 }
20082 else if (do_sep) {
20083 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
20084 }
20085 sv_catsv(sv, unresolved);
20086 if (inverted) {
20087 sv_catpvs(sv, "}");
20088 }
20089 do_sep = ! inverted;
20090 }
20091
20092 /* And, finally, add the above-the-bitmap stuff */
20093 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
20094 SV* contents;
20095
20096 /* See if truncation size is overridden */
20097 const STRLEN dump_len = (PL_dump_re_max_len > 256)
20098 ? PL_dump_re_max_len
20099 : 256;
20100
20101 /* This is output in a separate [] */
20102 if (do_sep) {
20103 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
20104 }
20105
20106 /* And, for easy of understanding, it is shown in the
20107 * uncomplemented form if possible. The one exception being if
20108 * there are unresolved items, where the inversion has to be
20109 * delayed until runtime */
20110 if (inverted && ! unresolved) {
20111 _invlist_invert(nonbitmap_invlist);
20112 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
20113 }
20114
20115 contents = invlist_contents(nonbitmap_invlist,
20116 FALSE /* output suitable for catsv */
20117 );
20118
20119 /* If the output is shorter than the permissible maximum, just do it. */
20120 if (SvCUR(contents) <= dump_len) {
20121 sv_catsv(sv, contents);
20122 }
20123 else {
20124 const char * contents_string = SvPVX(contents);
20125 STRLEN i = dump_len;
20126
20127 /* Otherwise, start at the permissible max and work back to the
20128 * first break possibility */
20129 while (i > 0 && contents_string[i] != ' ') {
20130 i--;
20131 }
20132 if (i == 0) { /* Fail-safe. Use the max if we couldn't
20133 find a legal break */
20134 i = dump_len;
20135 }
20136
20137 sv_catpvn(sv, contents_string, i);
20138 sv_catpvs(sv, "...");
20139 }
20140
20141 SvREFCNT_dec_NN(contents);
20142 SvREFCNT_dec_NN(nonbitmap_invlist);
20143 }
20144
20145 /* And finally the matching, closing ']' */
20146 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20147
20148 SvREFCNT_dec(unresolved);
20149 }
20150 else if (k == ANYOFM) {
20151 SV * cp_list = get_ANYOFM_contents(o);
20152
20153 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20154 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
20155 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20156
20157 SvREFCNT_dec(cp_list);
20158 }
20159 else if (k == POSIXD || k == NPOSIXD) {
20160 U8 index = FLAGS(o) * 2;
20161 if (index < C_ARRAY_LENGTH(anyofs)) {
20162 if (*anyofs[index] != '[') {
20163 sv_catpvs(sv, "[");
20164 }
20165 sv_catpv(sv, anyofs[index]);
20166 if (*anyofs[index] != '[') {
20167 sv_catpvs(sv, "]");
20168 }
20169 }
20170 else {
20171 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
20172 }
20173 }
20174 else if (k == BOUND || k == NBOUND) {
20175 /* Must be synced with order of 'bound_type' in regcomp.h */
20176 const char * const bounds[] = {
20177 "", /* Traditional */
20178 "{gcb}",
20179 "{lb}",
20180 "{sb}",
20181 "{wb}"
20182 };
20183 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
20184 sv_catpv(sv, bounds[FLAGS(o)]);
20185 }
20186 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
20187 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
20188 else if (OP(o) == SBOL)
20189 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
20190
20191 /* add on the verb argument if there is one */
20192 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
20193 if ( ARG(o) )
20194 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
20195 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
20196 else
20197 sv_catpvs(sv, ":NULL");
20198 }
20199#else
20200 PERL_UNUSED_CONTEXT;
20201 PERL_UNUSED_ARG(sv);
20202 PERL_UNUSED_ARG(o);
20203 PERL_UNUSED_ARG(prog);
20204 PERL_UNUSED_ARG(reginfo);
20205 PERL_UNUSED_ARG(pRExC_state);
20206#endif /* DEBUGGING */
20207}
20208
20209
20210
20211SV *
20212Perl_re_intuit_string(pTHX_ REGEXP * const r)
20213{ /* Assume that RE_INTUIT is set */
20214 struct regexp *const prog = ReANY(r);
20215 GET_RE_DEBUG_FLAGS_DECL;
20216
20217 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
20218 PERL_UNUSED_CONTEXT;
20219
20220 DEBUG_COMPILE_r(
20221 {
20222 const char * const s = SvPV_nolen_const(RX_UTF8(r)
20223 ? prog->check_utf8 : prog->check_substr);
20224
20225 if (!PL_colorset) reginitcolors();
20226 Perl_re_printf( aTHX_
20227 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
20228 PL_colors[4],
20229 RX_UTF8(r) ? "utf8 " : "",
20230 PL_colors[5], PL_colors[0],
20231 s,
20232 PL_colors[1],
20233 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
20234 } );
20235
20236 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
20237 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
20238}
20239
20240/*
20241 pregfree()
20242
20243 handles refcounting and freeing the perl core regexp structure. When
20244 it is necessary to actually free the structure the first thing it
20245 does is call the 'free' method of the regexp_engine associated to
20246 the regexp, allowing the handling of the void *pprivate; member
20247 first. (This routine is not overridable by extensions, which is why
20248 the extensions free is called first.)
20249
20250 See regdupe and regdupe_internal if you change anything here.
20251*/
20252#ifndef PERL_IN_XSUB_RE
20253void
20254Perl_pregfree(pTHX_ REGEXP *r)
20255{
20256 SvREFCNT_dec(r);
20257}
20258
20259void
20260Perl_pregfree2(pTHX_ REGEXP *rx)
20261{
20262 struct regexp *const r = ReANY(rx);
20263 GET_RE_DEBUG_FLAGS_DECL;
20264
20265 PERL_ARGS_ASSERT_PREGFREE2;
20266
20267 if (r->mother_re) {
20268 ReREFCNT_dec(r->mother_re);
20269 } else {
20270 CALLREGFREE_PVT(rx); /* free the private data */
20271 SvREFCNT_dec(RXp_PAREN_NAMES(r));
20272 }
20273 if (r->substrs) {
20274 int i;
20275 for (i = 0; i < 2; i++) {
20276 SvREFCNT_dec(r->substrs->data[i].substr);
20277 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
20278 }
20279 Safefree(r->substrs);
20280 }
20281 RX_MATCH_COPY_FREE(rx);
20282#ifdef PERL_ANY_COW
20283 SvREFCNT_dec(r->saved_copy);
20284#endif
20285 Safefree(r->offs);
20286 SvREFCNT_dec(r->qr_anoncv);
20287 if (r->recurse_locinput)
20288 Safefree(r->recurse_locinput);
20289}
20290
20291
20292/* reg_temp_copy()
20293
20294 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
20295 except that dsv will be created if NULL.
20296
20297 This function is used in two main ways. First to implement
20298 $r = qr/....; $s = $$r;
20299
20300 Secondly, it is used as a hacky workaround to the structural issue of
20301 match results
20302 being stored in the regexp structure which is in turn stored in
20303 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
20304 could be PL_curpm in multiple contexts, and could require multiple
20305 result sets being associated with the pattern simultaneously, such
20306 as when doing a recursive match with (??{$qr})
20307
20308 The solution is to make a lightweight copy of the regexp structure
20309 when a qr// is returned from the code executed by (??{$qr}) this
20310 lightweight copy doesn't actually own any of its data except for
20311 the starp/end and the actual regexp structure itself.
20312
20313*/
20314
20315
20316REGEXP *
20317Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
20318{
20319 struct regexp *drx;
20320 struct regexp *const srx = ReANY(ssv);
20321 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
20322
20323 PERL_ARGS_ASSERT_REG_TEMP_COPY;
20324
20325 if (!dsv)
20326 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
20327 else {
20328 SvOK_off((SV *)dsv);
20329 if (islv) {
20330 /* For PVLVs, the head (sv_any) points to an XPVLV, while
20331 * the LV's xpvlenu_rx will point to a regexp body, which
20332 * we allocate here */
20333 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
20334 assert(!SvPVX(dsv));
20335 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
20336 temp->sv_any = NULL;
20337 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
20338 SvREFCNT_dec_NN(temp);
20339 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
20340 ing below will not set it. */
20341 SvCUR_set(dsv, SvCUR(ssv));
20342 }
20343 }
20344 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
20345 sv_force_normal(sv) is called. */
20346 SvFAKE_on(dsv);
20347 drx = ReANY(dsv);
20348
20349 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
20350 SvPV_set(dsv, RX_WRAPPED(ssv));
20351 /* We share the same string buffer as the original regexp, on which we
20352 hold a reference count, incremented when mother_re is set below.
20353 The string pointer is copied here, being part of the regexp struct.
20354 */
20355 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
20356 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
20357 if (!islv)
20358 SvLEN_set(dsv, 0);
20359 if (srx->offs) {
20360 const I32 npar = srx->nparens+1;
20361 Newx(drx->offs, npar, regexp_paren_pair);
20362 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
20363 }
20364 if (srx->substrs) {
20365 int i;
20366 Newx(drx->substrs, 1, struct reg_substr_data);
20367 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
20368
20369 for (i = 0; i < 2; i++) {
20370 SvREFCNT_inc_void(drx->substrs->data[i].substr);
20371 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
20372 }
20373
20374 /* check_substr and check_utf8, if non-NULL, point to either their
20375 anchored or float namesakes, and don't hold a second reference. */
20376 }
20377 RX_MATCH_COPIED_off(dsv);
20378#ifdef PERL_ANY_COW
20379 drx->saved_copy = NULL;
20380#endif
20381 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
20382 SvREFCNT_inc_void(drx->qr_anoncv);
20383 if (srx->recurse_locinput)
20384 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
20385
20386 return dsv;
20387}
20388#endif
20389
20390
20391/* regfree_internal()
20392
20393 Free the private data in a regexp. This is overloadable by
20394 extensions. Perl takes care of the regexp structure in pregfree(),
20395 this covers the *pprivate pointer which technically perl doesn't
20396 know about, however of course we have to handle the
20397 regexp_internal structure when no extension is in use.
20398
20399 Note this is called before freeing anything in the regexp
20400 structure.
20401 */
20402
20403void
20404Perl_regfree_internal(pTHX_ REGEXP * const rx)
20405{
20406 struct regexp *const r = ReANY(rx);
20407 RXi_GET_DECL(r, ri);
20408 GET_RE_DEBUG_FLAGS_DECL;
20409
20410 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
20411
20412 DEBUG_COMPILE_r({
20413 if (!PL_colorset)
20414 reginitcolors();
20415 {
20416 SV *dsv= sv_newmortal();
20417 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
20418 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
20419 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
20420 PL_colors[4], PL_colors[5], s);
20421 }
20422 });
20423
20424#ifdef RE_TRACK_PATTERN_OFFSETS
20425 if (ri->u.offsets)
20426 Safefree(ri->u.offsets); /* 20010421 MJD */
20427#endif
20428 if (ri->code_blocks)
20429 S_free_codeblocks(aTHX_ ri->code_blocks);
20430
20431 if (ri->data) {
20432 int n = ri->data->count;
20433
20434 while (--n >= 0) {
20435 /* If you add a ->what type here, update the comment in regcomp.h */
20436 switch (ri->data->what[n]) {
20437 case 'a':
20438 case 'r':
20439 case 's':
20440 case 'S':
20441 case 'u':
20442 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
20443 break;
20444 case 'f':
20445 Safefree(ri->data->data[n]);
20446 break;
20447 case 'l':
20448 case 'L':
20449 break;
20450 case 'T':
20451 { /* Aho Corasick add-on structure for a trie node.
20452 Used in stclass optimization only */
20453 U32 refcount;
20454 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
20455#ifdef USE_ITHREADS
20456 dVAR;
20457#endif
20458 OP_REFCNT_LOCK;
20459 refcount = --aho->refcount;
20460 OP_REFCNT_UNLOCK;
20461 if ( !refcount ) {
20462 PerlMemShared_free(aho->states);
20463 PerlMemShared_free(aho->fail);
20464 /* do this last!!!! */
20465 PerlMemShared_free(ri->data->data[n]);
20466 /* we should only ever get called once, so
20467 * assert as much, and also guard the free
20468 * which /might/ happen twice. At the least
20469 * it will make code anlyzers happy and it
20470 * doesn't cost much. - Yves */
20471 assert(ri->regstclass);
20472 if (ri->regstclass) {
20473 PerlMemShared_free(ri->regstclass);
20474 ri->regstclass = 0;
20475 }
20476 }
20477 }
20478 break;
20479 case 't':
20480 {
20481 /* trie structure. */
20482 U32 refcount;
20483 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
20484#ifdef USE_ITHREADS
20485 dVAR;
20486#endif
20487 OP_REFCNT_LOCK;
20488 refcount = --trie->refcount;
20489 OP_REFCNT_UNLOCK;
20490 if ( !refcount ) {
20491 PerlMemShared_free(trie->charmap);
20492 PerlMemShared_free(trie->states);
20493 PerlMemShared_free(trie->trans);
20494 if (trie->bitmap)
20495 PerlMemShared_free(trie->bitmap);
20496 if (trie->jump)
20497 PerlMemShared_free(trie->jump);
20498 PerlMemShared_free(trie->wordinfo);
20499 /* do this last!!!! */
20500 PerlMemShared_free(ri->data->data[n]);
20501 }
20502 }
20503 break;
20504 default:
20505 Perl_croak(aTHX_ "panic: regfree data code '%c'",
20506 ri->data->what[n]);
20507 }
20508 }
20509 Safefree(ri->data->what);
20510 Safefree(ri->data);
20511 }
20512
20513 Safefree(ri);
20514}
20515
20516#define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
20517#define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
20518#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
20519
20520/*
20521 re_dup_guts - duplicate a regexp.
20522
20523 This routine is expected to clone a given regexp structure. It is only
20524 compiled under USE_ITHREADS.
20525
20526 After all of the core data stored in struct regexp is duplicated
20527 the regexp_engine.dupe method is used to copy any private data
20528 stored in the *pprivate pointer. This allows extensions to handle
20529 any duplication it needs to do.
20530
20531 See pregfree() and regfree_internal() if you change anything here.
20532*/
20533#if defined(USE_ITHREADS)
20534#ifndef PERL_IN_XSUB_RE
20535void
20536Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
20537{
20538 dVAR;
20539 I32 npar;
20540 const struct regexp *r = ReANY(sstr);
20541 struct regexp *ret = ReANY(dstr);
20542
20543 PERL_ARGS_ASSERT_RE_DUP_GUTS;
20544
20545 npar = r->nparens+1;
20546 Newx(ret->offs, npar, regexp_paren_pair);
20547 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
20548
20549 if (ret->substrs) {
20550 /* Do it this way to avoid reading from *r after the StructCopy().
20551 That way, if any of the sv_dup_inc()s dislodge *r from the L1
20552 cache, it doesn't matter. */
20553 int i;
20554 const bool anchored = r->check_substr
20555 ? r->check_substr == r->substrs->data[0].substr
20556 : r->check_utf8 == r->substrs->data[0].utf8_substr;
20557 Newx(ret->substrs, 1, struct reg_substr_data);
20558 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
20559
20560 for (i = 0; i < 2; i++) {
20561 ret->substrs->data[i].substr =
20562 sv_dup_inc(ret->substrs->data[i].substr, param);
20563 ret->substrs->data[i].utf8_substr =
20564 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
20565 }
20566
20567 /* check_substr and check_utf8, if non-NULL, point to either their
20568 anchored or float namesakes, and don't hold a second reference. */
20569
20570 if (ret->check_substr) {
20571 if (anchored) {
20572 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
20573
20574 ret->check_substr = ret->substrs->data[0].substr;
20575 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
20576 } else {
20577 assert(r->check_substr == r->substrs->data[1].substr);
20578 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
20579
20580 ret->check_substr = ret->substrs->data[1].substr;
20581 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
20582 }
20583 } else if (ret->check_utf8) {
20584 if (anchored) {
20585 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
20586 } else {
20587 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
20588 }
20589 }
20590 }
20591
20592 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
20593 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
20594 if (r->recurse_locinput)
20595 Newx(ret->recurse_locinput, r->nparens + 1, char *);
20596
20597 if (ret->pprivate)
20598 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
20599
20600 if (RX_MATCH_COPIED(dstr))
20601 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
20602 else
20603 ret->subbeg = NULL;
20604#ifdef PERL_ANY_COW
20605 ret->saved_copy = NULL;
20606#endif
20607
20608 /* Whether mother_re be set or no, we need to copy the string. We
20609 cannot refrain from copying it when the storage points directly to
20610 our mother regexp, because that's
20611 1: a buffer in a different thread
20612 2: something we no longer hold a reference on
20613 so we need to copy it locally. */
20614 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
20615 ret->mother_re = NULL;
20616}
20617#endif /* PERL_IN_XSUB_RE */
20618
20619/*
20620 regdupe_internal()
20621
20622 This is the internal complement to regdupe() which is used to copy
20623 the structure pointed to by the *pprivate pointer in the regexp.
20624 This is the core version of the extension overridable cloning hook.
20625 The regexp structure being duplicated will be copied by perl prior
20626 to this and will be provided as the regexp *r argument, however
20627 with the /old/ structures pprivate pointer value. Thus this routine
20628 may override any copying normally done by perl.
20629
20630 It returns a pointer to the new regexp_internal structure.
20631*/
20632
20633void *
20634Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
20635{
20636 dVAR;
20637 struct regexp *const r = ReANY(rx);
20638 regexp_internal *reti;
20639 int len;
20640 RXi_GET_DECL(r, ri);
20641
20642 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
20643
20644 len = ProgLen(ri);
20645
20646 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
20647 char, regexp_internal);
20648 Copy(ri->program, reti->program, len+1, regnode);
20649
20650
20651 if (ri->code_blocks) {
20652 int n;
20653 Newx(reti->code_blocks, 1, struct reg_code_blocks);
20654 Newx(reti->code_blocks->cb, ri->code_blocks->count,
20655 struct reg_code_block);
20656 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
20657 ri->code_blocks->count, struct reg_code_block);
20658 for (n = 0; n < ri->code_blocks->count; n++)
20659 reti->code_blocks->cb[n].src_regex = (REGEXP*)
20660 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
20661 reti->code_blocks->count = ri->code_blocks->count;
20662 reti->code_blocks->refcnt = 1;
20663 }
20664 else
20665 reti->code_blocks = NULL;
20666
20667 reti->regstclass = NULL;
20668
20669 if (ri->data) {
20670 struct reg_data *d;
20671 const int count = ri->data->count;
20672 int i;
20673
20674 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
20675 char, struct reg_data);
20676 Newx(d->what, count, U8);
20677
20678 d->count = count;
20679 for (i = 0; i < count; i++) {
20680 d->what[i] = ri->data->what[i];
20681 switch (d->what[i]) {
20682 /* see also regcomp.h and regfree_internal() */
20683 case 'a': /* actually an AV, but the dup function is identical.
20684 values seem to be "plain sv's" generally. */
20685 case 'r': /* a compiled regex (but still just another SV) */
20686 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
20687 this use case should go away, the code could have used
20688 'a' instead - see S_set_ANYOF_arg() for array contents. */
20689 case 'S': /* actually an SV, but the dup function is identical. */
20690 case 'u': /* actually an HV, but the dup function is identical.
20691 values are "plain sv's" */
20692 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
20693 break;
20694 case 'f':
20695 /* Synthetic Start Class - "Fake" charclass we generate to optimize
20696 * patterns which could start with several different things. Pre-TRIE
20697 * this was more important than it is now, however this still helps
20698 * in some places, for instance /x?a+/ might produce a SSC equivalent
20699 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
20700 * in regexec.c
20701 */
20702 /* This is cheating. */
20703 Newx(d->data[i], 1, regnode_ssc);
20704 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
20705 reti->regstclass = (regnode*)d->data[i];
20706 break;
20707 case 'T':
20708 /* AHO-CORASICK fail table */
20709 /* Trie stclasses are readonly and can thus be shared
20710 * without duplication. We free the stclass in pregfree
20711 * when the corresponding reg_ac_data struct is freed.
20712 */
20713 reti->regstclass= ri->regstclass;
20714 /* FALLTHROUGH */
20715 case 't':
20716 /* TRIE transition table */
20717 OP_REFCNT_LOCK;
20718 ((reg_trie_data*)ri->data->data[i])->refcount++;
20719 OP_REFCNT_UNLOCK;
20720 /* FALLTHROUGH */
20721 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
20722 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
20723 is not from another regexp */
20724 d->data[i] = ri->data->data[i];
20725 break;
20726 default:
20727 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
20728 ri->data->what[i]);
20729 }
20730 }
20731
20732 reti->data = d;
20733 }
20734 else
20735 reti->data = NULL;
20736
20737 reti->name_list_idx = ri->name_list_idx;
20738
20739#ifdef RE_TRACK_PATTERN_OFFSETS
20740 if (ri->u.offsets) {
20741 Newx(reti->u.offsets, 2*len+1, U32);
20742 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
20743 }
20744#else
20745 SetProgLen(reti, len);
20746#endif
20747
20748 return (void*)reti;
20749}
20750
20751#endif /* USE_ITHREADS */
20752
20753#ifndef PERL_IN_XSUB_RE
20754
20755/*
20756 - regnext - dig the "next" pointer out of a node
20757 */
20758regnode *
20759Perl_regnext(pTHX_ regnode *p)
20760{
20761 I32 offset;
20762
20763 if (!p)
20764 return(NULL);
20765
20766 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
20767 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
20768 (int)OP(p), (int)REGNODE_MAX);
20769 }
20770
20771 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
20772 if (offset == 0)
20773 return(NULL);
20774
20775 return(p+offset);
20776}
20777
20778#endif
20779
20780STATIC void
20781S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
20782{
20783 va_list args;
20784 STRLEN l1 = strlen(pat1);
20785 STRLEN l2 = strlen(pat2);
20786 char buf[512];
20787 SV *msv;
20788 const char *message;
20789
20790 PERL_ARGS_ASSERT_RE_CROAK2;
20791
20792 if (l1 > 510)
20793 l1 = 510;
20794 if (l1 + l2 > 510)
20795 l2 = 510 - l1;
20796 Copy(pat1, buf, l1 , char);
20797 Copy(pat2, buf + l1, l2 , char);
20798 buf[l1 + l2] = '\n';
20799 buf[l1 + l2 + 1] = '\0';
20800 va_start(args, pat2);
20801 msv = vmess(buf, &args);
20802 va_end(args);
20803 message = SvPV_const(msv, l1);
20804 if (l1 > 512)
20805 l1 = 512;
20806 Copy(message, buf, l1 , char);
20807 /* l1-1 to avoid \n */
20808 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
20809}
20810
20811/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
20812
20813#ifndef PERL_IN_XSUB_RE
20814void
20815Perl_save_re_context(pTHX)
20816{
20817 I32 nparens = -1;
20818 I32 i;
20819
20820 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
20821
20822 if (PL_curpm) {
20823 const REGEXP * const rx = PM_GETRE(PL_curpm);
20824 if (rx)
20825 nparens = RX_NPARENS(rx);
20826 }
20827
20828 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
20829 * that PL_curpm will be null, but that utf8.pm and the modules it
20830 * loads will only use $1..$3.
20831 * The t/porting/re_context.t test file checks this assumption.
20832 */
20833 if (nparens == -1)
20834 nparens = 3;
20835
20836 for (i = 1; i <= nparens; i++) {
20837 char digits[TYPE_CHARS(long)];
20838 const STRLEN len = my_snprintf(digits, sizeof(digits),
20839 "%lu", (long)i);
20840 GV *const *const gvp
20841 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
20842
20843 if (gvp) {
20844 GV * const gv = *gvp;
20845 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
20846 save_scalar(gv);
20847 }
20848 }
20849}
20850#endif
20851
20852#ifdef DEBUGGING
20853
20854STATIC void
20855S_put_code_point(pTHX_ SV *sv, UV c)
20856{
20857 PERL_ARGS_ASSERT_PUT_CODE_POINT;
20858
20859 if (c > 255) {
20860 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
20861 }
20862 else if (isPRINT(c)) {
20863 const char string = (char) c;
20864
20865 /* We use {phrase} as metanotation in the class, so also escape literal
20866 * braces */
20867 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
20868 sv_catpvs(sv, "\\");
20869 sv_catpvn(sv, &string, 1);
20870 }
20871 else if (isMNEMONIC_CNTRL(c)) {
20872 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
20873 }
20874 else {
20875 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
20876 }
20877}
20878
20879#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
20880
20881STATIC void
20882S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
20883{
20884 /* Appends to 'sv' a displayable version of the range of code points from
20885 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
20886 * that have them, when they occur at the beginning or end of the range.
20887 * It uses hex to output the remaining code points, unless 'allow_literals'
20888 * is true, in which case the printable ASCII ones are output as-is (though
20889 * some of these will be escaped by put_code_point()).
20890 *
20891 * NOTE: This is designed only for printing ranges of code points that fit
20892 * inside an ANYOF bitmap. Higher code points are simply suppressed
20893 */
20894
20895 const unsigned int min_range_count = 3;
20896
20897 assert(start <= end);
20898
20899 PERL_ARGS_ASSERT_PUT_RANGE;
20900
20901 while (start <= end) {
20902 UV this_end;
20903 const char * format;
20904
20905 if (end - start < min_range_count) {
20906
20907 /* Output chars individually when they occur in short ranges */
20908 for (; start <= end; start++) {
20909 put_code_point(sv, start);
20910 }
20911 break;
20912 }
20913
20914 /* If permitted by the input options, and there is a possibility that
20915 * this range contains a printable literal, look to see if there is
20916 * one. */
20917 if (allow_literals && start <= MAX_PRINT_A) {
20918
20919 /* If the character at the beginning of the range isn't an ASCII
20920 * printable, effectively split the range into two parts:
20921 * 1) the portion before the first such printable,
20922 * 2) the rest
20923 * and output them separately. */
20924 if (! isPRINT_A(start)) {
20925 UV temp_end = start + 1;
20926
20927 /* There is no point looking beyond the final possible
20928 * printable, in MAX_PRINT_A */
20929 UV max = MIN(end, MAX_PRINT_A);
20930
20931 while (temp_end <= max && ! isPRINT_A(temp_end)) {
20932 temp_end++;
20933 }
20934
20935 /* Here, temp_end points to one beyond the first printable if
20936 * found, or to one beyond 'max' if not. If none found, make
20937 * sure that we use the entire range */
20938 if (temp_end > MAX_PRINT_A) {
20939 temp_end = end + 1;
20940 }
20941
20942 /* Output the first part of the split range: the part that
20943 * doesn't have printables, with the parameter set to not look
20944 * for literals (otherwise we would infinitely recurse) */
20945 put_range(sv, start, temp_end - 1, FALSE);
20946
20947 /* The 2nd part of the range (if any) starts here. */
20948 start = temp_end;
20949
20950 /* We do a continue, instead of dropping down, because even if
20951 * the 2nd part is non-empty, it could be so short that we want
20952 * to output it as individual characters, as tested for at the
20953 * top of this loop. */
20954 continue;
20955 }
20956
20957 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
20958 * output a sub-range of just the digits or letters, then process
20959 * the remaining portion as usual. */
20960 if (isALPHANUMERIC_A(start)) {
20961 UV mask = (isDIGIT_A(start))
20962 ? _CC_DIGIT
20963 : isUPPER_A(start)
20964 ? _CC_UPPER
20965 : _CC_LOWER;
20966 UV temp_end = start + 1;
20967
20968 /* Find the end of the sub-range that includes just the
20969 * characters in the same class as the first character in it */
20970 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
20971 temp_end++;
20972 }
20973 temp_end--;
20974
20975 /* For short ranges, don't duplicate the code above to output
20976 * them; just call recursively */
20977 if (temp_end - start < min_range_count) {
20978 put_range(sv, start, temp_end, FALSE);
20979 }
20980 else { /* Output as a range */
20981 put_code_point(sv, start);
20982 sv_catpvs(sv, "-");
20983 put_code_point(sv, temp_end);
20984 }
20985 start = temp_end + 1;
20986 continue;
20987 }
20988
20989 /* We output any other printables as individual characters */
20990 if (isPUNCT_A(start) || isSPACE_A(start)) {
20991 while (start <= end && (isPUNCT_A(start)
20992 || isSPACE_A(start)))
20993 {
20994 put_code_point(sv, start);
20995 start++;
20996 }
20997 continue;
20998 }
20999 } /* End of looking for literals */
21000
21001 /* Here is not to output as a literal. Some control characters have
21002 * mnemonic names. Split off any of those at the beginning and end of
21003 * the range to print mnemonically. It isn't possible for many of
21004 * these to be in a row, so this won't overwhelm with output */
21005 if ( start <= end
21006 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21007 {
21008 while (isMNEMONIC_CNTRL(start) && start <= end) {
21009 put_code_point(sv, start);
21010 start++;
21011 }
21012
21013 /* If this didn't take care of the whole range ... */
21014 if (start <= end) {
21015
21016 /* Look backwards from the end to find the final non-mnemonic
21017 * */
21018 UV temp_end = end;
21019 while (isMNEMONIC_CNTRL(temp_end)) {
21020 temp_end--;
21021 }
21022
21023 /* And separately output the interior range that doesn't start
21024 * or end with mnemonics */
21025 put_range(sv, start, temp_end, FALSE);
21026
21027 /* Then output the mnemonic trailing controls */
21028 start = temp_end + 1;
21029 while (start <= end) {
21030 put_code_point(sv, start);
21031 start++;
21032 }
21033 break;
21034 }
21035 }
21036
21037 /* As a final resort, output the range or subrange as hex. */
21038
21039 this_end = (end < NUM_ANYOF_CODE_POINTS)
21040 ? end
21041 : NUM_ANYOF_CODE_POINTS - 1;
21042#if NUM_ANYOF_CODE_POINTS > 256
21043 format = (this_end < 256)
21044 ? "\\x%02" UVXf "-\\x%02" UVXf
21045 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
21046#else
21047 format = "\\x%02" UVXf "-\\x%02" UVXf;
21048#endif
21049 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
21050 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
21051 GCC_DIAG_RESTORE_STMT;
21052 break;
21053 }
21054}
21055
21056STATIC void
21057S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
21058{
21059 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
21060 * 'invlist' */
21061
21062 UV start, end;
21063 bool allow_literals = TRUE;
21064
21065 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
21066
21067 /* Generally, it is more readable if printable characters are output as
21068 * literals, but if a range (nearly) spans all of them, it's best to output
21069 * it as a single range. This code will use a single range if all but 2
21070 * ASCII printables are in it */
21071 invlist_iterinit(invlist);
21072 while (invlist_iternext(invlist, &start, &end)) {
21073
21074 /* If the range starts beyond the final printable, it doesn't have any
21075 * in it */
21076 if (start > MAX_PRINT_A) {
21077 break;
21078 }
21079
21080 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
21081 * all but two, the range must start and end no later than 2 from
21082 * either end */
21083 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
21084 if (end > MAX_PRINT_A) {
21085 end = MAX_PRINT_A;
21086 }
21087 if (start < ' ') {
21088 start = ' ';
21089 }
21090 if (end - start >= MAX_PRINT_A - ' ' - 2) {
21091 allow_literals = FALSE;
21092 }
21093 break;
21094 }
21095 }
21096 invlist_iterfinish(invlist);
21097
21098 /* Here we have figured things out. Output each range */
21099 invlist_iterinit(invlist);
21100 while (invlist_iternext(invlist, &start, &end)) {
21101 if (start >= NUM_ANYOF_CODE_POINTS) {
21102 break;
21103 }
21104 put_range(sv, start, end, allow_literals);
21105 }
21106 invlist_iterfinish(invlist);
21107
21108 return;
21109}
21110
21111STATIC SV*
21112S_put_charclass_bitmap_innards_common(pTHX_
21113 SV* invlist, /* The bitmap */
21114 SV* posixes, /* Under /l, things like [:word:], \S */
21115 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
21116 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
21117 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
21118 const bool invert /* Is the result to be inverted? */
21119)
21120{
21121 /* Create and return an SV containing a displayable version of the bitmap
21122 * and associated information determined by the input parameters. If the
21123 * output would have been only the inversion indicator '^', NULL is instead
21124 * returned. */
21125
21126 SV * output;
21127
21128 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
21129
21130 if (invert) {
21131 output = newSVpvs("^");
21132 }
21133 else {
21134 output = newSVpvs("");
21135 }
21136
21137 /* First, the code points in the bitmap that are unconditionally there */
21138 put_charclass_bitmap_innards_invlist(output, invlist);
21139
21140 /* Traditionally, these have been placed after the main code points */
21141 if (posixes) {
21142 sv_catsv(output, posixes);
21143 }
21144
21145 if (only_utf8 && _invlist_len(only_utf8)) {
21146 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
21147 put_charclass_bitmap_innards_invlist(output, only_utf8);
21148 }
21149
21150 if (not_utf8 && _invlist_len(not_utf8)) {
21151 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
21152 put_charclass_bitmap_innards_invlist(output, not_utf8);
21153 }
21154
21155 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
21156 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
21157 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
21158
21159 /* This is the only list in this routine that can legally contain code
21160 * points outside the bitmap range. The call just above to
21161 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
21162 * output them here. There's about a half-dozen possible, and none in
21163 * contiguous ranges longer than 2 */
21164 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21165 UV start, end;
21166 SV* above_bitmap = NULL;
21167
21168 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
21169
21170 invlist_iterinit(above_bitmap);
21171 while (invlist_iternext(above_bitmap, &start, &end)) {
21172 UV i;
21173
21174 for (i = start; i <= end; i++) {
21175 put_code_point(output, i);
21176 }
21177 }
21178 invlist_iterfinish(above_bitmap);
21179 SvREFCNT_dec_NN(above_bitmap);
21180 }
21181 }
21182
21183 if (invert && SvCUR(output) == 1) {
21184 return NULL;
21185 }
21186
21187 return output;
21188}
21189
21190STATIC bool
21191S_put_charclass_bitmap_innards(pTHX_ SV *sv,
21192 char *bitmap,
21193 SV *nonbitmap_invlist,
21194 SV *only_utf8_locale_invlist,
21195 const regnode * const node,
21196 const bool force_as_is_display)
21197{
21198 /* Appends to 'sv' a displayable version of the innards of the bracketed
21199 * character class defined by the other arguments:
21200 * 'bitmap' points to the bitmap, or NULL if to ignore that.
21201 * 'nonbitmap_invlist' is an inversion list of the code points that are in
21202 * the bitmap range, but for some reason aren't in the bitmap; NULL if
21203 * none. The reasons for this could be that they require some
21204 * condition such as the target string being or not being in UTF-8
21205 * (under /d), or because they came from a user-defined property that
21206 * was not resolved at the time of the regex compilation (under /u)
21207 * 'only_utf8_locale_invlist' is an inversion list of the code points that
21208 * are valid only if the runtime locale is a UTF-8 one; NULL if none
21209 * 'node' is the regex pattern ANYOF node. It is needed only when the
21210 * above two parameters are not null, and is passed so that this
21211 * routine can tease apart the various reasons for them.
21212 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
21213 * to invert things to see if that leads to a cleaner display. If
21214 * FALSE, this routine is free to use its judgment about doing this.
21215 *
21216 * It returns TRUE if there was actually something output. (It may be that
21217 * the bitmap, etc is empty.)
21218 *
21219 * When called for outputting the bitmap of a non-ANYOF node, just pass the
21220 * bitmap, with the succeeding parameters set to NULL, and the final one to
21221 * FALSE.
21222 */
21223
21224 /* In general, it tries to display the 'cleanest' representation of the
21225 * innards, choosing whether to display them inverted or not, regardless of
21226 * whether the class itself is to be inverted. However, there are some
21227 * cases where it can't try inverting, as what actually matches isn't known
21228 * until runtime, and hence the inversion isn't either. */
21229 bool inverting_allowed = ! force_as_is_display;
21230
21231 int i;
21232 STRLEN orig_sv_cur = SvCUR(sv);
21233
21234 SV* invlist; /* Inversion list we accumulate of code points that
21235 are unconditionally matched */
21236 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
21237 UTF-8 */
21238 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
21239 */
21240 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
21241 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
21242 is UTF-8 */
21243
21244 SV* as_is_display; /* The output string when we take the inputs
21245 literally */
21246 SV* inverted_display; /* The output string when we invert the inputs */
21247
21248 U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
21249
21250 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
21251 to match? */
21252 /* We are biased in favor of displaying things without them being inverted,
21253 * as that is generally easier to understand */
21254 const int bias = 5;
21255
21256 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
21257
21258 /* Start off with whatever code points are passed in. (We clone, so we
21259 * don't change the caller's list) */
21260 if (nonbitmap_invlist) {
21261 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
21262 invlist = invlist_clone(nonbitmap_invlist, NULL);
21263 }
21264 else { /* Worst case size is every other code point is matched */
21265 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
21266 }
21267
21268 if (flags) {
21269 if (OP(node) == ANYOFD) {
21270
21271 /* This flag indicates that the code points below 0x100 in the
21272 * nonbitmap list are precisely the ones that match only when the
21273 * target is UTF-8 (they should all be non-ASCII). */
21274 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
21275 {
21276 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
21277 _invlist_subtract(invlist, only_utf8, &invlist);
21278 }
21279
21280 /* And this flag for matching all non-ASCII 0xFF and below */
21281 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
21282 {
21283 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
21284 }
21285 }
21286 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
21287
21288 /* If either of these flags are set, what matches isn't
21289 * determinable except during execution, so don't know enough here
21290 * to invert */
21291 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
21292 inverting_allowed = FALSE;
21293 }
21294
21295 /* What the posix classes match also varies at runtime, so these
21296 * will be output symbolically. */
21297 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
21298 int i;
21299
21300 posixes = newSVpvs("");
21301 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
21302 if (ANYOF_POSIXL_TEST(node, i)) {
21303 sv_catpv(posixes, anyofs[i]);
21304 }
21305 }
21306 }
21307 }
21308 }
21309
21310 /* Accumulate the bit map into the unconditional match list */
21311 if (bitmap) {
21312 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
21313 if (BITMAP_TEST(bitmap, i)) {
21314 int start = i++;
21315 for (;
21316 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
21317 i++)
21318 { /* empty */ }
21319 invlist = _add_range_to_invlist(invlist, start, i-1);
21320 }
21321 }
21322 }
21323
21324 /* Make sure that the conditional match lists don't have anything in them
21325 * that match unconditionally; otherwise the output is quite confusing.
21326 * This could happen if the code that populates these misses some
21327 * duplication. */
21328 if (only_utf8) {
21329 _invlist_subtract(only_utf8, invlist, &only_utf8);
21330 }
21331 if (not_utf8) {
21332 _invlist_subtract(not_utf8, invlist, &not_utf8);
21333 }
21334
21335 if (only_utf8_locale_invlist) {
21336
21337 /* Since this list is passed in, we have to make a copy before
21338 * modifying it */
21339 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
21340
21341 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
21342
21343 /* And, it can get really weird for us to try outputting an inverted
21344 * form of this list when it has things above the bitmap, so don't even
21345 * try */
21346 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21347 inverting_allowed = FALSE;
21348 }
21349 }
21350
21351 /* Calculate what the output would be if we take the input as-is */
21352 as_is_display = put_charclass_bitmap_innards_common(invlist,
21353 posixes,
21354 only_utf8,
21355 not_utf8,
21356 only_utf8_locale,
21357 invert);
21358
21359 /* If have to take the output as-is, just do that */
21360 if (! inverting_allowed) {
21361 if (as_is_display) {
21362 sv_catsv(sv, as_is_display);
21363 SvREFCNT_dec_NN(as_is_display);
21364 }
21365 }
21366 else { /* But otherwise, create the output again on the inverted input, and
21367 use whichever version is shorter */
21368
21369 int inverted_bias, as_is_bias;
21370
21371 /* We will apply our bias to whichever of the the results doesn't have
21372 * the '^' */
21373 if (invert) {
21374 invert = FALSE;
21375 as_is_bias = bias;
21376 inverted_bias = 0;
21377 }
21378 else {
21379 invert = TRUE;
21380 as_is_bias = 0;
21381 inverted_bias = bias;
21382 }
21383
21384 /* Now invert each of the lists that contribute to the output,
21385 * excluding from the result things outside the possible range */
21386
21387 /* For the unconditional inversion list, we have to add in all the
21388 * conditional code points, so that when inverted, they will be gone
21389 * from it */
21390 _invlist_union(only_utf8, invlist, &invlist);
21391 _invlist_union(not_utf8, invlist, &invlist);
21392 _invlist_union(only_utf8_locale, invlist, &invlist);
21393 _invlist_invert(invlist);
21394 _invlist_intersection(invlist, PL_InBitmap, &invlist);
21395
21396 if (only_utf8) {
21397 _invlist_invert(only_utf8);
21398 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
21399 }
21400 else if (not_utf8) {
21401
21402 /* If a code point matches iff the target string is not in UTF-8,
21403 * then complementing the result has it not match iff not in UTF-8,
21404 * which is the same thing as matching iff it is UTF-8. */
21405 only_utf8 = not_utf8;
21406 not_utf8 = NULL;
21407 }
21408
21409 if (only_utf8_locale) {
21410 _invlist_invert(only_utf8_locale);
21411 _invlist_intersection(only_utf8_locale,
21412 PL_InBitmap,
21413 &only_utf8_locale);
21414 }
21415
21416 inverted_display = put_charclass_bitmap_innards_common(
21417 invlist,
21418 posixes,
21419 only_utf8,
21420 not_utf8,
21421 only_utf8_locale, invert);
21422
21423 /* Use the shortest representation, taking into account our bias
21424 * against showing it inverted */
21425 if ( inverted_display
21426 && ( ! as_is_display
21427 || ( SvCUR(inverted_display) + inverted_bias
21428 < SvCUR(as_is_display) + as_is_bias)))
21429 {
21430 sv_catsv(sv, inverted_display);
21431 }
21432 else if (as_is_display) {
21433 sv_catsv(sv, as_is_display);
21434 }
21435
21436 SvREFCNT_dec(as_is_display);
21437 SvREFCNT_dec(inverted_display);
21438 }
21439
21440 SvREFCNT_dec_NN(invlist);
21441 SvREFCNT_dec(only_utf8);
21442 SvREFCNT_dec(not_utf8);
21443 SvREFCNT_dec(posixes);
21444 SvREFCNT_dec(only_utf8_locale);
21445
21446 return SvCUR(sv) > orig_sv_cur;
21447}
21448
21449#define CLEAR_OPTSTART \
21450 if (optstart) STMT_START { \
21451 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
21452 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
21453 optstart=NULL; \
21454 } STMT_END
21455
21456#define DUMPUNTIL(b,e) \
21457 CLEAR_OPTSTART; \
21458 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
21459
21460STATIC const regnode *
21461S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
21462 const regnode *last, const regnode *plast,
21463 SV* sv, I32 indent, U32 depth)
21464{
21465 U8 op = PSEUDO; /* Arbitrary non-END op. */
21466 const regnode *next;
21467 const regnode *optstart= NULL;
21468
21469 RXi_GET_DECL(r, ri);
21470 GET_RE_DEBUG_FLAGS_DECL;
21471
21472 PERL_ARGS_ASSERT_DUMPUNTIL;
21473
21474#ifdef DEBUG_DUMPUNTIL
21475 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
21476 last ? last-start : 0, plast ? plast-start : 0);
21477#endif
21478
21479 if (plast && plast < last)
21480 last= plast;
21481
21482 while (PL_regkind[op] != END && (!last || node < last)) {
21483 assert(node);
21484 /* While that wasn't END last time... */
21485 NODE_ALIGN(node);
21486 op = OP(node);
21487 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
21488 indent--;
21489 next = regnext((regnode *)node);
21490
21491 /* Where, what. */
21492 if (OP(node) == OPTIMIZED) {
21493 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
21494 optstart = node;
21495 else
21496 goto after_print;
21497 } else
21498 CLEAR_OPTSTART;
21499
21500 regprop(r, sv, node, NULL, NULL);
21501 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
21502 (int)(2*indent + 1), "", SvPVX_const(sv));
21503
21504 if (OP(node) != OPTIMIZED) {
21505 if (next == NULL) /* Next ptr. */
21506 Perl_re_printf( aTHX_ " (0)");
21507 else if (PL_regkind[(U8)op] == BRANCH
21508 && PL_regkind[OP(next)] != BRANCH )
21509 Perl_re_printf( aTHX_ " (FAIL)");
21510 else
21511 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
21512 Perl_re_printf( aTHX_ "\n");
21513 }
21514
21515 after_print:
21516 if (PL_regkind[(U8)op] == BRANCHJ) {
21517 assert(next);
21518 {
21519 const regnode *nnode = (OP(next) == LONGJMP
21520 ? regnext((regnode *)next)
21521 : next);
21522 if (last && nnode > last)
21523 nnode = last;
21524 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
21525 }
21526 }
21527 else if (PL_regkind[(U8)op] == BRANCH) {
21528 assert(next);
21529 DUMPUNTIL(NEXTOPER(node), next);
21530 }
21531 else if ( PL_regkind[(U8)op] == TRIE ) {
21532 const regnode *this_trie = node;
21533 const char op = OP(node);
21534 const U32 n = ARG(node);
21535 const reg_ac_data * const ac = op>=AHOCORASICK ?
21536 (reg_ac_data *)ri->data->data[n] :
21537 NULL;
21538 const reg_trie_data * const trie =
21539 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
21540#ifdef DEBUGGING
21541 AV *const trie_words
21542 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
21543#endif
21544 const regnode *nextbranch= NULL;
21545 I32 word_idx;
21546 SvPVCLEAR(sv);
21547 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
21548 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
21549
21550 Perl_re_indentf( aTHX_ "%s ",
21551 indent+3,
21552 elem_ptr
21553 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
21554 SvCUR(*elem_ptr), PL_dump_re_max_len,
21555 PL_colors[0], PL_colors[1],
21556 (SvUTF8(*elem_ptr)
21557 ? PERL_PV_ESCAPE_UNI
21558 : 0)
21559 | PERL_PV_PRETTY_ELLIPSES
21560 | PERL_PV_PRETTY_LTGT
21561 )
21562 : "???"
21563 );
21564 if (trie->jump) {
21565 U16 dist= trie->jump[word_idx+1];
21566 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
21567 (UV)((dist ? this_trie + dist : next) - start));
21568 if (dist) {
21569 if (!nextbranch)
21570 nextbranch= this_trie + trie->jump[0];
21571 DUMPUNTIL(this_trie + dist, nextbranch);
21572 }
21573 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
21574 nextbranch= regnext((regnode *)nextbranch);
21575 } else {
21576 Perl_re_printf( aTHX_ "\n");
21577 }
21578 }
21579 if (last && next > last)
21580 node= last;
21581 else
21582 node= next;
21583 }
21584 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
21585 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
21586 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
21587 }
21588 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
21589 assert(next);
21590 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
21591 }
21592 else if ( op == PLUS || op == STAR) {
21593 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
21594 }
21595 else if (PL_regkind[(U8)op] == EXACT) {
21596 /* Literal string, where present. */
21597 node += NODE_SZ_STR(node) - 1;
21598 node = NEXTOPER(node);
21599 }
21600 else {
21601 node = NEXTOPER(node);
21602 node += regarglen[(U8)op];
21603 }
21604 if (op == CURLYX || op == OPEN || op == SROPEN)
21605 indent++;
21606 }
21607 CLEAR_OPTSTART;
21608#ifdef DEBUG_DUMPUNTIL
21609 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
21610#endif
21611 return node;
21612}
21613
21614#endif /* DEBUGGING */
21615
21616#ifndef PERL_IN_XSUB_RE
21617
21618#include "uni_keywords.h"
21619
21620void
21621Perl_init_uniprops(pTHX)
21622{
21623 /* Set up the inversion list global variables */
21624
21625 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
21626 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
21627 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
21628 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
21629 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
21630 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
21631 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
21632 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
21633 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
21634 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
21635 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
21636 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
21637 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
21638 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
21639 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
21640 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
21641
21642 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
21643 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
21644 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
21645 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
21646 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
21647 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
21648 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
21649 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
21650 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
21651 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
21652 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
21653 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
21654 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
21655 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
21656 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
21657 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
21658
21659 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
21660 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
21661 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
21662 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
21663 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
21664
21665 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
21666 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
21667 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
21668
21669 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
21670
21671 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
21672 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
21673
21674 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
21675 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
21676
21677 PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
21678 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
21679 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
21680 PL_NonL1NonFinalFold = _new_invlist_C_array(
21681 NonL1_Perl_Non_Final_Folds_invlist);
21682
21683 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
21684 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
21685 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
21686 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
21687 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
21688 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
21689 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
21690
21691 /* The below are used only by deprecated functions. They could be removed */
21692 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
21693 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
21694 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
21695}
21696
21697SV *
21698Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len,
21699 const bool to_fold, bool * invert)
21700{
21701 /* Parse the interior meat of \p{} passed to this in 'name' with length
21702 * 'name_len', and return an inversion list if a property with 'name' is
21703 * found, or NULL if not. 'name' point to the input with leading and
21704 * trailing space trimmed. 'to_fold' indicates if /i is in effect.
21705 *
21706 * When the return is an inversion list, '*invert' will be set to a boolean
21707 * indicating if it should be inverted or not
21708 *
21709 * This currently doesn't handle all cases. A NULL return indicates the
21710 * caller should try a different approach
21711 */
21712
21713 char* lookup_name;
21714 bool stricter = FALSE;
21715 bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one
21716 of the cjk numeric properties (though
21717 it requires extra effort to compile
21718 them) */
21719 unsigned int i;
21720 unsigned int j = 0, lookup_len;
21721 int equals_pos = -1; /* Where the '=' is found, or negative if none */
21722 int slash_pos = -1; /* Where the '/' is found, or negative if none */
21723 int table_index = 0;
21724 bool starts_with_In_or_Is = FALSE;
21725 Size_t lookup_offset = 0;
21726
21727 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
21728
21729 /* The input will be modified into 'lookup_name' */
21730 Newx(lookup_name, name_len, char);
21731 SAVEFREEPV(lookup_name);
21732
21733 /* Parse the input. */
21734 for (i = 0; i < name_len; i++) {
21735 char cur = name[i];
21736
21737 /* These characters can be freely ignored in most situations. Later it
21738 * may turn out we shouldn't have ignored them, and we have to reparse,
21739 * but we don't have enough information yet to make that decision */
21740 if (cur == '-' || cur == '_' || isSPACE_A(cur)) {
21741 continue;
21742 }
21743
21744 /* Case differences are also ignored. Our lookup routine assumes
21745 * everything is lowercase */
21746 if (isUPPER_A(cur)) {
21747 lookup_name[j++] = toLOWER(cur);
21748 continue;
21749 }
21750
21751 /* A double colon is either an error, or a package qualifier to a
21752 * subroutine user-defined property; neither of which do we currently
21753 * handle
21754 *
21755 * But a single colon is a synonym for '=' */
21756 if (cur == ':') {
21757 if (i < name_len - 1 && name[i+1] == ':') {
21758 return NULL;
21759 }
21760 cur = '=';
21761 }
21762
21763 /* Otherwise, this character is part of the name. */
21764 lookup_name[j++] = cur;
21765
21766 /* Only the equals sign needs further processing */
21767 if (cur == '=') {
21768 equals_pos = j; /* Note where it occurred in the input */
21769 break;
21770 }
21771 }
21772
21773 /* Here, we are either done with the whole property name, if it was simple;
21774 * or are positioned just after the '=' if it is compound. */
21775
21776 if (equals_pos >= 0) {
21777 assert(! stricter); /* We shouldn't have set this yet */
21778
21779 /* Space immediately after the '=' is ignored */
21780 i++;
21781 for (; i < name_len; i++) {
21782 if (! isSPACE_A(name[i])) {
21783 break;
21784 }
21785 }
21786
21787 /* Certain properties need special handling. They may optionally be
21788 * prefixed by 'is'. Ignore that prefix for the purposes of checking
21789 * if this is one of those properties */
21790 if (memBEGINPs(lookup_name, name_len, "is")) {
21791 lookup_offset = 2;
21792 }
21793
21794 /* Then check if it is one of these properties. This is hard-coded
21795 * because easier this way, and the list is unlikely to change. There
21796 * are several properties like this in the Unihan DB, which is unlikely
21797 * to be compiled, and they all end with 'numeric'. The interiors
21798 * aren't checked for the precise property. This would stop working if
21799 * a cjk property were to be created that ended with 'numeric' and
21800 * wasn't a numeric type */
21801 is_nv_type = memEQs(lookup_name + lookup_offset,
21802 j - 1 - lookup_offset, "numericvalue")
21803 || memEQs(lookup_name + lookup_offset,
21804 j - 1 - lookup_offset, "nv")
21805 || ( memENDPs(lookup_name + lookup_offset,
21806 j - 1 - lookup_offset, "numeric")
21807 && ( memBEGINPs(lookup_name + lookup_offset,
21808 j - 1 - lookup_offset, "cjk")
21809 || memBEGINPs(lookup_name + lookup_offset,
21810 j - 1 - lookup_offset, "k")));
21811 if ( is_nv_type
21812 || memEQs(lookup_name + lookup_offset,
21813 j - 1 - lookup_offset, "canonicalcombiningclass")
21814 || memEQs(lookup_name + lookup_offset,
21815 j - 1 - lookup_offset, "ccc")
21816 || memEQs(lookup_name + lookup_offset,
21817 j - 1 - lookup_offset, "age")
21818 || memEQs(lookup_name + lookup_offset,
21819 j - 1 - lookup_offset, "in")
21820 || memEQs(lookup_name + lookup_offset,
21821 j - 1 - lookup_offset, "presentin"))
21822 {
21823 unsigned int k;
21824
21825 /* What makes these properties special is that the stuff after the
21826 * '=' is a number. Therefore, we can't throw away '-'
21827 * willy-nilly, as those could be a minus sign. Other stricter
21828 * rules also apply. However, these properties all can have the
21829 * rhs not be a number, in which case they contain at least one
21830 * alphabetic. In those cases, the stricter rules don't apply.
21831 * But the numeric type properties can have the alphas [Ee] to
21832 * signify an exponent, and it is still a number with stricter
21833 * rules. So look for an alpha that signifys not-strict */
21834 stricter = TRUE;
21835 for (k = i; k < name_len; k++) {
21836 if ( isALPHA_A(name[k])
21837 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
21838 {
21839 stricter = FALSE;
21840 break;
21841 }
21842 }
21843 }
21844
21845 if (stricter) {
21846
21847 /* A number may have a leading '+' or '-'. The latter is retained
21848 * */
21849 if (name[i] == '+') {
21850 i++;
21851 }
21852 else if (name[i] == '-') {
21853 lookup_name[j++] = '-';
21854 i++;
21855 }
21856
21857 /* Skip leading zeros including single underscores separating the
21858 * zeros, or between the final leading zero and the first other
21859 * digit */
21860 for (; i < name_len - 1; i++) {
21861 if ( name[i] != '0'
21862 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
21863 {
21864 break;
21865 }
21866 }
21867 }
21868 }
21869 else { /* No '=' */
21870
21871 /* We are now in a position to determine if this property should have
21872 * been parsed using stricter rules. Only a few are like that, and
21873 * unlikely to change. */
21874 if ( memBEGINPs(lookup_name, j, "perl")
21875 && memNEs(lookup_name + 4, j - 4, "space")
21876 && memNEs(lookup_name + 4, j - 4, "word"))
21877 {
21878 stricter = TRUE;
21879
21880 /* We set the inputs back to 0 and the code below will reparse,
21881 * using strict */
21882 i = j = 0;
21883 }
21884 }
21885
21886 /* Here, we have either finished the property, or are positioned to parse
21887 * the remainder, and we know if stricter rules apply. Finish out, if not
21888 * already done */
21889 for (; i < name_len; i++) {
21890 char cur = name[i];
21891
21892 /* In all instances, case differences are ignored, and we normalize to
21893 * lowercase */
21894 if (isUPPER_A(cur)) {
21895 lookup_name[j++] = toLOWER(cur);
21896 continue;
21897 }
21898
21899 /* An underscore is skipped, but not under strict rules unless it
21900 * separates two digits */
21901 if (cur == '_') {
21902 if ( stricter
21903 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
21904 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
21905 {
21906 lookup_name[j++] = '_';
21907 }
21908 continue;
21909 }
21910
21911 /* Hyphens are skipped except under strict */
21912 if (cur == '-' && ! stricter) {
21913 continue;
21914 }
21915
21916 /* XXX Bug in documentation. It says white space skipped adjacent to
21917 * non-word char. Maybe we should, but shouldn't skip it next to a dot
21918 * in a number */
21919 if (isSPACE_A(cur) && ! stricter) {
21920 continue;
21921 }
21922
21923 lookup_name[j++] = cur;
21924
21925 /* Unless this is a non-trailing slash, we are done with it */
21926 if (i >= name_len - 1 || cur != '/') {
21927 continue;
21928 }
21929
21930 slash_pos = j;
21931
21932 /* A slash in the 'numeric value' property indicates that what follows
21933 * is a denominator. It can have a leading '+' and '0's that should be
21934 * skipped. But we have never allowed a negative denominator, so treat
21935 * a minus like every other character. (No need to rule out a second
21936 * '/', as that won't match anything anyway */
21937 if (is_nv_type) {
21938 i++;
21939 if (i < name_len && name[i] == '+') {
21940 i++;
21941 }
21942
21943 /* Skip leading zeros including underscores separating digits */
21944 for (; i < name_len - 1; i++) {
21945 if ( name[i] != '0'
21946 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
21947 {
21948 break;
21949 }
21950 }
21951
21952 /* Store the first real character in the denominator */
21953 lookup_name[j++] = name[i];
21954 }
21955 }
21956
21957 /* Here are completely done parsing the input 'name', and 'lookup_name'
21958 * contains a copy, normalized.
21959 *
21960 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
21961 * different from without the underscores. */
21962 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
21963 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
21964 && UNLIKELY(name[name_len-1] == '_'))
21965 {
21966 lookup_name[j++] = '&';
21967 }
21968 else if (name_len > 2 && name[0] == 'I' && ( name[1] == 'n'
21969 || name[1] == 's'))
21970 {
21971
21972 /* Also, if the original input began with 'In' or 'Is', it could be a
21973 * subroutine call instead of a property names, which currently isn't
21974 * handled by this function. Subroutine calls can't happen if there is
21975 * an '=' in the name */
21976 if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL)
21977 {
21978 return NULL;
21979 }
21980
21981 starts_with_In_or_Is = TRUE;
21982 }
21983
21984 lookup_len = j; /* Use a more mnemonic name starting here */
21985
21986 /* Get the index into our pointer table of the inversion list corresponding
21987 * to the property */
21988 table_index = match_uniprop((U8 *) lookup_name, lookup_len);
21989
21990 /* If it didn't find the property */
21991 if (table_index == 0) {
21992
21993 /* If didn't find the property, we try again stripping off any initial
21994 * 'In' or 'Is' */
21995 if (starts_with_In_or_Is) {
21996 lookup_name += 2;
21997 lookup_len -= 2;
21998 equals_pos -= 2;
21999 slash_pos -= 2;
22000
22001 table_index = match_uniprop((U8 *) lookup_name, lookup_len);
22002 }
22003
22004 if (table_index == 0) {
22005 char * canonical;
22006
22007 /* If not found, and not a numeric type property, isn't a legal
22008 * property */
22009 if (! is_nv_type) {
22010 return NULL;
22011 }
22012
22013 /* But the numeric type properties need more work to decide. What
22014 * we do is make sure we have the number in canonical form and look
22015 * that up. */
22016
22017 if (slash_pos < 0) { /* No slash */
22018
22019 /* When it isn't a rational, take the input, convert it to a
22020 * NV, then create a canonical string representation of that
22021 * NV. */
22022
22023 NV value;
22024
22025 /* Get the value */
22026 if (my_atof3(lookup_name + equals_pos, &value,
22027 lookup_len - equals_pos)
22028 != lookup_name + lookup_len)
22029 {
22030 return NULL;
22031 }
22032
22033 /* If the value is an integer, the canonical value is integral */
22034 if (Perl_ceil(value) == value) {
22035 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
22036 equals_pos, lookup_name, value);
22037 }
22038 else { /* Otherwise, it is %e with a known precision */
22039 char * exp_ptr;
22040
22041 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
22042 equals_pos, lookup_name,
22043 PL_E_FORMAT_PRECISION, value);
22044
22045 /* The exponent generated is expecting two digits, whereas
22046 * %e on some systems will generate three. Remove leading
22047 * zeros in excess of 2 from the exponent. We start
22048 * looking for them after the '=' */
22049 exp_ptr = strchr(canonical + equals_pos, 'e');
22050 if (exp_ptr) {
22051 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
22052 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
22053
22054 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
22055
22056 if (excess_exponent_len > 0) {
22057 SSize_t leading_zeros = strspn(cur_ptr, "0");
22058 SSize_t excess_leading_zeros
22059 = MIN(leading_zeros, excess_exponent_len);
22060 if (excess_leading_zeros > 0) {
22061 Move(cur_ptr + excess_leading_zeros,
22062 cur_ptr,
22063 strlen(cur_ptr) - excess_leading_zeros
22064 + 1, /* Copy the NUL as well */
22065 char);
22066 }
22067 }
22068 }
22069 }
22070 }
22071 else { /* Has a slash. Create a rational in canonical form */
22072 UV numerator, denominator, gcd, trial;
22073 const char * end_ptr;
22074 const char * sign = "";
22075
22076 /* We can't just find the numerator, denominator, and do the
22077 * division, then use the method above, because that is
22078 * inexact. And the input could be a rational that is within
22079 * epsilon (given our precision) of a valid rational, and would
22080 * then incorrectly compare valid.
22081 *
22082 * We're only interested in the part after the '=' */
22083 const char * this_lookup_name = lookup_name + equals_pos;
22084 lookup_len -= equals_pos;
22085 slash_pos -= equals_pos;
22086
22087 /* Handle any leading minus */
22088 if (this_lookup_name[0] == '-') {
22089 sign = "-";
22090 this_lookup_name++;
22091 lookup_len--;
22092 slash_pos--;
22093 }
22094
22095 /* Convert the numerator to numeric */
22096 end_ptr = this_lookup_name + slash_pos;
22097 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
22098 return NULL;
22099 }
22100
22101 /* It better have included all characters before the slash */
22102 if (*end_ptr != '/') {
22103 return NULL;
22104 }
22105
22106 /* Set to look at just the denominator */
22107 this_lookup_name += slash_pos;
22108 lookup_len -= slash_pos;
22109 end_ptr = this_lookup_name + lookup_len;
22110
22111 /* Convert the denominator to numeric */
22112 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
22113 return NULL;
22114 }
22115
22116 /* It better be the rest of the characters, and don't divide by
22117 * 0 */
22118 if ( end_ptr != this_lookup_name + lookup_len
22119 || denominator == 0)
22120 {
22121 return NULL;
22122 }
22123
22124 /* Get the greatest common denominator using
22125 http://en.wikipedia.org/wiki/Euclidean_algorithm */
22126 gcd = numerator;
22127 trial = denominator;
22128 while (trial != 0) {
22129 UV temp = trial;
22130 trial = gcd % trial;
22131 gcd = temp;
22132 }
22133
22134 /* If already in lowest possible terms, we have already tried
22135 * looking this up */
22136 if (gcd == 1) {
22137 return NULL;
22138 }
22139
22140 /* Reduce the rational, which should put it in canonical form.
22141 * Then look it up */
22142 numerator /= gcd;
22143 denominator /= gcd;
22144
22145 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
22146 equals_pos, lookup_name, sign, numerator, denominator);
22147 }
22148
22149 /* Here, we have the number in canonical form. Try that */
22150 table_index = match_uniprop((U8 *) canonical, strlen(canonical));
22151 if (table_index == 0) {
22152 return NULL;
22153 }
22154 }
22155 }
22156
22157 /* The return is an index into a table of ptrs. A negative return
22158 * signifies that the real index is the absolute value, but the result
22159 * needs to be inverted */
22160 if (table_index < 0) {
22161 *invert = TRUE;
22162 table_index = -table_index;
22163 }
22164 else {
22165 *invert = FALSE;
22166 }
22167
22168 /* Out-of band indices indicate a deprecated property. The proper index is
22169 * modulo it with the table size. And dividing by the table size yields
22170 * an offset into a table constructed to contain the corresponding warning
22171 * message */
22172 if (table_index > MAX_UNI_KEYWORD_INDEX) {
22173 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
22174 table_index %= MAX_UNI_KEYWORD_INDEX;
22175 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
22176 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
22177 (int) name_len, name, deprecated_property_msgs[warning_offset]);
22178 }
22179
22180 /* In a few properties, a different property is used under /i. These are
22181 * unlikely to change, so are hard-coded here. */
22182 if (to_fold) {
22183 if ( table_index == UNI_XPOSIXUPPER
22184 || table_index == UNI_XPOSIXLOWER
22185 || table_index == UNI_TITLE)
22186 {
22187 table_index = UNI_CASED;
22188 }
22189 else if ( table_index == UNI_UPPERCASELETTER
22190 || table_index == UNI_LOWERCASELETTER
22191# ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
22192 || table_index == UNI_TITLECASELETTER
22193# endif
22194 ) {
22195 table_index = UNI_CASEDLETTER;
22196 }
22197 else if ( table_index == UNI_POSIXUPPER
22198 || table_index == UNI_POSIXLOWER)
22199 {
22200 table_index = UNI_POSIXALPHA;
22201 }
22202 }
22203
22204 /* Create and return the inversion list */
22205 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
22206}
22207
22208#endif
22209
22210/*
22211 * ex: set ts=8 sts=4 sw=4 et:
22212 */