This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Skip stat.t readability test on VMS.
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
34#ifdef PERL_EXT_RE_BUILD
35#include "re_top.h"
36#endif
37
38/*
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
64 ****
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
74#define PERL_IN_REGCOMP_C
75#include "perl.h"
76
77#ifndef PERL_IN_XSUB_RE
78# include "INTERN.h"
79#endif
80
81#define REG_COMP_C
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
87
88#include "dquote_static.c"
89#include "charclass_invlists.h"
90
91#ifdef op
92#undef op
93#endif /* op */
94
95#ifdef MSDOS
96# if defined(BUGGY_MSC6)
97 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
98# pragma optimize("a",off)
99 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
100# pragma optimize("w",on )
101# endif /* BUGGY_MSC6 */
102#endif /* MSDOS */
103
104#ifndef STATIC
105#define STATIC static
106#endif
107
108typedef struct RExC_state_t {
109 U32 flags; /* are we folding, multilining? */
110 char *precomp; /* uncompiled string. */
111 REGEXP *rx_sv; /* The SV that is the regexp. */
112 regexp *rx; /* perl core regexp structure */
113 regexp_internal *rxi; /* internal data for regexp object pprivate field */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit_bound; /* First regnode outside of the allocated space */
120 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
123 U32 seen;
124 I32 size; /* Code size. */
125 I32 npar; /* Capture buffer count, (OPEN). */
126 I32 cpar; /* Capture buffer count, (CLOSE). */
127 I32 nestroot; /* root parens we are in - used by accept */
128 I32 extralen;
129 I32 seen_zerolen;
130 I32 seen_evals;
131 regnode **open_parens; /* pointers to open parens */
132 regnode **close_parens; /* pointers to close parens */
133 regnode *opend; /* END node in program */
134 I32 utf8; /* whether the pattern is utf8 or not */
135 I32 orig_utf8; /* whether the pattern was originally in utf8 */
136 /* XXX use this for future optimisation of case
137 * where pattern must be upgraded to utf8. */
138 I32 uni_semantics; /* If a d charset modifier should use unicode
139 rules, even if the pattern is not in
140 utf8 */
141 HV *paren_names; /* Paren names */
142
143 regnode **recurse; /* Recurse regops */
144 I32 recurse_count; /* Number of recurse regops */
145 I32 in_lookbehind;
146 I32 contains_locale;
147 I32 override_recoding;
148#if ADD_TO_REGEXEC
149 char *starttry; /* -Dr: where regtry was called. */
150#define RExC_starttry (pRExC_state->starttry)
151#endif
152#ifdef DEBUGGING
153 const char *lastparse;
154 I32 lastnum;
155 AV *paren_name_list; /* idx -> name */
156#define RExC_lastparse (pRExC_state->lastparse)
157#define RExC_lastnum (pRExC_state->lastnum)
158#define RExC_paren_name_list (pRExC_state->paren_name_list)
159#endif
160} RExC_state_t;
161
162#define RExC_flags (pRExC_state->flags)
163#define RExC_precomp (pRExC_state->precomp)
164#define RExC_rx_sv (pRExC_state->rx_sv)
165#define RExC_rx (pRExC_state->rx)
166#define RExC_rxi (pRExC_state->rxi)
167#define RExC_start (pRExC_state->start)
168#define RExC_end (pRExC_state->end)
169#define RExC_parse (pRExC_state->parse)
170#define RExC_whilem_seen (pRExC_state->whilem_seen)
171#ifdef RE_TRACK_PATTERN_OFFSETS
172#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
173#endif
174#define RExC_emit (pRExC_state->emit)
175#define RExC_emit_start (pRExC_state->emit_start)
176#define RExC_emit_bound (pRExC_state->emit_bound)
177#define RExC_naughty (pRExC_state->naughty)
178#define RExC_sawback (pRExC_state->sawback)
179#define RExC_seen (pRExC_state->seen)
180#define RExC_size (pRExC_state->size)
181#define RExC_npar (pRExC_state->npar)
182#define RExC_nestroot (pRExC_state->nestroot)
183#define RExC_extralen (pRExC_state->extralen)
184#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
185#define RExC_seen_evals (pRExC_state->seen_evals)
186#define RExC_utf8 (pRExC_state->utf8)
187#define RExC_uni_semantics (pRExC_state->uni_semantics)
188#define RExC_orig_utf8 (pRExC_state->orig_utf8)
189#define RExC_open_parens (pRExC_state->open_parens)
190#define RExC_close_parens (pRExC_state->close_parens)
191#define RExC_opend (pRExC_state->opend)
192#define RExC_paren_names (pRExC_state->paren_names)
193#define RExC_recurse (pRExC_state->recurse)
194#define RExC_recurse_count (pRExC_state->recurse_count)
195#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
196#define RExC_contains_locale (pRExC_state->contains_locale)
197#define RExC_override_recoding (pRExC_state->override_recoding)
198
199
200#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
201#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
202 ((*s) == '{' && regcurly(s)))
203
204#ifdef SPSTART
205#undef SPSTART /* dratted cpp namespace... */
206#endif
207/*
208 * Flags to be passed up and down.
209 */
210#define WORST 0 /* Worst case. */
211#define HASWIDTH 0x01 /* Known to match non-null strings. */
212
213/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
214 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
215#define SIMPLE 0x02
216#define SPSTART 0x04 /* Starts with * or +. */
217#define TRYAGAIN 0x08 /* Weeded out a declaration. */
218#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
219
220#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
221
222/* whether trie related optimizations are enabled */
223#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
224#define TRIE_STUDY_OPT
225#define FULL_TRIE_STUDY
226#define TRIE_STCLASS
227#endif
228
229
230
231#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
232#define PBITVAL(paren) (1 << ((paren) & 7))
233#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
234#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
235#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
236
237/* If not already in utf8, do a longjmp back to the beginning */
238#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
239#define REQUIRE_UTF8 STMT_START { \
240 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
241 } STMT_END
242
243/* About scan_data_t.
244
245 During optimisation we recurse through the regexp program performing
246 various inplace (keyhole style) optimisations. In addition study_chunk
247 and scan_commit populate this data structure with information about
248 what strings MUST appear in the pattern. We look for the longest
249 string that must appear at a fixed location, and we look for the
250 longest string that may appear at a floating location. So for instance
251 in the pattern:
252
253 /FOO[xX]A.*B[xX]BAR/
254
255 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
256 strings (because they follow a .* construct). study_chunk will identify
257 both FOO and BAR as being the longest fixed and floating strings respectively.
258
259 The strings can be composites, for instance
260
261 /(f)(o)(o)/
262
263 will result in a composite fixed substring 'foo'.
264
265 For each string some basic information is maintained:
266
267 - offset or min_offset
268 This is the position the string must appear at, or not before.
269 It also implicitly (when combined with minlenp) tells us how many
270 characters must match before the string we are searching for.
271 Likewise when combined with minlenp and the length of the string it
272 tells us how many characters must appear after the string we have
273 found.
274
275 - max_offset
276 Only used for floating strings. This is the rightmost point that
277 the string can appear at. If set to I32 max it indicates that the
278 string can occur infinitely far to the right.
279
280 - minlenp
281 A pointer to the minimum length of the pattern that the string
282 was found inside. This is important as in the case of positive
283 lookahead or positive lookbehind we can have multiple patterns
284 involved. Consider
285
286 /(?=FOO).*F/
287
288 The minimum length of the pattern overall is 3, the minimum length
289 of the lookahead part is 3, but the minimum length of the part that
290 will actually match is 1. So 'FOO's minimum length is 3, but the
291 minimum length for the F is 1. This is important as the minimum length
292 is used to determine offsets in front of and behind the string being
293 looked for. Since strings can be composites this is the length of the
294 pattern at the time it was committed with a scan_commit. Note that
295 the length is calculated by study_chunk, so that the minimum lengths
296 are not known until the full pattern has been compiled, thus the
297 pointer to the value.
298
299 - lookbehind
300
301 In the case of lookbehind the string being searched for can be
302 offset past the start point of the final matching string.
303 If this value was just blithely removed from the min_offset it would
304 invalidate some of the calculations for how many chars must match
305 before or after (as they are derived from min_offset and minlen and
306 the length of the string being searched for).
307 When the final pattern is compiled and the data is moved from the
308 scan_data_t structure into the regexp structure the information
309 about lookbehind is factored in, with the information that would
310 have been lost precalculated in the end_shift field for the
311 associated string.
312
313 The fields pos_min and pos_delta are used to store the minimum offset
314 and the delta to the maximum offset at the current point in the pattern.
315
316*/
317
318typedef struct scan_data_t {
319 /*I32 len_min; unused */
320 /*I32 len_delta; unused */
321 I32 pos_min;
322 I32 pos_delta;
323 SV *last_found;
324 I32 last_end; /* min value, <0 unless valid. */
325 I32 last_start_min;
326 I32 last_start_max;
327 SV **longest; /* Either &l_fixed, or &l_float. */
328 SV *longest_fixed; /* longest fixed string found in pattern */
329 I32 offset_fixed; /* offset where it starts */
330 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
331 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
332 SV *longest_float; /* longest floating string found in pattern */
333 I32 offset_float_min; /* earliest point in string it can appear */
334 I32 offset_float_max; /* latest point in string it can appear */
335 I32 *minlen_float; /* pointer to the minlen relevant to the string */
336 I32 lookbehind_float; /* is the position of the string modified by LB */
337 I32 flags;
338 I32 whilem_c;
339 I32 *last_closep;
340 struct regnode_charclass_class *start_class;
341} scan_data_t;
342
343/*
344 * Forward declarations for pregcomp()'s friends.
345 */
346
347static const scan_data_t zero_scan_data =
348 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
349
350#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
351#define SF_BEFORE_SEOL 0x0001
352#define SF_BEFORE_MEOL 0x0002
353#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
354#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
355
356#ifdef NO_UNARY_PLUS
357# define SF_FIX_SHIFT_EOL (0+2)
358# define SF_FL_SHIFT_EOL (0+4)
359#else
360# define SF_FIX_SHIFT_EOL (+2)
361# define SF_FL_SHIFT_EOL (+4)
362#endif
363
364#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
365#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
366
367#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
368#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
369#define SF_IS_INF 0x0040
370#define SF_HAS_PAR 0x0080
371#define SF_IN_PAR 0x0100
372#define SF_HAS_EVAL 0x0200
373#define SCF_DO_SUBSTR 0x0400
374#define SCF_DO_STCLASS_AND 0x0800
375#define SCF_DO_STCLASS_OR 0x1000
376#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
377#define SCF_WHILEM_VISITED_POS 0x2000
378
379#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
380#define SCF_SEEN_ACCEPT 0x8000
381
382#define UTF cBOOL(RExC_utf8)
383#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
384#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
385#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
386#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
387#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
388#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
389#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
390
391#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
392
393#define OOB_UNICODE 12345678
394#define OOB_NAMEDCLASS -1
395
396#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
397#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
398
399
400/* length of regex to show in messages that don't mark a position within */
401#define RegexLengthToShowInErrorMessages 127
402
403/*
404 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
405 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
406 * op/pragma/warn/regcomp.
407 */
408#define MARKER1 "<-- HERE" /* marker as it appears in the description */
409#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
410
411#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
412
413/*
414 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
415 * arg. Show regex, up to a maximum length. If it's too long, chop and add
416 * "...".
417 */
418#define _FAIL(code) STMT_START { \
419 const char *ellipses = ""; \
420 IV len = RExC_end - RExC_precomp; \
421 \
422 if (!SIZE_ONLY) \
423 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
424 if (len > RegexLengthToShowInErrorMessages) { \
425 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
426 len = RegexLengthToShowInErrorMessages - 10; \
427 ellipses = "..."; \
428 } \
429 code; \
430} STMT_END
431
432#define FAIL(msg) _FAIL( \
433 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
434 msg, (int)len, RExC_precomp, ellipses))
435
436#define FAIL2(msg,arg) _FAIL( \
437 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
438 arg, (int)len, RExC_precomp, ellipses))
439
440/*
441 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
442 */
443#define Simple_vFAIL(m) STMT_START { \
444 const IV offset = RExC_parse - RExC_precomp; \
445 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
446 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
447} STMT_END
448
449/*
450 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
451 */
452#define vFAIL(m) STMT_START { \
453 if (!SIZE_ONLY) \
454 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
455 Simple_vFAIL(m); \
456} STMT_END
457
458/*
459 * Like Simple_vFAIL(), but accepts two arguments.
460 */
461#define Simple_vFAIL2(m,a1) STMT_START { \
462 const IV offset = RExC_parse - RExC_precomp; \
463 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
464 (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
466
467/*
468 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
469 */
470#define vFAIL2(m,a1) STMT_START { \
471 if (!SIZE_ONLY) \
472 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
473 Simple_vFAIL2(m, a1); \
474} STMT_END
475
476
477/*
478 * Like Simple_vFAIL(), but accepts three arguments.
479 */
480#define Simple_vFAIL3(m, a1, a2) STMT_START { \
481 const IV offset = RExC_parse - RExC_precomp; \
482 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
483 (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
485
486/*
487 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
488 */
489#define vFAIL3(m,a1,a2) STMT_START { \
490 if (!SIZE_ONLY) \
491 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
492 Simple_vFAIL3(m, a1, a2); \
493} STMT_END
494
495/*
496 * Like Simple_vFAIL(), but accepts four arguments.
497 */
498#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
499 const IV offset = RExC_parse - RExC_precomp; \
500 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
501 (int)offset, RExC_precomp, RExC_precomp + offset); \
502} STMT_END
503
504#define ckWARNreg(loc,m) STMT_START { \
505 const IV offset = loc - RExC_precomp; \
506 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
507 (int)offset, RExC_precomp, RExC_precomp + offset); \
508} STMT_END
509
510#define ckWARNregdep(loc,m) STMT_START { \
511 const IV offset = loc - RExC_precomp; \
512 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
513 m REPORT_LOCATION, \
514 (int)offset, RExC_precomp, RExC_precomp + offset); \
515} STMT_END
516
517#define ckWARN2regdep(loc,m, a1) STMT_START { \
518 const IV offset = loc - RExC_precomp; \
519 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
520 m REPORT_LOCATION, \
521 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
524#define ckWARN2reg(loc, m, a1) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
530#define vWARN3(loc, m, a1, a2) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
535
536#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
537 const IV offset = loc - RExC_precomp; \
538 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
539 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
540} STMT_END
541
542#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
543 const IV offset = loc - RExC_precomp; \
544 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
545 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
546} STMT_END
547
548#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
549 const IV offset = loc - RExC_precomp; \
550 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
551 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
552} STMT_END
553
554#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
555 const IV offset = loc - RExC_precomp; \
556 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
557 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
558} STMT_END
559
560
561/* Allow for side effects in s */
562#define REGC(c,s) STMT_START { \
563 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
564} STMT_END
565
566/* Macros for recording node offsets. 20001227 mjd@plover.com
567 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
568 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
569 * Element 0 holds the number n.
570 * Position is 1 indexed.
571 */
572#ifndef RE_TRACK_PATTERN_OFFSETS
573#define Set_Node_Offset_To_R(node,byte)
574#define Set_Node_Offset(node,byte)
575#define Set_Cur_Node_Offset
576#define Set_Node_Length_To_R(node,len)
577#define Set_Node_Length(node,len)
578#define Set_Node_Cur_Length(node)
579#define Node_Offset(n)
580#define Node_Length(n)
581#define Set_Node_Offset_Length(node,offset,len)
582#define ProgLen(ri) ri->u.proglen
583#define SetProgLen(ri,x) ri->u.proglen = x
584#else
585#define ProgLen(ri) ri->u.offsets[0]
586#define SetProgLen(ri,x) ri->u.offsets[0] = x
587#define Set_Node_Offset_To_R(node,byte) STMT_START { \
588 if (! SIZE_ONLY) { \
589 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
590 __LINE__, (int)(node), (int)(byte))); \
591 if((node) < 0) { \
592 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
593 } else { \
594 RExC_offsets[2*(node)-1] = (byte); \
595 } \
596 } \
597} STMT_END
598
599#define Set_Node_Offset(node,byte) \
600 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
601#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
602
603#define Set_Node_Length_To_R(node,len) STMT_START { \
604 if (! SIZE_ONLY) { \
605 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
606 __LINE__, (int)(node), (int)(len))); \
607 if((node) < 0) { \
608 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
609 } else { \
610 RExC_offsets[2*(node)] = (len); \
611 } \
612 } \
613} STMT_END
614
615#define Set_Node_Length(node,len) \
616 Set_Node_Length_To_R((node)-RExC_emit_start, len)
617#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
618#define Set_Node_Cur_Length(node) \
619 Set_Node_Length(node, RExC_parse - parse_start)
620
621/* Get offsets and lengths */
622#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
623#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
624
625#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
626 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
627 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
628} STMT_END
629#endif
630
631#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
632#define EXPERIMENTAL_INPLACESCAN
633#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
634
635#define DEBUG_STUDYDATA(str,data,depth) \
636DEBUG_OPTIMISE_MORE_r(if(data){ \
637 PerlIO_printf(Perl_debug_log, \
638 "%*s" str "Pos:%"IVdf"/%"IVdf \
639 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
640 (int)(depth)*2, "", \
641 (IV)((data)->pos_min), \
642 (IV)((data)->pos_delta), \
643 (UV)((data)->flags), \
644 (IV)((data)->whilem_c), \
645 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
646 is_inf ? "INF " : "" \
647 ); \
648 if ((data)->last_found) \
649 PerlIO_printf(Perl_debug_log, \
650 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
651 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
652 SvPVX_const((data)->last_found), \
653 (IV)((data)->last_end), \
654 (IV)((data)->last_start_min), \
655 (IV)((data)->last_start_max), \
656 ((data)->longest && \
657 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
658 SvPVX_const((data)->longest_fixed), \
659 (IV)((data)->offset_fixed), \
660 ((data)->longest && \
661 (data)->longest==&((data)->longest_float)) ? "*" : "", \
662 SvPVX_const((data)->longest_float), \
663 (IV)((data)->offset_float_min), \
664 (IV)((data)->offset_float_max) \
665 ); \
666 PerlIO_printf(Perl_debug_log,"\n"); \
667});
668
669static void clear_re(pTHX_ void *r);
670
671/* Mark that we cannot extend a found fixed substring at this point.
672 Update the longest found anchored substring and the longest found
673 floating substrings if needed. */
674
675STATIC void
676S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
677{
678 const STRLEN l = CHR_SVLEN(data->last_found);
679 const STRLEN old_l = CHR_SVLEN(*data->longest);
680 GET_RE_DEBUG_FLAGS_DECL;
681
682 PERL_ARGS_ASSERT_SCAN_COMMIT;
683
684 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
685 SvSetMagicSV(*data->longest, data->last_found);
686 if (*data->longest == data->longest_fixed) {
687 data->offset_fixed = l ? data->last_start_min : data->pos_min;
688 if (data->flags & SF_BEFORE_EOL)
689 data->flags
690 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
691 else
692 data->flags &= ~SF_FIX_BEFORE_EOL;
693 data->minlen_fixed=minlenp;
694 data->lookbehind_fixed=0;
695 }
696 else { /* *data->longest == data->longest_float */
697 data->offset_float_min = l ? data->last_start_min : data->pos_min;
698 data->offset_float_max = (l
699 ? data->last_start_max
700 : data->pos_min + data->pos_delta);
701 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
702 data->offset_float_max = I32_MAX;
703 if (data->flags & SF_BEFORE_EOL)
704 data->flags
705 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
706 else
707 data->flags &= ~SF_FL_BEFORE_EOL;
708 data->minlen_float=minlenp;
709 data->lookbehind_float=0;
710 }
711 }
712 SvCUR_set(data->last_found, 0);
713 {
714 SV * const sv = data->last_found;
715 if (SvUTF8(sv) && SvMAGICAL(sv)) {
716 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
717 if (mg)
718 mg->mg_len = 0;
719 }
720 }
721 data->last_end = -1;
722 data->flags &= ~SF_BEFORE_EOL;
723 DEBUG_STUDYDATA("commit: ",data,0);
724}
725
726/* Can match anything (initialization) */
727STATIC void
728S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
729{
730 PERL_ARGS_ASSERT_CL_ANYTHING;
731
732 ANYOF_BITMAP_SETALL(cl);
733 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
734 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
735
736 /* If any portion of the regex is to operate under locale rules,
737 * initialization includes it. The reason this isn't done for all regexes
738 * is that the optimizer was written under the assumption that locale was
739 * all-or-nothing. Given the complexity and lack of documentation in the
740 * optimizer, and that there are inadequate test cases for locale, so many
741 * parts of it may not work properly, it is safest to avoid locale unless
742 * necessary. */
743 if (RExC_contains_locale) {
744 ANYOF_CLASS_SETALL(cl); /* /l uses class */
745 cl->flags |= ANYOF_LOCALE;
746 }
747 else {
748 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
749 }
750}
751
752/* Can match anything (initialization) */
753STATIC int
754S_cl_is_anything(const struct regnode_charclass_class *cl)
755{
756 int value;
757
758 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
759
760 for (value = 0; value <= ANYOF_MAX; value += 2)
761 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
762 return 1;
763 if (!(cl->flags & ANYOF_UNICODE_ALL))
764 return 0;
765 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
766 return 0;
767 return 1;
768}
769
770/* Can match anything (initialization) */
771STATIC void
772S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
773{
774 PERL_ARGS_ASSERT_CL_INIT;
775
776 Zero(cl, 1, struct regnode_charclass_class);
777 cl->type = ANYOF;
778 cl_anything(pRExC_state, cl);
779 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
780}
781
782/* These two functions currently do the exact same thing */
783#define cl_init_zero S_cl_init
784
785/* 'AND' a given class with another one. Can create false positives. 'cl'
786 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
787 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
788STATIC void
789S_cl_and(struct regnode_charclass_class *cl,
790 const struct regnode_charclass_class *and_with)
791{
792 PERL_ARGS_ASSERT_CL_AND;
793
794 assert(and_with->type == ANYOF);
795
796 /* I (khw) am not sure all these restrictions are necessary XXX */
797 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
798 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
799 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
800 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
801 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
802 int i;
803
804 if (and_with->flags & ANYOF_INVERT)
805 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
806 cl->bitmap[i] &= ~and_with->bitmap[i];
807 else
808 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
809 cl->bitmap[i] &= and_with->bitmap[i];
810 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
811
812 if (and_with->flags & ANYOF_INVERT) {
813
814 /* Here, the and'ed node is inverted. Get the AND of the flags that
815 * aren't affected by the inversion. Those that are affected are
816 * handled individually below */
817 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
818 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
819 cl->flags |= affected_flags;
820
821 /* We currently don't know how to deal with things that aren't in the
822 * bitmap, but we know that the intersection is no greater than what
823 * is already in cl, so let there be false positives that get sorted
824 * out after the synthetic start class succeeds, and the node is
825 * matched for real. */
826
827 /* The inversion of these two flags indicate that the resulting
828 * intersection doesn't have them */
829 if (and_with->flags & ANYOF_UNICODE_ALL) {
830 cl->flags &= ~ANYOF_UNICODE_ALL;
831 }
832 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
833 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
834 }
835 }
836 else { /* and'd node is not inverted */
837 U8 outside_bitmap_but_not_utf8; /* Temp variable */
838
839 if (! ANYOF_NONBITMAP(and_with)) {
840
841 /* Here 'and_with' doesn't match anything outside the bitmap
842 * (except possibly ANYOF_UNICODE_ALL), which means the
843 * intersection can't either, except for ANYOF_UNICODE_ALL, in
844 * which case we don't know what the intersection is, but it's no
845 * greater than what cl already has, so can just leave it alone,
846 * with possible false positives */
847 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
848 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
849 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
850 }
851 }
852 else if (! ANYOF_NONBITMAP(cl)) {
853
854 /* Here, 'and_with' does match something outside the bitmap, and cl
855 * doesn't have a list of things to match outside the bitmap. If
856 * cl can match all code points above 255, the intersection will
857 * be those above-255 code points that 'and_with' matches. If cl
858 * can't match all Unicode code points, it means that it can't
859 * match anything outside the bitmap (since the 'if' that got us
860 * into this block tested for that), so we leave the bitmap empty.
861 */
862 if (cl->flags & ANYOF_UNICODE_ALL) {
863 ARG_SET(cl, ARG(and_with));
864
865 /* and_with's ARG may match things that don't require UTF8.
866 * And now cl's will too, in spite of this being an 'and'. See
867 * the comments below about the kludge */
868 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
869 }
870 }
871 else {
872 /* Here, both 'and_with' and cl match something outside the
873 * bitmap. Currently we do not do the intersection, so just match
874 * whatever cl had at the beginning. */
875 }
876
877
878 /* Take the intersection of the two sets of flags. However, the
879 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
880 * kludge around the fact that this flag is not treated like the others
881 * which are initialized in cl_anything(). The way the optimizer works
882 * is that the synthetic start class (SSC) is initialized to match
883 * anything, and then the first time a real node is encountered, its
884 * values are AND'd with the SSC's with the result being the values of
885 * the real node. However, there are paths through the optimizer where
886 * the AND never gets called, so those initialized bits are set
887 * inappropriately, which is not usually a big deal, as they just cause
888 * false positives in the SSC, which will just mean a probably
889 * imperceptible slow down in execution. However this bit has a
890 * higher false positive consequence in that it can cause utf8.pm,
891 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
892 * bigger slowdown and also causes significant extra memory to be used.
893 * In order to prevent this, the code now takes a different tack. The
894 * bit isn't set unless some part of the regular expression needs it,
895 * but once set it won't get cleared. This means that these extra
896 * modules won't get loaded unless there was some path through the
897 * pattern that would have required them anyway, and so any false
898 * positives that occur by not ANDing them out when they could be
899 * aren't as severe as they would be if we treated this bit like all
900 * the others */
901 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
902 & ANYOF_NONBITMAP_NON_UTF8;
903 cl->flags &= and_with->flags;
904 cl->flags |= outside_bitmap_but_not_utf8;
905 }
906}
907
908/* 'OR' a given class with another one. Can create false positives. 'cl'
909 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
910 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
911STATIC void
912S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
913{
914 PERL_ARGS_ASSERT_CL_OR;
915
916 if (or_with->flags & ANYOF_INVERT) {
917
918 /* Here, the or'd node is to be inverted. This means we take the
919 * complement of everything not in the bitmap, but currently we don't
920 * know what that is, so give up and match anything */
921 if (ANYOF_NONBITMAP(or_with)) {
922 cl_anything(pRExC_state, cl);
923 }
924 /* We do not use
925 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
926 * <= (B1 | !B2) | (CL1 | !CL2)
927 * which is wasteful if CL2 is small, but we ignore CL2:
928 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
929 * XXXX Can we handle case-fold? Unclear:
930 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
931 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
932 */
933 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
934 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
935 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
936 int i;
937
938 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
939 cl->bitmap[i] |= ~or_with->bitmap[i];
940 } /* XXXX: logic is complicated otherwise */
941 else {
942 cl_anything(pRExC_state, cl);
943 }
944
945 /* And, we can just take the union of the flags that aren't affected
946 * by the inversion */
947 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
948
949 /* For the remaining flags:
950 ANYOF_UNICODE_ALL and inverted means to not match anything above
951 255, which means that the union with cl should just be
952 what cl has in it, so can ignore this flag
953 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
954 is 127-255 to match them, but then invert that, so the
955 union with cl should just be what cl has in it, so can
956 ignore this flag
957 */
958 } else { /* 'or_with' is not inverted */
959 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
960 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
961 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
962 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
963 int i;
964
965 /* OR char bitmap and class bitmap separately */
966 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
967 cl->bitmap[i] |= or_with->bitmap[i];
968 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
969 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
970 cl->classflags[i] |= or_with->classflags[i];
971 cl->flags |= ANYOF_CLASS;
972 }
973 }
974 else { /* XXXX: logic is complicated, leave it along for a moment. */
975 cl_anything(pRExC_state, cl);
976 }
977
978 if (ANYOF_NONBITMAP(or_with)) {
979
980 /* Use the added node's outside-the-bit-map match if there isn't a
981 * conflict. If there is a conflict (both nodes match something
982 * outside the bitmap, but what they match outside is not the same
983 * pointer, and hence not easily compared until XXX we extend
984 * inversion lists this far), give up and allow the start class to
985 * match everything outside the bitmap. If that stuff is all above
986 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
987 if (! ANYOF_NONBITMAP(cl)) {
988 ARG_SET(cl, ARG(or_with));
989 }
990 else if (ARG(cl) != ARG(or_with)) {
991
992 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
993 cl_anything(pRExC_state, cl);
994 }
995 else {
996 cl->flags |= ANYOF_UNICODE_ALL;
997 }
998 }
999 }
1000
1001 /* Take the union */
1002 cl->flags |= or_with->flags;
1003 }
1004}
1005
1006#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1007#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1008#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1009#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1010
1011
1012#ifdef DEBUGGING
1013/*
1014 dump_trie(trie,widecharmap,revcharmap)
1015 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1016 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1017
1018 These routines dump out a trie in a somewhat readable format.
1019 The _interim_ variants are used for debugging the interim
1020 tables that are used to generate the final compressed
1021 representation which is what dump_trie expects.
1022
1023 Part of the reason for their existence is to provide a form
1024 of documentation as to how the different representations function.
1025
1026*/
1027
1028/*
1029 Dumps the final compressed table form of the trie to Perl_debug_log.
1030 Used for debugging make_trie().
1031*/
1032
1033STATIC void
1034S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1035 AV *revcharmap, U32 depth)
1036{
1037 U32 state;
1038 SV *sv=sv_newmortal();
1039 int colwidth= widecharmap ? 6 : 4;
1040 U16 word;
1041 GET_RE_DEBUG_FLAGS_DECL;
1042
1043 PERL_ARGS_ASSERT_DUMP_TRIE;
1044
1045 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1046 (int)depth * 2 + 2,"",
1047 "Match","Base","Ofs" );
1048
1049 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1050 SV ** const tmp = av_fetch( revcharmap, state, 0);
1051 if ( tmp ) {
1052 PerlIO_printf( Perl_debug_log, "%*s",
1053 colwidth,
1054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1055 PL_colors[0], PL_colors[1],
1056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1057 PERL_PV_ESCAPE_FIRSTCHAR
1058 )
1059 );
1060 }
1061 }
1062 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1063 (int)depth * 2 + 2,"");
1064
1065 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1066 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1067 PerlIO_printf( Perl_debug_log, "\n");
1068
1069 for( state = 1 ; state < trie->statecount ; state++ ) {
1070 const U32 base = trie->states[ state ].trans.base;
1071
1072 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1073
1074 if ( trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1076 } else {
1077 PerlIO_printf( Perl_debug_log, "%6s", "" );
1078 }
1079
1080 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1081
1082 if ( base ) {
1083 U32 ofs = 0;
1084
1085 while( ( base + ofs < trie->uniquecharcount ) ||
1086 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1087 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1088 ofs++;
1089
1090 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1091
1092 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1093 if ( ( base + ofs >= trie->uniquecharcount ) &&
1094 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1095 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1096 {
1097 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1098 colwidth,
1099 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1100 } else {
1101 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1102 }
1103 }
1104
1105 PerlIO_printf( Perl_debug_log, "]");
1106
1107 }
1108 PerlIO_printf( Perl_debug_log, "\n" );
1109 }
1110 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1111 for (word=1; word <= trie->wordcount; word++) {
1112 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1113 (int)word, (int)(trie->wordinfo[word].prev),
1114 (int)(trie->wordinfo[word].len));
1115 }
1116 PerlIO_printf(Perl_debug_log, "\n" );
1117}
1118/*
1119 Dumps a fully constructed but uncompressed trie in list form.
1120 List tries normally only are used for construction when the number of
1121 possible chars (trie->uniquecharcount) is very high.
1122 Used for debugging make_trie().
1123*/
1124STATIC void
1125S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1126 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1127 U32 depth)
1128{
1129 U32 state;
1130 SV *sv=sv_newmortal();
1131 int colwidth= widecharmap ? 6 : 4;
1132 GET_RE_DEBUG_FLAGS_DECL;
1133
1134 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1135
1136 /* print out the table precompression. */
1137 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1138 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1139 "------:-----+-----------------\n" );
1140
1141 for( state=1 ; state < next_alloc ; state ++ ) {
1142 U16 charid;
1143
1144 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1145 (int)depth * 2 + 2,"", (UV)state );
1146 if ( ! trie->states[ state ].wordnum ) {
1147 PerlIO_printf( Perl_debug_log, "%5s| ","");
1148 } else {
1149 PerlIO_printf( Perl_debug_log, "W%4x| ",
1150 trie->states[ state ].wordnum
1151 );
1152 }
1153 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1154 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1155 if ( tmp ) {
1156 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1157 colwidth,
1158 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1159 PL_colors[0], PL_colors[1],
1160 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1161 PERL_PV_ESCAPE_FIRSTCHAR
1162 ) ,
1163 TRIE_LIST_ITEM(state,charid).forid,
1164 (UV)TRIE_LIST_ITEM(state,charid).newstate
1165 );
1166 if (!(charid % 10))
1167 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1168 (int)((depth * 2) + 14), "");
1169 }
1170 }
1171 PerlIO_printf( Perl_debug_log, "\n");
1172 }
1173}
1174
1175/*
1176 Dumps a fully constructed but uncompressed trie in table form.
1177 This is the normal DFA style state transition table, with a few
1178 twists to facilitate compression later.
1179 Used for debugging make_trie().
1180*/
1181STATIC void
1182S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1183 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1184 U32 depth)
1185{
1186 U32 state;
1187 U16 charid;
1188 SV *sv=sv_newmortal();
1189 int colwidth= widecharmap ? 6 : 4;
1190 GET_RE_DEBUG_FLAGS_DECL;
1191
1192 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1193
1194 /*
1195 print out the table precompression so that we can do a visual check
1196 that they are identical.
1197 */
1198
1199 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1200
1201 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1202 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1203 if ( tmp ) {
1204 PerlIO_printf( Perl_debug_log, "%*s",
1205 colwidth,
1206 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1207 PL_colors[0], PL_colors[1],
1208 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1209 PERL_PV_ESCAPE_FIRSTCHAR
1210 )
1211 );
1212 }
1213 }
1214
1215 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1216
1217 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1218 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1219 }
1220
1221 PerlIO_printf( Perl_debug_log, "\n" );
1222
1223 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1224
1225 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1226 (int)depth * 2 + 2,"",
1227 (UV)TRIE_NODENUM( state ) );
1228
1229 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1230 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1231 if (v)
1232 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1233 else
1234 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1235 }
1236 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1237 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1238 } else {
1239 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1240 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1241 }
1242 }
1243}
1244
1245#endif
1246
1247
1248/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1249 startbranch: the first branch in the whole branch sequence
1250 first : start branch of sequence of branch-exact nodes.
1251 May be the same as startbranch
1252 last : Thing following the last branch.
1253 May be the same as tail.
1254 tail : item following the branch sequence
1255 count : words in the sequence
1256 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1257 depth : indent depth
1258
1259Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1260
1261A trie is an N'ary tree where the branches are determined by digital
1262decomposition of the key. IE, at the root node you look up the 1st character and
1263follow that branch repeat until you find the end of the branches. Nodes can be
1264marked as "accepting" meaning they represent a complete word. Eg:
1265
1266 /he|she|his|hers/
1267
1268would convert into the following structure. Numbers represent states, letters
1269following numbers represent valid transitions on the letter from that state, if
1270the number is in square brackets it represents an accepting state, otherwise it
1271will be in parenthesis.
1272
1273 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1274 | |
1275 | (2)
1276 | |
1277 (1) +-i->(6)-+-s->[7]
1278 |
1279 +-s->(3)-+-h->(4)-+-e->[5]
1280
1281 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1282
1283This shows that when matching against the string 'hers' we will begin at state 1
1284read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1285then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1286is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1287single traverse. We store a mapping from accepting to state to which word was
1288matched, and then when we have multiple possibilities we try to complete the
1289rest of the regex in the order in which they occured in the alternation.
1290
1291The only prior NFA like behaviour that would be changed by the TRIE support is
1292the silent ignoring of duplicate alternations which are of the form:
1293
1294 / (DUPE|DUPE) X? (?{ ... }) Y /x
1295
1296Thus EVAL blocks following a trie may be called a different number of times with
1297and without the optimisation. With the optimisations dupes will be silently
1298ignored. This inconsistent behaviour of EVAL type nodes is well established as
1299the following demonstrates:
1300
1301 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1302
1303which prints out 'word' three times, but
1304
1305 'words'=~/(word|word|word)(?{ print $1 })S/
1306
1307which doesnt print it out at all. This is due to other optimisations kicking in.
1308
1309Example of what happens on a structural level:
1310
1311The regexp /(ac|ad|ab)+/ will produce the following debug output:
1312
1313 1: CURLYM[1] {1,32767}(18)
1314 5: BRANCH(8)
1315 6: EXACT <ac>(16)
1316 8: BRANCH(11)
1317 9: EXACT <ad>(16)
1318 11: BRANCH(14)
1319 12: EXACT <ab>(16)
1320 16: SUCCEED(0)
1321 17: NOTHING(18)
1322 18: END(0)
1323
1324This would be optimizable with startbranch=5, first=5, last=16, tail=16
1325and should turn into:
1326
1327 1: CURLYM[1] {1,32767}(18)
1328 5: TRIE(16)
1329 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1330 <ac>
1331 <ad>
1332 <ab>
1333 16: SUCCEED(0)
1334 17: NOTHING(18)
1335 18: END(0)
1336
1337Cases where tail != last would be like /(?foo|bar)baz/:
1338
1339 1: BRANCH(4)
1340 2: EXACT <foo>(8)
1341 4: BRANCH(7)
1342 5: EXACT <bar>(8)
1343 7: TAIL(8)
1344 8: EXACT <baz>(10)
1345 10: END(0)
1346
1347which would be optimizable with startbranch=1, first=1, last=7, tail=8
1348and would end up looking like:
1349
1350 1: TRIE(8)
1351 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1352 <foo>
1353 <bar>
1354 7: TAIL(8)
1355 8: EXACT <baz>(10)
1356 10: END(0)
1357
1358 d = uvuni_to_utf8_flags(d, uv, 0);
1359
1360is the recommended Unicode-aware way of saying
1361
1362 *(d++) = uv;
1363*/
1364
1365#define TRIE_STORE_REVCHAR \
1366 STMT_START { \
1367 if (UTF) { \
1368 SV *zlopp = newSV(2); \
1369 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1370 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1371 SvCUR_set(zlopp, kapow - flrbbbbb); \
1372 SvPOK_on(zlopp); \
1373 SvUTF8_on(zlopp); \
1374 av_push(revcharmap, zlopp); \
1375 } else { \
1376 char ooooff = (char)uvc; \
1377 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1378 } \
1379 } STMT_END
1380
1381#define TRIE_READ_CHAR STMT_START { \
1382 wordlen++; \
1383 if ( UTF ) { \
1384 if ( folder ) { \
1385 if ( foldlen > 0 ) { \
1386 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1387 foldlen -= len; \
1388 scan += len; \
1389 len = 0; \
1390 } else { \
1391 len = UTF8SKIP(uc);\
1392 uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
1393 foldlen -= UNISKIP( uvc ); \
1394 scan = foldbuf + UNISKIP( uvc ); \
1395 } \
1396 } else { \
1397 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1398 } \
1399 } else { \
1400 uvc = (U32)*uc; \
1401 len = 1; \
1402 } \
1403} STMT_END
1404
1405
1406
1407#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1408 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1409 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1410 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1411 } \
1412 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1413 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1414 TRIE_LIST_CUR( state )++; \
1415} STMT_END
1416
1417#define TRIE_LIST_NEW(state) STMT_START { \
1418 Newxz( trie->states[ state ].trans.list, \
1419 4, reg_trie_trans_le ); \
1420 TRIE_LIST_CUR( state ) = 1; \
1421 TRIE_LIST_LEN( state ) = 4; \
1422} STMT_END
1423
1424#define TRIE_HANDLE_WORD(state) STMT_START { \
1425 U16 dupe= trie->states[ state ].wordnum; \
1426 regnode * const noper_next = regnext( noper ); \
1427 \
1428 DEBUG_r({ \
1429 /* store the word for dumping */ \
1430 SV* tmp; \
1431 if (OP(noper) != NOTHING) \
1432 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1433 else \
1434 tmp = newSVpvn_utf8( "", 0, UTF ); \
1435 av_push( trie_words, tmp ); \
1436 }); \
1437 \
1438 curword++; \
1439 trie->wordinfo[curword].prev = 0; \
1440 trie->wordinfo[curword].len = wordlen; \
1441 trie->wordinfo[curword].accept = state; \
1442 \
1443 if ( noper_next < tail ) { \
1444 if (!trie->jump) \
1445 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1446 trie->jump[curword] = (U16)(noper_next - convert); \
1447 if (!jumper) \
1448 jumper = noper_next; \
1449 if (!nextbranch) \
1450 nextbranch= regnext(cur); \
1451 } \
1452 \
1453 if ( dupe ) { \
1454 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1455 /* chain, so that when the bits of chain are later */\
1456 /* linked together, the dups appear in the chain */\
1457 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1458 trie->wordinfo[dupe].prev = curword; \
1459 } else { \
1460 /* we haven't inserted this word yet. */ \
1461 trie->states[ state ].wordnum = curword; \
1462 } \
1463} STMT_END
1464
1465
1466#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1467 ( ( base + charid >= ucharcount \
1468 && base + charid < ubound \
1469 && state == trie->trans[ base - ucharcount + charid ].check \
1470 && trie->trans[ base - ucharcount + charid ].next ) \
1471 ? trie->trans[ base - ucharcount + charid ].next \
1472 : ( state==1 ? special : 0 ) \
1473 )
1474
1475#define MADE_TRIE 1
1476#define MADE_JUMP_TRIE 2
1477#define MADE_EXACT_TRIE 4
1478
1479STATIC I32
1480S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1481{
1482 dVAR;
1483 /* first pass, loop through and scan words */
1484 reg_trie_data *trie;
1485 HV *widecharmap = NULL;
1486 AV *revcharmap = newAV();
1487 regnode *cur;
1488 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1489 STRLEN len = 0;
1490 UV uvc = 0;
1491 U16 curword = 0;
1492 U32 next_alloc = 0;
1493 regnode *jumper = NULL;
1494 regnode *nextbranch = NULL;
1495 regnode *convert = NULL;
1496 U32 *prev_states; /* temp array mapping each state to previous one */
1497 /* we just use folder as a flag in utf8 */
1498 const U8 * folder = NULL;
1499
1500#ifdef DEBUGGING
1501 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1502 AV *trie_words = NULL;
1503 /* along with revcharmap, this only used during construction but both are
1504 * useful during debugging so we store them in the struct when debugging.
1505 */
1506#else
1507 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1508 STRLEN trie_charcount=0;
1509#endif
1510 SV *re_trie_maxbuff;
1511 GET_RE_DEBUG_FLAGS_DECL;
1512
1513 PERL_ARGS_ASSERT_MAKE_TRIE;
1514#ifndef DEBUGGING
1515 PERL_UNUSED_ARG(depth);
1516#endif
1517
1518 switch (flags) {
1519 case EXACT: break;
1520 case EXACTFA:
1521 case EXACTFU: folder = PL_fold_latin1; break;
1522 case EXACTF: folder = PL_fold; break;
1523 case EXACTFL: folder = PL_fold_locale; break;
1524 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
1525 }
1526
1527 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1528 trie->refcount = 1;
1529 trie->startstate = 1;
1530 trie->wordcount = word_count;
1531 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1532 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1533 if (!(UTF && folder))
1534 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1535 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1536 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1537
1538 DEBUG_r({
1539 trie_words = newAV();
1540 });
1541
1542 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1543 if (!SvIOK(re_trie_maxbuff)) {
1544 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1545 }
1546 DEBUG_OPTIMISE_r({
1547 PerlIO_printf( Perl_debug_log,
1548 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1549 (int)depth * 2 + 2, "",
1550 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1551 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1552 (int)depth);
1553 });
1554
1555 /* Find the node we are going to overwrite */
1556 if ( first == startbranch && OP( last ) != BRANCH ) {
1557 /* whole branch chain */
1558 convert = first;
1559 } else {
1560 /* branch sub-chain */
1561 convert = NEXTOPER( first );
1562 }
1563
1564 /* -- First loop and Setup --
1565
1566 We first traverse the branches and scan each word to determine if it
1567 contains widechars, and how many unique chars there are, this is
1568 important as we have to build a table with at least as many columns as we
1569 have unique chars.
1570
1571 We use an array of integers to represent the character codes 0..255
1572 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1573 native representation of the character value as the key and IV's for the
1574 coded index.
1575
1576 *TODO* If we keep track of how many times each character is used we can
1577 remap the columns so that the table compression later on is more
1578 efficient in terms of memory by ensuring the most common value is in the
1579 middle and the least common are on the outside. IMO this would be better
1580 than a most to least common mapping as theres a decent chance the most
1581 common letter will share a node with the least common, meaning the node
1582 will not be compressible. With a middle is most common approach the worst
1583 case is when we have the least common nodes twice.
1584
1585 */
1586
1587 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1588 regnode * const noper = NEXTOPER( cur );
1589 const U8 *uc = (U8*)STRING( noper );
1590 const U8 * const e = uc + STR_LEN( noper );
1591 STRLEN foldlen = 0;
1592 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1593 const U8 *scan = (U8*)NULL;
1594 U32 wordlen = 0; /* required init */
1595 STRLEN chars = 0;
1596 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1597
1598 if (OP(noper) == NOTHING) {
1599 trie->minlen= 0;
1600 continue;
1601 }
1602 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1603 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1604 regardless of encoding */
1605
1606 for ( ; uc < e ; uc += len ) {
1607 TRIE_CHARCOUNT(trie)++;
1608 TRIE_READ_CHAR;
1609 chars++;
1610 if ( uvc < 256 ) {
1611 if ( !trie->charmap[ uvc ] ) {
1612 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1613 if ( folder )
1614 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1615 TRIE_STORE_REVCHAR;
1616 }
1617 if ( set_bit ) {
1618 /* store the codepoint in the bitmap, and its folded
1619 * equivalent. */
1620 TRIE_BITMAP_SET(trie,uvc);
1621
1622 /* store the folded codepoint */
1623 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1624
1625 if ( !UTF ) {
1626 /* store first byte of utf8 representation of
1627 variant codepoints */
1628 if (! UNI_IS_INVARIANT(uvc)) {
1629 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1630 }
1631 }
1632 set_bit = 0; /* We've done our bit :-) */
1633 }
1634 } else {
1635 SV** svpp;
1636 if ( !widecharmap )
1637 widecharmap = newHV();
1638
1639 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1640
1641 if ( !svpp )
1642 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1643
1644 if ( !SvTRUE( *svpp ) ) {
1645 sv_setiv( *svpp, ++trie->uniquecharcount );
1646 TRIE_STORE_REVCHAR;
1647 }
1648 }
1649 }
1650 if( cur == first ) {
1651 trie->minlen=chars;
1652 trie->maxlen=chars;
1653 } else if (chars < trie->minlen) {
1654 trie->minlen=chars;
1655 } else if (chars > trie->maxlen) {
1656 trie->maxlen=chars;
1657 }
1658
1659 } /* end first pass */
1660 DEBUG_TRIE_COMPILE_r(
1661 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1662 (int)depth * 2 + 2,"",
1663 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1664 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1665 (int)trie->minlen, (int)trie->maxlen )
1666 );
1667
1668 /*
1669 We now know what we are dealing with in terms of unique chars and
1670 string sizes so we can calculate how much memory a naive
1671 representation using a flat table will take. If it's over a reasonable
1672 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1673 conservative but potentially much slower representation using an array
1674 of lists.
1675
1676 At the end we convert both representations into the same compressed
1677 form that will be used in regexec.c for matching with. The latter
1678 is a form that cannot be used to construct with but has memory
1679 properties similar to the list form and access properties similar
1680 to the table form making it both suitable for fast searches and
1681 small enough that its feasable to store for the duration of a program.
1682
1683 See the comment in the code where the compressed table is produced
1684 inplace from the flat tabe representation for an explanation of how
1685 the compression works.
1686
1687 */
1688
1689
1690 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1691 prev_states[1] = 0;
1692
1693 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1694 /*
1695 Second Pass -- Array Of Lists Representation
1696
1697 Each state will be represented by a list of charid:state records
1698 (reg_trie_trans_le) the first such element holds the CUR and LEN
1699 points of the allocated array. (See defines above).
1700
1701 We build the initial structure using the lists, and then convert
1702 it into the compressed table form which allows faster lookups
1703 (but cant be modified once converted).
1704 */
1705
1706 STRLEN transcount = 1;
1707
1708 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1709 "%*sCompiling trie using list compiler\n",
1710 (int)depth * 2 + 2, ""));
1711
1712 trie->states = (reg_trie_state *)
1713 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1714 sizeof(reg_trie_state) );
1715 TRIE_LIST_NEW(1);
1716 next_alloc = 2;
1717
1718 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1719
1720 regnode * const noper = NEXTOPER( cur );
1721 U8 *uc = (U8*)STRING( noper );
1722 const U8 * const e = uc + STR_LEN( noper );
1723 U32 state = 1; /* required init */
1724 U16 charid = 0; /* sanity init */
1725 U8 *scan = (U8*)NULL; /* sanity init */
1726 STRLEN foldlen = 0; /* required init */
1727 U32 wordlen = 0; /* required init */
1728 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1729
1730 if (OP(noper) != NOTHING) {
1731 for ( ; uc < e ; uc += len ) {
1732
1733 TRIE_READ_CHAR;
1734
1735 if ( uvc < 256 ) {
1736 charid = trie->charmap[ uvc ];
1737 } else {
1738 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1739 if ( !svpp ) {
1740 charid = 0;
1741 } else {
1742 charid=(U16)SvIV( *svpp );
1743 }
1744 }
1745 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1746 if ( charid ) {
1747
1748 U16 check;
1749 U32 newstate = 0;
1750
1751 charid--;
1752 if ( !trie->states[ state ].trans.list ) {
1753 TRIE_LIST_NEW( state );
1754 }
1755 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1756 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1757 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1758 break;
1759 }
1760 }
1761 if ( ! newstate ) {
1762 newstate = next_alloc++;
1763 prev_states[newstate] = state;
1764 TRIE_LIST_PUSH( state, charid, newstate );
1765 transcount++;
1766 }
1767 state = newstate;
1768 } else {
1769 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1770 }
1771 }
1772 }
1773 TRIE_HANDLE_WORD(state);
1774
1775 } /* end second pass */
1776
1777 /* next alloc is the NEXT state to be allocated */
1778 trie->statecount = next_alloc;
1779 trie->states = (reg_trie_state *)
1780 PerlMemShared_realloc( trie->states,
1781 next_alloc
1782 * sizeof(reg_trie_state) );
1783
1784 /* and now dump it out before we compress it */
1785 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1786 revcharmap, next_alloc,
1787 depth+1)
1788 );
1789
1790 trie->trans = (reg_trie_trans *)
1791 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1792 {
1793 U32 state;
1794 U32 tp = 0;
1795 U32 zp = 0;
1796
1797
1798 for( state=1 ; state < next_alloc ; state ++ ) {
1799 U32 base=0;
1800
1801 /*
1802 DEBUG_TRIE_COMPILE_MORE_r(
1803 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1804 );
1805 */
1806
1807 if (trie->states[state].trans.list) {
1808 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1809 U16 maxid=minid;
1810 U16 idx;
1811
1812 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1813 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1814 if ( forid < minid ) {
1815 minid=forid;
1816 } else if ( forid > maxid ) {
1817 maxid=forid;
1818 }
1819 }
1820 if ( transcount < tp + maxid - minid + 1) {
1821 transcount *= 2;
1822 trie->trans = (reg_trie_trans *)
1823 PerlMemShared_realloc( trie->trans,
1824 transcount
1825 * sizeof(reg_trie_trans) );
1826 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1827 }
1828 base = trie->uniquecharcount + tp - minid;
1829 if ( maxid == minid ) {
1830 U32 set = 0;
1831 for ( ; zp < tp ; zp++ ) {
1832 if ( ! trie->trans[ zp ].next ) {
1833 base = trie->uniquecharcount + zp - minid;
1834 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1835 trie->trans[ zp ].check = state;
1836 set = 1;
1837 break;
1838 }
1839 }
1840 if ( !set ) {
1841 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1842 trie->trans[ tp ].check = state;
1843 tp++;
1844 zp = tp;
1845 }
1846 } else {
1847 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1848 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1849 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1850 trie->trans[ tid ].check = state;
1851 }
1852 tp += ( maxid - minid + 1 );
1853 }
1854 Safefree(trie->states[ state ].trans.list);
1855 }
1856 /*
1857 DEBUG_TRIE_COMPILE_MORE_r(
1858 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1859 );
1860 */
1861 trie->states[ state ].trans.base=base;
1862 }
1863 trie->lasttrans = tp + 1;
1864 }
1865 } else {
1866 /*
1867 Second Pass -- Flat Table Representation.
1868
1869 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1870 We know that we will need Charcount+1 trans at most to store the data
1871 (one row per char at worst case) So we preallocate both structures
1872 assuming worst case.
1873
1874 We then construct the trie using only the .next slots of the entry
1875 structs.
1876
1877 We use the .check field of the first entry of the node temporarily to
1878 make compression both faster and easier by keeping track of how many non
1879 zero fields are in the node.
1880
1881 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1882 transition.
1883
1884 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1885 number representing the first entry of the node, and state as a
1886 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1887 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1888 are 2 entrys per node. eg:
1889
1890 A B A B
1891 1. 2 4 1. 3 7
1892 2. 0 3 3. 0 5
1893 3. 0 0 5. 0 0
1894 4. 0 0 7. 0 0
1895
1896 The table is internally in the right hand, idx form. However as we also
1897 have to deal with the states array which is indexed by nodenum we have to
1898 use TRIE_NODENUM() to convert.
1899
1900 */
1901 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1902 "%*sCompiling trie using table compiler\n",
1903 (int)depth * 2 + 2, ""));
1904
1905 trie->trans = (reg_trie_trans *)
1906 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1907 * trie->uniquecharcount + 1,
1908 sizeof(reg_trie_trans) );
1909 trie->states = (reg_trie_state *)
1910 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1911 sizeof(reg_trie_state) );
1912 next_alloc = trie->uniquecharcount + 1;
1913
1914
1915 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1916
1917 regnode * const noper = NEXTOPER( cur );
1918 const U8 *uc = (U8*)STRING( noper );
1919 const U8 * const e = uc + STR_LEN( noper );
1920
1921 U32 state = 1; /* required init */
1922
1923 U16 charid = 0; /* sanity init */
1924 U32 accept_state = 0; /* sanity init */
1925 U8 *scan = (U8*)NULL; /* sanity init */
1926
1927 STRLEN foldlen = 0; /* required init */
1928 U32 wordlen = 0; /* required init */
1929 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1930
1931 if ( OP(noper) != NOTHING ) {
1932 for ( ; uc < e ; uc += len ) {
1933
1934 TRIE_READ_CHAR;
1935
1936 if ( uvc < 256 ) {
1937 charid = trie->charmap[ uvc ];
1938 } else {
1939 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1940 charid = svpp ? (U16)SvIV(*svpp) : 0;
1941 }
1942 if ( charid ) {
1943 charid--;
1944 if ( !trie->trans[ state + charid ].next ) {
1945 trie->trans[ state + charid ].next = next_alloc;
1946 trie->trans[ state ].check++;
1947 prev_states[TRIE_NODENUM(next_alloc)]
1948 = TRIE_NODENUM(state);
1949 next_alloc += trie->uniquecharcount;
1950 }
1951 state = trie->trans[ state + charid ].next;
1952 } else {
1953 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1954 }
1955 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1956 }
1957 }
1958 accept_state = TRIE_NODENUM( state );
1959 TRIE_HANDLE_WORD(accept_state);
1960
1961 } /* end second pass */
1962
1963 /* and now dump it out before we compress it */
1964 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1965 revcharmap,
1966 next_alloc, depth+1));
1967
1968 {
1969 /*
1970 * Inplace compress the table.*
1971
1972 For sparse data sets the table constructed by the trie algorithm will
1973 be mostly 0/FAIL transitions or to put it another way mostly empty.
1974 (Note that leaf nodes will not contain any transitions.)
1975
1976 This algorithm compresses the tables by eliminating most such
1977 transitions, at the cost of a modest bit of extra work during lookup:
1978
1979 - Each states[] entry contains a .base field which indicates the
1980 index in the state[] array wheres its transition data is stored.
1981
1982 - If .base is 0 there are no valid transitions from that node.
1983
1984 - If .base is nonzero then charid is added to it to find an entry in
1985 the trans array.
1986
1987 -If trans[states[state].base+charid].check!=state then the
1988 transition is taken to be a 0/Fail transition. Thus if there are fail
1989 transitions at the front of the node then the .base offset will point
1990 somewhere inside the previous nodes data (or maybe even into a node
1991 even earlier), but the .check field determines if the transition is
1992 valid.
1993
1994 XXX - wrong maybe?
1995 The following process inplace converts the table to the compressed
1996 table: We first do not compress the root node 1,and mark all its
1997 .check pointers as 1 and set its .base pointer as 1 as well. This
1998 allows us to do a DFA construction from the compressed table later,
1999 and ensures that any .base pointers we calculate later are greater
2000 than 0.
2001
2002 - We set 'pos' to indicate the first entry of the second node.
2003
2004 - We then iterate over the columns of the node, finding the first and
2005 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2006 and set the .check pointers accordingly, and advance pos
2007 appropriately and repreat for the next node. Note that when we copy
2008 the next pointers we have to convert them from the original
2009 NODEIDX form to NODENUM form as the former is not valid post
2010 compression.
2011
2012 - If a node has no transitions used we mark its base as 0 and do not
2013 advance the pos pointer.
2014
2015 - If a node only has one transition we use a second pointer into the
2016 structure to fill in allocated fail transitions from other states.
2017 This pointer is independent of the main pointer and scans forward
2018 looking for null transitions that are allocated to a state. When it
2019 finds one it writes the single transition into the "hole". If the
2020 pointer doesnt find one the single transition is appended as normal.
2021
2022 - Once compressed we can Renew/realloc the structures to release the
2023 excess space.
2024
2025 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2026 specifically Fig 3.47 and the associated pseudocode.
2027
2028 demq
2029 */
2030 const U32 laststate = TRIE_NODENUM( next_alloc );
2031 U32 state, charid;
2032 U32 pos = 0, zp=0;
2033 trie->statecount = laststate;
2034
2035 for ( state = 1 ; state < laststate ; state++ ) {
2036 U8 flag = 0;
2037 const U32 stateidx = TRIE_NODEIDX( state );
2038 const U32 o_used = trie->trans[ stateidx ].check;
2039 U32 used = trie->trans[ stateidx ].check;
2040 trie->trans[ stateidx ].check = 0;
2041
2042 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2043 if ( flag || trie->trans[ stateidx + charid ].next ) {
2044 if ( trie->trans[ stateidx + charid ].next ) {
2045 if (o_used == 1) {
2046 for ( ; zp < pos ; zp++ ) {
2047 if ( ! trie->trans[ zp ].next ) {
2048 break;
2049 }
2050 }
2051 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2052 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2053 trie->trans[ zp ].check = state;
2054 if ( ++zp > pos ) pos = zp;
2055 break;
2056 }
2057 used--;
2058 }
2059 if ( !flag ) {
2060 flag = 1;
2061 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2062 }
2063 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2064 trie->trans[ pos ].check = state;
2065 pos++;
2066 }
2067 }
2068 }
2069 trie->lasttrans = pos + 1;
2070 trie->states = (reg_trie_state *)
2071 PerlMemShared_realloc( trie->states, laststate
2072 * sizeof(reg_trie_state) );
2073 DEBUG_TRIE_COMPILE_MORE_r(
2074 PerlIO_printf( Perl_debug_log,
2075 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2076 (int)depth * 2 + 2,"",
2077 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2078 (IV)next_alloc,
2079 (IV)pos,
2080 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2081 );
2082
2083 } /* end table compress */
2084 }
2085 DEBUG_TRIE_COMPILE_MORE_r(
2086 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2087 (int)depth * 2 + 2, "",
2088 (UV)trie->statecount,
2089 (UV)trie->lasttrans)
2090 );
2091 /* resize the trans array to remove unused space */
2092 trie->trans = (reg_trie_trans *)
2093 PerlMemShared_realloc( trie->trans, trie->lasttrans
2094 * sizeof(reg_trie_trans) );
2095
2096 { /* Modify the program and insert the new TRIE node */
2097 U8 nodetype =(U8)(flags & 0xFF);
2098 char *str=NULL;
2099
2100#ifdef DEBUGGING
2101 regnode *optimize = NULL;
2102#ifdef RE_TRACK_PATTERN_OFFSETS
2103
2104 U32 mjd_offset = 0;
2105 U32 mjd_nodelen = 0;
2106#endif /* RE_TRACK_PATTERN_OFFSETS */
2107#endif /* DEBUGGING */
2108 /*
2109 This means we convert either the first branch or the first Exact,
2110 depending on whether the thing following (in 'last') is a branch
2111 or not and whther first is the startbranch (ie is it a sub part of
2112 the alternation or is it the whole thing.)
2113 Assuming its a sub part we convert the EXACT otherwise we convert
2114 the whole branch sequence, including the first.
2115 */
2116 /* Find the node we are going to overwrite */
2117 if ( first != startbranch || OP( last ) == BRANCH ) {
2118 /* branch sub-chain */
2119 NEXT_OFF( first ) = (U16)(last - first);
2120#ifdef RE_TRACK_PATTERN_OFFSETS
2121 DEBUG_r({
2122 mjd_offset= Node_Offset((convert));
2123 mjd_nodelen= Node_Length((convert));
2124 });
2125#endif
2126 /* whole branch chain */
2127 }
2128#ifdef RE_TRACK_PATTERN_OFFSETS
2129 else {
2130 DEBUG_r({
2131 const regnode *nop = NEXTOPER( convert );
2132 mjd_offset= Node_Offset((nop));
2133 mjd_nodelen= Node_Length((nop));
2134 });
2135 }
2136 DEBUG_OPTIMISE_r(
2137 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2138 (int)depth * 2 + 2, "",
2139 (UV)mjd_offset, (UV)mjd_nodelen)
2140 );
2141#endif
2142 /* But first we check to see if there is a common prefix we can
2143 split out as an EXACT and put in front of the TRIE node. */
2144 trie->startstate= 1;
2145 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2146 U32 state;
2147 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2148 U32 ofs = 0;
2149 I32 idx = -1;
2150 U32 count = 0;
2151 const U32 base = trie->states[ state ].trans.base;
2152
2153 if ( trie->states[state].wordnum )
2154 count = 1;
2155
2156 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2157 if ( ( base + ofs >= trie->uniquecharcount ) &&
2158 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2159 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2160 {
2161 if ( ++count > 1 ) {
2162 SV **tmp = av_fetch( revcharmap, ofs, 0);
2163 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2164 if ( state == 1 ) break;
2165 if ( count == 2 ) {
2166 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2167 DEBUG_OPTIMISE_r(
2168 PerlIO_printf(Perl_debug_log,
2169 "%*sNew Start State=%"UVuf" Class: [",
2170 (int)depth * 2 + 2, "",
2171 (UV)state));
2172 if (idx >= 0) {
2173 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2174 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2175
2176 TRIE_BITMAP_SET(trie,*ch);
2177 if ( folder )
2178 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2179 DEBUG_OPTIMISE_r(
2180 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2181 );
2182 }
2183 }
2184 TRIE_BITMAP_SET(trie,*ch);
2185 if ( folder )
2186 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2187 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2188 }
2189 idx = ofs;
2190 }
2191 }
2192 if ( count == 1 ) {
2193 SV **tmp = av_fetch( revcharmap, idx, 0);
2194 STRLEN len;
2195 char *ch = SvPV( *tmp, len );
2196 DEBUG_OPTIMISE_r({
2197 SV *sv=sv_newmortal();
2198 PerlIO_printf( Perl_debug_log,
2199 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2200 (int)depth * 2 + 2, "",
2201 (UV)state, (UV)idx,
2202 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2203 PL_colors[0], PL_colors[1],
2204 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2205 PERL_PV_ESCAPE_FIRSTCHAR
2206 )
2207 );
2208 });
2209 if ( state==1 ) {
2210 OP( convert ) = nodetype;
2211 str=STRING(convert);
2212 STR_LEN(convert)=0;
2213 }
2214 STR_LEN(convert) += len;
2215 while (len--)
2216 *str++ = *ch++;
2217 } else {
2218#ifdef DEBUGGING
2219 if (state>1)
2220 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2221#endif
2222 break;
2223 }
2224 }
2225 trie->prefixlen = (state-1);
2226 if (str) {
2227 regnode *n = convert+NODE_SZ_STR(convert);
2228 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2229 trie->startstate = state;
2230 trie->minlen -= (state - 1);
2231 trie->maxlen -= (state - 1);
2232#ifdef DEBUGGING
2233 /* At least the UNICOS C compiler choked on this
2234 * being argument to DEBUG_r(), so let's just have
2235 * it right here. */
2236 if (
2237#ifdef PERL_EXT_RE_BUILD
2238 1
2239#else
2240 DEBUG_r_TEST
2241#endif
2242 ) {
2243 regnode *fix = convert;
2244 U32 word = trie->wordcount;
2245 mjd_nodelen++;
2246 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2247 while( ++fix < n ) {
2248 Set_Node_Offset_Length(fix, 0, 0);
2249 }
2250 while (word--) {
2251 SV ** const tmp = av_fetch( trie_words, word, 0 );
2252 if (tmp) {
2253 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2254 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2255 else
2256 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2257 }
2258 }
2259 }
2260#endif
2261 if (trie->maxlen) {
2262 convert = n;
2263 } else {
2264 NEXT_OFF(convert) = (U16)(tail - convert);
2265 DEBUG_r(optimize= n);
2266 }
2267 }
2268 }
2269 if (!jumper)
2270 jumper = last;
2271 if ( trie->maxlen ) {
2272 NEXT_OFF( convert ) = (U16)(tail - convert);
2273 ARG_SET( convert, data_slot );
2274 /* Store the offset to the first unabsorbed branch in
2275 jump[0], which is otherwise unused by the jump logic.
2276 We use this when dumping a trie and during optimisation. */
2277 if (trie->jump)
2278 trie->jump[0] = (U16)(nextbranch - convert);
2279
2280 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2281 * and there is a bitmap
2282 * and the first "jump target" node we found leaves enough room
2283 * then convert the TRIE node into a TRIEC node, with the bitmap
2284 * embedded inline in the opcode - this is hypothetically faster.
2285 */
2286 if ( !trie->states[trie->startstate].wordnum
2287 && trie->bitmap
2288 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2289 {
2290 OP( convert ) = TRIEC;
2291 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2292 PerlMemShared_free(trie->bitmap);
2293 trie->bitmap= NULL;
2294 } else
2295 OP( convert ) = TRIE;
2296
2297 /* store the type in the flags */
2298 convert->flags = nodetype;
2299 DEBUG_r({
2300 optimize = convert
2301 + NODE_STEP_REGNODE
2302 + regarglen[ OP( convert ) ];
2303 });
2304 /* XXX We really should free up the resource in trie now,
2305 as we won't use them - (which resources?) dmq */
2306 }
2307 /* needed for dumping*/
2308 DEBUG_r(if (optimize) {
2309 regnode *opt = convert;
2310
2311 while ( ++opt < optimize) {
2312 Set_Node_Offset_Length(opt,0,0);
2313 }
2314 /*
2315 Try to clean up some of the debris left after the
2316 optimisation.
2317 */
2318 while( optimize < jumper ) {
2319 mjd_nodelen += Node_Length((optimize));
2320 OP( optimize ) = OPTIMIZED;
2321 Set_Node_Offset_Length(optimize,0,0);
2322 optimize++;
2323 }
2324 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2325 });
2326 } /* end node insert */
2327
2328 /* Finish populating the prev field of the wordinfo array. Walk back
2329 * from each accept state until we find another accept state, and if
2330 * so, point the first word's .prev field at the second word. If the
2331 * second already has a .prev field set, stop now. This will be the
2332 * case either if we've already processed that word's accept state,
2333 * or that state had multiple words, and the overspill words were
2334 * already linked up earlier.
2335 */
2336 {
2337 U16 word;
2338 U32 state;
2339 U16 prev;
2340
2341 for (word=1; word <= trie->wordcount; word++) {
2342 prev = 0;
2343 if (trie->wordinfo[word].prev)
2344 continue;
2345 state = trie->wordinfo[word].accept;
2346 while (state) {
2347 state = prev_states[state];
2348 if (!state)
2349 break;
2350 prev = trie->states[state].wordnum;
2351 if (prev)
2352 break;
2353 }
2354 trie->wordinfo[word].prev = prev;
2355 }
2356 Safefree(prev_states);
2357 }
2358
2359
2360 /* and now dump out the compressed format */
2361 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2362
2363 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2364#ifdef DEBUGGING
2365 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2366 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2367#else
2368 SvREFCNT_dec(revcharmap);
2369#endif
2370 return trie->jump
2371 ? MADE_JUMP_TRIE
2372 : trie->startstate>1
2373 ? MADE_EXACT_TRIE
2374 : MADE_TRIE;
2375}
2376
2377STATIC void
2378S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2379{
2380/* The Trie is constructed and compressed now so we can build a fail array if it's needed
2381
2382 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2383 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2384 ISBN 0-201-10088-6
2385
2386 We find the fail state for each state in the trie, this state is the longest proper
2387 suffix of the current state's 'word' that is also a proper prefix of another word in our
2388 trie. State 1 represents the word '' and is thus the default fail state. This allows
2389 the DFA not to have to restart after its tried and failed a word at a given point, it
2390 simply continues as though it had been matching the other word in the first place.
2391 Consider
2392 'abcdgu'=~/abcdefg|cdgu/
2393 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2394 fail, which would bring us to the state representing 'd' in the second word where we would
2395 try 'g' and succeed, proceeding to match 'cdgu'.
2396 */
2397 /* add a fail transition */
2398 const U32 trie_offset = ARG(source);
2399 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2400 U32 *q;
2401 const U32 ucharcount = trie->uniquecharcount;
2402 const U32 numstates = trie->statecount;
2403 const U32 ubound = trie->lasttrans + ucharcount;
2404 U32 q_read = 0;
2405 U32 q_write = 0;
2406 U32 charid;
2407 U32 base = trie->states[ 1 ].trans.base;
2408 U32 *fail;
2409 reg_ac_data *aho;
2410 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2411 GET_RE_DEBUG_FLAGS_DECL;
2412
2413 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2414#ifndef DEBUGGING
2415 PERL_UNUSED_ARG(depth);
2416#endif
2417
2418
2419 ARG_SET( stclass, data_slot );
2420 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2421 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2422 aho->trie=trie_offset;
2423 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2424 Copy( trie->states, aho->states, numstates, reg_trie_state );
2425 Newxz( q, numstates, U32);
2426 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2427 aho->refcount = 1;
2428 fail = aho->fail;
2429 /* initialize fail[0..1] to be 1 so that we always have
2430 a valid final fail state */
2431 fail[ 0 ] = fail[ 1 ] = 1;
2432
2433 for ( charid = 0; charid < ucharcount ; charid++ ) {
2434 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2435 if ( newstate ) {
2436 q[ q_write ] = newstate;
2437 /* set to point at the root */
2438 fail[ q[ q_write++ ] ]=1;
2439 }
2440 }
2441 while ( q_read < q_write) {
2442 const U32 cur = q[ q_read++ % numstates ];
2443 base = trie->states[ cur ].trans.base;
2444
2445 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2446 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2447 if (ch_state) {
2448 U32 fail_state = cur;
2449 U32 fail_base;
2450 do {
2451 fail_state = fail[ fail_state ];
2452 fail_base = aho->states[ fail_state ].trans.base;
2453 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2454
2455 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2456 fail[ ch_state ] = fail_state;
2457 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2458 {
2459 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2460 }
2461 q[ q_write++ % numstates] = ch_state;
2462 }
2463 }
2464 }
2465 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2466 when we fail in state 1, this allows us to use the
2467 charclass scan to find a valid start char. This is based on the principle
2468 that theres a good chance the string being searched contains lots of stuff
2469 that cant be a start char.
2470 */
2471 fail[ 0 ] = fail[ 1 ] = 0;
2472 DEBUG_TRIE_COMPILE_r({
2473 PerlIO_printf(Perl_debug_log,
2474 "%*sStclass Failtable (%"UVuf" states): 0",
2475 (int)(depth * 2), "", (UV)numstates
2476 );
2477 for( q_read=1; q_read<numstates; q_read++ ) {
2478 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2479 }
2480 PerlIO_printf(Perl_debug_log, "\n");
2481 });
2482 Safefree(q);
2483 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2484}
2485
2486
2487/*
2488 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2489 * These need to be revisited when a newer toolchain becomes available.
2490 */
2491#if defined(__sparc64__) && defined(__GNUC__)
2492# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2493# undef SPARC64_GCC_WORKAROUND
2494# define SPARC64_GCC_WORKAROUND 1
2495# endif
2496#endif
2497
2498#define DEBUG_PEEP(str,scan,depth) \
2499 DEBUG_OPTIMISE_r({if (scan){ \
2500 SV * const mysv=sv_newmortal(); \
2501 regnode *Next = regnext(scan); \
2502 regprop(RExC_rx, mysv, scan); \
2503 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2504 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2505 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2506 }});
2507
2508
2509/* The below joins as many adjacent EXACTish nodes as possible into a single
2510 * one, and looks for problematic sequences of characters whose folds vs.
2511 * non-folds have sufficiently different lengths, that the optimizer would be
2512 * fooled into rejecting legitimate matches of them, and the trie construction
2513 * code can't cope with them. The joining is only done if:
2514 * 1) there is room in the current conglomerated node to entirely contain the
2515 * next one.
2516 * 2) they are the exact same node type
2517 *
2518 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2519 * these get optimized out
2520 *
2521 * If there are problematic code sequences, *min_subtract is set to the delta
2522 * that the minimum size of the node can be less than its actual size. And,
2523 * the node type of the result is changed to reflect that it contains these
2524 * sequences.
2525 *
2526 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2527 * and contains LATIN SMALL LETTER SHARP S
2528 *
2529 * This is as good a place as any to discuss the design of handling these
2530 * problematic sequences. It's been wrong in Perl for a very long time. There
2531 * are three code points in Unicode whose folded lengths differ so much from
2532 * the un-folded lengths that it causes problems for the optimizer and trie
2533 * construction. Why only these are problematic, and not others where lengths
2534 * also differ is something I (khw) do not understand. New versions of Unicode
2535 * might add more such code points. Hopefully the logic in fold_grind.t that
2536 * figures out what to test (in part by verifying that each size-combination
2537 * gets tested) will catch any that do come along, so they can be added to the
2538 * special handling below. The chances of new ones are actually rather small,
2539 * as most, if not all, of the world's scripts that have casefolding have
2540 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2541 * made to allow compatibility with pre-existing standards, and almost all of
2542 * those have already been dealt with. These would otherwise be the most
2543 * likely candidates for generating further tricky sequences. In other words,
2544 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2545 * with pre-existing standards, and there aren't many of those left.
2546 *
2547 * The previous designs for dealing with these involved assigning a special
2548 * node for them. This approach doesn't work, as evidenced by this example:
2549 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2550 * Both these fold to "sss", but if the pattern is parsed to create a node of
2551 * that would match just the \xDF, it won't be able to handle the case where a
2552 * successful match would have to cross the node's boundary. The new approach
2553 * that hopefully generally solves the problem generates an EXACTFU_SS node
2554 * that is "sss".
2555 *
2556 * There are a number of components to the approach (a lot of work for just
2557 * three code points!):
2558 * 1) This routine examines each EXACTFish node that could contain the
2559 * problematic sequences. It returns in *min_subtract how much to
2560 * subtract from the the actual length of the string to get a real minimum
2561 * for one that could match it. This number is usually 0 except for the
2562 * problematic sequences. This delta is used by the caller to adjust the
2563 * min length of the match, and the delta between min and max, so that the
2564 * optimizer doesn't reject these possibilities based on size constraints.
2565 * 2) These sequences are not currently correctly handled by the trie code
2566 * either, so it changes the joined node type to ops that are not handled
2567 * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
2568 * 3) This is sufficient for the two Greek sequences (described below), but
2569 * the one involving the Sharp s (\xDF) needs more. The node type
2570 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2571 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2572 * case where there is a possible fold length change. That means that a
2573 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2574 * itself with length changes, and so can be processed faster. regexec.c
2575 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2576 * is pre-folded by regcomp.c. This saves effort in regex matching.
2577 * However, probably mostly for historical reasons, the pre-folding isn't
2578 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2579 * nodes, as what they fold to isn't known until runtime.) The fold
2580 * possibilities for the non-UTF8 patterns are quite simple, except for
2581 * the sharp s. All the ones that don't involve a UTF-8 target string
2582 * are members of a fold-pair, and arrays are set up for all of them
2583 * that quickly find the other member of the pair. It might actually
2584 * be faster to pre-fold these, but it isn't currently done, except for
2585 * the sharp s. Code elsewhere in this file makes sure that it gets
2586 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2587 * issues described in the next item.
2588 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2589 * 'ss' or not is not knowable at compile time. It will match iff the
2590 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2591 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2592 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2593 * described in item 3). An assumption that the optimizer part of
2594 * regexec.c (probably unwittingly) makes is that a character in the
2595 * pattern corresponds to at most a single character in the target string.
2596 * (And I do mean character, and not byte here, unlike other parts of the
2597 * documentation that have never been updated to account for multibyte
2598 * Unicode.) This assumption is wrong only in this case, as all other
2599 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2600 * virtue of having this file pre-fold UTF-8 patterns. I'm
2601 * reluctant to try to change this assumption, so instead the code punts.
2602 * This routine examines EXACTF nodes for the sharp s, and returns a
2603 * boolean indicating whether or not the node is an EXACTF node that
2604 * contains a sharp s. When it is true, the caller sets a flag that later
2605 * causes the optimizer in this file to not set values for the floating
2606 * and fixed string lengths, and thus avoids the optimizer code in
2607 * regexec.c that makes the invalid assumption. Thus, there is no
2608 * optimization based on string lengths for EXACTF nodes that contain the
2609 * sharp s. This only happens for /id rules (which means the pattern
2610 * isn't in UTF-8).
2611 */
2612
2613#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2614 if (PL_regkind[OP(scan)] == EXACT) \
2615 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2616
2617STATIC U32
2618S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2619 /* Merge several consecutive EXACTish nodes into one. */
2620 regnode *n = regnext(scan);
2621 U32 stringok = 1;
2622 regnode *next = scan + NODE_SZ_STR(scan);
2623 U32 merged = 0;
2624 U32 stopnow = 0;
2625#ifdef DEBUGGING
2626 regnode *stop = scan;
2627 GET_RE_DEBUG_FLAGS_DECL;
2628#else
2629 PERL_UNUSED_ARG(depth);
2630#endif
2631
2632 PERL_ARGS_ASSERT_JOIN_EXACT;
2633#ifndef EXPERIMENTAL_INPLACESCAN
2634 PERL_UNUSED_ARG(flags);
2635 PERL_UNUSED_ARG(val);
2636#endif
2637 DEBUG_PEEP("join",scan,depth);
2638
2639 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2640 * EXACT ones that are mergeable to the current one. */
2641 while (n
2642 && (PL_regkind[OP(n)] == NOTHING
2643 || (stringok && OP(n) == OP(scan)))
2644 && NEXT_OFF(n)
2645 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2646 {
2647
2648 if (OP(n) == TAIL || n > next)
2649 stringok = 0;
2650 if (PL_regkind[OP(n)] == NOTHING) {
2651 DEBUG_PEEP("skip:",n,depth);
2652 NEXT_OFF(scan) += NEXT_OFF(n);
2653 next = n + NODE_STEP_REGNODE;
2654#ifdef DEBUGGING
2655 if (stringok)
2656 stop = n;
2657#endif
2658 n = regnext(n);
2659 }
2660 else if (stringok) {
2661 const unsigned int oldl = STR_LEN(scan);
2662 regnode * const nnext = regnext(n);
2663
2664 if (oldl + STR_LEN(n) > U8_MAX)
2665 break;
2666
2667 DEBUG_PEEP("merg",n,depth);
2668 merged++;
2669
2670 NEXT_OFF(scan) += NEXT_OFF(n);
2671 STR_LEN(scan) += STR_LEN(n);
2672 next = n + NODE_SZ_STR(n);
2673 /* Now we can overwrite *n : */
2674 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2675#ifdef DEBUGGING
2676 stop = next - 1;
2677#endif
2678 n = nnext;
2679 if (stopnow) break;
2680 }
2681
2682#ifdef EXPERIMENTAL_INPLACESCAN
2683 if (flags && !NEXT_OFF(n)) {
2684 DEBUG_PEEP("atch", val, depth);
2685 if (reg_off_by_arg[OP(n)]) {
2686 ARG_SET(n, val - n);
2687 }
2688 else {
2689 NEXT_OFF(n) = val - n;
2690 }
2691 stopnow = 1;
2692 }
2693#endif
2694 }
2695
2696 *min_subtract = 0;
2697 *has_exactf_sharp_s = FALSE;
2698
2699 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2700 * can now analyze for sequences of problematic code points. (Prior to
2701 * this final joining, sequences could have been split over boundaries, and
2702 * hence missed). The sequences only happen in folding, hence for any
2703 * non-EXACT EXACTish node */
2704 if (OP(scan) != EXACT) {
2705 U8 *s;
2706 U8 * s0 = (U8*) STRING(scan);
2707 U8 * const s_end = s0 + STR_LEN(scan);
2708
2709 /* The below is perhaps overboard, but this allows us to save a test
2710 * each time through the loop at the expense of a mask. This is
2711 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2712 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2713 * This uses an exclusive 'or' to find that bit and then inverts it to
2714 * form a mask, with just a single 0, in the bit position where 'S' and
2715 * 's' differ. */
2716 const U8 S_or_s_mask = ~ ('S' ^ 's');
2717 const U8 s_masked = 's' & S_or_s_mask;
2718
2719 /* One pass is made over the node's string looking for all the
2720 * possibilities. to avoid some tests in the loop, there are two main
2721 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2722 * non-UTF-8 */
2723 if (UTF) {
2724
2725 /* There are two problematic Greek code points in Unicode
2726 * casefolding
2727 *
2728 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2729 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2730 *
2731 * which casefold to
2732 *
2733 * Unicode UTF-8
2734 *
2735 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2736 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2737 *
2738 * This means that in case-insensitive matching (or "loose
2739 * matching", as Unicode calls it), an EXACTF of length six (the
2740 * UTF-8 encoded byte length of the above casefolded versions) can
2741 * match a target string of length two (the byte length of UTF-8
2742 * encoded U+0390 or U+03B0). This would rather mess up the
2743 * minimum length computation. (there are other code points that
2744 * also fold to these two sequences, but the delta is smaller)
2745 *
2746 * If these sequences are found, the minimum length is decreased by
2747 * four (six minus two).
2748 *
2749 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2750 * LETTER SHARP S. We decrease the min length by 1 for each
2751 * occurrence of 'ss' found */
2752
2753#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2754# define U390_first_byte 0xb4
2755 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2756# define U3B0_first_byte 0xb5
2757 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2758#else
2759# define U390_first_byte 0xce
2760 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2761# define U3B0_first_byte 0xcf
2762 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2763#endif
2764 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2765 yields a net of 0 */
2766 /* Examine the string for one of the problematic sequences */
2767 for (s = s0;
2768 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2769 * sequence we are looking for is 2 */
2770 s += UTF8SKIP(s))
2771 {
2772
2773 /* Look for the first byte in each problematic sequence */
2774 switch (*s) {
2775 /* We don't have to worry about other things that fold to
2776 * 's' (such as the long s, U+017F), as all above-latin1
2777 * code points have been pre-folded */
2778 case 's':
2779 case 'S':
2780
2781 /* Current character is an 's' or 'S'. If next one is
2782 * as well, we have the dreaded sequence */
2783 if (((*(s+1) & S_or_s_mask) == s_masked)
2784 /* These two node types don't have special handling
2785 * for 'ss' */
2786 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2787 {
2788 *min_subtract += 1;
2789 OP(scan) = EXACTFU_SS;
2790 s++; /* No need to look at this character again */
2791 }
2792 break;
2793
2794 case U390_first_byte:
2795 if (s_end - s >= len
2796
2797 /* The 1's are because are skipping comparing the
2798 * first byte */
2799 && memEQ(s + 1, U390_tail, len - 1))
2800 {
2801 goto greek_sequence;
2802 }
2803 break;
2804
2805 case U3B0_first_byte:
2806 if (! (s_end - s >= len
2807 && memEQ(s + 1, U3B0_tail, len - 1)))
2808 {
2809 break;
2810 }
2811 greek_sequence:
2812 *min_subtract += 4;
2813
2814 /* This can't currently be handled by trie's, so change
2815 * the node type to indicate this. If EXACTFA and
2816 * EXACTFL were ever to be handled by trie's, this
2817 * would have to be changed. If this node has already
2818 * been changed to EXACTFU_SS in this loop, leave it as
2819 * is. (I (khw) think it doesn't matter in regexec.c
2820 * for UTF patterns, but no need to change it */
2821 if (OP(scan) == EXACTFU) {
2822 OP(scan) = EXACTFU_NO_TRIE;
2823 }
2824 s += 6; /* We already know what this sequence is. Skip
2825 the rest of it */
2826 break;
2827 }
2828 }
2829 }
2830 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2831
2832 /* Here, the pattern is not UTF-8. We need to look only for the
2833 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2834 * in the final position. Otherwise we can stop looking 1 byte
2835 * earlier because have to find both the first and second 's' */
2836 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2837
2838 for (s = s0; s < upper; s++) {
2839 switch (*s) {
2840 case 'S':
2841 case 's':
2842 if (s_end - s > 1
2843 && ((*(s+1) & S_or_s_mask) == s_masked))
2844 {
2845 *min_subtract += 1;
2846
2847 /* EXACTF nodes need to know that the minimum
2848 * length changed so that a sharp s in the string
2849 * can match this ss in the pattern, but they
2850 * remain EXACTF nodes, as they are not trie'able,
2851 * so don't have to invent a new node type to
2852 * exclude them from the trie code */
2853 if (OP(scan) != EXACTF) {
2854 OP(scan) = EXACTFU_SS;
2855 }
2856 s++;
2857 }
2858 break;
2859 case LATIN_SMALL_LETTER_SHARP_S:
2860 if (OP(scan) == EXACTF) {
2861 *has_exactf_sharp_s = TRUE;
2862 }
2863 break;
2864 }
2865 }
2866 }
2867 }
2868
2869#ifdef DEBUGGING
2870 /* Allow dumping but overwriting the collection of skipped
2871 * ops and/or strings with fake optimized ops */
2872 n = scan + NODE_SZ_STR(scan);
2873 while (n <= stop) {
2874 OP(n) = OPTIMIZED;
2875 FLAGS(n) = 0;
2876 NEXT_OFF(n) = 0;
2877 n++;
2878 }
2879#endif
2880 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2881 return stopnow;
2882}
2883
2884/* REx optimizer. Converts nodes into quicker variants "in place".
2885 Finds fixed substrings. */
2886
2887/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2888 to the position after last scanned or to NULL. */
2889
2890#define INIT_AND_WITHP \
2891 assert(!and_withp); \
2892 Newx(and_withp,1,struct regnode_charclass_class); \
2893 SAVEFREEPV(and_withp)
2894
2895/* this is a chain of data about sub patterns we are processing that
2896 need to be handled separately/specially in study_chunk. Its so
2897 we can simulate recursion without losing state. */
2898struct scan_frame;
2899typedef struct scan_frame {
2900 regnode *last; /* last node to process in this frame */
2901 regnode *next; /* next node to process when last is reached */
2902 struct scan_frame *prev; /*previous frame*/
2903 I32 stop; /* what stopparen do we use */
2904} scan_frame;
2905
2906
2907#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2908
2909#define CASE_SYNST_FNC(nAmE) \
2910case nAmE: \
2911 if (flags & SCF_DO_STCLASS_AND) { \
2912 for (value = 0; value < 256; value++) \
2913 if (!is_ ## nAmE ## _cp(value)) \
2914 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2915 } \
2916 else { \
2917 for (value = 0; value < 256; value++) \
2918 if (is_ ## nAmE ## _cp(value)) \
2919 ANYOF_BITMAP_SET(data->start_class, value); \
2920 } \
2921 break; \
2922case N ## nAmE: \
2923 if (flags & SCF_DO_STCLASS_AND) { \
2924 for (value = 0; value < 256; value++) \
2925 if (is_ ## nAmE ## _cp(value)) \
2926 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2927 } \
2928 else { \
2929 for (value = 0; value < 256; value++) \
2930 if (!is_ ## nAmE ## _cp(value)) \
2931 ANYOF_BITMAP_SET(data->start_class, value); \
2932 } \
2933 break
2934
2935
2936
2937STATIC I32
2938S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2939 I32 *minlenp, I32 *deltap,
2940 regnode *last,
2941 scan_data_t *data,
2942 I32 stopparen,
2943 U8* recursed,
2944 struct regnode_charclass_class *and_withp,
2945 U32 flags, U32 depth)
2946 /* scanp: Start here (read-write). */
2947 /* deltap: Write maxlen-minlen here. */
2948 /* last: Stop before this one. */
2949 /* data: string data about the pattern */
2950 /* stopparen: treat close N as END */
2951 /* recursed: which subroutines have we recursed into */
2952 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2953{
2954 dVAR;
2955 I32 min = 0, pars = 0, code;
2956 regnode *scan = *scanp, *next;
2957 I32 delta = 0;
2958 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2959 int is_inf_internal = 0; /* The studied chunk is infinite */
2960 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2961 scan_data_t data_fake;
2962 SV *re_trie_maxbuff = NULL;
2963 regnode *first_non_open = scan;
2964 I32 stopmin = I32_MAX;
2965 scan_frame *frame = NULL;
2966 GET_RE_DEBUG_FLAGS_DECL;
2967
2968 PERL_ARGS_ASSERT_STUDY_CHUNK;
2969
2970#ifdef DEBUGGING
2971 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2972#endif
2973
2974 if ( depth == 0 ) {
2975 while (first_non_open && OP(first_non_open) == OPEN)
2976 first_non_open=regnext(first_non_open);
2977 }
2978
2979
2980 fake_study_recurse:
2981 while ( scan && OP(scan) != END && scan < last ){
2982 UV min_subtract = 0; /* How much to subtract from the minimum node
2983 length to get a real minimum (because the
2984 folded version may be shorter) */
2985 bool has_exactf_sharp_s = FALSE;
2986 /* Peephole optimizer: */
2987 DEBUG_STUDYDATA("Peep:", data,depth);
2988 DEBUG_PEEP("Peep",scan,depth);
2989
2990 /* Its not clear to khw or hv why this is done here, and not in the
2991 * clauses that deal with EXACT nodes. khw's guess is that it's
2992 * because of a previous design */
2993 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
2994
2995 /* Follow the next-chain of the current node and optimize
2996 away all the NOTHINGs from it. */
2997 if (OP(scan) != CURLYX) {
2998 const int max = (reg_off_by_arg[OP(scan)]
2999 ? I32_MAX
3000 /* I32 may be smaller than U16 on CRAYs! */
3001 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3002 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3003 int noff;
3004 regnode *n = scan;
3005
3006 /* Skip NOTHING and LONGJMP. */
3007 while ((n = regnext(n))
3008 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3009 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3010 && off + noff < max)
3011 off += noff;
3012 if (reg_off_by_arg[OP(scan)])
3013 ARG(scan) = off;
3014 else
3015 NEXT_OFF(scan) = off;
3016 }
3017
3018
3019
3020 /* The principal pseudo-switch. Cannot be a switch, since we
3021 look into several different things. */
3022 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3023 || OP(scan) == IFTHEN) {
3024 next = regnext(scan);
3025 code = OP(scan);
3026 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3027
3028 if (OP(next) == code || code == IFTHEN) {
3029 /* NOTE - There is similar code to this block below for handling
3030 TRIE nodes on a re-study. If you change stuff here check there
3031 too. */
3032 I32 max1 = 0, min1 = I32_MAX, num = 0;
3033 struct regnode_charclass_class accum;
3034 regnode * const startbranch=scan;
3035
3036 if (flags & SCF_DO_SUBSTR)
3037 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3038 if (flags & SCF_DO_STCLASS)
3039 cl_init_zero(pRExC_state, &accum);
3040
3041 while (OP(scan) == code) {
3042 I32 deltanext, minnext, f = 0, fake;
3043 struct regnode_charclass_class this_class;
3044
3045 num++;
3046 data_fake.flags = 0;
3047 if (data) {
3048 data_fake.whilem_c = data->whilem_c;
3049 data_fake.last_closep = data->last_closep;
3050 }
3051 else
3052 data_fake.last_closep = &fake;
3053
3054 data_fake.pos_delta = delta;
3055 next = regnext(scan);
3056 scan = NEXTOPER(scan);
3057 if (code != BRANCH)
3058 scan = NEXTOPER(scan);
3059 if (flags & SCF_DO_STCLASS) {
3060 cl_init(pRExC_state, &this_class);
3061 data_fake.start_class = &this_class;
3062 f = SCF_DO_STCLASS_AND;
3063 }
3064 if (flags & SCF_WHILEM_VISITED_POS)
3065 f |= SCF_WHILEM_VISITED_POS;
3066
3067 /* we suppose the run is continuous, last=next...*/
3068 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3069 next, &data_fake,
3070 stopparen, recursed, NULL, f,depth+1);
3071 if (min1 > minnext)
3072 min1 = minnext;
3073 if (max1 < minnext + deltanext)
3074 max1 = minnext + deltanext;
3075 if (deltanext == I32_MAX)
3076 is_inf = is_inf_internal = 1;
3077 scan = next;
3078 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3079 pars++;
3080 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3081 if ( stopmin > minnext)
3082 stopmin = min + min1;
3083 flags &= ~SCF_DO_SUBSTR;
3084 if (data)
3085 data->flags |= SCF_SEEN_ACCEPT;
3086 }
3087 if (data) {
3088 if (data_fake.flags & SF_HAS_EVAL)
3089 data->flags |= SF_HAS_EVAL;
3090 data->whilem_c = data_fake.whilem_c;
3091 }
3092 if (flags & SCF_DO_STCLASS)
3093 cl_or(pRExC_state, &accum, &this_class);
3094 }
3095 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3096 min1 = 0;
3097 if (flags & SCF_DO_SUBSTR) {
3098 data->pos_min += min1;
3099 data->pos_delta += max1 - min1;
3100 if (max1 != min1 || is_inf)
3101 data->longest = &(data->longest_float);
3102 }
3103 min += min1;
3104 delta += max1 - min1;
3105 if (flags & SCF_DO_STCLASS_OR) {
3106 cl_or(pRExC_state, data->start_class, &accum);
3107 if (min1) {
3108 cl_and(data->start_class, and_withp);
3109 flags &= ~SCF_DO_STCLASS;
3110 }
3111 }
3112 else if (flags & SCF_DO_STCLASS_AND) {
3113 if (min1) {
3114 cl_and(data->start_class, &accum);
3115 flags &= ~SCF_DO_STCLASS;
3116 }
3117 else {
3118 /* Switch to OR mode: cache the old value of
3119 * data->start_class */
3120 INIT_AND_WITHP;
3121 StructCopy(data->start_class, and_withp,
3122 struct regnode_charclass_class);
3123 flags &= ~SCF_DO_STCLASS_AND;
3124 StructCopy(&accum, data->start_class,
3125 struct regnode_charclass_class);
3126 flags |= SCF_DO_STCLASS_OR;
3127 data->start_class->flags |= ANYOF_EOS;
3128 }
3129 }
3130
3131 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3132 /* demq.
3133
3134 Assuming this was/is a branch we are dealing with: 'scan' now
3135 points at the item that follows the branch sequence, whatever
3136 it is. We now start at the beginning of the sequence and look
3137 for subsequences of
3138
3139 BRANCH->EXACT=>x1
3140 BRANCH->EXACT=>x2
3141 tail
3142
3143 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3144
3145 If we can find such a subsequence we need to turn the first
3146 element into a trie and then add the subsequent branch exact
3147 strings to the trie.
3148
3149 We have two cases
3150
3151 1. patterns where the whole set of branches can be converted.
3152
3153 2. patterns where only a subset can be converted.
3154
3155 In case 1 we can replace the whole set with a single regop
3156 for the trie. In case 2 we need to keep the start and end
3157 branches so
3158
3159 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3160 becomes BRANCH TRIE; BRANCH X;
3161
3162 There is an additional case, that being where there is a
3163 common prefix, which gets split out into an EXACT like node
3164 preceding the TRIE node.
3165
3166 If x(1..n)==tail then we can do a simple trie, if not we make
3167 a "jump" trie, such that when we match the appropriate word
3168 we "jump" to the appropriate tail node. Essentially we turn
3169 a nested if into a case structure of sorts.
3170
3171 */
3172
3173 int made=0;
3174 if (!re_trie_maxbuff) {
3175 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3176 if (!SvIOK(re_trie_maxbuff))
3177 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3178 }
3179 if ( SvIV(re_trie_maxbuff)>=0 ) {
3180 regnode *cur;
3181 regnode *first = (regnode *)NULL;
3182 regnode *last = (regnode *)NULL;
3183 regnode *tail = scan;
3184 U8 optype = 0;
3185 U32 count=0;
3186
3187#ifdef DEBUGGING
3188 SV * const mysv = sv_newmortal(); /* for dumping */
3189#endif
3190 /* var tail is used because there may be a TAIL
3191 regop in the way. Ie, the exacts will point to the
3192 thing following the TAIL, but the last branch will
3193 point at the TAIL. So we advance tail. If we
3194 have nested (?:) we may have to move through several
3195 tails.
3196 */
3197
3198 while ( OP( tail ) == TAIL ) {
3199 /* this is the TAIL generated by (?:) */
3200 tail = regnext( tail );
3201 }
3202
3203
3204 DEBUG_OPTIMISE_r({
3205 regprop(RExC_rx, mysv, tail );
3206 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3207 (int)depth * 2 + 2, "",
3208 "Looking for TRIE'able sequences. Tail node is: ",
3209 SvPV_nolen_const( mysv )
3210 );
3211 });
3212
3213 /*
3214
3215 step through the branches, cur represents each
3216 branch, noper is the first thing to be matched
3217 as part of that branch and noper_next is the
3218 regnext() of that node. if noper is an EXACT
3219 and noper_next is the same as scan (our current
3220 position in the regex) then the EXACT branch is
3221 a possible optimization target. Once we have
3222 two or more consecutive such branches we can
3223 create a trie of the EXACT's contents and stich
3224 it in place. If the sequence represents all of
3225 the branches we eliminate the whole thing and
3226 replace it with a single TRIE. If it is a
3227 subsequence then we need to stitch it in. This
3228 means the first branch has to remain, and needs
3229 to be repointed at the item on the branch chain
3230 following the last branch optimized. This could
3231 be either a BRANCH, in which case the
3232 subsequence is internal, or it could be the
3233 item following the branch sequence in which
3234 case the subsequence is at the end.
3235
3236 */
3237
3238 /* dont use tail as the end marker for this traverse */
3239 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3240 regnode * const noper = NEXTOPER( cur );
3241#if defined(DEBUGGING) || defined(NOJUMPTRIE)
3242 regnode * const noper_next = regnext( noper );
3243#endif
3244
3245 DEBUG_OPTIMISE_r({
3246 regprop(RExC_rx, mysv, cur);
3247 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3248 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3249
3250 regprop(RExC_rx, mysv, noper);
3251 PerlIO_printf( Perl_debug_log, " -> %s",
3252 SvPV_nolen_const(mysv));
3253
3254 if ( noper_next ) {
3255 regprop(RExC_rx, mysv, noper_next );
3256 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3257 SvPV_nolen_const(mysv));
3258 }
3259 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3260 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3261 });
3262 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3263 : PL_regkind[ OP( noper ) ] == EXACT )
3264 || OP(noper) == NOTHING )
3265#ifdef NOJUMPTRIE
3266 && noper_next == tail
3267#endif
3268 && count < U16_MAX)
3269 {
3270 count++;
3271 if ( !first || optype == NOTHING ) {
3272 if (!first) first = cur;
3273 optype = OP( noper );
3274 } else {
3275 last = cur;
3276 }
3277 } else {
3278/*
3279 Currently the trie logic handles case insensitive matching properly only
3280 when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3281 semantics).
3282
3283 If/when this is fixed the following define can be swapped
3284 in below to fully enable trie logic.
3285
3286#define TRIE_TYPE_IS_SAFE 1
3287
3288Note that join_exact() assumes that the other types of EXACTFish nodes are not
3289used in tries, so that would have to be updated if this changed
3290
3291*/
3292#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
3293
3294 if ( last && TRIE_TYPE_IS_SAFE ) {
3295 make_trie( pRExC_state,
3296 startbranch, first, cur, tail, count,
3297 optype, depth+1 );
3298 }
3299 if ( PL_regkind[ OP( noper ) ] == EXACT
3300#ifdef NOJUMPTRIE
3301 && noper_next == tail
3302#endif
3303 ){
3304 count = 1;
3305 first = cur;
3306 optype = OP( noper );
3307 } else {
3308 count = 0;
3309 first = NULL;
3310 optype = 0;
3311 }
3312 last = NULL;
3313 }
3314 }
3315 DEBUG_OPTIMISE_r({
3316 regprop(RExC_rx, mysv, cur);
3317 PerlIO_printf( Perl_debug_log,
3318 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3319 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3320
3321 });
3322
3323 if ( last && TRIE_TYPE_IS_SAFE ) {
3324 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3325#ifdef TRIE_STUDY_OPT
3326 if ( ((made == MADE_EXACT_TRIE &&
3327 startbranch == first)
3328 || ( first_non_open == first )) &&
3329 depth==0 ) {
3330 flags |= SCF_TRIE_RESTUDY;
3331 if ( startbranch == first
3332 && scan == tail )
3333 {
3334 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3335 }
3336 }
3337#endif
3338 }
3339 }
3340
3341 } /* do trie */
3342
3343 }
3344 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3345 scan = NEXTOPER(NEXTOPER(scan));
3346 } else /* single branch is optimized. */
3347 scan = NEXTOPER(scan);
3348 continue;
3349 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3350 scan_frame *newframe = NULL;
3351 I32 paren;
3352 regnode *start;
3353 regnode *end;
3354
3355 if (OP(scan) != SUSPEND) {
3356 /* set the pointer */
3357 if (OP(scan) == GOSUB) {
3358 paren = ARG(scan);
3359 RExC_recurse[ARG2L(scan)] = scan;
3360 start = RExC_open_parens[paren-1];
3361 end = RExC_close_parens[paren-1];
3362 } else {
3363 paren = 0;
3364 start = RExC_rxi->program + 1;
3365 end = RExC_opend;
3366 }
3367 if (!recursed) {
3368 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3369 SAVEFREEPV(recursed);
3370 }
3371 if (!PAREN_TEST(recursed,paren+1)) {
3372 PAREN_SET(recursed,paren+1);
3373 Newx(newframe,1,scan_frame);
3374 } else {
3375 if (flags & SCF_DO_SUBSTR) {
3376 SCAN_COMMIT(pRExC_state,data,minlenp);
3377 data->longest = &(data->longest_float);
3378 }
3379 is_inf = is_inf_internal = 1;
3380 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3381 cl_anything(pRExC_state, data->start_class);
3382 flags &= ~SCF_DO_STCLASS;
3383 }
3384 } else {
3385 Newx(newframe,1,scan_frame);
3386 paren = stopparen;
3387 start = scan+2;
3388 end = regnext(scan);
3389 }
3390 if (newframe) {
3391 assert(start);
3392 assert(end);
3393 SAVEFREEPV(newframe);
3394 newframe->next = regnext(scan);
3395 newframe->last = last;
3396 newframe->stop = stopparen;
3397 newframe->prev = frame;
3398
3399 frame = newframe;
3400 scan = start;
3401 stopparen = paren;
3402 last = end;
3403
3404 continue;
3405 }
3406 }
3407 else if (OP(scan) == EXACT) {
3408 I32 l = STR_LEN(scan);
3409 UV uc;
3410 if (UTF) {
3411 const U8 * const s = (U8*)STRING(scan);
3412 l = utf8_length(s, s + l);
3413 uc = utf8_to_uvchr(s, NULL);
3414 } else {
3415 uc = *((U8*)STRING(scan));
3416 }
3417 min += l;
3418 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3419 /* The code below prefers earlier match for fixed
3420 offset, later match for variable offset. */
3421 if (data->last_end == -1) { /* Update the start info. */
3422 data->last_start_min = data->pos_min;
3423 data->last_start_max = is_inf
3424 ? I32_MAX : data->pos_min + data->pos_delta;
3425 }
3426 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3427 if (UTF)
3428 SvUTF8_on(data->last_found);
3429 {
3430 SV * const sv = data->last_found;
3431 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3432 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3433 if (mg && mg->mg_len >= 0)
3434 mg->mg_len += utf8_length((U8*)STRING(scan),
3435 (U8*)STRING(scan)+STR_LEN(scan));
3436 }
3437 data->last_end = data->pos_min + l;
3438 data->pos_min += l; /* As in the first entry. */
3439 data->flags &= ~SF_BEFORE_EOL;
3440 }
3441 if (flags & SCF_DO_STCLASS_AND) {
3442 /* Check whether it is compatible with what we know already! */
3443 int compat = 1;
3444
3445
3446 /* If compatible, we or it in below. It is compatible if is
3447 * in the bitmp and either 1) its bit or its fold is set, or 2)
3448 * it's for a locale. Even if there isn't unicode semantics
3449 * here, at runtime there may be because of matching against a
3450 * utf8 string, so accept a possible false positive for
3451 * latin1-range folds */
3452 if (uc >= 0x100 ||
3453 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3454 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3455 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3456 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3457 )
3458 {
3459 compat = 0;
3460 }
3461 ANYOF_CLASS_ZERO(data->start_class);
3462 ANYOF_BITMAP_ZERO(data->start_class);
3463 if (compat)
3464 ANYOF_BITMAP_SET(data->start_class, uc);
3465 else if (uc >= 0x100) {
3466 int i;
3467
3468 /* Some Unicode code points fold to the Latin1 range; as
3469 * XXX temporary code, instead of figuring out if this is
3470 * one, just assume it is and set all the start class bits
3471 * that could be some such above 255 code point's fold
3472 * which will generate fals positives. As the code
3473 * elsewhere that does compute the fold settles down, it
3474 * can be extracted out and re-used here */
3475 for (i = 0; i < 256; i++){
3476 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3477 ANYOF_BITMAP_SET(data->start_class, i);
3478 }
3479 }
3480 }
3481 data->start_class->flags &= ~ANYOF_EOS;
3482 if (uc < 0x100)
3483 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3484 }
3485 else if (flags & SCF_DO_STCLASS_OR) {
3486 /* false positive possible if the class is case-folded */
3487 if (uc < 0x100)
3488 ANYOF_BITMAP_SET(data->start_class, uc);
3489 else
3490 data->start_class->flags |= ANYOF_UNICODE_ALL;
3491 data->start_class->flags &= ~ANYOF_EOS;
3492 cl_and(data->start_class, and_withp);
3493 }
3494 flags &= ~SCF_DO_STCLASS;
3495 }
3496 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3497 I32 l = STR_LEN(scan);
3498 UV uc = *((U8*)STRING(scan));
3499
3500 /* Search for fixed substrings supports EXACT only. */
3501 if (flags & SCF_DO_SUBSTR) {
3502 assert(data);
3503 SCAN_COMMIT(pRExC_state, data, minlenp);
3504 }
3505 if (UTF) {
3506 const U8 * const s = (U8 *)STRING(scan);
3507 l = utf8_length(s, s + l);
3508 uc = utf8_to_uvchr(s, NULL);
3509 }
3510 else if (has_exactf_sharp_s) {
3511 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3512 }
3513 min += l - min_subtract;
3514 if (min < 0) {
3515 min = 0;
3516 }
3517 delta += min_subtract;
3518 if (flags & SCF_DO_SUBSTR) {
3519 data->pos_min += l - min_subtract;
3520 if (data->pos_min < 0) {
3521 data->pos_min = 0;
3522 }
3523 data->pos_delta += min_subtract;
3524 if (min_subtract) {
3525 data->longest = &(data->longest_float);
3526 }
3527 }
3528 if (flags & SCF_DO_STCLASS_AND) {
3529 /* Check whether it is compatible with what we know already! */
3530 int compat = 1;
3531 if (uc >= 0x100 ||
3532 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3533 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3534 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3535 {
3536 compat = 0;
3537 }
3538 ANYOF_CLASS_ZERO(data->start_class);
3539 ANYOF_BITMAP_ZERO(data->start_class);
3540 if (compat) {
3541 ANYOF_BITMAP_SET(data->start_class, uc);
3542 data->start_class->flags &= ~ANYOF_EOS;
3543 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3544 if (OP(scan) == EXACTFL) {
3545 /* XXX This set is probably no longer necessary, and
3546 * probably wrong as LOCALE now is on in the initial
3547 * state */
3548 data->start_class->flags |= ANYOF_LOCALE;
3549 }
3550 else {
3551
3552 /* Also set the other member of the fold pair. In case
3553 * that unicode semantics is called for at runtime, use
3554 * the full latin1 fold. (Can't do this for locale,
3555 * because not known until runtime) */
3556 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3557
3558 /* All other (EXACTFL handled above) folds except under
3559 * /iaa that include s, S, and sharp_s also may include
3560 * the others */
3561 if (OP(scan) != EXACTFA) {
3562 if (uc == 's' || uc == 'S') {
3563 ANYOF_BITMAP_SET(data->start_class,
3564 LATIN_SMALL_LETTER_SHARP_S);
3565 }
3566 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3567 ANYOF_BITMAP_SET(data->start_class, 's');
3568 ANYOF_BITMAP_SET(data->start_class, 'S');
3569 }
3570 }
3571 }
3572 }
3573 else if (uc >= 0x100) {
3574 int i;
3575 for (i = 0; i < 256; i++){
3576 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3577 ANYOF_BITMAP_SET(data->start_class, i);
3578 }
3579 }
3580 }
3581 }
3582 else if (flags & SCF_DO_STCLASS_OR) {
3583 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3584 /* false positive possible if the class is case-folded.
3585 Assume that the locale settings are the same... */
3586 if (uc < 0x100) {
3587 ANYOF_BITMAP_SET(data->start_class, uc);
3588 if (OP(scan) != EXACTFL) {
3589
3590 /* And set the other member of the fold pair, but
3591 * can't do that in locale because not known until
3592 * run-time */
3593 ANYOF_BITMAP_SET(data->start_class,
3594 PL_fold_latin1[uc]);
3595
3596 /* All folds except under /iaa that include s, S,
3597 * and sharp_s also may include the others */
3598 if (OP(scan) != EXACTFA) {
3599 if (uc == 's' || uc == 'S') {
3600 ANYOF_BITMAP_SET(data->start_class,
3601 LATIN_SMALL_LETTER_SHARP_S);
3602 }
3603 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3604 ANYOF_BITMAP_SET(data->start_class, 's');
3605 ANYOF_BITMAP_SET(data->start_class, 'S');
3606 }
3607 }
3608 }
3609 }
3610 data->start_class->flags &= ~ANYOF_EOS;
3611 }
3612 cl_and(data->start_class, and_withp);
3613 }
3614 flags &= ~SCF_DO_STCLASS;
3615 }
3616 else if (REGNODE_VARIES(OP(scan))) {
3617 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3618 I32 f = flags, pos_before = 0;
3619 regnode * const oscan = scan;
3620 struct regnode_charclass_class this_class;
3621 struct regnode_charclass_class *oclass = NULL;
3622 I32 next_is_eval = 0;
3623
3624 switch (PL_regkind[OP(scan)]) {
3625 case WHILEM: /* End of (?:...)* . */
3626 scan = NEXTOPER(scan);
3627 goto finish;
3628 case PLUS:
3629 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3630 next = NEXTOPER(scan);
3631 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3632 mincount = 1;
3633 maxcount = REG_INFTY;
3634 next = regnext(scan);
3635 scan = NEXTOPER(scan);
3636 goto do_curly;
3637 }
3638 }
3639 if (flags & SCF_DO_SUBSTR)
3640 data->pos_min++;
3641 min++;
3642 /* Fall through. */
3643 case STAR:
3644 if (flags & SCF_DO_STCLASS) {
3645 mincount = 0;
3646 maxcount = REG_INFTY;
3647 next = regnext(scan);
3648 scan = NEXTOPER(scan);
3649 goto do_curly;
3650 }
3651 is_inf = is_inf_internal = 1;
3652 scan = regnext(scan);
3653 if (flags & SCF_DO_SUBSTR) {
3654 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3655 data->longest = &(data->longest_float);
3656 }
3657 goto optimize_curly_tail;
3658 case CURLY:
3659 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3660 && (scan->flags == stopparen))
3661 {
3662 mincount = 1;
3663 maxcount = 1;
3664 } else {
3665 mincount = ARG1(scan);
3666 maxcount = ARG2(scan);
3667 }
3668 next = regnext(scan);
3669 if (OP(scan) == CURLYX) {
3670 I32 lp = (data ? *(data->last_closep) : 0);
3671 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3672 }
3673 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3674 next_is_eval = (OP(scan) == EVAL);
3675 do_curly:
3676 if (flags & SCF_DO_SUBSTR) {
3677 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3678 pos_before = data->pos_min;
3679 }
3680 if (data) {
3681 fl = data->flags;
3682 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3683 if (is_inf)
3684 data->flags |= SF_IS_INF;
3685 }
3686 if (flags & SCF_DO_STCLASS) {
3687 cl_init(pRExC_state, &this_class);
3688 oclass = data->start_class;
3689 data->start_class = &this_class;
3690 f |= SCF_DO_STCLASS_AND;
3691 f &= ~SCF_DO_STCLASS_OR;
3692 }
3693 /* Exclude from super-linear cache processing any {n,m}
3694 regops for which the combination of input pos and regex
3695 pos is not enough information to determine if a match
3696 will be possible.
3697
3698 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3699 regex pos at the \s*, the prospects for a match depend not
3700 only on the input position but also on how many (bar\s*)
3701 repeats into the {4,8} we are. */
3702 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3703 f &= ~SCF_WHILEM_VISITED_POS;
3704
3705 /* This will finish on WHILEM, setting scan, or on NULL: */
3706 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3707 last, data, stopparen, recursed, NULL,
3708 (mincount == 0
3709 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3710
3711 if (flags & SCF_DO_STCLASS)
3712 data->start_class = oclass;
3713 if (mincount == 0 || minnext == 0) {
3714 if (flags & SCF_DO_STCLASS_OR) {
3715 cl_or(pRExC_state, data->start_class, &this_class);
3716 }
3717 else if (flags & SCF_DO_STCLASS_AND) {
3718 /* Switch to OR mode: cache the old value of
3719 * data->start_class */
3720 INIT_AND_WITHP;
3721 StructCopy(data->start_class, and_withp,
3722 struct regnode_charclass_class);
3723 flags &= ~SCF_DO_STCLASS_AND;
3724 StructCopy(&this_class, data->start_class,
3725 struct regnode_charclass_class);
3726 flags |= SCF_DO_STCLASS_OR;
3727 data->start_class->flags |= ANYOF_EOS;
3728 }
3729 } else { /* Non-zero len */
3730 if (flags & SCF_DO_STCLASS_OR) {
3731 cl_or(pRExC_state, data->start_class, &this_class);
3732 cl_and(data->start_class, and_withp);
3733 }
3734 else if (flags & SCF_DO_STCLASS_AND)
3735 cl_and(data->start_class, &this_class);
3736 flags &= ~SCF_DO_STCLASS;
3737 }
3738 if (!scan) /* It was not CURLYX, but CURLY. */
3739 scan = next;
3740 if ( /* ? quantifier ok, except for (?{ ... }) */
3741 (next_is_eval || !(mincount == 0 && maxcount == 1))
3742 && (minnext == 0) && (deltanext == 0)
3743 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3744 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3745 {
3746 ckWARNreg(RExC_parse,
3747 "Quantifier unexpected on zero-length expression");
3748 }
3749
3750 min += minnext * mincount;
3751 is_inf_internal |= ((maxcount == REG_INFTY
3752 && (minnext + deltanext) > 0)
3753 || deltanext == I32_MAX);
3754 is_inf |= is_inf_internal;
3755 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3756
3757 /* Try powerful optimization CURLYX => CURLYN. */
3758 if ( OP(oscan) == CURLYX && data
3759 && data->flags & SF_IN_PAR
3760 && !(data->flags & SF_HAS_EVAL)
3761 && !deltanext && minnext == 1 ) {
3762 /* Try to optimize to CURLYN. */
3763 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3764 regnode * const nxt1 = nxt;
3765#ifdef DEBUGGING
3766 regnode *nxt2;
3767#endif
3768
3769 /* Skip open. */
3770 nxt = regnext(nxt);
3771 if (!REGNODE_SIMPLE(OP(nxt))
3772 && !(PL_regkind[OP(nxt)] == EXACT
3773 && STR_LEN(nxt) == 1))
3774 goto nogo;
3775#ifdef DEBUGGING
3776 nxt2 = nxt;
3777#endif
3778 nxt = regnext(nxt);
3779 if (OP(nxt) != CLOSE)
3780 goto nogo;
3781 if (RExC_open_parens) {
3782 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3783 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3784 }
3785 /* Now we know that nxt2 is the only contents: */
3786 oscan->flags = (U8)ARG(nxt);
3787 OP(oscan) = CURLYN;
3788 OP(nxt1) = NOTHING; /* was OPEN. */
3789
3790#ifdef DEBUGGING
3791 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3792 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3793 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3794 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3795 OP(nxt + 1) = OPTIMIZED; /* was count. */
3796 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3797#endif
3798 }
3799 nogo:
3800
3801 /* Try optimization CURLYX => CURLYM. */
3802 if ( OP(oscan) == CURLYX && data
3803 && !(data->flags & SF_HAS_PAR)
3804 && !(data->flags & SF_HAS_EVAL)
3805 && !deltanext /* atom is fixed width */
3806 && minnext != 0 /* CURLYM can't handle zero width */
3807 ) {
3808 /* XXXX How to optimize if data == 0? */
3809 /* Optimize to a simpler form. */
3810 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3811 regnode *nxt2;
3812
3813 OP(oscan) = CURLYM;
3814 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3815 && (OP(nxt2) != WHILEM))
3816 nxt = nxt2;
3817 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3818 /* Need to optimize away parenths. */
3819 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3820 /* Set the parenth number. */
3821 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3822
3823 oscan->flags = (U8)ARG(nxt);
3824 if (RExC_open_parens) {
3825 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3826 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3827 }
3828 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3829 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3830
3831#ifdef DEBUGGING
3832 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3833 OP(nxt + 1) = OPTIMIZED; /* was count. */
3834 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3835 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3836#endif
3837#if 0
3838 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3839 regnode *nnxt = regnext(nxt1);
3840 if (nnxt == nxt) {
3841 if (reg_off_by_arg[OP(nxt1)])
3842 ARG_SET(nxt1, nxt2 - nxt1);
3843 else if (nxt2 - nxt1 < U16_MAX)
3844 NEXT_OFF(nxt1) = nxt2 - nxt1;
3845 else
3846 OP(nxt) = NOTHING; /* Cannot beautify */
3847 }
3848 nxt1 = nnxt;
3849 }
3850#endif
3851 /* Optimize again: */
3852 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3853 NULL, stopparen, recursed, NULL, 0,depth+1);
3854 }
3855 else
3856 oscan->flags = 0;
3857 }
3858 else if ((OP(oscan) == CURLYX)
3859 && (flags & SCF_WHILEM_VISITED_POS)
3860 /* See the comment on a similar expression above.
3861 However, this time it's not a subexpression
3862 we care about, but the expression itself. */
3863 && (maxcount == REG_INFTY)
3864 && data && ++data->whilem_c < 16) {
3865 /* This stays as CURLYX, we can put the count/of pair. */
3866 /* Find WHILEM (as in regexec.c) */
3867 regnode *nxt = oscan + NEXT_OFF(oscan);
3868
3869 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3870 nxt += ARG(nxt);
3871 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3872 | (RExC_whilem_seen << 4)); /* On WHILEM */
3873 }
3874 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3875 pars++;
3876 if (flags & SCF_DO_SUBSTR) {
3877 SV *last_str = NULL;
3878 int counted = mincount != 0;
3879
3880 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3881#if defined(SPARC64_GCC_WORKAROUND)
3882 I32 b = 0;
3883 STRLEN l = 0;
3884 const char *s = NULL;
3885 I32 old = 0;
3886
3887 if (pos_before >= data->last_start_min)
3888 b = pos_before;
3889 else
3890 b = data->last_start_min;
3891
3892 l = 0;
3893 s = SvPV_const(data->last_found, l);
3894 old = b - data->last_start_min;
3895
3896#else
3897 I32 b = pos_before >= data->last_start_min
3898 ? pos_before : data->last_start_min;
3899 STRLEN l;
3900 const char * const s = SvPV_const(data->last_found, l);
3901 I32 old = b - data->last_start_min;
3902#endif
3903
3904 if (UTF)
3905 old = utf8_hop((U8*)s, old) - (U8*)s;
3906 l -= old;
3907 /* Get the added string: */
3908 last_str = newSVpvn_utf8(s + old, l, UTF);
3909 if (deltanext == 0 && pos_before == b) {
3910 /* What was added is a constant string */
3911 if (mincount > 1) {
3912 SvGROW(last_str, (mincount * l) + 1);
3913 repeatcpy(SvPVX(last_str) + l,
3914 SvPVX_const(last_str), l, mincount - 1);
3915 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3916 /* Add additional parts. */
3917 SvCUR_set(data->last_found,
3918 SvCUR(data->last_found) - l);
3919 sv_catsv(data->last_found, last_str);
3920 {
3921 SV * sv = data->last_found;
3922 MAGIC *mg =
3923 SvUTF8(sv) && SvMAGICAL(sv) ?
3924 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3925 if (mg && mg->mg_len >= 0)
3926 mg->mg_len += CHR_SVLEN(last_str) - l;
3927 }
3928 data->last_end += l * (mincount - 1);
3929 }
3930 } else {
3931 /* start offset must point into the last copy */
3932 data->last_start_min += minnext * (mincount - 1);
3933 data->last_start_max += is_inf ? I32_MAX
3934 : (maxcount - 1) * (minnext + data->pos_delta);
3935 }
3936 }
3937 /* It is counted once already... */
3938 data->pos_min += minnext * (mincount - counted);
3939 data->pos_delta += - counted * deltanext +
3940 (minnext + deltanext) * maxcount - minnext * mincount;
3941 if (mincount != maxcount) {
3942 /* Cannot extend fixed substrings found inside
3943 the group. */
3944 SCAN_COMMIT(pRExC_state,data,minlenp);
3945 if (mincount && last_str) {
3946 SV * const sv = data->last_found;
3947 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3948 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3949
3950 if (mg)
3951 mg->mg_len = -1;
3952 sv_setsv(sv, last_str);
3953 data->last_end = data->pos_min;
3954 data->last_start_min =
3955 data->pos_min - CHR_SVLEN(last_str);
3956 data->last_start_max = is_inf
3957 ? I32_MAX
3958 : data->pos_min + data->pos_delta
3959 - CHR_SVLEN(last_str);
3960 }
3961 data->longest = &(data->longest_float);
3962 }
3963 SvREFCNT_dec(last_str);
3964 }
3965 if (data && (fl & SF_HAS_EVAL))
3966 data->flags |= SF_HAS_EVAL;
3967 optimize_curly_tail:
3968 if (OP(oscan) != CURLYX) {
3969 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3970 && NEXT_OFF(next))
3971 NEXT_OFF(oscan) += NEXT_OFF(next);
3972 }
3973 continue;
3974 default: /* REF, ANYOFV, and CLUMP only? */
3975 if (flags & SCF_DO_SUBSTR) {
3976 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3977 data->longest = &(data->longest_float);
3978 }
3979 is_inf = is_inf_internal = 1;
3980 if (flags & SCF_DO_STCLASS_OR)
3981 cl_anything(pRExC_state, data->start_class);
3982 flags &= ~SCF_DO_STCLASS;
3983 break;
3984 }
3985 }
3986 else if (OP(scan) == LNBREAK) {
3987 if (flags & SCF_DO_STCLASS) {
3988 int value = 0;
3989 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3990 if (flags & SCF_DO_STCLASS_AND) {
3991 for (value = 0; value < 256; value++)
3992 if (!is_VERTWS_cp(value))
3993 ANYOF_BITMAP_CLEAR(data->start_class, value);
3994 }
3995 else {
3996 for (value = 0; value < 256; value++)
3997 if (is_VERTWS_cp(value))
3998 ANYOF_BITMAP_SET(data->start_class, value);
3999 }
4000 if (flags & SCF_DO_STCLASS_OR)
4001 cl_and(data->start_class, and_withp);
4002 flags &= ~SCF_DO_STCLASS;
4003 }
4004 min += 1;
4005 delta += 1;
4006 if (flags & SCF_DO_SUBSTR) {
4007 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4008 data->pos_min += 1;
4009 data->pos_delta += 1;
4010 data->longest = &(data->longest_float);
4011 }
4012 }
4013 else if (REGNODE_SIMPLE(OP(scan))) {
4014 int value = 0;
4015
4016 if (flags & SCF_DO_SUBSTR) {
4017 SCAN_COMMIT(pRExC_state,data,minlenp);
4018 data->pos_min++;
4019 }
4020 min++;
4021 if (flags & SCF_DO_STCLASS) {
4022 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4023
4024 /* Some of the logic below assumes that switching
4025 locale on will only add false positives. */
4026 switch (PL_regkind[OP(scan)]) {
4027 case SANY:
4028 default:
4029 do_default:
4030 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4031 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4032 cl_anything(pRExC_state, data->start_class);
4033 break;
4034 case REG_ANY:
4035 if (OP(scan) == SANY)
4036 goto do_default;
4037 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4038 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4039 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4040 cl_anything(pRExC_state, data->start_class);
4041 }
4042 if (flags & SCF_DO_STCLASS_AND || !value)
4043 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4044 break;
4045 case ANYOF:
4046 if (flags & SCF_DO_STCLASS_AND)
4047 cl_and(data->start_class,
4048 (struct regnode_charclass_class*)scan);
4049 else
4050 cl_or(pRExC_state, data->start_class,
4051 (struct regnode_charclass_class*)scan);
4052 break;
4053 case ALNUM:
4054 if (flags & SCF_DO_STCLASS_AND) {
4055 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4056 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4057 if (OP(scan) == ALNUMU) {
4058 for (value = 0; value < 256; value++) {
4059 if (!isWORDCHAR_L1(value)) {
4060 ANYOF_BITMAP_CLEAR(data->start_class, value);
4061 }
4062 }
4063 } else {
4064 for (value = 0; value < 256; value++) {
4065 if (!isALNUM(value)) {
4066 ANYOF_BITMAP_CLEAR(data->start_class, value);
4067 }
4068 }
4069 }
4070 }
4071 }
4072 else {
4073 if (data->start_class->flags & ANYOF_LOCALE)
4074 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4075
4076 /* Even if under locale, set the bits for non-locale
4077 * in case it isn't a true locale-node. This will
4078 * create false positives if it truly is locale */
4079 if (OP(scan) == ALNUMU) {
4080 for (value = 0; value < 256; value++) {
4081 if (isWORDCHAR_L1(value)) {
4082 ANYOF_BITMAP_SET(data->start_class, value);
4083 }
4084 }
4085 } else {
4086 for (value = 0; value < 256; value++) {
4087 if (isALNUM(value)) {
4088 ANYOF_BITMAP_SET(data->start_class, value);
4089 }
4090 }
4091 }
4092 }
4093 break;
4094 case NALNUM:
4095 if (flags & SCF_DO_STCLASS_AND) {
4096 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4097 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4098 if (OP(scan) == NALNUMU) {
4099 for (value = 0; value < 256; value++) {
4100 if (isWORDCHAR_L1(value)) {
4101 ANYOF_BITMAP_CLEAR(data->start_class, value);
4102 }
4103 }
4104 } else {
4105 for (value = 0; value < 256; value++) {
4106 if (isALNUM(value)) {
4107 ANYOF_BITMAP_CLEAR(data->start_class, value);
4108 }
4109 }
4110 }
4111 }
4112 }
4113 else {
4114 if (data->start_class->flags & ANYOF_LOCALE)
4115 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4116
4117 /* Even if under locale, set the bits for non-locale in
4118 * case it isn't a true locale-node. This will create
4119 * false positives if it truly is locale */
4120 if (OP(scan) == NALNUMU) {
4121 for (value = 0; value < 256; value++) {
4122 if (! isWORDCHAR_L1(value)) {
4123 ANYOF_BITMAP_SET(data->start_class, value);
4124 }
4125 }
4126 } else {
4127 for (value = 0; value < 256; value++) {
4128 if (! isALNUM(value)) {
4129 ANYOF_BITMAP_SET(data->start_class, value);
4130 }
4131 }
4132 }
4133 }
4134 break;
4135 case SPACE:
4136 if (flags & SCF_DO_STCLASS_AND) {
4137 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4138 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4139 if (OP(scan) == SPACEU) {
4140 for (value = 0; value < 256; value++) {
4141 if (!isSPACE_L1(value)) {
4142 ANYOF_BITMAP_CLEAR(data->start_class, value);
4143 }
4144 }
4145 } else {
4146 for (value = 0; value < 256; value++) {
4147 if (!isSPACE(value)) {
4148 ANYOF_BITMAP_CLEAR(data->start_class, value);
4149 }
4150 }
4151 }
4152 }
4153 }
4154 else {
4155 if (data->start_class->flags & ANYOF_LOCALE) {
4156 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4157 }
4158 if (OP(scan) == SPACEU) {
4159 for (value = 0; value < 256; value++) {
4160 if (isSPACE_L1(value)) {
4161 ANYOF_BITMAP_SET(data->start_class, value);
4162 }
4163 }
4164 } else {
4165 for (value = 0; value < 256; value++) {
4166 if (isSPACE(value)) {
4167 ANYOF_BITMAP_SET(data->start_class, value);
4168 }
4169 }
4170 }
4171 }
4172 break;
4173 case NSPACE:
4174 if (flags & SCF_DO_STCLASS_AND) {
4175 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4176 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4177 if (OP(scan) == NSPACEU) {
4178 for (value = 0; value < 256; value++) {
4179 if (isSPACE_L1(value)) {
4180 ANYOF_BITMAP_CLEAR(data->start_class, value);
4181 }
4182 }
4183 } else {
4184 for (value = 0; value < 256; value++) {
4185 if (isSPACE(value)) {
4186 ANYOF_BITMAP_CLEAR(data->start_class, value);
4187 }
4188 }
4189 }
4190 }
4191 }
4192 else {
4193 if (data->start_class->flags & ANYOF_LOCALE)
4194 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4195 if (OP(scan) == NSPACEU) {
4196 for (value = 0; value < 256; value++) {
4197 if (!isSPACE_L1(value)) {
4198 ANYOF_BITMAP_SET(data->start_class, value);
4199 }
4200 }
4201 }
4202 else {
4203 for (value = 0; value < 256; value++) {
4204 if (!isSPACE(value)) {
4205 ANYOF_BITMAP_SET(data->start_class, value);
4206 }
4207 }
4208 }
4209 }
4210 break;
4211 case DIGIT:
4212 if (flags & SCF_DO_STCLASS_AND) {
4213 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4215 for (value = 0; value < 256; value++)
4216 if (!isDIGIT(value))
4217 ANYOF_BITMAP_CLEAR(data->start_class, value);
4218 }
4219 }
4220 else {
4221 if (data->start_class->flags & ANYOF_LOCALE)
4222 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4223 for (value = 0; value < 256; value++)
4224 if (isDIGIT(value))
4225 ANYOF_BITMAP_SET(data->start_class, value);
4226 }
4227 break;
4228 case NDIGIT:
4229 if (flags & SCF_DO_STCLASS_AND) {
4230 if (!(data->start_class->flags & ANYOF_LOCALE))
4231 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4232 for (value = 0; value < 256; value++)
4233 if (isDIGIT(value))
4234 ANYOF_BITMAP_CLEAR(data->start_class, value);
4235 }
4236 else {
4237 if (data->start_class->flags & ANYOF_LOCALE)
4238 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4239 for (value = 0; value < 256; value++)
4240 if (!isDIGIT(value))
4241 ANYOF_BITMAP_SET(data->start_class, value);
4242 }
4243 break;
4244 CASE_SYNST_FNC(VERTWS);
4245 CASE_SYNST_FNC(HORIZWS);
4246
4247 }
4248 if (flags & SCF_DO_STCLASS_OR)
4249 cl_and(data->start_class, and_withp);
4250 flags &= ~SCF_DO_STCLASS;
4251 }
4252 }
4253 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4254 data->flags |= (OP(scan) == MEOL
4255 ? SF_BEFORE_MEOL
4256 : SF_BEFORE_SEOL);
4257 }
4258 else if ( PL_regkind[OP(scan)] == BRANCHJ
4259 /* Lookbehind, or need to calculate parens/evals/stclass: */
4260 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4261 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4262 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4263 || OP(scan) == UNLESSM )
4264 {
4265 /* Negative Lookahead/lookbehind
4266 In this case we can't do fixed string optimisation.
4267 */
4268
4269 I32 deltanext, minnext, fake = 0;
4270 regnode *nscan;
4271 struct regnode_charclass_class intrnl;
4272 int f = 0;
4273
4274 data_fake.flags = 0;
4275 if (data) {
4276 data_fake.whilem_c = data->whilem_c;
4277 data_fake.last_closep = data->last_closep;
4278 }
4279 else
4280 data_fake.last_closep = &fake;
4281 data_fake.pos_delta = delta;
4282 if ( flags & SCF_DO_STCLASS && !scan->flags
4283 && OP(scan) == IFMATCH ) { /* Lookahead */
4284 cl_init(pRExC_state, &intrnl);
4285 data_fake.start_class = &intrnl;
4286 f |= SCF_DO_STCLASS_AND;
4287 }
4288 if (flags & SCF_WHILEM_VISITED_POS)
4289 f |= SCF_WHILEM_VISITED_POS;
4290 next = regnext(scan);
4291 nscan = NEXTOPER(NEXTOPER(scan));
4292 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4293 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4294 if (scan->flags) {
4295 if (deltanext) {
4296 FAIL("Variable length lookbehind not implemented");
4297 }
4298 else if (minnext > (I32)U8_MAX) {
4299 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4300 }
4301 scan->flags = (U8)minnext;
4302 }
4303 if (data) {
4304 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4305 pars++;
4306 if (data_fake.flags & SF_HAS_EVAL)
4307 data->flags |= SF_HAS_EVAL;
4308 data->whilem_c = data_fake.whilem_c;
4309 }
4310 if (f & SCF_DO_STCLASS_AND) {
4311 if (flags & SCF_DO_STCLASS_OR) {
4312 /* OR before, AND after: ideally we would recurse with
4313 * data_fake to get the AND applied by study of the
4314 * remainder of the pattern, and then derecurse;
4315 * *** HACK *** for now just treat as "no information".
4316 * See [perl #56690].
4317 */
4318 cl_init(pRExC_state, data->start_class);
4319 } else {
4320 /* AND before and after: combine and continue */
4321 const int was = (data->start_class->flags & ANYOF_EOS);
4322
4323 cl_and(data->start_class, &intrnl);
4324 if (was)
4325 data->start_class->flags |= ANYOF_EOS;
4326 }
4327 }
4328 }
4329#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4330 else {
4331 /* Positive Lookahead/lookbehind
4332 In this case we can do fixed string optimisation,
4333 but we must be careful about it. Note in the case of
4334 lookbehind the positions will be offset by the minimum
4335 length of the pattern, something we won't know about
4336 until after the recurse.
4337 */
4338 I32 deltanext, fake = 0;
4339 regnode *nscan;
4340 struct regnode_charclass_class intrnl;
4341 int f = 0;
4342 /* We use SAVEFREEPV so that when the full compile
4343 is finished perl will clean up the allocated
4344 minlens when it's all done. This way we don't
4345 have to worry about freeing them when we know
4346 they wont be used, which would be a pain.
4347 */
4348 I32 *minnextp;
4349 Newx( minnextp, 1, I32 );
4350 SAVEFREEPV(minnextp);
4351
4352 if (data) {
4353 StructCopy(data, &data_fake, scan_data_t);
4354 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4355 f |= SCF_DO_SUBSTR;
4356 if (scan->flags)
4357 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4358 data_fake.last_found=newSVsv(data->last_found);
4359 }
4360 }
4361 else
4362 data_fake.last_closep = &fake;
4363 data_fake.flags = 0;
4364 data_fake.pos_delta = delta;
4365 if (is_inf)
4366 data_fake.flags |= SF_IS_INF;
4367 if ( flags & SCF_DO_STCLASS && !scan->flags
4368 && OP(scan) == IFMATCH ) { /* Lookahead */
4369 cl_init(pRExC_state, &intrnl);
4370 data_fake.start_class = &intrnl;
4371 f |= SCF_DO_STCLASS_AND;
4372 }
4373 if (flags & SCF_WHILEM_VISITED_POS)
4374 f |= SCF_WHILEM_VISITED_POS;
4375 next = regnext(scan);
4376 nscan = NEXTOPER(NEXTOPER(scan));
4377
4378 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4379 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4380 if (scan->flags) {
4381 if (deltanext) {
4382 FAIL("Variable length lookbehind not implemented");
4383 }
4384 else if (*minnextp > (I32)U8_MAX) {
4385 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4386 }
4387 scan->flags = (U8)*minnextp;
4388 }
4389
4390 *minnextp += min;
4391
4392 if (f & SCF_DO_STCLASS_AND) {
4393 const int was = (data->start_class->flags & ANYOF_EOS);
4394
4395 cl_and(data->start_class, &intrnl);
4396 if (was)
4397 data->start_class->flags |= ANYOF_EOS;
4398 }
4399 if (data) {
4400 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4401 pars++;
4402 if (data_fake.flags & SF_HAS_EVAL)
4403 data->flags |= SF_HAS_EVAL;
4404 data->whilem_c = data_fake.whilem_c;
4405 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4406 if (RExC_rx->minlen<*minnextp)
4407 RExC_rx->minlen=*minnextp;
4408 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4409 SvREFCNT_dec(data_fake.last_found);
4410
4411 if ( data_fake.minlen_fixed != minlenp )
4412 {
4413 data->offset_fixed= data_fake.offset_fixed;
4414 data->minlen_fixed= data_fake.minlen_fixed;
4415 data->lookbehind_fixed+= scan->flags;
4416 }
4417 if ( data_fake.minlen_float != minlenp )
4418 {
4419 data->minlen_float= data_fake.minlen_float;
4420 data->offset_float_min=data_fake.offset_float_min;
4421 data->offset_float_max=data_fake.offset_float_max;
4422 data->lookbehind_float+= scan->flags;
4423 }
4424 }
4425 }
4426
4427
4428 }
4429#endif
4430 }
4431 else if (OP(scan) == OPEN) {
4432 if (stopparen != (I32)ARG(scan))
4433 pars++;
4434 }
4435 else if (OP(scan) == CLOSE) {
4436 if (stopparen == (I32)ARG(scan)) {
4437 break;
4438 }
4439 if ((I32)ARG(scan) == is_par) {
4440 next = regnext(scan);
4441
4442 if ( next && (OP(next) != WHILEM) && next < last)
4443 is_par = 0; /* Disable optimization */
4444 }
4445 if (data)
4446 *(data->last_closep) = ARG(scan);
4447 }
4448 else if (OP(scan) == EVAL) {
4449 if (data)
4450 data->flags |= SF_HAS_EVAL;
4451 }
4452 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4453 if (flags & SCF_DO_SUBSTR) {
4454 SCAN_COMMIT(pRExC_state,data,minlenp);
4455 flags &= ~SCF_DO_SUBSTR;
4456 }
4457 if (data && OP(scan)==ACCEPT) {
4458 data->flags |= SCF_SEEN_ACCEPT;
4459 if (stopmin > min)
4460 stopmin = min;
4461 }
4462 }
4463 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4464 {
4465 if (flags & SCF_DO_SUBSTR) {
4466 SCAN_COMMIT(pRExC_state,data,minlenp);
4467 data->longest = &(data->longest_float);
4468 }
4469 is_inf = is_inf_internal = 1;
4470 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4471 cl_anything(pRExC_state, data->start_class);
4472 flags &= ~SCF_DO_STCLASS;
4473 }
4474 else if (OP(scan) == GPOS) {
4475 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4476 !(delta || is_inf || (data && data->pos_delta)))
4477 {
4478 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4479 RExC_rx->extflags |= RXf_ANCH_GPOS;
4480 if (RExC_rx->gofs < (U32)min)
4481 RExC_rx->gofs = min;
4482 } else {
4483 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4484 RExC_rx->gofs = 0;
4485 }
4486 }
4487#ifdef TRIE_STUDY_OPT
4488#ifdef FULL_TRIE_STUDY
4489 else if (PL_regkind[OP(scan)] == TRIE) {
4490 /* NOTE - There is similar code to this block above for handling
4491 BRANCH nodes on the initial study. If you change stuff here
4492 check there too. */
4493 regnode *trie_node= scan;
4494 regnode *tail= regnext(scan);
4495 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4496 I32 max1 = 0, min1 = I32_MAX;
4497 struct regnode_charclass_class accum;
4498
4499 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4500 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4501 if (flags & SCF_DO_STCLASS)
4502 cl_init_zero(pRExC_state, &accum);
4503
4504 if (!trie->jump) {
4505 min1= trie->minlen;
4506 max1= trie->maxlen;
4507 } else {
4508 const regnode *nextbranch= NULL;
4509 U32 word;
4510
4511 for ( word=1 ; word <= trie->wordcount ; word++)
4512 {
4513 I32 deltanext=0, minnext=0, f = 0, fake;
4514 struct regnode_charclass_class this_class;
4515
4516 data_fake.flags = 0;
4517 if (data) {
4518 data_fake.whilem_c = data->whilem_c;
4519 data_fake.last_closep = data->last_closep;
4520 }
4521 else
4522 data_fake.last_closep = &fake;
4523 data_fake.pos_delta = delta;
4524 if (flags & SCF_DO_STCLASS) {
4525 cl_init(pRExC_state, &this_class);
4526 data_fake.start_class = &this_class;
4527 f = SCF_DO_STCLASS_AND;
4528 }
4529 if (flags & SCF_WHILEM_VISITED_POS)
4530 f |= SCF_WHILEM_VISITED_POS;
4531
4532 if (trie->jump[word]) {
4533 if (!nextbranch)
4534 nextbranch = trie_node + trie->jump[0];
4535 scan= trie_node + trie->jump[word];
4536 /* We go from the jump point to the branch that follows
4537 it. Note this means we need the vestigal unused branches
4538 even though they arent otherwise used.
4539 */
4540 minnext = study_chunk(pRExC_state, &scan, minlenp,
4541 &deltanext, (regnode *)nextbranch, &data_fake,
4542 stopparen, recursed, NULL, f,depth+1);
4543 }
4544 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4545 nextbranch= regnext((regnode*)nextbranch);
4546
4547 if (min1 > (I32)(minnext + trie->minlen))
4548 min1 = minnext + trie->minlen;
4549 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4550 max1 = minnext + deltanext + trie->maxlen;
4551 if (deltanext == I32_MAX)
4552 is_inf = is_inf_internal = 1;
4553
4554 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4555 pars++;
4556 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4557 if ( stopmin > min + min1)
4558 stopmin = min + min1;
4559 flags &= ~SCF_DO_SUBSTR;
4560 if (data)
4561 data->flags |= SCF_SEEN_ACCEPT;
4562 }
4563 if (data) {
4564 if (data_fake.flags & SF_HAS_EVAL)
4565 data->flags |= SF_HAS_EVAL;
4566 data->whilem_c = data_fake.whilem_c;
4567 }
4568 if (flags & SCF_DO_STCLASS)
4569 cl_or(pRExC_state, &accum, &this_class);
4570 }
4571 }
4572 if (flags & SCF_DO_SUBSTR) {
4573 data->pos_min += min1;
4574 data->pos_delta += max1 - min1;
4575 if (max1 != min1 || is_inf)
4576 data->longest = &(data->longest_float);
4577 }
4578 min += min1;
4579 delta += max1 - min1;
4580 if (flags & SCF_DO_STCLASS_OR) {
4581 cl_or(pRExC_state, data->start_class, &accum);
4582 if (min1) {
4583 cl_and(data->start_class, and_withp);
4584 flags &= ~SCF_DO_STCLASS;
4585 }
4586 }
4587 else if (flags & SCF_DO_STCLASS_AND) {
4588 if (min1) {
4589 cl_and(data->start_class, &accum);
4590 flags &= ~SCF_DO_STCLASS;
4591 }
4592 else {
4593 /* Switch to OR mode: cache the old value of
4594 * data->start_class */
4595 INIT_AND_WITHP;
4596 StructCopy(data->start_class, and_withp,
4597 struct regnode_charclass_class);
4598 flags &= ~SCF_DO_STCLASS_AND;
4599 StructCopy(&accum, data->start_class,
4600 struct regnode_charclass_class);
4601 flags |= SCF_DO_STCLASS_OR;
4602 data->start_class->flags |= ANYOF_EOS;
4603 }
4604 }
4605 scan= tail;
4606 continue;
4607 }
4608#else
4609 else if (PL_regkind[OP(scan)] == TRIE) {
4610 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4611 U8*bang=NULL;
4612
4613 min += trie->minlen;
4614 delta += (trie->maxlen - trie->minlen);
4615 flags &= ~SCF_DO_STCLASS; /* xxx */
4616 if (flags & SCF_DO_SUBSTR) {
4617 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4618 data->pos_min += trie->minlen;
4619 data->pos_delta += (trie->maxlen - trie->minlen);
4620 if (trie->maxlen != trie->minlen)
4621 data->longest = &(data->longest_float);
4622 }
4623 if (trie->jump) /* no more substrings -- for now /grr*/
4624 flags &= ~SCF_DO_SUBSTR;
4625 }
4626#endif /* old or new */
4627#endif /* TRIE_STUDY_OPT */
4628
4629 /* Else: zero-length, ignore. */
4630 scan = regnext(scan);
4631 }
4632 if (frame) {
4633 last = frame->last;
4634 scan = frame->next;
4635 stopparen = frame->stop;
4636 frame = frame->prev;
4637 goto fake_study_recurse;
4638 }
4639
4640 finish:
4641 assert(!frame);
4642 DEBUG_STUDYDATA("pre-fin:",data,depth);
4643
4644 *scanp = scan;
4645 *deltap = is_inf_internal ? I32_MAX : delta;
4646 if (flags & SCF_DO_SUBSTR && is_inf)
4647 data->pos_delta = I32_MAX - data->pos_min;
4648 if (is_par > (I32)U8_MAX)
4649 is_par = 0;
4650 if (is_par && pars==1 && data) {
4651 data->flags |= SF_IN_PAR;
4652 data->flags &= ~SF_HAS_PAR;
4653 }
4654 else if (pars && data) {
4655 data->flags |= SF_HAS_PAR;
4656 data->flags &= ~SF_IN_PAR;
4657 }
4658 if (flags & SCF_DO_STCLASS_OR)
4659 cl_and(data->start_class, and_withp);
4660 if (flags & SCF_TRIE_RESTUDY)
4661 data->flags |= SCF_TRIE_RESTUDY;
4662
4663 DEBUG_STUDYDATA("post-fin:",data,depth);
4664
4665 return min < stopmin ? min : stopmin;
4666}
4667
4668STATIC U32
4669S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4670{
4671 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4672
4673 PERL_ARGS_ASSERT_ADD_DATA;
4674
4675 Renewc(RExC_rxi->data,
4676 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4677 char, struct reg_data);
4678 if(count)
4679 Renew(RExC_rxi->data->what, count + n, U8);
4680 else
4681 Newx(RExC_rxi->data->what, n, U8);
4682 RExC_rxi->data->count = count + n;
4683 Copy(s, RExC_rxi->data->what + count, n, U8);
4684 return count;
4685}
4686
4687/*XXX: todo make this not included in a non debugging perl */
4688#ifndef PERL_IN_XSUB_RE
4689void
4690Perl_reginitcolors(pTHX)
4691{
4692 dVAR;
4693 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4694 if (s) {
4695 char *t = savepv(s);
4696 int i = 0;
4697 PL_colors[0] = t;
4698 while (++i < 6) {
4699 t = strchr(t, '\t');
4700 if (t) {
4701 *t = '\0';
4702 PL_colors[i] = ++t;
4703 }
4704 else
4705 PL_colors[i] = t = (char *)"";
4706 }
4707 } else {
4708 int i = 0;
4709 while (i < 6)
4710 PL_colors[i++] = (char *)"";
4711 }
4712 PL_colorset = 1;
4713}
4714#endif
4715
4716
4717#ifdef TRIE_STUDY_OPT
4718#define CHECK_RESTUDY_GOTO \
4719 if ( \
4720 (data.flags & SCF_TRIE_RESTUDY) \
4721 && ! restudied++ \
4722 ) goto reStudy
4723#else
4724#define CHECK_RESTUDY_GOTO
4725#endif
4726
4727/*
4728 - pregcomp - compile a regular expression into internal code
4729 *
4730 * We can't allocate space until we know how big the compiled form will be,
4731 * but we can't compile it (and thus know how big it is) until we've got a
4732 * place to put the code. So we cheat: we compile it twice, once with code
4733 * generation turned off and size counting turned on, and once "for real".
4734 * This also means that we don't allocate space until we are sure that the
4735 * thing really will compile successfully, and we never have to move the
4736 * code and thus invalidate pointers into it. (Note that it has to be in
4737 * one piece because free() must be able to free it all.) [NB: not true in perl]
4738 *
4739 * Beware that the optimization-preparation code in here knows about some
4740 * of the structure of the compiled regexp. [I'll say.]
4741 */
4742
4743
4744
4745#ifndef PERL_IN_XSUB_RE
4746#define RE_ENGINE_PTR &PL_core_reg_engine
4747#else
4748extern const struct regexp_engine my_reg_engine;
4749#define RE_ENGINE_PTR &my_reg_engine
4750#endif
4751
4752#ifndef PERL_IN_XSUB_RE
4753REGEXP *
4754Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4755{
4756 dVAR;
4757 HV * const table = GvHV(PL_hintgv);
4758
4759 PERL_ARGS_ASSERT_PREGCOMP;
4760
4761 /* Dispatch a request to compile a regexp to correct
4762 regexp engine. */
4763 if (table) {
4764 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4765 GET_RE_DEBUG_FLAGS_DECL;
4766 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4767 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4768 DEBUG_COMPILE_r({
4769 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4770 SvIV(*ptr));
4771 });
4772 return CALLREGCOMP_ENG(eng, pattern, flags);
4773 }
4774 }
4775 return Perl_re_compile(aTHX_ pattern, flags);
4776}
4777#endif
4778
4779REGEXP *
4780Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4781{
4782 dVAR;
4783 REGEXP *rx;
4784 struct regexp *r;
4785 register regexp_internal *ri;
4786 STRLEN plen;
4787 char* VOL exp;
4788 char* xend;
4789 regnode *scan;
4790 I32 flags;
4791 I32 minlen = 0;
4792 U32 pm_flags;
4793
4794 /* these are all flags - maybe they should be turned
4795 * into a single int with different bit masks */
4796 I32 sawlookahead = 0;
4797 I32 sawplus = 0;
4798 I32 sawopen = 0;
4799 bool used_setjump = FALSE;
4800 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
4801
4802 U8 jump_ret = 0;
4803 dJMPENV;
4804 scan_data_t data;
4805 RExC_state_t RExC_state;
4806 RExC_state_t * const pRExC_state = &RExC_state;
4807#ifdef TRIE_STUDY_OPT
4808 int restudied;
4809 RExC_state_t copyRExC_state;
4810#endif
4811 GET_RE_DEBUG_FLAGS_DECL;
4812
4813 PERL_ARGS_ASSERT_RE_COMPILE;
4814
4815 DEBUG_r(if (!PL_colorset) reginitcolors());
4816
4817 /* Initialize these here instead of as-needed, as is quick and avoids
4818 * having to test them each time otherwise */
4819 if (! PL_AboveLatin1) {
4820 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4821 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4822 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
4823
4824 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4825 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4826
4827 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4828 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4829
4830 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4831 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4832
4833 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4834
4835 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4836 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4837
4838 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4839
4840 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4841 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4842
4843 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4844 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4845
4846 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4847 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4848
4849 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4850 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4851
4852 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4853 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4854
4855 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4856 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4857
4858 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4859 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4860
4861 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4862 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4863
4864 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4865
4866 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4867 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4868
4869 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4870 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
4871 }
4872
4873 exp = SvPV(pattern, plen);
4874
4875 if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4876 RExC_utf8 = RExC_orig_utf8 = 0;
4877 }
4878 else {
4879 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4880 }
4881 RExC_uni_semantics = 0;
4882 RExC_contains_locale = 0;
4883
4884 /****************** LONG JUMP TARGET HERE***********************/
4885 /* Longjmp back to here if have to switch in midstream to utf8 */
4886 if (! RExC_orig_utf8) {
4887 JMPENV_PUSH(jump_ret);
4888 used_setjump = TRUE;
4889 }
4890
4891 if (jump_ret == 0) { /* First time through */
4892 xend = exp + plen;
4893
4894 DEBUG_COMPILE_r({
4895 SV *dsv= sv_newmortal();
4896 RE_PV_QUOTED_DECL(s, RExC_utf8,
4897 dsv, exp, plen, 60);
4898 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4899 PL_colors[4],PL_colors[5],s);
4900 });
4901 }
4902 else { /* longjumped back */
4903 STRLEN len = plen;
4904
4905 /* If the cause for the longjmp was other than changing to utf8, pop
4906 * our own setjmp, and longjmp to the correct handler */
4907 if (jump_ret != UTF8_LONGJMP) {
4908 JMPENV_POP;
4909 JMPENV_JUMP(jump_ret);
4910 }
4911
4912 GET_RE_DEBUG_FLAGS;
4913
4914 /* It's possible to write a regexp in ascii that represents Unicode
4915 codepoints outside of the byte range, such as via \x{100}. If we
4916 detect such a sequence we have to convert the entire pattern to utf8
4917 and then recompile, as our sizing calculation will have been based
4918 on 1 byte == 1 character, but we will need to use utf8 to encode
4919 at least some part of the pattern, and therefore must convert the whole
4920 thing.
4921 -- dmq */
4922 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4923 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4924 exp = (char*)Perl_bytes_to_utf8(aTHX_
4925 (U8*)SvPV_nomg(pattern, plen),
4926 &len);
4927 xend = exp + len;
4928 RExC_orig_utf8 = RExC_utf8 = 1;
4929 SAVEFREEPV(exp);
4930 }
4931
4932#ifdef TRIE_STUDY_OPT
4933 restudied = 0;
4934#endif
4935
4936 pm_flags = orig_pm_flags;
4937
4938 if (initial_charset == REGEX_LOCALE_CHARSET) {
4939 RExC_contains_locale = 1;
4940 }
4941 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4942
4943 /* Set to use unicode semantics if the pattern is in utf8 and has the
4944 * 'depends' charset specified, as it means unicode when utf8 */
4945 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
4946 }
4947
4948 RExC_precomp = exp;
4949 RExC_flags = pm_flags;
4950 RExC_sawback = 0;
4951
4952 RExC_seen = 0;
4953 RExC_in_lookbehind = 0;
4954 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4955 RExC_seen_evals = 0;
4956 RExC_extralen = 0;
4957 RExC_override_recoding = 0;
4958
4959 /* First pass: determine size, legality. */
4960 RExC_parse = exp;
4961 RExC_start = exp;
4962 RExC_end = xend;
4963 RExC_naughty = 0;
4964 RExC_npar = 1;
4965 RExC_nestroot = 0;
4966 RExC_size = 0L;
4967 RExC_emit = &PL_regdummy;
4968 RExC_whilem_seen = 0;
4969 RExC_open_parens = NULL;
4970 RExC_close_parens = NULL;
4971 RExC_opend = NULL;
4972 RExC_paren_names = NULL;
4973#ifdef DEBUGGING
4974 RExC_paren_name_list = NULL;
4975#endif
4976 RExC_recurse = NULL;
4977 RExC_recurse_count = 0;
4978
4979#if 0 /* REGC() is (currently) a NOP at the first pass.
4980 * Clever compilers notice this and complain. --jhi */
4981 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4982#endif
4983 DEBUG_PARSE_r(
4984 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
4985 RExC_lastnum=0;
4986 RExC_lastparse=NULL;
4987 );
4988 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4989 RExC_precomp = NULL;
4990 return(NULL);
4991 }
4992
4993 /* Here, finished first pass. Get rid of any added setjmp */
4994 if (used_setjump) {
4995 JMPENV_POP;
4996 }
4997
4998 DEBUG_PARSE_r({
4999 PerlIO_printf(Perl_debug_log,
5000 "Required size %"IVdf" nodes\n"
5001 "Starting second pass (creation)\n",
5002 (IV)RExC_size);
5003 RExC_lastnum=0;
5004 RExC_lastparse=NULL;
5005 });
5006
5007 /* The first pass could have found things that force Unicode semantics */
5008 if ((RExC_utf8 || RExC_uni_semantics)
5009 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5010 {
5011 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5012 }
5013
5014 /* Small enough for pointer-storage convention?
5015 If extralen==0, this means that we will not need long jumps. */
5016 if (RExC_size >= 0x10000L && RExC_extralen)
5017 RExC_size += RExC_extralen;
5018 else
5019 RExC_extralen = 0;
5020 if (RExC_whilem_seen > 15)
5021 RExC_whilem_seen = 15;
5022
5023 /* Allocate space and zero-initialize. Note, the two step process
5024 of zeroing when in debug mode, thus anything assigned has to
5025 happen after that */
5026 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5027 r = (struct regexp*)SvANY(rx);
5028 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5029 char, regexp_internal);
5030 if ( r == NULL || ri == NULL )
5031 FAIL("Regexp out of space");
5032#ifdef DEBUGGING
5033 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5034 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5035#else
5036 /* bulk initialize base fields with 0. */
5037 Zero(ri, sizeof(regexp_internal), char);
5038#endif
5039
5040 /* non-zero initialization begins here */
5041 RXi_SET( r, ri );
5042 r->engine= RE_ENGINE_PTR;
5043 r->extflags = pm_flags;
5044 {
5045 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5046 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5047
5048 /* The caret is output if there are any defaults: if not all the STD
5049 * flags are set, or if no character set specifier is needed */
5050 bool has_default =
5051 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5052 || ! has_charset);
5053 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5054 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5055 >> RXf_PMf_STD_PMMOD_SHIFT);
5056 const char *fptr = STD_PAT_MODS; /*"msix"*/
5057 char *p;
5058 /* Allocate for the worst case, which is all the std flags are turned
5059 * on. If more precision is desired, we could do a population count of
5060 * the flags set. This could be done with a small lookup table, or by
5061 * shifting, masking and adding, or even, when available, assembly
5062 * language for a machine-language population count.
5063 * We never output a minus, as all those are defaults, so are
5064 * covered by the caret */
5065 const STRLEN wraplen = plen + has_p + has_runon
5066 + has_default /* If needs a caret */
5067
5068 /* If needs a character set specifier */
5069 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5070 + (sizeof(STD_PAT_MODS) - 1)
5071 + (sizeof("(?:)") - 1);
5072
5073 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5074 SvPOK_on(rx);
5075 SvFLAGS(rx) |= SvUTF8(pattern);
5076 *p++='('; *p++='?';
5077
5078 /* If a default, cover it using the caret */
5079 if (has_default) {
5080 *p++= DEFAULT_PAT_MOD;
5081 }
5082 if (has_charset) {
5083 STRLEN len;
5084 const char* const name = get_regex_charset_name(r->extflags, &len);
5085 Copy(name, p, len, char);
5086 p += len;
5087 }
5088 if (has_p)
5089 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5090 {
5091 char ch;
5092 while((ch = *fptr++)) {
5093 if(reganch & 1)
5094 *p++ = ch;
5095 reganch >>= 1;
5096 }
5097 }
5098
5099 *p++ = ':';
5100 Copy(RExC_precomp, p, plen, char);
5101 assert ((RX_WRAPPED(rx) - p) < 16);
5102 r->pre_prefix = p - RX_WRAPPED(rx);
5103 p += plen;
5104 if (has_runon)
5105 *p++ = '\n';
5106 *p++ = ')';
5107 *p = 0;
5108 SvCUR_set(rx, p - SvPVX_const(rx));
5109 }
5110
5111 r->intflags = 0;
5112 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5113
5114 if (RExC_seen & REG_SEEN_RECURSE) {
5115 Newxz(RExC_open_parens, RExC_npar,regnode *);
5116 SAVEFREEPV(RExC_open_parens);
5117 Newxz(RExC_close_parens,RExC_npar,regnode *);
5118 SAVEFREEPV(RExC_close_parens);
5119 }
5120
5121 /* Useful during FAIL. */
5122#ifdef RE_TRACK_PATTERN_OFFSETS
5123 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5124 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5125 "%s %"UVuf" bytes for offset annotations.\n",
5126 ri->u.offsets ? "Got" : "Couldn't get",
5127 (UV)((2*RExC_size+1) * sizeof(U32))));
5128#endif
5129 SetProgLen(ri,RExC_size);
5130 RExC_rx_sv = rx;
5131 RExC_rx = r;
5132 RExC_rxi = ri;
5133
5134 /* Second pass: emit code. */
5135 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
5136 RExC_parse = exp;
5137 RExC_end = xend;
5138 RExC_naughty = 0;
5139 RExC_npar = 1;
5140 RExC_emit_start = ri->program;
5141 RExC_emit = ri->program;
5142 RExC_emit_bound = ri->program + RExC_size + 1;
5143
5144 /* Store the count of eval-groups for security checks: */
5145 RExC_rx->seen_evals = RExC_seen_evals;
5146 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5147 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5148 ReREFCNT_dec(rx);
5149 return(NULL);
5150 }
5151 /* XXXX To minimize changes to RE engine we always allocate
5152 3-units-long substrs field. */
5153 Newx(r->substrs, 1, struct reg_substr_data);
5154 if (RExC_recurse_count) {
5155 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5156 SAVEFREEPV(RExC_recurse);
5157 }
5158
5159reStudy:
5160 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5161 Zero(r->substrs, 1, struct reg_substr_data);
5162
5163#ifdef TRIE_STUDY_OPT
5164 if (!restudied) {
5165 StructCopy(&zero_scan_data, &data, scan_data_t);
5166 copyRExC_state = RExC_state;
5167 } else {
5168 U32 seen=RExC_seen;
5169 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5170
5171 RExC_state = copyRExC_state;
5172 if (seen & REG_TOP_LEVEL_BRANCHES)
5173 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5174 else
5175 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5176 if (data.last_found) {
5177 SvREFCNT_dec(data.longest_fixed);
5178 SvREFCNT_dec(data.longest_float);
5179 SvREFCNT_dec(data.last_found);
5180 }
5181 StructCopy(&zero_scan_data, &data, scan_data_t);
5182 }
5183#else
5184 StructCopy(&zero_scan_data, &data, scan_data_t);
5185#endif
5186
5187 /* Dig out information for optimizations. */
5188 r->extflags = RExC_flags; /* was pm_op */
5189 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5190
5191 if (UTF)
5192 SvUTF8_on(rx); /* Unicode in it? */
5193 ri->regstclass = NULL;
5194 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5195 r->intflags |= PREGf_NAUGHTY;
5196 scan = ri->program + 1; /* First BRANCH. */
5197
5198 /* testing for BRANCH here tells us whether there is "must appear"
5199 data in the pattern. If there is then we can use it for optimisations */
5200 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5201 I32 fake;
5202 STRLEN longest_float_length, longest_fixed_length;
5203 struct regnode_charclass_class ch_class; /* pointed to by data */
5204 int stclass_flag;
5205 I32 last_close = 0; /* pointed to by data */
5206 regnode *first= scan;
5207 regnode *first_next= regnext(first);
5208 /*
5209 * Skip introductions and multiplicators >= 1
5210 * so that we can extract the 'meat' of the pattern that must
5211 * match in the large if() sequence following.
5212 * NOTE that EXACT is NOT covered here, as it is normally
5213 * picked up by the optimiser separately.
5214 *
5215 * This is unfortunate as the optimiser isnt handling lookahead
5216 * properly currently.
5217 *
5218 */
5219 while ((OP(first) == OPEN && (sawopen = 1)) ||
5220 /* An OR of *one* alternative - should not happen now. */
5221 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5222 /* for now we can't handle lookbehind IFMATCH*/
5223 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5224 (OP(first) == PLUS) ||
5225 (OP(first) == MINMOD) ||
5226 /* An {n,m} with n>0 */
5227 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5228 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5229 {
5230 /*
5231 * the only op that could be a regnode is PLUS, all the rest
5232 * will be regnode_1 or regnode_2.
5233 *
5234 */
5235 if (OP(first) == PLUS)
5236 sawplus = 1;
5237 else
5238 first += regarglen[OP(first)];
5239
5240 first = NEXTOPER(first);
5241 first_next= regnext(first);
5242 }
5243
5244 /* Starting-point info. */
5245 again:
5246 DEBUG_PEEP("first:",first,0);
5247 /* Ignore EXACT as we deal with it later. */
5248 if (PL_regkind[OP(first)] == EXACT) {
5249 if (OP(first) == EXACT)
5250 NOOP; /* Empty, get anchored substr later. */
5251 else
5252 ri->regstclass = first;
5253 }
5254#ifdef TRIE_STCLASS
5255 else if (PL_regkind[OP(first)] == TRIE &&
5256 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5257 {
5258 regnode *trie_op;
5259 /* this can happen only on restudy */
5260 if ( OP(first) == TRIE ) {
5261 struct regnode_1 *trieop = (struct regnode_1 *)
5262 PerlMemShared_calloc(1, sizeof(struct regnode_1));
5263 StructCopy(first,trieop,struct regnode_1);
5264 trie_op=(regnode *)trieop;
5265 } else {
5266 struct regnode_charclass *trieop = (struct regnode_charclass *)
5267 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5268 StructCopy(first,trieop,struct regnode_charclass);
5269 trie_op=(regnode *)trieop;
5270 }
5271 OP(trie_op)+=2;
5272 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5273 ri->regstclass = trie_op;
5274 }
5275#endif
5276 else if (REGNODE_SIMPLE(OP(first)))
5277 ri->regstclass = first;
5278 else if (PL_regkind[OP(first)] == BOUND ||
5279 PL_regkind[OP(first)] == NBOUND)
5280 ri->regstclass = first;
5281 else if (PL_regkind[OP(first)] == BOL) {
5282 r->extflags |= (OP(first) == MBOL
5283 ? RXf_ANCH_MBOL
5284 : (OP(first) == SBOL
5285 ? RXf_ANCH_SBOL
5286 : RXf_ANCH_BOL));
5287 first = NEXTOPER(first);
5288 goto again;
5289 }
5290 else if (OP(first) == GPOS) {
5291 r->extflags |= RXf_ANCH_GPOS;
5292 first = NEXTOPER(first);
5293 goto again;
5294 }
5295 else if ((!sawopen || !RExC_sawback) &&
5296 (OP(first) == STAR &&
5297 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5298 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5299 {
5300 /* turn .* into ^.* with an implied $*=1 */
5301 const int type =
5302 (OP(NEXTOPER(first)) == REG_ANY)
5303 ? RXf_ANCH_MBOL
5304 : RXf_ANCH_SBOL;
5305 r->extflags |= type;
5306 r->intflags |= PREGf_IMPLICIT;
5307 first = NEXTOPER(first);
5308 goto again;
5309 }
5310 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5311 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5312 /* x+ must match at the 1st pos of run of x's */
5313 r->intflags |= PREGf_SKIP;
5314
5315 /* Scan is after the zeroth branch, first is atomic matcher. */
5316#ifdef TRIE_STUDY_OPT
5317 DEBUG_PARSE_r(
5318 if (!restudied)
5319 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5320 (IV)(first - scan + 1))
5321 );
5322#else
5323 DEBUG_PARSE_r(
5324 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5325 (IV)(first - scan + 1))
5326 );
5327#endif
5328
5329
5330 /*
5331 * If there's something expensive in the r.e., find the
5332 * longest literal string that must appear and make it the
5333 * regmust. Resolve ties in favor of later strings, since
5334 * the regstart check works with the beginning of the r.e.
5335 * and avoiding duplication strengthens checking. Not a
5336 * strong reason, but sufficient in the absence of others.
5337 * [Now we resolve ties in favor of the earlier string if
5338 * it happens that c_offset_min has been invalidated, since the
5339 * earlier string may buy us something the later one won't.]
5340 */
5341
5342 data.longest_fixed = newSVpvs("");
5343 data.longest_float = newSVpvs("");
5344 data.last_found = newSVpvs("");
5345 data.longest = &(data.longest_fixed);
5346 first = scan;
5347 if (!ri->regstclass) {
5348 cl_init(pRExC_state, &ch_class);
5349 data.start_class = &ch_class;
5350 stclass_flag = SCF_DO_STCLASS_AND;
5351 } else /* XXXX Check for BOUND? */
5352 stclass_flag = 0;
5353 data.last_closep = &last_close;
5354
5355 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5356 &data, -1, NULL, NULL,
5357 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5358
5359
5360 CHECK_RESTUDY_GOTO;
5361
5362
5363 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5364 && data.last_start_min == 0 && data.last_end > 0
5365 && !RExC_seen_zerolen
5366 && !(RExC_seen & REG_SEEN_VERBARG)
5367 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5368 r->extflags |= RXf_CHECK_ALL;
5369 scan_commit(pRExC_state, &data,&minlen,0);
5370 SvREFCNT_dec(data.last_found);
5371
5372 /* Note that code very similar to this but for anchored string
5373 follows immediately below, changes may need to be made to both.
5374 Be careful.
5375 */
5376 longest_float_length = CHR_SVLEN(data.longest_float);
5377 if (longest_float_length
5378 || (data.flags & SF_FL_BEFORE_EOL
5379 && (!(data.flags & SF_FL_BEFORE_MEOL)
5380 || (RExC_flags & RXf_PMf_MULTILINE))))
5381 {
5382 I32 t,ml;
5383
5384 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5385 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5386 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5387 && data.offset_fixed == data.offset_float_min
5388 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5389 goto remove_float; /* As in (a)+. */
5390
5391 /* copy the information about the longest float from the reg_scan_data
5392 over to the program. */
5393 if (SvUTF8(data.longest_float)) {
5394 r->float_utf8 = data.longest_float;
5395 r->float_substr = NULL;
5396 } else {
5397 r->float_substr = data.longest_float;
5398 r->float_utf8 = NULL;
5399 }
5400 /* float_end_shift is how many chars that must be matched that
5401 follow this item. We calculate it ahead of time as once the
5402 lookbehind offset is added in we lose the ability to correctly
5403 calculate it.*/
5404 ml = data.minlen_float ? *(data.minlen_float)
5405 : (I32)longest_float_length;
5406 r->float_end_shift = ml - data.offset_float_min
5407 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5408 + data.lookbehind_float;
5409 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5410 r->float_max_offset = data.offset_float_max;
5411 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5412 r->float_max_offset -= data.lookbehind_float;
5413
5414 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5415 && (!(data.flags & SF_FL_BEFORE_MEOL)
5416 || (RExC_flags & RXf_PMf_MULTILINE)));
5417 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5418 }
5419 else {
5420 remove_float:
5421 r->float_substr = r->float_utf8 = NULL;
5422 SvREFCNT_dec(data.longest_float);
5423 longest_float_length = 0;
5424 }
5425
5426 /* Note that code very similar to this but for floating string
5427 is immediately above, changes may need to be made to both.
5428 Be careful.
5429 */
5430 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5431
5432 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5433 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5434 && (longest_fixed_length
5435 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5436 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5437 || (RExC_flags & RXf_PMf_MULTILINE)))) )
5438 {
5439 I32 t,ml;
5440
5441 /* copy the information about the longest fixed
5442 from the reg_scan_data over to the program. */
5443 if (SvUTF8(data.longest_fixed)) {
5444 r->anchored_utf8 = data.longest_fixed;
5445 r->anchored_substr = NULL;
5446 } else {
5447 r->anchored_substr = data.longest_fixed;
5448 r->anchored_utf8 = NULL;
5449 }
5450 /* fixed_end_shift is how many chars that must be matched that
5451 follow this item. We calculate it ahead of time as once the
5452 lookbehind offset is added in we lose the ability to correctly
5453 calculate it.*/
5454 ml = data.minlen_fixed ? *(data.minlen_fixed)
5455 : (I32)longest_fixed_length;
5456 r->anchored_end_shift = ml - data.offset_fixed
5457 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5458 + data.lookbehind_fixed;
5459 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5460
5461 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5462 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5463 || (RExC_flags & RXf_PMf_MULTILINE)));
5464 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5465 }
5466 else {
5467 r->anchored_substr = r->anchored_utf8 = NULL;
5468 SvREFCNT_dec(data.longest_fixed);
5469 longest_fixed_length = 0;
5470 }
5471 if (ri->regstclass
5472 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5473 ri->regstclass = NULL;
5474
5475 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5476 && stclass_flag
5477 && !(data.start_class->flags & ANYOF_EOS)
5478 && !cl_is_anything(data.start_class))
5479 {
5480 const U32 n = add_data(pRExC_state, 1, "f");
5481 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5482
5483 Newx(RExC_rxi->data->data[n], 1,
5484 struct regnode_charclass_class);
5485 StructCopy(data.start_class,
5486 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5487 struct regnode_charclass_class);
5488 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5489 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5490 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5491 regprop(r, sv, (regnode*)data.start_class);
5492 PerlIO_printf(Perl_debug_log,
5493 "synthetic stclass \"%s\".\n",
5494 SvPVX_const(sv));});
5495 }
5496
5497 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5498 if (longest_fixed_length > longest_float_length) {
5499 r->check_end_shift = r->anchored_end_shift;
5500 r->check_substr = r->anchored_substr;
5501 r->check_utf8 = r->anchored_utf8;
5502 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5503 if (r->extflags & RXf_ANCH_SINGLE)
5504 r->extflags |= RXf_NOSCAN;
5505 }
5506 else {
5507 r->check_end_shift = r->float_end_shift;
5508 r->check_substr = r->float_substr;
5509 r->check_utf8 = r->float_utf8;
5510 r->check_offset_min = r->float_min_offset;
5511 r->check_offset_max = r->float_max_offset;
5512 }
5513 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5514 This should be changed ASAP! */
5515 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5516 r->extflags |= RXf_USE_INTUIT;
5517 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5518 r->extflags |= RXf_INTUIT_TAIL;
5519 }
5520 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5521 if ( (STRLEN)minlen < longest_float_length )
5522 minlen= longest_float_length;
5523 if ( (STRLEN)minlen < longest_fixed_length )
5524 minlen= longest_fixed_length;
5525 */
5526 }
5527 else {
5528 /* Several toplevels. Best we can is to set minlen. */
5529 I32 fake;
5530 struct regnode_charclass_class ch_class;
5531 I32 last_close = 0;
5532
5533 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5534
5535 scan = ri->program + 1;
5536 cl_init(pRExC_state, &ch_class);
5537 data.start_class = &ch_class;
5538 data.last_closep = &last_close;
5539
5540
5541 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5542 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5543
5544 CHECK_RESTUDY_GOTO;
5545
5546 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5547 = r->float_substr = r->float_utf8 = NULL;
5548
5549 if (!(data.start_class->flags & ANYOF_EOS)
5550 && !cl_is_anything(data.start_class))
5551 {
5552 const U32 n = add_data(pRExC_state, 1, "f");
5553 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
5554
5555 Newx(RExC_rxi->data->data[n], 1,
5556 struct regnode_charclass_class);
5557 StructCopy(data.start_class,
5558 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5559 struct regnode_charclass_class);
5560 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5561 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5562 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5563 regprop(r, sv, (regnode*)data.start_class);
5564 PerlIO_printf(Perl_debug_log,
5565 "synthetic stclass \"%s\".\n",
5566 SvPVX_const(sv));});
5567 }
5568 }
5569
5570 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5571 the "real" pattern. */
5572 DEBUG_OPTIMISE_r({
5573 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5574 (IV)minlen, (IV)r->minlen);
5575 });
5576 r->minlenret = minlen;
5577 if (r->minlen < minlen)
5578 r->minlen = minlen;
5579
5580 if (RExC_seen & REG_SEEN_GPOS)
5581 r->extflags |= RXf_GPOS_SEEN;
5582 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5583 r->extflags |= RXf_LOOKBEHIND_SEEN;
5584 if (RExC_seen & REG_SEEN_EVAL)
5585 r->extflags |= RXf_EVAL_SEEN;
5586 if (RExC_seen & REG_SEEN_CANY)
5587 r->extflags |= RXf_CANY_SEEN;
5588 if (RExC_seen & REG_SEEN_VERBARG)
5589 r->intflags |= PREGf_VERBARG_SEEN;
5590 if (RExC_seen & REG_SEEN_CUTGROUP)
5591 r->intflags |= PREGf_CUTGROUP_SEEN;
5592 if (RExC_paren_names)
5593 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5594 else
5595 RXp_PAREN_NAMES(r) = NULL;
5596
5597#ifdef STUPID_PATTERN_CHECKS
5598 if (RX_PRELEN(rx) == 0)
5599 r->extflags |= RXf_NULL;
5600 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5601 /* XXX: this should happen BEFORE we compile */
5602 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5603 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5604 r->extflags |= RXf_WHITE;
5605 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5606 r->extflags |= RXf_START_ONLY;
5607#else
5608 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5609 /* XXX: this should happen BEFORE we compile */
5610 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5611 else {
5612 regnode *first = ri->program + 1;
5613 U8 fop = OP(first);
5614
5615 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
5616 r->extflags |= RXf_NULL;
5617 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
5618 r->extflags |= RXf_START_ONLY;
5619 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5620 && OP(regnext(first)) == END)
5621 r->extflags |= RXf_WHITE;
5622 }
5623#endif
5624#ifdef DEBUGGING
5625 if (RExC_paren_names) {
5626 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5627 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5628 } else
5629#endif
5630 ri->name_list_idx = 0;
5631
5632 if (RExC_recurse_count) {
5633 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5634 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5635 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5636 }
5637 }
5638 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5639 /* assume we don't need to swap parens around before we match */
5640
5641 DEBUG_DUMP_r({
5642 PerlIO_printf(Perl_debug_log,"Final program:\n");
5643 regdump(r);
5644 });
5645#ifdef RE_TRACK_PATTERN_OFFSETS
5646 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5647 const U32 len = ri->u.offsets[0];
5648 U32 i;
5649 GET_RE_DEBUG_FLAGS_DECL;
5650 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5651 for (i = 1; i <= len; i++) {
5652 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5653 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5654 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5655 }
5656 PerlIO_printf(Perl_debug_log, "\n");
5657 });
5658#endif
5659 return rx;
5660}
5661
5662#undef RE_ENGINE_PTR
5663
5664
5665SV*
5666Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5667 const U32 flags)
5668{
5669 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5670
5671 PERL_UNUSED_ARG(value);
5672
5673 if (flags & RXapif_FETCH) {
5674 return reg_named_buff_fetch(rx, key, flags);
5675 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5676 Perl_croak_no_modify(aTHX);
5677 return NULL;
5678 } else if (flags & RXapif_EXISTS) {
5679 return reg_named_buff_exists(rx, key, flags)
5680 ? &PL_sv_yes
5681 : &PL_sv_no;
5682 } else if (flags & RXapif_REGNAMES) {
5683 return reg_named_buff_all(rx, flags);
5684 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5685 return reg_named_buff_scalar(rx, flags);
5686 } else {
5687 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5688 return NULL;
5689 }
5690}
5691
5692SV*
5693Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5694 const U32 flags)
5695{
5696 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5697 PERL_UNUSED_ARG(lastkey);
5698
5699 if (flags & RXapif_FIRSTKEY)
5700 return reg_named_buff_firstkey(rx, flags);
5701 else if (flags & RXapif_NEXTKEY)
5702 return reg_named_buff_nextkey(rx, flags);
5703 else {
5704 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5705 return NULL;
5706 }
5707}
5708
5709SV*
5710Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5711 const U32 flags)
5712{
5713 AV *retarray = NULL;
5714 SV *ret;
5715 struct regexp *const rx = (struct regexp *)SvANY(r);
5716
5717 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5718
5719 if (flags & RXapif_ALL)
5720 retarray=newAV();
5721
5722 if (rx && RXp_PAREN_NAMES(rx)) {
5723 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5724 if (he_str) {
5725 IV i;
5726 SV* sv_dat=HeVAL(he_str);
5727 I32 *nums=(I32*)SvPVX(sv_dat);
5728 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5729 if ((I32)(rx->nparens) >= nums[i]
5730 && rx->offs[nums[i]].start != -1
5731 && rx->offs[nums[i]].end != -1)
5732 {
5733 ret = newSVpvs("");
5734 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5735 if (!retarray)
5736 return ret;
5737 } else {
5738 if (retarray)
5739 ret = newSVsv(&PL_sv_undef);
5740 }
5741 if (retarray)
5742 av_push(retarray, ret);
5743 }
5744 if (retarray)
5745 return newRV_noinc(MUTABLE_SV(retarray));
5746 }
5747 }
5748 return NULL;
5749}
5750
5751bool
5752Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5753 const U32 flags)
5754{
5755 struct regexp *const rx = (struct regexp *)SvANY(r);
5756
5757 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5758
5759 if (rx && RXp_PAREN_NAMES(rx)) {
5760 if (flags & RXapif_ALL) {
5761 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5762 } else {
5763 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5764 if (sv) {
5765 SvREFCNT_dec(sv);
5766 return TRUE;
5767 } else {
5768 return FALSE;
5769 }
5770 }
5771 } else {
5772 return FALSE;
5773 }
5774}
5775
5776SV*
5777Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5778{
5779 struct regexp *const rx = (struct regexp *)SvANY(r);
5780
5781 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5782
5783 if ( rx && RXp_PAREN_NAMES(rx) ) {
5784 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5785
5786 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5787 } else {
5788 return FALSE;
5789 }
5790}
5791
5792SV*
5793Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5794{
5795 struct regexp *const rx = (struct regexp *)SvANY(r);
5796 GET_RE_DEBUG_FLAGS_DECL;
5797
5798 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5799
5800 if (rx && RXp_PAREN_NAMES(rx)) {
5801 HV *hv = RXp_PAREN_NAMES(rx);
5802 HE *temphe;
5803 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5804 IV i;
5805 IV parno = 0;
5806 SV* sv_dat = HeVAL(temphe);
5807 I32 *nums = (I32*)SvPVX(sv_dat);
5808 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5809 if ((I32)(rx->lastparen) >= nums[i] &&
5810 rx->offs[nums[i]].start != -1 &&
5811 rx->offs[nums[i]].end != -1)
5812 {
5813 parno = nums[i];
5814 break;
5815 }
5816 }
5817 if (parno || flags & RXapif_ALL) {
5818 return newSVhek(HeKEY_hek(temphe));
5819 }
5820 }
5821 }
5822 return NULL;
5823}
5824
5825SV*
5826Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5827{
5828 SV *ret;
5829 AV *av;
5830 I32 length;
5831 struct regexp *const rx = (struct regexp *)SvANY(r);
5832
5833 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5834
5835 if (rx && RXp_PAREN_NAMES(rx)) {
5836 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5837 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5838 } else if (flags & RXapif_ONE) {
5839 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5840 av = MUTABLE_AV(SvRV(ret));
5841 length = av_len(av);
5842 SvREFCNT_dec(ret);
5843 return newSViv(length + 1);
5844 } else {
5845 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5846 return NULL;
5847 }
5848 }
5849 return &PL_sv_undef;
5850}
5851
5852SV*
5853Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5854{
5855 struct regexp *const rx = (struct regexp *)SvANY(r);
5856 AV *av = newAV();
5857
5858 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5859
5860 if (rx && RXp_PAREN_NAMES(rx)) {
5861 HV *hv= RXp_PAREN_NAMES(rx);
5862 HE *temphe;
5863 (void)hv_iterinit(hv);
5864 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5865 IV i;
5866 IV parno = 0;
5867 SV* sv_dat = HeVAL(temphe);
5868 I32 *nums = (I32*)SvPVX(sv_dat);
5869 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5870 if ((I32)(rx->lastparen) >= nums[i] &&
5871 rx->offs[nums[i]].start != -1 &&
5872 rx->offs[nums[i]].end != -1)
5873 {
5874 parno = nums[i];
5875 break;
5876 }
5877 }
5878 if (parno || flags & RXapif_ALL) {
5879 av_push(av, newSVhek(HeKEY_hek(temphe)));
5880 }
5881 }
5882 }
5883
5884 return newRV_noinc(MUTABLE_SV(av));
5885}
5886
5887void
5888Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5889 SV * const sv)
5890{
5891 struct regexp *const rx = (struct regexp *)SvANY(r);
5892 char *s = NULL;
5893 I32 i = 0;
5894 I32 s1, t1;
5895
5896 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5897
5898 if (!rx->subbeg) {
5899 sv_setsv(sv,&PL_sv_undef);
5900 return;
5901 }
5902 else
5903 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5904 /* $` */
5905 i = rx->offs[0].start;
5906 s = rx->subbeg;
5907 }
5908 else
5909 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5910 /* $' */
5911 s = rx->subbeg + rx->offs[0].end;
5912 i = rx->sublen - rx->offs[0].end;
5913 }
5914 else
5915 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5916 (s1 = rx->offs[paren].start) != -1 &&
5917 (t1 = rx->offs[paren].end) != -1)
5918 {
5919 /* $& $1 ... */
5920 i = t1 - s1;
5921 s = rx->subbeg + s1;
5922 } else {
5923 sv_setsv(sv,&PL_sv_undef);
5924 return;
5925 }
5926 assert(rx->sublen >= (s - rx->subbeg) + i );
5927 if (i >= 0) {
5928 const int oldtainted = PL_tainted;
5929 TAINT_NOT;
5930 sv_setpvn(sv, s, i);
5931 PL_tainted = oldtainted;
5932 if ( (rx->extflags & RXf_CANY_SEEN)
5933 ? (RXp_MATCH_UTF8(rx)
5934 && (!i || is_utf8_string((U8*)s, i)))
5935 : (RXp_MATCH_UTF8(rx)) )
5936 {
5937 SvUTF8_on(sv);
5938 }
5939 else
5940 SvUTF8_off(sv);
5941 if (PL_tainting) {
5942 if (RXp_MATCH_TAINTED(rx)) {
5943 if (SvTYPE(sv) >= SVt_PVMG) {
5944 MAGIC* const mg = SvMAGIC(sv);
5945 MAGIC* mgt;
5946 PL_tainted = 1;
5947 SvMAGIC_set(sv, mg->mg_moremagic);
5948 SvTAINT(sv);
5949 if ((mgt = SvMAGIC(sv))) {
5950 mg->mg_moremagic = mgt;
5951 SvMAGIC_set(sv, mg);
5952 }
5953 } else {
5954 PL_tainted = 1;
5955 SvTAINT(sv);
5956 }
5957 } else
5958 SvTAINTED_off(sv);
5959 }
5960 } else {
5961 sv_setsv(sv,&PL_sv_undef);
5962 return;
5963 }
5964}
5965
5966void
5967Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5968 SV const * const value)
5969{
5970 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5971
5972 PERL_UNUSED_ARG(rx);
5973 PERL_UNUSED_ARG(paren);
5974 PERL_UNUSED_ARG(value);
5975
5976 if (!PL_localizing)
5977 Perl_croak_no_modify(aTHX);
5978}
5979
5980I32
5981Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5982 const I32 paren)
5983{
5984 struct regexp *const rx = (struct regexp *)SvANY(r);
5985 I32 i;
5986 I32 s1, t1;
5987
5988 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5989
5990 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5991 switch (paren) {
5992 /* $` / ${^PREMATCH} */
5993 case RX_BUFF_IDX_PREMATCH:
5994 if (rx->offs[0].start != -1) {
5995 i = rx->offs[0].start;
5996 if (i > 0) {
5997 s1 = 0;
5998 t1 = i;
5999 goto getlen;
6000 }
6001 }
6002 return 0;
6003 /* $' / ${^POSTMATCH} */
6004 case RX_BUFF_IDX_POSTMATCH:
6005 if (rx->offs[0].end != -1) {
6006 i = rx->sublen - rx->offs[0].end;
6007 if (i > 0) {
6008 s1 = rx->offs[0].end;
6009 t1 = rx->sublen;
6010 goto getlen;
6011 }
6012 }
6013 return 0;
6014 /* $& / ${^MATCH}, $1, $2, ... */
6015 default:
6016 if (paren <= (I32)rx->nparens &&
6017 (s1 = rx->offs[paren].start) != -1 &&
6018 (t1 = rx->offs[paren].end) != -1)
6019 {
6020 i = t1 - s1;
6021 goto getlen;
6022 } else {
6023 if (ckWARN(WARN_UNINITIALIZED))
6024 report_uninit((const SV *)sv);
6025 return 0;
6026 }
6027 }
6028 getlen:
6029 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6030 const char * const s = rx->subbeg + s1;
6031 const U8 *ep;
6032 STRLEN el;
6033
6034 i = t1 - s1;
6035 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6036 i = el;
6037 }
6038 return i;
6039}
6040
6041SV*
6042Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6043{
6044 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6045 PERL_UNUSED_ARG(rx);
6046 if (0)
6047 return NULL;
6048 else
6049 return newSVpvs("Regexp");
6050}
6051
6052/* Scans the name of a named buffer from the pattern.
6053 * If flags is REG_RSN_RETURN_NULL returns null.
6054 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6055 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6056 * to the parsed name as looked up in the RExC_paren_names hash.
6057 * If there is an error throws a vFAIL().. type exception.
6058 */
6059
6060#define REG_RSN_RETURN_NULL 0
6061#define REG_RSN_RETURN_NAME 1
6062#define REG_RSN_RETURN_DATA 2
6063
6064STATIC SV*
6065S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6066{
6067 char *name_start = RExC_parse;
6068
6069 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6070
6071 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6072 /* skip IDFIRST by using do...while */
6073 if (UTF)
6074 do {
6075 RExC_parse += UTF8SKIP(RExC_parse);
6076 } while (isALNUM_utf8((U8*)RExC_parse));
6077 else
6078 do {
6079 RExC_parse++;
6080 } while (isALNUM(*RExC_parse));
6081 }
6082
6083 if ( flags ) {
6084 SV* sv_name
6085 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6086 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6087 if ( flags == REG_RSN_RETURN_NAME)
6088 return sv_name;
6089 else if (flags==REG_RSN_RETURN_DATA) {
6090 HE *he_str = NULL;
6091 SV *sv_dat = NULL;
6092 if ( ! sv_name ) /* should not happen*/
6093 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6094 if (RExC_paren_names)
6095 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6096 if ( he_str )
6097 sv_dat = HeVAL(he_str);
6098 if ( ! sv_dat )
6099 vFAIL("Reference to nonexistent named group");
6100 return sv_dat;
6101 }
6102 else {
6103 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6104 (unsigned long) flags);
6105 }
6106 /* NOT REACHED */
6107 }
6108 return NULL;
6109}
6110
6111#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6112 int rem=(int)(RExC_end - RExC_parse); \
6113 int cut; \
6114 int num; \
6115 int iscut=0; \
6116 if (rem>10) { \
6117 rem=10; \
6118 iscut=1; \
6119 } \
6120 cut=10-rem; \
6121 if (RExC_lastparse!=RExC_parse) \
6122 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6123 rem, RExC_parse, \
6124 cut + 4, \
6125 iscut ? "..." : "<" \
6126 ); \
6127 else \
6128 PerlIO_printf(Perl_debug_log,"%16s",""); \
6129 \
6130 if (SIZE_ONLY) \
6131 num = RExC_size + 1; \
6132 else \
6133 num=REG_NODE_NUM(RExC_emit); \
6134 if (RExC_lastnum!=num) \
6135 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6136 else \
6137 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6138 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6139 (int)((depth*2)), "", \
6140 (funcname) \
6141 ); \
6142 RExC_lastnum=num; \
6143 RExC_lastparse=RExC_parse; \
6144})
6145
6146
6147
6148#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6149 DEBUG_PARSE_MSG((funcname)); \
6150 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6151})
6152#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6153 DEBUG_PARSE_MSG((funcname)); \
6154 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6155})
6156
6157/* This section of code defines the inversion list object and its methods. The
6158 * interfaces are highly subject to change, so as much as possible is static to
6159 * this file. An inversion list is here implemented as a malloc'd C UV array
6160 * with some added info that is placed as UVs at the beginning in a header
6161 * portion. An inversion list for Unicode is an array of code points, sorted
6162 * by ordinal number. The zeroth element is the first code point in the list.
6163 * The 1th element is the first element beyond that not in the list. In other
6164 * words, the first range is
6165 * invlist[0]..(invlist[1]-1)
6166 * The other ranges follow. Thus every element whose index is divisible by two
6167 * marks the beginning of a range that is in the list, and every element not
6168 * divisible by two marks the beginning of a range not in the list. A single
6169 * element inversion list that contains the single code point N generally
6170 * consists of two elements
6171 * invlist[0] == N
6172 * invlist[1] == N+1
6173 * (The exception is when N is the highest representable value on the
6174 * machine, in which case the list containing just it would be a single
6175 * element, itself. By extension, if the last range in the list extends to
6176 * infinity, then the first element of that range will be in the inversion list
6177 * at a position that is divisible by two, and is the final element in the
6178 * list.)
6179 * Taking the complement (inverting) an inversion list is quite simple, if the
6180 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6181 * This implementation reserves an element at the beginning of each inversion list
6182 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6183 * beginning of the list is either that element if 0, or the next one if 1.
6184 *
6185 * More about inversion lists can be found in "Unicode Demystified"
6186 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6187 * More will be coming when functionality is added later.
6188 *
6189 * The inversion list data structure is currently implemented as an SV pointing
6190 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6191 * array of UV whose memory management is automatically handled by the existing
6192 * facilities for SV's.
6193 *
6194 * Some of the methods should always be private to the implementation, and some
6195 * should eventually be made public */
6196
6197#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6198#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6199
6200/* This is a combination of a version and data structure type, so that one
6201 * being passed in can be validated to be an inversion list of the correct
6202 * vintage. When the structure of the header is changed, a new random number
6203 * in the range 2**31-1 should be generated and the new() method changed to
6204 * insert that at this location. Then, if an auxiliary program doesn't change
6205 * correspondingly, it will be discovered immediately */
6206#define INVLIST_VERSION_ID_OFFSET 2
6207#define INVLIST_VERSION_ID 1064334010
6208
6209/* For safety, when adding new elements, remember to #undef them at the end of
6210 * the inversion list code section */
6211
6212#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
6213/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
6214 * contains the code point U+00000, and begins here. If 1, the inversion list
6215 * doesn't contain U+0000, and it begins at the next UV in the array.
6216 * Inverting an inversion list consists of adding or removing the 0 at the
6217 * beginning of it. By reserving a space for that 0, inversion can be made
6218 * very fast */
6219
6220#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6221
6222/* Internally things are UVs */
6223#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6224#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6225
6226#define INVLIST_INITIAL_LEN 10
6227
6228PERL_STATIC_INLINE UV*
6229S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6230{
6231 /* Returns a pointer to the first element in the inversion list's array.
6232 * This is called upon initialization of an inversion list. Where the
6233 * array begins depends on whether the list has the code point U+0000
6234 * in it or not. The other parameter tells it whether the code that
6235 * follows this call is about to put a 0 in the inversion list or not.
6236 * The first element is either the element with 0, if 0, or the next one,
6237 * if 1 */
6238
6239 UV* zero = get_invlist_zero_addr(invlist);
6240
6241 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6242
6243 /* Must be empty */
6244 assert(! *get_invlist_len_addr(invlist));
6245
6246 /* 1^1 = 0; 1^0 = 1 */
6247 *zero = 1 ^ will_have_0;
6248 return zero + *zero;
6249}
6250
6251PERL_STATIC_INLINE UV*
6252S_invlist_array(pTHX_ SV* const invlist)
6253{
6254 /* Returns the pointer to the inversion list's array. Every time the
6255 * length changes, this needs to be called in case malloc or realloc moved
6256 * it */
6257
6258 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6259
6260 /* Must not be empty. If these fail, you probably didn't check for <len>
6261 * being non-zero before trying to get the array */
6262 assert(*get_invlist_len_addr(invlist));
6263 assert(*get_invlist_zero_addr(invlist) == 0
6264 || *get_invlist_zero_addr(invlist) == 1);
6265
6266 /* The array begins either at the element reserved for zero if the
6267 * list contains 0 (that element will be set to 0), or otherwise the next
6268 * element (in which case the reserved element will be set to 1). */
6269 return (UV *) (get_invlist_zero_addr(invlist)
6270 + *get_invlist_zero_addr(invlist));
6271}
6272
6273PERL_STATIC_INLINE UV*
6274S_get_invlist_len_addr(pTHX_ SV* invlist)
6275{
6276 /* Return the address of the UV that contains the current number
6277 * of used elements in the inversion list */
6278
6279 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6280
6281 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6282}
6283
6284PERL_STATIC_INLINE UV
6285S_invlist_len(pTHX_ SV* const invlist)
6286{
6287 /* Returns the current number of elements stored in the inversion list's
6288 * array */
6289
6290 PERL_ARGS_ASSERT_INVLIST_LEN;
6291
6292 return *get_invlist_len_addr(invlist);
6293}
6294
6295PERL_STATIC_INLINE void
6296S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6297{
6298 /* Sets the current number of elements stored in the inversion list */
6299
6300 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6301
6302 *get_invlist_len_addr(invlist) = len;
6303
6304 assert(len <= SvLEN(invlist));
6305
6306 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6307 /* If the list contains U+0000, that element is part of the header,
6308 * and should not be counted as part of the array. It will contain
6309 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6310 * subtract:
6311 * SvCUR_set(invlist,
6312 * TO_INTERNAL_SIZE(len
6313 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6314 * But, this is only valid if len is not 0. The consequences of not doing
6315 * this is that the memory allocation code may think that 1 more UV is
6316 * being used than actually is, and so might do an unnecessary grow. That
6317 * seems worth not bothering to make this the precise amount.
6318 *
6319 * Note that when inverting, SvCUR shouldn't change */
6320}
6321
6322PERL_STATIC_INLINE UV
6323S_invlist_max(pTHX_ SV* const invlist)
6324{
6325 /* Returns the maximum number of elements storable in the inversion list's
6326 * array, without having to realloc() */
6327
6328 PERL_ARGS_ASSERT_INVLIST_MAX;
6329
6330 return FROM_INTERNAL_SIZE(SvLEN(invlist));
6331}
6332
6333PERL_STATIC_INLINE UV*
6334S_get_invlist_zero_addr(pTHX_ SV* invlist)
6335{
6336 /* Return the address of the UV that is reserved to hold 0 if the inversion
6337 * list contains 0. This has to be the last element of the heading, as the
6338 * list proper starts with either it if 0, or the next element if not.
6339 * (But we force it to contain either 0 or 1) */
6340
6341 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6342
6343 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6344}
6345
6346#ifndef PERL_IN_XSUB_RE
6347SV*
6348Perl__new_invlist(pTHX_ IV initial_size)
6349{
6350
6351 /* Return a pointer to a newly constructed inversion list, with enough
6352 * space to store 'initial_size' elements. If that number is negative, a
6353 * system default is used instead */
6354
6355 SV* new_list;
6356
6357 if (initial_size < 0) {
6358 initial_size = INVLIST_INITIAL_LEN;
6359 }
6360
6361 /* Allocate the initial space */
6362 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6363 invlist_set_len(new_list, 0);
6364
6365 /* Force iterinit() to be used to get iteration to work */
6366 *get_invlist_iter_addr(new_list) = UV_MAX;
6367
6368 /* This should force a segfault if a method doesn't initialize this
6369 * properly */
6370 *get_invlist_zero_addr(new_list) = UV_MAX;
6371
6372 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6373#if HEADER_LENGTH != 4
6374# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6375#endif
6376
6377 return new_list;
6378}
6379#endif
6380
6381STATIC SV*
6382S__new_invlist_C_array(pTHX_ UV* list)
6383{
6384 /* Return a pointer to a newly constructed inversion list, initialized to
6385 * point to <list>, which has to be in the exact correct inversion list
6386 * form, including internal fields. Thus this is a dangerous routine that
6387 * should not be used in the wrong hands */
6388
6389 SV* invlist = newSV_type(SVt_PV);
6390
6391 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6392
6393 SvPV_set(invlist, (char *) list);
6394 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
6395 shouldn't touch it */
6396 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6397
6398 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6399 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6400 }
6401
6402 return invlist;
6403}
6404
6405STATIC void
6406S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6407{
6408 /* Grow the maximum size of an inversion list */
6409
6410 PERL_ARGS_ASSERT_INVLIST_EXTEND;
6411
6412 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6413}
6414
6415PERL_STATIC_INLINE void
6416S_invlist_trim(pTHX_ SV* const invlist)
6417{
6418 PERL_ARGS_ASSERT_INVLIST_TRIM;
6419
6420 /* Change the length of the inversion list to how many entries it currently
6421 * has */
6422
6423 SvPV_shrink_to_cur((SV *) invlist);
6424}
6425
6426/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6427 * etc */
6428#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6429#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6430
6431#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6432
6433#ifndef PERL_IN_XSUB_RE
6434void
6435Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6436{
6437 /* Subject to change or removal. Append the range from 'start' to 'end' at
6438 * the end of the inversion list. The range must be above any existing
6439 * ones. */
6440
6441 UV* array;
6442 UV max = invlist_max(invlist);
6443 UV len = invlist_len(invlist);
6444
6445 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6446
6447 if (len == 0) { /* Empty lists must be initialized */
6448 array = _invlist_array_init(invlist, start == 0);
6449 }
6450 else {
6451 /* Here, the existing list is non-empty. The current max entry in the
6452 * list is generally the first value not in the set, except when the
6453 * set extends to the end of permissible values, in which case it is
6454 * the first entry in that final set, and so this call is an attempt to
6455 * append out-of-order */
6456
6457 UV final_element = len - 1;
6458 array = invlist_array(invlist);
6459 if (array[final_element] > start
6460 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6461 {
6462 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6463 array[final_element], start,
6464 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6465 }
6466
6467 /* Here, it is a legal append. If the new range begins with the first
6468 * value not in the set, it is extending the set, so the new first
6469 * value not in the set is one greater than the newly extended range.
6470 * */
6471 if (array[final_element] == start) {
6472 if (end != UV_MAX) {
6473 array[final_element] = end + 1;
6474 }
6475 else {
6476 /* But if the end is the maximum representable on the machine,
6477 * just let the range that this would extend to have no end */
6478 invlist_set_len(invlist, len - 1);
6479 }
6480 return;
6481 }
6482 }
6483
6484 /* Here the new range doesn't extend any existing set. Add it */
6485
6486 len += 2; /* Includes an element each for the start and end of range */
6487
6488 /* If overflows the existing space, extend, which may cause the array to be
6489 * moved */
6490 if (max < len) {
6491 invlist_extend(invlist, len);
6492 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
6493 failure in invlist_array() */
6494 array = invlist_array(invlist);
6495 }
6496 else {
6497 invlist_set_len(invlist, len);
6498 }
6499
6500 /* The next item on the list starts the range, the one after that is
6501 * one past the new range. */
6502 array[len - 2] = start;
6503 if (end != UV_MAX) {
6504 array[len - 1] = end + 1;
6505 }
6506 else {
6507 /* But if the end is the maximum representable on the machine, just let
6508 * the range have no end */
6509 invlist_set_len(invlist, len - 1);
6510 }
6511}
6512
6513STATIC IV
6514S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6515{
6516 /* Searches the inversion list for the entry that contains the input code
6517 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
6518 * return value is the index into the list's array of the range that
6519 * contains <cp> */
6520
6521 IV low = 0;
6522 IV high = invlist_len(invlist);
6523 const UV * const array = invlist_array(invlist);
6524
6525 PERL_ARGS_ASSERT_INVLIST_SEARCH;
6526
6527 /* If list is empty or the code point is before the first element, return
6528 * failure. */
6529 if (high == 0 || cp < array[0]) {
6530 return -1;
6531 }
6532
6533 /* Binary search. What we are looking for is <i> such that
6534 * array[i] <= cp < array[i+1]
6535 * The loop below converges on the i+1. */
6536 while (low < high) {
6537 IV mid = (low + high) / 2;
6538 if (array[mid] <= cp) {
6539 low = mid + 1;
6540
6541 /* We could do this extra test to exit the loop early.
6542 if (cp < array[low]) {
6543 return mid;
6544 }
6545 */
6546 }
6547 else { /* cp < array[mid] */
6548 high = mid;
6549 }
6550 }
6551
6552 return high - 1;
6553}
6554
6555void
6556Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6557{
6558 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6559 * but is used when the swash has an inversion list. This makes this much
6560 * faster, as it uses a binary search instead of a linear one. This is
6561 * intimately tied to that function, and perhaps should be in utf8.c,
6562 * except it is intimately tied to inversion lists as well. It assumes
6563 * that <swatch> is all 0's on input */
6564
6565 UV current = start;
6566 const IV len = invlist_len(invlist);
6567 IV i;
6568 const UV * array;
6569
6570 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6571
6572 if (len == 0) { /* Empty inversion list */
6573 return;
6574 }
6575
6576 array = invlist_array(invlist);
6577
6578 /* Find which element it is */
6579 i = invlist_search(invlist, start);
6580
6581 /* We populate from <start> to <end> */
6582 while (current < end) {
6583 UV upper;
6584
6585 /* The inversion list gives the results for every possible code point
6586 * after the first one in the list. Only those ranges whose index is
6587 * even are ones that the inversion list matches. For the odd ones,
6588 * and if the initial code point is not in the list, we have to skip
6589 * forward to the next element */
6590 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6591 i++;
6592 if (i >= len) { /* Finished if beyond the end of the array */
6593 return;
6594 }
6595 current = array[i];
6596 if (current >= end) { /* Finished if beyond the end of what we
6597 are populating */
6598 return;
6599 }
6600 }
6601 assert(current >= start);
6602
6603 /* The current range ends one below the next one, except don't go past
6604 * <end> */
6605 i++;
6606 upper = (i < len && array[i] < end) ? array[i] : end;
6607
6608 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
6609 * for each code point in it */
6610 for (; current < upper; current++) {
6611 const STRLEN offset = (STRLEN)(current - start);
6612 swatch[offset >> 3] |= 1 << (offset & 7);
6613 }
6614
6615 /* Quit if at the end of the list */
6616 if (i >= len) {
6617
6618 /* But first, have to deal with the highest possible code point on
6619 * the platform. The previous code assumes that <end> is one
6620 * beyond where we want to populate, but that is impossible at the
6621 * platform's infinity, so have to handle it specially */
6622 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6623 {
6624 const STRLEN offset = (STRLEN)(end - start);
6625 swatch[offset >> 3] |= 1 << (offset & 7);
6626 }
6627 return;
6628 }
6629
6630 /* Advance to the next range, which will be for code points not in the
6631 * inversion list */
6632 current = array[i];
6633 }
6634
6635 return;
6636}
6637
6638
6639void
6640Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
6641{
6642 /* Take the union of two inversion lists and point <output> to it. *output
6643 * should be defined upon input, and if it points to one of the two lists,
6644 * the reference count to that list will be decremented. The first list,
6645 * <a>, may be NULL, in which case a copy of the second list is returned.
6646 * If <complement_b> is TRUE, the union is taken of the complement
6647 * (inversion) of <b> instead of b itself.
6648 *
6649 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6650 * Richard Gillam, published by Addison-Wesley, and explained at some
6651 * length there. The preface says to incorporate its examples into your
6652 * code at your own risk.
6653 *
6654 * The algorithm is like a merge sort.
6655 *
6656 * XXX A potential performance improvement is to keep track as we go along
6657 * if only one of the inputs contributes to the result, meaning the other
6658 * is a subset of that one. In that case, we can skip the final copy and
6659 * return the larger of the input lists, but then outside code might need
6660 * to keep track of whether to free the input list or not */
6661
6662 UV* array_a; /* a's array */
6663 UV* array_b;
6664 UV len_a; /* length of a's array */
6665 UV len_b;
6666
6667 SV* u; /* the resulting union */
6668 UV* array_u;
6669 UV len_u;
6670
6671 UV i_a = 0; /* current index into a's array */
6672 UV i_b = 0;
6673 UV i_u = 0;
6674
6675 /* running count, as explained in the algorithm source book; items are
6676 * stopped accumulating and are output when the count changes to/from 0.
6677 * The count is incremented when we start a range that's in the set, and
6678 * decremented when we start a range that's not in the set. So its range
6679 * is 0 to 2. Only when the count is zero is something not in the set.
6680 */
6681 UV count = 0;
6682
6683 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
6684 assert(a != b);
6685
6686 /* If either one is empty, the union is the other one */
6687 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
6688 if (*output == a) {
6689 if (a != NULL) {
6690 SvREFCNT_dec(a);
6691 }
6692 }
6693 if (*output != b) {
6694 *output = invlist_clone(b);
6695 if (complement_b) {
6696 _invlist_invert(*output);
6697 }
6698 } /* else *output already = b; */
6699 return;
6700 }
6701 else if ((len_b = invlist_len(b)) == 0) {
6702 if (*output == b) {
6703 SvREFCNT_dec(b);
6704 }
6705
6706 /* The complement of an empty list is a list that has everything in it,
6707 * so the union with <a> includes everything too */
6708 if (complement_b) {
6709 if (a == *output) {
6710 SvREFCNT_dec(a);
6711 }
6712 *output = _new_invlist(1);
6713 _append_range_to_invlist(*output, 0, UV_MAX);
6714 }
6715 else if (*output != a) {
6716 *output = invlist_clone(a);
6717 }
6718 /* else *output already = a; */
6719 return;
6720 }
6721
6722 /* Here both lists exist and are non-empty */
6723 array_a = invlist_array(a);
6724 array_b = invlist_array(b);
6725
6726 /* If are to take the union of 'a' with the complement of b, set it
6727 * up so are looking at b's complement. */
6728 if (complement_b) {
6729
6730 /* To complement, we invert: if the first element is 0, remove it. To
6731 * do this, we just pretend the array starts one later, and clear the
6732 * flag as we don't have to do anything else later */
6733 if (array_b[0] == 0) {
6734 array_b++;
6735 len_b--;
6736 complement_b = FALSE;
6737 }
6738 else {
6739
6740 /* But if the first element is not zero, we unshift a 0 before the
6741 * array. The data structure reserves a space for that 0 (which
6742 * should be a '1' right now), so physical shifting is unneeded,
6743 * but temporarily change that element to 0. Before exiting the
6744 * routine, we must restore the element to '1' */
6745 array_b--;
6746 len_b++;
6747 array_b[0] = 0;
6748 }
6749 }
6750
6751 /* Size the union for the worst case: that the sets are completely
6752 * disjoint */
6753 u = _new_invlist(len_a + len_b);
6754
6755 /* Will contain U+0000 if either component does */
6756 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6757 || (len_b > 0 && array_b[0] == 0));
6758
6759 /* Go through each list item by item, stopping when exhausted one of
6760 * them */
6761 while (i_a < len_a && i_b < len_b) {
6762 UV cp; /* The element to potentially add to the union's array */
6763 bool cp_in_set; /* is it in the the input list's set or not */
6764
6765 /* We need to take one or the other of the two inputs for the union.
6766 * Since we are merging two sorted lists, we take the smaller of the
6767 * next items. In case of a tie, we take the one that is in its set
6768 * first. If we took one not in the set first, it would decrement the
6769 * count, possibly to 0 which would cause it to be output as ending the
6770 * range, and the next time through we would take the same number, and
6771 * output it again as beginning the next range. By doing it the
6772 * opposite way, there is no possibility that the count will be
6773 * momentarily decremented to 0, and thus the two adjoining ranges will
6774 * be seamlessly merged. (In a tie and both are in the set or both not
6775 * in the set, it doesn't matter which we take first.) */
6776 if (array_a[i_a] < array_b[i_b]
6777 || (array_a[i_a] == array_b[i_b]
6778 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
6779 {
6780 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
6781 cp= array_a[i_a++];
6782 }
6783 else {
6784 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
6785 cp= array_b[i_b++];
6786 }
6787
6788 /* Here, have chosen which of the two inputs to look at. Only output
6789 * if the running count changes to/from 0, which marks the
6790 * beginning/end of a range in that's in the set */
6791 if (cp_in_set) {
6792 if (count == 0) {
6793 array_u[i_u++] = cp;
6794 }
6795 count++;
6796 }
6797 else {
6798 count--;
6799 if (count == 0) {
6800 array_u[i_u++] = cp;
6801 }
6802 }
6803 }
6804
6805 /* Here, we are finished going through at least one of the lists, which
6806 * means there is something remaining in at most one. We check if the list
6807 * that hasn't been exhausted is positioned such that we are in the middle
6808 * of a range in its set or not. (i_a and i_b point to the element beyond
6809 * the one we care about.) If in the set, we decrement 'count'; if 0, there
6810 * is potentially more to output.
6811 * There are four cases:
6812 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6813 * in the union is entirely from the non-exhausted set.
6814 * 2) Both were in their sets, count is 2. Nothing further should
6815 * be output, as everything that remains will be in the exhausted
6816 * list's set, hence in the union; decrementing to 1 but not 0 insures
6817 * that
6818 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6819 * Nothing further should be output because the union includes
6820 * everything from the exhausted set. Not decrementing ensures that.
6821 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6822 * decrementing to 0 insures that we look at the remainder of the
6823 * non-exhausted set */
6824 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6825 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
6826 {
6827 count--;
6828 }
6829
6830 /* The final length is what we've output so far, plus what else is about to
6831 * be output. (If 'count' is non-zero, then the input list we exhausted
6832 * has everything remaining up to the machine's limit in its set, and hence
6833 * in the union, so there will be no further output. */
6834 len_u = i_u;
6835 if (count == 0) {
6836 /* At most one of the subexpressions will be non-zero */
6837 len_u += (len_a - i_a) + (len_b - i_b);
6838 }
6839
6840 /* Set result to final length, which can change the pointer to array_u, so
6841 * re-find it */
6842 if (len_u != invlist_len(u)) {
6843 invlist_set_len(u, len_u);
6844 invlist_trim(u);
6845 array_u = invlist_array(u);
6846 }
6847
6848 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6849 * the other) ended with everything above it not in its set. That means
6850 * that the remaining part of the union is precisely the same as the
6851 * non-exhausted list, so can just copy it unchanged. (If both list were
6852 * exhausted at the same time, then the operations below will be both 0.)
6853 */
6854 if (count == 0) {
6855 IV copy_count; /* At most one will have a non-zero copy count */
6856 if ((copy_count = len_a - i_a) > 0) {
6857 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6858 }
6859 else if ((copy_count = len_b - i_b) > 0) {
6860 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6861 }
6862 }
6863
6864 /* We may be removing a reference to one of the inputs */
6865 if (a == *output || b == *output) {
6866 SvREFCNT_dec(*output);
6867 }
6868
6869 /* If we've changed b, restore it */
6870 if (complement_b) {
6871 array_b[0] = 1;
6872 }
6873
6874 *output = u;
6875 return;
6876}
6877
6878void
6879Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
6880{
6881 /* Take the intersection of two inversion lists and point <i> to it. *i
6882 * should be defined upon input, and if it points to one of the two lists,
6883 * the reference count to that list will be decremented.
6884 * If <complement_b> is TRUE, the result will be the intersection of <a>
6885 * and the complement (or inversion) of <b> instead of <b> directly.
6886 *
6887 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6888 * Richard Gillam, published by Addison-Wesley, and explained at some
6889 * length there. The preface says to incorporate its examples into your
6890 * code at your own risk. In fact, it had bugs
6891 *
6892 * The algorithm is like a merge sort, and is essentially the same as the
6893 * union above
6894 */
6895
6896 UV* array_a; /* a's array */
6897 UV* array_b;
6898 UV len_a; /* length of a's array */
6899 UV len_b;
6900
6901 SV* r; /* the resulting intersection */
6902 UV* array_r;
6903 UV len_r;
6904
6905 UV i_a = 0; /* current index into a's array */
6906 UV i_b = 0;
6907 UV i_r = 0;
6908
6909 /* running count, as explained in the algorithm source book; items are
6910 * stopped accumulating and are output when the count changes to/from 2.
6911 * The count is incremented when we start a range that's in the set, and
6912 * decremented when we start a range that's not in the set. So its range
6913 * is 0 to 2. Only when the count is 2 is something in the intersection.
6914 */
6915 UV count = 0;
6916
6917 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
6918 assert(a != b);
6919
6920 /* Special case if either one is empty */
6921 len_a = invlist_len(a);
6922 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
6923
6924 if (len_a != 0 && complement_b) {
6925
6926 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
6927 * be empty. Here, also we are using 'b's complement, which hence
6928 * must be every possible code point. Thus the intersection is
6929 * simply 'a'. */
6930 if (*i != a) {
6931 *i = invlist_clone(a);
6932
6933 if (*i == b) {
6934 SvREFCNT_dec(b);
6935 }
6936 }
6937 /* else *i is already 'a' */
6938 return;
6939 }
6940
6941 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
6942 * intersection must be empty */
6943 if (*i == a) {
6944 SvREFCNT_dec(a);
6945 }
6946 else if (*i == b) {
6947 SvREFCNT_dec(b);
6948 }
6949 *i = _new_invlist(0);
6950 return;
6951 }
6952
6953 /* Here both lists exist and are non-empty */
6954 array_a = invlist_array(a);
6955 array_b = invlist_array(b);
6956
6957 /* If are to take the intersection of 'a' with the complement of b, set it
6958 * up so are looking at b's complement. */
6959 if (complement_b) {
6960
6961 /* To complement, we invert: if the first element is 0, remove it. To
6962 * do this, we just pretend the array starts one later, and clear the
6963 * flag as we don't have to do anything else later */
6964 if (array_b[0] == 0) {
6965 array_b++;
6966 len_b--;
6967 complement_b = FALSE;
6968 }
6969 else {
6970
6971 /* But if the first element is not zero, we unshift a 0 before the
6972 * array. The data structure reserves a space for that 0 (which
6973 * should be a '1' right now), so physical shifting is unneeded,
6974 * but temporarily change that element to 0. Before exiting the
6975 * routine, we must restore the element to '1' */
6976 array_b--;
6977 len_b++;
6978 array_b[0] = 0;
6979 }
6980 }
6981
6982 /* Size the intersection for the worst case: that the intersection ends up
6983 * fragmenting everything to be completely disjoint */
6984 r= _new_invlist(len_a + len_b);
6985
6986 /* Will contain U+0000 iff both components do */
6987 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
6988 && len_b > 0 && array_b[0] == 0);
6989
6990 /* Go through each list item by item, stopping when exhausted one of
6991 * them */
6992 while (i_a < len_a && i_b < len_b) {
6993 UV cp; /* The element to potentially add to the intersection's
6994 array */
6995 bool cp_in_set; /* Is it in the input list's set or not */
6996
6997 /* We need to take one or the other of the two inputs for the
6998 * intersection. Since we are merging two sorted lists, we take the
6999 * smaller of the next items. In case of a tie, we take the one that
7000 * is not in its set first (a difference from the union algorithm). If
7001 * we took one in the set first, it would increment the count, possibly
7002 * to 2 which would cause it to be output as starting a range in the
7003 * intersection, and the next time through we would take that same
7004 * number, and output it again as ending the set. By doing it the
7005 * opposite of this, there is no possibility that the count will be
7006 * momentarily incremented to 2. (In a tie and both are in the set or
7007 * both not in the set, it doesn't matter which we take first.) */
7008 if (array_a[i_a] < array_b[i_b]
7009 || (array_a[i_a] == array_b[i_b]
7010 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7011 {
7012 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7013 cp= array_a[i_a++];
7014 }
7015 else {
7016 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7017 cp= array_b[i_b++];
7018 }
7019
7020 /* Here, have chosen which of the two inputs to look at. Only output
7021 * if the running count changes to/from 2, which marks the
7022 * beginning/end of a range that's in the intersection */
7023 if (cp_in_set) {
7024 count++;
7025 if (count == 2) {
7026 array_r[i_r++] = cp;
7027 }
7028 }
7029 else {
7030 if (count == 2) {
7031 array_r[i_r++] = cp;
7032 }
7033 count--;
7034 }
7035 }
7036
7037 /* Here, we are finished going through at least one of the lists, which
7038 * means there is something remaining in at most one. We check if the list
7039 * that has been exhausted is positioned such that we are in the middle
7040 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7041 * the ones we care about.) There are four cases:
7042 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7043 * nothing left in the intersection.
7044 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7045 * above 2. What should be output is exactly that which is in the
7046 * non-exhausted set, as everything it has is also in the intersection
7047 * set, and everything it doesn't have can't be in the intersection
7048 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7049 * gets incremented to 2. Like the previous case, the intersection is
7050 * everything that remains in the non-exhausted set.
7051 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7052 * remains 1. And the intersection has nothing more. */
7053 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7054 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7055 {
7056 count++;
7057 }
7058
7059 /* The final length is what we've output so far plus what else is in the
7060 * intersection. At most one of the subexpressions below will be non-zero */
7061 len_r = i_r;
7062 if (count >= 2) {
7063 len_r += (len_a - i_a) + (len_b - i_b);
7064 }
7065
7066 /* Set result to final length, which can change the pointer to array_r, so
7067 * re-find it */
7068 if (len_r != invlist_len(r)) {
7069 invlist_set_len(r, len_r);
7070 invlist_trim(r);
7071 array_r = invlist_array(r);
7072 }
7073
7074 /* Finish outputting any remaining */
7075 if (count >= 2) { /* At most one will have a non-zero copy count */
7076 IV copy_count;
7077 if ((copy_count = len_a - i_a) > 0) {
7078 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7079 }
7080 else if ((copy_count = len_b - i_b) > 0) {
7081 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7082 }
7083 }
7084
7085 /* We may be removing a reference to one of the inputs */
7086 if (a == *i || b == *i) {
7087 SvREFCNT_dec(*i);
7088 }
7089
7090 /* If we've changed b, restore it */
7091 if (complement_b) {
7092 array_b[0] = 1;
7093 }
7094
7095 *i = r;
7096 return;
7097}
7098
7099#endif
7100
7101STATIC SV*
7102S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7103{
7104 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7105 * set. A pointer to the inversion list is returned. This may actually be
7106 * a new list, in which case the passed in one has been destroyed. The
7107 * passed in inversion list can be NULL, in which case a new one is created
7108 * with just the one range in it */
7109
7110 SV* range_invlist;
7111 UV len;
7112
7113 if (invlist == NULL) {
7114 invlist = _new_invlist(2);
7115 len = 0;
7116 }
7117 else {
7118 len = invlist_len(invlist);
7119 }
7120
7121 /* If comes after the final entry, can just append it to the end */
7122 if (len == 0
7123 || start >= invlist_array(invlist)
7124 [invlist_len(invlist) - 1])
7125 {
7126 _append_range_to_invlist(invlist, start, end);
7127 return invlist;
7128 }
7129
7130 /* Here, can't just append things, create and return a new inversion list
7131 * which is the union of this range and the existing inversion list */
7132 range_invlist = _new_invlist(2);
7133 _append_range_to_invlist(range_invlist, start, end);
7134
7135 _invlist_union(invlist, range_invlist, &invlist);
7136
7137 /* The temporary can be freed */
7138 SvREFCNT_dec(range_invlist);
7139
7140 return invlist;
7141}
7142
7143PERL_STATIC_INLINE SV*
7144S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7145 return add_range_to_invlist(invlist, cp, cp);
7146}
7147
7148#ifndef PERL_IN_XSUB_RE
7149void
7150Perl__invlist_invert(pTHX_ SV* const invlist)
7151{
7152 /* Complement the input inversion list. This adds a 0 if the list didn't
7153 * have a zero; removes it otherwise. As described above, the data
7154 * structure is set up so that this is very efficient */
7155
7156 UV* len_pos = get_invlist_len_addr(invlist);
7157
7158 PERL_ARGS_ASSERT__INVLIST_INVERT;
7159
7160 /* The inverse of matching nothing is matching everything */
7161 if (*len_pos == 0) {
7162 _append_range_to_invlist(invlist, 0, UV_MAX);
7163 return;
7164 }
7165
7166 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7167 * zero element was a 0, so it is being removed, so the length decrements
7168 * by 1; and vice-versa. SvCUR is unaffected */
7169 if (*get_invlist_zero_addr(invlist) ^= 1) {
7170 (*len_pos)--;
7171 }
7172 else {
7173 (*len_pos)++;
7174 }
7175}
7176
7177void
7178Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7179{
7180 /* Complement the input inversion list (which must be a Unicode property,
7181 * all of which don't match above the Unicode maximum code point.) And
7182 * Perl has chosen to not have the inversion match above that either. This
7183 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7184 */
7185
7186 UV len;
7187 UV* array;
7188
7189 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7190
7191 _invlist_invert(invlist);
7192
7193 len = invlist_len(invlist);
7194
7195 if (len != 0) { /* If empty do nothing */
7196 array = invlist_array(invlist);
7197 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7198 /* Add 0x110000. First, grow if necessary */
7199 len++;
7200 if (invlist_max(invlist) < len) {
7201 invlist_extend(invlist, len);
7202 array = invlist_array(invlist);
7203 }
7204 invlist_set_len(invlist, len);
7205 array[len - 1] = PERL_UNICODE_MAX + 1;
7206 }
7207 else { /* Remove the 0x110000 */
7208 invlist_set_len(invlist, len - 1);
7209 }
7210 }
7211
7212 return;
7213}
7214#endif
7215
7216PERL_STATIC_INLINE SV*
7217S_invlist_clone(pTHX_ SV* const invlist)
7218{
7219
7220 /* Return a new inversion list that is a copy of the input one, which is
7221 * unchanged */
7222
7223 /* Need to allocate extra space to accommodate Perl's addition of a
7224 * trailing NUL to SvPV's, since it thinks they are always strings */
7225 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7226 STRLEN length = SvCUR(invlist);
7227
7228 PERL_ARGS_ASSERT_INVLIST_CLONE;
7229
7230 SvCUR_set(new_invlist, length); /* This isn't done automatically */
7231 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7232
7233 return new_invlist;
7234}
7235
7236PERL_STATIC_INLINE UV*
7237S_get_invlist_iter_addr(pTHX_ SV* invlist)
7238{
7239 /* Return the address of the UV that contains the current iteration
7240 * position */
7241
7242 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7243
7244 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7245}
7246
7247PERL_STATIC_INLINE UV*
7248S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7249{
7250 /* Return the address of the UV that contains the version id. */
7251
7252 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7253
7254 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7255}
7256
7257PERL_STATIC_INLINE void
7258S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
7259{
7260 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7261
7262 *get_invlist_iter_addr(invlist) = 0;
7263}
7264
7265STATIC bool
7266S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7267{
7268 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7269 * This call sets in <*start> and <*end>, the next range in <invlist>.
7270 * Returns <TRUE> if successful and the next call will return the next
7271 * range; <FALSE> if was already at the end of the list. If the latter,
7272 * <*start> and <*end> are unchanged, and the next call to this function
7273 * will start over at the beginning of the list */
7274
7275 UV* pos = get_invlist_iter_addr(invlist);
7276 UV len = invlist_len(invlist);
7277 UV *array;
7278
7279 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7280
7281 if (*pos >= len) {
7282 *pos = UV_MAX; /* Force iternit() to be required next time */
7283 return FALSE;
7284 }
7285
7286 array = invlist_array(invlist);
7287
7288 *start = array[(*pos)++];
7289
7290 if (*pos >= len) {
7291 *end = UV_MAX;
7292 }
7293 else {
7294 *end = array[(*pos)++] - 1;
7295 }
7296
7297 return TRUE;
7298}
7299
7300#ifndef PERL_IN_XSUB_RE
7301SV *
7302Perl__invlist_contents(pTHX_ SV* const invlist)
7303{
7304 /* Get the contents of an inversion list into a string SV so that they can
7305 * be printed out. It uses the format traditionally done for debug tracing
7306 */
7307
7308 UV start, end;
7309 SV* output = newSVpvs("\n");
7310
7311 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7312
7313 invlist_iterinit(invlist);
7314 while (invlist_iternext(invlist, &start, &end)) {
7315 if (end == UV_MAX) {
7316 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7317 }
7318 else if (end != start) {
7319 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7320 start, end);
7321 }
7322 else {
7323 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7324 }
7325 }
7326
7327 return output;
7328}
7329#endif
7330
7331#if 0
7332void
7333S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7334{
7335 /* Dumps out the ranges in an inversion list. The string 'header'
7336 * if present is output on a line before the first range */
7337
7338 UV start, end;
7339
7340 if (header && strlen(header)) {
7341 PerlIO_printf(Perl_debug_log, "%s\n", header);
7342 }
7343 invlist_iterinit(invlist);
7344 while (invlist_iternext(invlist, &start, &end)) {
7345 if (end == UV_MAX) {
7346 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7347 }
7348 else {
7349 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7350 }
7351 }
7352}
7353#endif
7354
7355#undef HEADER_LENGTH
7356#undef INVLIST_INITIAL_LENGTH
7357#undef TO_INTERNAL_SIZE
7358#undef FROM_INTERNAL_SIZE
7359#undef INVLIST_LEN_OFFSET
7360#undef INVLIST_ZERO_OFFSET
7361#undef INVLIST_ITER_OFFSET
7362#undef INVLIST_VERSION_ID
7363
7364/* End of inversion list object */
7365
7366/*
7367 - reg - regular expression, i.e. main body or parenthesized thing
7368 *
7369 * Caller must absorb opening parenthesis.
7370 *
7371 * Combining parenthesis handling with the base level of regular expression
7372 * is a trifle forced, but the need to tie the tails of the branches to what
7373 * follows makes it hard to avoid.
7374 */
7375#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7376#ifdef DEBUGGING
7377#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7378#else
7379#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7380#endif
7381
7382STATIC regnode *
7383S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7384 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7385{
7386 dVAR;
7387 register regnode *ret; /* Will be the head of the group. */
7388 register regnode *br;
7389 register regnode *lastbr;
7390 register regnode *ender = NULL;
7391 register I32 parno = 0;
7392 I32 flags;
7393 U32 oregflags = RExC_flags;
7394 bool have_branch = 0;
7395 bool is_open = 0;
7396 I32 freeze_paren = 0;
7397 I32 after_freeze = 0;
7398
7399 /* for (?g), (?gc), and (?o) warnings; warning
7400 about (?c) will warn about (?g) -- japhy */
7401
7402#define WASTED_O 0x01
7403#define WASTED_G 0x02
7404#define WASTED_C 0x04
7405#define WASTED_GC (0x02|0x04)
7406 I32 wastedflags = 0x00;
7407
7408 char * parse_start = RExC_parse; /* MJD */
7409 char * const oregcomp_parse = RExC_parse;
7410
7411 GET_RE_DEBUG_FLAGS_DECL;
7412
7413 PERL_ARGS_ASSERT_REG;
7414 DEBUG_PARSE("reg ");
7415
7416 *flagp = 0; /* Tentatively. */
7417
7418
7419 /* Make an OPEN node, if parenthesized. */
7420 if (paren) {
7421 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7422 char *start_verb = RExC_parse;
7423 STRLEN verb_len = 0;
7424 char *start_arg = NULL;
7425 unsigned char op = 0;
7426 int argok = 1;
7427 int internal_argval = 0; /* internal_argval is only useful if !argok */
7428 while ( *RExC_parse && *RExC_parse != ')' ) {
7429 if ( *RExC_parse == ':' ) {
7430 start_arg = RExC_parse + 1;
7431 break;
7432 }
7433 RExC_parse++;
7434 }
7435 ++start_verb;
7436 verb_len = RExC_parse - start_verb;
7437 if ( start_arg ) {
7438 RExC_parse++;
7439 while ( *RExC_parse && *RExC_parse != ')' )
7440 RExC_parse++;
7441 if ( *RExC_parse != ')' )
7442 vFAIL("Unterminated verb pattern argument");
7443 if ( RExC_parse == start_arg )
7444 start_arg = NULL;
7445 } else {
7446 if ( *RExC_parse != ')' )
7447 vFAIL("Unterminated verb pattern");
7448 }
7449
7450 switch ( *start_verb ) {
7451 case 'A': /* (*ACCEPT) */
7452 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7453 op = ACCEPT;
7454 internal_argval = RExC_nestroot;
7455 }
7456 break;
7457 case 'C': /* (*COMMIT) */
7458 if ( memEQs(start_verb,verb_len,"COMMIT") )
7459 op = COMMIT;
7460 break;
7461 case 'F': /* (*FAIL) */
7462 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7463 op = OPFAIL;
7464 argok = 0;
7465 }
7466 break;
7467 case ':': /* (*:NAME) */
7468 case 'M': /* (*MARK:NAME) */
7469 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7470 op = MARKPOINT;
7471 argok = -1;
7472 }
7473 break;
7474 case 'P': /* (*PRUNE) */
7475 if ( memEQs(start_verb,verb_len,"PRUNE") )
7476 op = PRUNE;
7477 break;
7478 case 'S': /* (*SKIP) */
7479 if ( memEQs(start_verb,verb_len,"SKIP") )
7480 op = SKIP;
7481 break;
7482 case 'T': /* (*THEN) */
7483 /* [19:06] <TimToady> :: is then */
7484 if ( memEQs(start_verb,verb_len,"THEN") ) {
7485 op = CUTGROUP;
7486 RExC_seen |= REG_SEEN_CUTGROUP;
7487 }
7488 break;
7489 }
7490 if ( ! op ) {
7491 RExC_parse++;
7492 vFAIL3("Unknown verb pattern '%.*s'",
7493 verb_len, start_verb);
7494 }
7495 if ( argok ) {
7496 if ( start_arg && internal_argval ) {
7497 vFAIL3("Verb pattern '%.*s' may not have an argument",
7498 verb_len, start_verb);
7499 } else if ( argok < 0 && !start_arg ) {
7500 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7501 verb_len, start_verb);
7502 } else {
7503 ret = reganode(pRExC_state, op, internal_argval);
7504 if ( ! internal_argval && ! SIZE_ONLY ) {
7505 if (start_arg) {
7506 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7507 ARG(ret) = add_data( pRExC_state, 1, "S" );
7508 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
7509 ret->flags = 0;
7510 } else {
7511 ret->flags = 1;
7512 }
7513 }
7514 }
7515 if (!internal_argval)
7516 RExC_seen |= REG_SEEN_VERBARG;
7517 } else if ( start_arg ) {
7518 vFAIL3("Verb pattern '%.*s' may not have an argument",
7519 verb_len, start_verb);
7520 } else {
7521 ret = reg_node(pRExC_state, op);
7522 }
7523 nextchar(pRExC_state);
7524 return ret;
7525 } else
7526 if (*RExC_parse == '?') { /* (?...) */
7527 bool is_logical = 0;
7528 const char * const seqstart = RExC_parse;
7529 bool has_use_defaults = FALSE;
7530
7531 RExC_parse++;
7532 paren = *RExC_parse++;
7533 ret = NULL; /* For look-ahead/behind. */
7534 switch (paren) {
7535
7536 case 'P': /* (?P...) variants for those used to PCRE/Python */
7537 paren = *RExC_parse++;
7538 if ( paren == '<') /* (?P<...>) named capture */
7539 goto named_capture;
7540 else if (paren == '>') { /* (?P>name) named recursion */
7541 goto named_recursion;
7542 }
7543 else if (paren == '=') { /* (?P=...) named backref */
7544 /* this pretty much dupes the code for \k<NAME> in regatom(), if
7545 you change this make sure you change that */
7546 char* name_start = RExC_parse;
7547 U32 num = 0;
7548 SV *sv_dat = reg_scan_name(pRExC_state,
7549 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7550 if (RExC_parse == name_start || *RExC_parse != ')')
7551 vFAIL2("Sequence %.3s... not terminated",parse_start);
7552
7553 if (!SIZE_ONLY) {
7554 num = add_data( pRExC_state, 1, "S" );
7555 RExC_rxi->data->data[num]=(void*)sv_dat;
7556 SvREFCNT_inc_simple_void(sv_dat);
7557 }
7558 RExC_sawback = 1;
7559 ret = reganode(pRExC_state,
7560 ((! FOLD)
7561 ? NREF
7562 : (MORE_ASCII_RESTRICTED)
7563 ? NREFFA
7564 : (AT_LEAST_UNI_SEMANTICS)
7565 ? NREFFU
7566 : (LOC)
7567 ? NREFFL
7568 : NREFF),
7569 num);
7570 *flagp |= HASWIDTH;
7571
7572 Set_Node_Offset(ret, parse_start+1);
7573 Set_Node_Cur_Length(ret); /* MJD */
7574
7575 nextchar(pRExC_state);
7576 return ret;
7577 }
7578 RExC_parse++;
7579 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7580 /*NOTREACHED*/
7581 case '<': /* (?<...) */
7582 if (*RExC_parse == '!')
7583 paren = ',';
7584 else if (*RExC_parse != '=')
7585 named_capture:
7586 { /* (?<...>) */
7587 char *name_start;
7588 SV *svname;
7589 paren= '>';
7590 case '\'': /* (?'...') */
7591 name_start= RExC_parse;
7592 svname = reg_scan_name(pRExC_state,
7593 SIZE_ONLY ? /* reverse test from the others */
7594 REG_RSN_RETURN_NAME :
7595 REG_RSN_RETURN_NULL);
7596 if (RExC_parse == name_start) {
7597 RExC_parse++;
7598 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7599 /*NOTREACHED*/
7600 }
7601 if (*RExC_parse != paren)
7602 vFAIL2("Sequence (?%c... not terminated",
7603 paren=='>' ? '<' : paren);
7604 if (SIZE_ONLY) {
7605 HE *he_str;
7606 SV *sv_dat = NULL;
7607 if (!svname) /* shouldn't happen */
7608 Perl_croak(aTHX_
7609 "panic: reg_scan_name returned NULL");
7610 if (!RExC_paren_names) {
7611 RExC_paren_names= newHV();
7612 sv_2mortal(MUTABLE_SV(RExC_paren_names));
7613#ifdef DEBUGGING
7614 RExC_paren_name_list= newAV();
7615 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
7616#endif
7617 }
7618 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
7619 if ( he_str )
7620 sv_dat = HeVAL(he_str);
7621 if ( ! sv_dat ) {
7622 /* croak baby croak */
7623 Perl_croak(aTHX_
7624 "panic: paren_name hash element allocation failed");
7625 } else if ( SvPOK(sv_dat) ) {
7626 /* (?|...) can mean we have dupes so scan to check
7627 its already been stored. Maybe a flag indicating
7628 we are inside such a construct would be useful,
7629 but the arrays are likely to be quite small, so
7630 for now we punt -- dmq */
7631 IV count = SvIV(sv_dat);
7632 I32 *pv = (I32*)SvPVX(sv_dat);
7633 IV i;
7634 for ( i = 0 ; i < count ; i++ ) {
7635 if ( pv[i] == RExC_npar ) {
7636 count = 0;
7637 break;
7638 }
7639 }
7640 if ( count ) {
7641 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7642 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7643 pv[count] = RExC_npar;
7644 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
7645 }
7646 } else {
7647 (void)SvUPGRADE(sv_dat,SVt_PVNV);
7648 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7649 SvIOK_on(sv_dat);
7650 SvIV_set(sv_dat, 1);
7651 }
7652#ifdef DEBUGGING
7653 /* Yes this does cause a memory leak in debugging Perls */
7654 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7655 SvREFCNT_dec(svname);
7656#endif
7657
7658 /*sv_dump(sv_dat);*/
7659 }
7660 nextchar(pRExC_state);
7661 paren = 1;
7662 goto capturing_parens;
7663 }
7664 RExC_seen |= REG_SEEN_LOOKBEHIND;
7665 RExC_in_lookbehind++;
7666 RExC_parse++;
7667 case '=': /* (?=...) */
7668 RExC_seen_zerolen++;
7669 break;
7670 case '!': /* (?!...) */
7671 RExC_seen_zerolen++;
7672 if (*RExC_parse == ')') {
7673 ret=reg_node(pRExC_state, OPFAIL);
7674 nextchar(pRExC_state);
7675 return ret;
7676 }
7677 break;
7678 case '|': /* (?|...) */
7679 /* branch reset, behave like a (?:...) except that
7680 buffers in alternations share the same numbers */
7681 paren = ':';
7682 after_freeze = freeze_paren = RExC_npar;
7683 break;
7684 case ':': /* (?:...) */
7685 case '>': /* (?>...) */
7686 break;
7687 case '$': /* (?$...) */
7688 case '@': /* (?@...) */
7689 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
7690 break;
7691 case '#': /* (?#...) */
7692 while (*RExC_parse && *RExC_parse != ')')
7693 RExC_parse++;
7694 if (*RExC_parse != ')')
7695 FAIL("Sequence (?#... not terminated");
7696 nextchar(pRExC_state);
7697 *flagp = TRYAGAIN;
7698 return NULL;
7699 case '0' : /* (?0) */
7700 case 'R' : /* (?R) */
7701 if (*RExC_parse != ')')
7702 FAIL("Sequence (?R) not terminated");
7703 ret = reg_node(pRExC_state, GOSTART);
7704 *flagp |= POSTPONED;
7705 nextchar(pRExC_state);
7706 return ret;
7707 /*notreached*/
7708 { /* named and numeric backreferences */
7709 I32 num;
7710 case '&': /* (?&NAME) */
7711 parse_start = RExC_parse - 1;
7712 named_recursion:
7713 {
7714 SV *sv_dat = reg_scan_name(pRExC_state,
7715 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7716 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7717 }
7718 goto gen_recurse_regop;
7719 /* NOT REACHED */
7720 case '+':
7721 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7722 RExC_parse++;
7723 vFAIL("Illegal pattern");
7724 }
7725 goto parse_recursion;
7726 /* NOT REACHED*/
7727 case '-': /* (?-1) */
7728 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7729 RExC_parse--; /* rewind to let it be handled later */
7730 goto parse_flags;
7731 }
7732 /*FALLTHROUGH */
7733 case '1': case '2': case '3': case '4': /* (?1) */
7734 case '5': case '6': case '7': case '8': case '9':
7735 RExC_parse--;
7736 parse_recursion:
7737 num = atoi(RExC_parse);
7738 parse_start = RExC_parse - 1; /* MJD */
7739 if (*RExC_parse == '-')
7740 RExC_parse++;
7741 while (isDIGIT(*RExC_parse))
7742 RExC_parse++;
7743 if (*RExC_parse!=')')
7744 vFAIL("Expecting close bracket");
7745
7746 gen_recurse_regop:
7747 if ( paren == '-' ) {
7748 /*
7749 Diagram of capture buffer numbering.
7750 Top line is the normal capture buffer numbers
7751 Bottom line is the negative indexing as from
7752 the X (the (?-2))
7753
7754 + 1 2 3 4 5 X 6 7
7755 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7756 - 5 4 3 2 1 X x x
7757
7758 */
7759 num = RExC_npar + num;
7760 if (num < 1) {
7761 RExC_parse++;
7762 vFAIL("Reference to nonexistent group");
7763 }
7764 } else if ( paren == '+' ) {
7765 num = RExC_npar + num - 1;
7766 }
7767
7768 ret = reganode(pRExC_state, GOSUB, num);
7769 if (!SIZE_ONLY) {
7770 if (num > (I32)RExC_rx->nparens) {
7771 RExC_parse++;
7772 vFAIL("Reference to nonexistent group");
7773 }
7774 ARG2L_SET( ret, RExC_recurse_count++);
7775 RExC_emit++;
7776 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
7777 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
7778 } else {
7779 RExC_size++;
7780 }
7781 RExC_seen |= REG_SEEN_RECURSE;
7782 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
7783 Set_Node_Offset(ret, parse_start); /* MJD */
7784
7785 *flagp |= POSTPONED;
7786 nextchar(pRExC_state);
7787 return ret;
7788 } /* named and numeric backreferences */
7789 /* NOT REACHED */
7790
7791 case '?': /* (??...) */
7792 is_logical = 1;
7793 if (*RExC_parse != '{') {
7794 RExC_parse++;
7795 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7796 /*NOTREACHED*/
7797 }
7798 *flagp |= POSTPONED;
7799 paren = *RExC_parse++;
7800 /* FALL THROUGH */
7801 case '{': /* (?{...}) */
7802 {
7803 I32 count = 1;
7804 U32 n = 0;
7805 char c;
7806 char *s = RExC_parse;
7807
7808 RExC_seen_zerolen++;
7809 RExC_seen |= REG_SEEN_EVAL;
7810 while (count && (c = *RExC_parse)) {
7811 if (c == '\\') {
7812 if (RExC_parse[1])
7813 RExC_parse++;
7814 }
7815 else if (c == '{')
7816 count++;
7817 else if (c == '}')
7818 count--;
7819 RExC_parse++;
7820 }
7821 if (*RExC_parse != ')') {
7822 RExC_parse = s;
7823 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7824 }
7825 if (!SIZE_ONLY) {
7826 PAD *pad;
7827 OP_4tree *sop, *rop;
7828 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
7829
7830 ENTER;
7831 Perl_save_re_context(aTHX);
7832 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
7833 sop->op_private |= OPpREFCOUNTED;
7834 /* re_dup will OpREFCNT_inc */
7835 OpREFCNT_set(sop, 1);
7836 LEAVE;
7837
7838 n = add_data(pRExC_state, 3, "nop");
7839 RExC_rxi->data->data[n] = (void*)rop;
7840 RExC_rxi->data->data[n+1] = (void*)sop;
7841 RExC_rxi->data->data[n+2] = (void*)pad;
7842 SvREFCNT_dec(sv);
7843 }
7844 else { /* First pass */
7845 if (PL_reginterp_cnt < ++RExC_seen_evals
7846 && IN_PERL_RUNTIME)
7847 /* No compiled RE interpolated, has runtime
7848 components ===> unsafe. */
7849 FAIL("Eval-group not allowed at runtime, use re 'eval'");
7850 if (PL_tainting && PL_tainted)
7851 FAIL("Eval-group in insecure regular expression");
7852#if PERL_VERSION > 8
7853 if (IN_PERL_COMPILETIME)
7854 PL_cv_has_eval = 1;
7855#endif
7856 }
7857
7858 nextchar(pRExC_state);
7859 if (is_logical) {
7860 ret = reg_node(pRExC_state, LOGICAL);
7861 if (!SIZE_ONLY)
7862 ret->flags = 2;
7863 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
7864 /* deal with the length of this later - MJD */
7865 return ret;
7866 }
7867 ret = reganode(pRExC_state, EVAL, n);
7868 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7869 Set_Node_Offset(ret, parse_start);
7870 return ret;
7871 }
7872 case '(': /* (?(?{...})...) and (?(?=...)...) */
7873 {
7874 int is_define= 0;
7875 if (RExC_parse[0] == '?') { /* (?(?...)) */
7876 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7877 || RExC_parse[1] == '<'
7878 || RExC_parse[1] == '{') { /* Lookahead or eval. */
7879 I32 flag;
7880
7881 ret = reg_node(pRExC_state, LOGICAL);
7882 if (!SIZE_ONLY)
7883 ret->flags = 1;
7884 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
7885 goto insert_if;
7886 }
7887 }
7888 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
7889 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7890 {
7891 char ch = RExC_parse[0] == '<' ? '>' : '\'';
7892 char *name_start= RExC_parse++;
7893 U32 num = 0;
7894 SV *sv_dat=reg_scan_name(pRExC_state,
7895 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7896 if (RExC_parse == name_start || *RExC_parse != ch)
7897 vFAIL2("Sequence (?(%c... not terminated",
7898 (ch == '>' ? '<' : ch));
7899 RExC_parse++;
7900 if (!SIZE_ONLY) {
7901 num = add_data( pRExC_state, 1, "S" );
7902 RExC_rxi->data->data[num]=(void*)sv_dat;
7903 SvREFCNT_inc_simple_void(sv_dat);
7904 }
7905 ret = reganode(pRExC_state,NGROUPP,num);
7906 goto insert_if_check_paren;
7907 }
7908 else if (RExC_parse[0] == 'D' &&
7909 RExC_parse[1] == 'E' &&
7910 RExC_parse[2] == 'F' &&
7911 RExC_parse[3] == 'I' &&
7912 RExC_parse[4] == 'N' &&
7913 RExC_parse[5] == 'E')
7914 {
7915 ret = reganode(pRExC_state,DEFINEP,0);
7916 RExC_parse +=6 ;
7917 is_define = 1;
7918 goto insert_if_check_paren;
7919 }
7920 else if (RExC_parse[0] == 'R') {
7921 RExC_parse++;
7922 parno = 0;
7923 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7924 parno = atoi(RExC_parse++);
7925 while (isDIGIT(*RExC_parse))
7926 RExC_parse++;
7927 } else if (RExC_parse[0] == '&') {
7928 SV *sv_dat;
7929 RExC_parse++;
7930 sv_dat = reg_scan_name(pRExC_state,
7931 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7932 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7933 }
7934 ret = reganode(pRExC_state,INSUBP,parno);
7935 goto insert_if_check_paren;
7936 }
7937 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7938 /* (?(1)...) */
7939 char c;
7940 parno = atoi(RExC_parse++);
7941
7942 while (isDIGIT(*RExC_parse))
7943 RExC_parse++;
7944 ret = reganode(pRExC_state, GROUPP, parno);
7945
7946 insert_if_check_paren:
7947 if ((c = *nextchar(pRExC_state)) != ')')
7948 vFAIL("Switch condition not recognized");
7949 insert_if:
7950 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7951 br = regbranch(pRExC_state, &flags, 1,depth+1);
7952 if (br == NULL)
7953 br = reganode(pRExC_state, LONGJMP, 0);
7954 else
7955 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
7956 c = *nextchar(pRExC_state);
7957 if (flags&HASWIDTH)
7958 *flagp |= HASWIDTH;
7959 if (c == '|') {
7960 if (is_define)
7961 vFAIL("(?(DEFINE)....) does not allow branches");
7962 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
7963 regbranch(pRExC_state, &flags, 1,depth+1);
7964 REGTAIL(pRExC_state, ret, lastbr);
7965 if (flags&HASWIDTH)
7966 *flagp |= HASWIDTH;
7967 c = *nextchar(pRExC_state);
7968 }
7969 else
7970 lastbr = NULL;
7971 if (c != ')')
7972 vFAIL("Switch (?(condition)... contains too many branches");
7973 ender = reg_node(pRExC_state, TAIL);
7974 REGTAIL(pRExC_state, br, ender);
7975 if (lastbr) {
7976 REGTAIL(pRExC_state, lastbr, ender);
7977 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
7978 }
7979 else
7980 REGTAIL(pRExC_state, ret, ender);
7981 RExC_size++; /* XXX WHY do we need this?!!
7982 For large programs it seems to be required
7983 but I can't figure out why. -- dmq*/
7984 return ret;
7985 }
7986 else {
7987 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
7988 }
7989 }
7990 case 0:
7991 RExC_parse--; /* for vFAIL to print correctly */
7992 vFAIL("Sequence (? incomplete");
7993 break;
7994 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
7995 that follow */
7996 has_use_defaults = TRUE;
7997 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
7998 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7999 ? REGEX_UNICODE_CHARSET
8000 : REGEX_DEPENDS_CHARSET);
8001 goto parse_flags;
8002 default:
8003 --RExC_parse;
8004 parse_flags: /* (?i) */
8005 {
8006 U32 posflags = 0, negflags = 0;
8007 U32 *flagsp = &posflags;
8008 char has_charset_modifier = '\0';
8009 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
8010 ? REGEX_UNICODE_CHARSET
8011 : REGEX_DEPENDS_CHARSET;
8012
8013 while (*RExC_parse) {
8014 /* && strchr("iogcmsx", *RExC_parse) */
8015 /* (?g), (?gc) and (?o) are useless here
8016 and must be globally applied -- japhy */
8017 switch (*RExC_parse) {
8018 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8019 case LOCALE_PAT_MOD:
8020 if (has_charset_modifier) {
8021 goto excess_modifier;
8022 }
8023 else if (flagsp == &negflags) {
8024 goto neg_modifier;
8025 }
8026 cs = REGEX_LOCALE_CHARSET;
8027 has_charset_modifier = LOCALE_PAT_MOD;
8028 RExC_contains_locale = 1;
8029 break;
8030 case UNICODE_PAT_MOD:
8031 if (has_charset_modifier) {
8032 goto excess_modifier;
8033 }
8034 else if (flagsp == &negflags) {
8035 goto neg_modifier;
8036 }
8037 cs = REGEX_UNICODE_CHARSET;
8038 has_charset_modifier = UNICODE_PAT_MOD;
8039 break;
8040 case ASCII_RESTRICT_PAT_MOD:
8041 if (flagsp == &negflags) {
8042 goto neg_modifier;
8043 }
8044 if (has_charset_modifier) {
8045 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8046 goto excess_modifier;
8047 }
8048 /* Doubled modifier implies more restricted */
8049 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8050 }
8051 else {
8052 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8053 }
8054 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8055 break;
8056 case DEPENDS_PAT_MOD:
8057 if (has_use_defaults) {
8058 goto fail_modifiers;
8059 }
8060 else if (flagsp == &negflags) {
8061 goto neg_modifier;
8062 }
8063 else if (has_charset_modifier) {
8064 goto excess_modifier;
8065 }
8066
8067 /* The dual charset means unicode semantics if the
8068 * pattern (or target, not known until runtime) are
8069 * utf8, or something in the pattern indicates unicode
8070 * semantics */
8071 cs = (RExC_utf8 || RExC_uni_semantics)
8072 ? REGEX_UNICODE_CHARSET
8073 : REGEX_DEPENDS_CHARSET;
8074 has_charset_modifier = DEPENDS_PAT_MOD;
8075 break;
8076 excess_modifier:
8077 RExC_parse++;
8078 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8079 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8080 }
8081 else if (has_charset_modifier == *(RExC_parse - 1)) {
8082 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8083 }
8084 else {
8085 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8086 }
8087 /*NOTREACHED*/
8088 neg_modifier:
8089 RExC_parse++;
8090 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8091 /*NOTREACHED*/
8092 case ONCE_PAT_MOD: /* 'o' */
8093 case GLOBAL_PAT_MOD: /* 'g' */
8094 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8095 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8096 if (! (wastedflags & wflagbit) ) {
8097 wastedflags |= wflagbit;
8098 vWARN5(
8099 RExC_parse + 1,
8100 "Useless (%s%c) - %suse /%c modifier",
8101 flagsp == &negflags ? "?-" : "?",
8102 *RExC_parse,
8103 flagsp == &negflags ? "don't " : "",
8104 *RExC_parse
8105 );
8106 }
8107 }
8108 break;
8109
8110 case CONTINUE_PAT_MOD: /* 'c' */
8111 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8112 if (! (wastedflags & WASTED_C) ) {
8113 wastedflags |= WASTED_GC;
8114 vWARN3(
8115 RExC_parse + 1,
8116 "Useless (%sc) - %suse /gc modifier",
8117 flagsp == &negflags ? "?-" : "?",
8118 flagsp == &negflags ? "don't " : ""
8119 );
8120 }
8121 }
8122 break;
8123 case KEEPCOPY_PAT_MOD: /* 'p' */
8124 if (flagsp == &negflags) {
8125 if (SIZE_ONLY)
8126 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8127 } else {
8128 *flagsp |= RXf_PMf_KEEPCOPY;
8129 }
8130 break;
8131 case '-':
8132 /* A flag is a default iff it is following a minus, so
8133 * if there is a minus, it means will be trying to
8134 * re-specify a default which is an error */
8135 if (has_use_defaults || flagsp == &negflags) {
8136 fail_modifiers:
8137 RExC_parse++;
8138 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8139 /*NOTREACHED*/
8140 }
8141 flagsp = &negflags;
8142 wastedflags = 0; /* reset so (?g-c) warns twice */
8143 break;
8144 case ':':
8145 paren = ':';
8146 /*FALLTHROUGH*/
8147 case ')':
8148 RExC_flags |= posflags;
8149 RExC_flags &= ~negflags;
8150 set_regex_charset(&RExC_flags, cs);
8151 if (paren != ':') {
8152 oregflags |= posflags;
8153 oregflags &= ~negflags;
8154 set_regex_charset(&oregflags, cs);
8155 }
8156 nextchar(pRExC_state);
8157 if (paren != ':') {
8158 *flagp = TRYAGAIN;
8159 return NULL;
8160 } else {
8161 ret = NULL;
8162 goto parse_rest;
8163 }
8164 /*NOTREACHED*/
8165 default:
8166 RExC_parse++;
8167 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8168 /*NOTREACHED*/
8169 }
8170 ++RExC_parse;
8171 }
8172 }} /* one for the default block, one for the switch */
8173 }
8174 else { /* (...) */
8175 capturing_parens:
8176 parno = RExC_npar;
8177 RExC_npar++;
8178
8179 ret = reganode(pRExC_state, OPEN, parno);
8180 if (!SIZE_ONLY ){
8181 if (!RExC_nestroot)
8182 RExC_nestroot = parno;
8183 if (RExC_seen & REG_SEEN_RECURSE
8184 && !RExC_open_parens[parno-1])
8185 {
8186 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8187 "Setting open paren #%"IVdf" to %d\n",
8188 (IV)parno, REG_NODE_NUM(ret)));
8189 RExC_open_parens[parno-1]= ret;
8190 }
8191 }
8192 Set_Node_Length(ret, 1); /* MJD */
8193 Set_Node_Offset(ret, RExC_parse); /* MJD */
8194 is_open = 1;
8195 }
8196 }
8197 else /* ! paren */
8198 ret = NULL;
8199
8200 parse_rest:
8201 /* Pick up the branches, linking them together. */
8202 parse_start = RExC_parse; /* MJD */
8203 br = regbranch(pRExC_state, &flags, 1,depth+1);
8204
8205 /* branch_len = (paren != 0); */
8206
8207 if (br == NULL)
8208 return(NULL);
8209 if (*RExC_parse == '|') {
8210 if (!SIZE_ONLY && RExC_extralen) {
8211 reginsert(pRExC_state, BRANCHJ, br, depth+1);
8212 }
8213 else { /* MJD */
8214 reginsert(pRExC_state, BRANCH, br, depth+1);
8215 Set_Node_Length(br, paren != 0);
8216 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8217 }
8218 have_branch = 1;
8219 if (SIZE_ONLY)
8220 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
8221 }
8222 else if (paren == ':') {
8223 *flagp |= flags&SIMPLE;
8224 }
8225 if (is_open) { /* Starts with OPEN. */
8226 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
8227 }
8228 else if (paren != '?') /* Not Conditional */
8229 ret = br;
8230 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8231 lastbr = br;
8232 while (*RExC_parse == '|') {
8233 if (!SIZE_ONLY && RExC_extralen) {
8234 ender = reganode(pRExC_state, LONGJMP,0);
8235 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8236 }
8237 if (SIZE_ONLY)
8238 RExC_extralen += 2; /* Account for LONGJMP. */
8239 nextchar(pRExC_state);
8240 if (freeze_paren) {
8241 if (RExC_npar > after_freeze)
8242 after_freeze = RExC_npar;
8243 RExC_npar = freeze_paren;
8244 }
8245 br = regbranch(pRExC_state, &flags, 0, depth+1);
8246
8247 if (br == NULL)
8248 return(NULL);
8249 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
8250 lastbr = br;
8251 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8252 }
8253
8254 if (have_branch || paren != ':') {
8255 /* Make a closing node, and hook it on the end. */
8256 switch (paren) {
8257 case ':':
8258 ender = reg_node(pRExC_state, TAIL);
8259 break;
8260 case 1:
8261 ender = reganode(pRExC_state, CLOSE, parno);
8262 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8263 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8264 "Setting close paren #%"IVdf" to %d\n",
8265 (IV)parno, REG_NODE_NUM(ender)));
8266 RExC_close_parens[parno-1]= ender;
8267 if (RExC_nestroot == parno)
8268 RExC_nestroot = 0;
8269 }
8270 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8271 Set_Node_Length(ender,1); /* MJD */
8272 break;
8273 case '<':
8274 case ',':
8275 case '=':
8276 case '!':
8277 *flagp &= ~HASWIDTH;
8278 /* FALL THROUGH */
8279 case '>':
8280 ender = reg_node(pRExC_state, SUCCEED);
8281 break;
8282 case 0:
8283 ender = reg_node(pRExC_state, END);
8284 if (!SIZE_ONLY) {
8285 assert(!RExC_opend); /* there can only be one! */
8286 RExC_opend = ender;
8287 }
8288 break;
8289 }
8290 REGTAIL(pRExC_state, lastbr, ender);
8291
8292 if (have_branch && !SIZE_ONLY) {
8293 if (depth==1)
8294 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8295
8296 /* Hook the tails of the branches to the closing node. */
8297 for (br = ret; br; br = regnext(br)) {
8298 const U8 op = PL_regkind[OP(br)];
8299 if (op == BRANCH) {
8300 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8301 }
8302 else if (op == BRANCHJ) {
8303 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8304 }
8305 }
8306 }
8307 }
8308
8309 {
8310 const char *p;
8311 static const char parens[] = "=!<,>";
8312
8313 if (paren && (p = strchr(parens, paren))) {
8314 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8315 int flag = (p - parens) > 1;
8316
8317 if (paren == '>')
8318 node = SUSPEND, flag = 0;
8319 reginsert(pRExC_state, node,ret, depth+1);
8320 Set_Node_Cur_Length(ret);
8321 Set_Node_Offset(ret, parse_start + 1);
8322 ret->flags = flag;
8323 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8324 }
8325 }
8326
8327 /* Check for proper termination. */
8328 if (paren) {
8329 RExC_flags = oregflags;
8330 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8331 RExC_parse = oregcomp_parse;
8332 vFAIL("Unmatched (");
8333 }
8334 }
8335 else if (!paren && RExC_parse < RExC_end) {
8336 if (*RExC_parse == ')') {
8337 RExC_parse++;
8338 vFAIL("Unmatched )");
8339 }
8340 else
8341 FAIL("Junk on end of regexp"); /* "Can't happen". */
8342 /* NOTREACHED */
8343 }
8344
8345 if (RExC_in_lookbehind) {
8346 RExC_in_lookbehind--;
8347 }
8348 if (after_freeze > RExC_npar)
8349 RExC_npar = after_freeze;
8350 return(ret);
8351}
8352
8353/*
8354 - regbranch - one alternative of an | operator
8355 *
8356 * Implements the concatenation operator.
8357 */
8358STATIC regnode *
8359S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8360{
8361 dVAR;
8362 register regnode *ret;
8363 register regnode *chain = NULL;
8364 register regnode *latest;
8365 I32 flags = 0, c = 0;
8366 GET_RE_DEBUG_FLAGS_DECL;
8367
8368 PERL_ARGS_ASSERT_REGBRANCH;
8369
8370 DEBUG_PARSE("brnc");
8371
8372 if (first)
8373 ret = NULL;
8374 else {
8375 if (!SIZE_ONLY && RExC_extralen)
8376 ret = reganode(pRExC_state, BRANCHJ,0);
8377 else {
8378 ret = reg_node(pRExC_state, BRANCH);
8379 Set_Node_Length(ret, 1);
8380 }
8381 }
8382
8383 if (!first && SIZE_ONLY)
8384 RExC_extralen += 1; /* BRANCHJ */
8385
8386 *flagp = WORST; /* Tentatively. */
8387
8388 RExC_parse--;
8389 nextchar(pRExC_state);
8390 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8391 flags &= ~TRYAGAIN;
8392 latest = regpiece(pRExC_state, &flags,depth+1);
8393 if (latest == NULL) {
8394 if (flags & TRYAGAIN)
8395 continue;
8396 return(NULL);
8397 }
8398 else if (ret == NULL)
8399 ret = latest;
8400 *flagp |= flags&(HASWIDTH|POSTPONED);
8401 if (chain == NULL) /* First piece. */
8402 *flagp |= flags&SPSTART;
8403 else {
8404 RExC_naughty++;
8405 REGTAIL(pRExC_state, chain, latest);
8406 }
8407 chain = latest;
8408 c++;
8409 }
8410 if (chain == NULL) { /* Loop ran zero times. */
8411 chain = reg_node(pRExC_state, NOTHING);
8412 if (ret == NULL)
8413 ret = chain;
8414 }
8415 if (c == 1) {
8416 *flagp |= flags&SIMPLE;
8417 }
8418
8419 return ret;
8420}
8421
8422/*
8423 - regpiece - something followed by possible [*+?]
8424 *
8425 * Note that the branching code sequences used for ? and the general cases
8426 * of * and + are somewhat optimized: they use the same NOTHING node as
8427 * both the endmarker for their branch list and the body of the last branch.
8428 * It might seem that this node could be dispensed with entirely, but the
8429 * endmarker role is not redundant.
8430 */
8431STATIC regnode *
8432S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8433{
8434 dVAR;
8435 register regnode *ret;
8436 register char op;
8437 register char *next;
8438 I32 flags;
8439 const char * const origparse = RExC_parse;
8440 I32 min;
8441 I32 max = REG_INFTY;
8442#ifdef RE_TRACK_PATTERN_OFFSETS
8443 char *parse_start;
8444#endif
8445 const char *maxpos = NULL;
8446 GET_RE_DEBUG_FLAGS_DECL;
8447
8448 PERL_ARGS_ASSERT_REGPIECE;
8449
8450 DEBUG_PARSE("piec");
8451
8452 ret = regatom(pRExC_state, &flags,depth+1);
8453 if (ret == NULL) {
8454 if (flags & TRYAGAIN)
8455 *flagp |= TRYAGAIN;
8456 return(NULL);
8457 }
8458
8459 op = *RExC_parse;
8460
8461 if (op == '{' && regcurly(RExC_parse)) {
8462 maxpos = NULL;
8463#ifdef RE_TRACK_PATTERN_OFFSETS
8464 parse_start = RExC_parse; /* MJD */
8465#endif
8466 next = RExC_parse + 1;
8467 while (isDIGIT(*next) || *next == ',') {
8468 if (*next == ',') {
8469 if (maxpos)
8470 break;
8471 else
8472 maxpos = next;
8473 }
8474 next++;
8475 }
8476 if (*next == '}') { /* got one */
8477 if (!maxpos)
8478 maxpos = next;
8479 RExC_parse++;
8480 min = atoi(RExC_parse);
8481 if (*maxpos == ',')
8482 maxpos++;
8483 else
8484 maxpos = RExC_parse;
8485 max = atoi(maxpos);
8486 if (!max && *maxpos != '0')
8487 max = REG_INFTY; /* meaning "infinity" */
8488 else if (max >= REG_INFTY)
8489 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
8490 RExC_parse = next;
8491 nextchar(pRExC_state);
8492
8493 do_curly:
8494 if ((flags&SIMPLE)) {
8495 RExC_naughty += 2 + RExC_naughty / 2;
8496 reginsert(pRExC_state, CURLY, ret, depth+1);
8497 Set_Node_Offset(ret, parse_start+1); /* MJD */
8498 Set_Node_Cur_Length(ret);
8499 }
8500 else {
8501 regnode * const w = reg_node(pRExC_state, WHILEM);
8502
8503 w->flags = 0;
8504 REGTAIL(pRExC_state, ret, w);
8505 if (!SIZE_ONLY && RExC_extralen) {
8506 reginsert(pRExC_state, LONGJMP,ret, depth+1);
8507 reginsert(pRExC_state, NOTHING,ret, depth+1);
8508 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
8509 }
8510 reginsert(pRExC_state, CURLYX,ret, depth+1);
8511 /* MJD hk */
8512 Set_Node_Offset(ret, parse_start+1);
8513 Set_Node_Length(ret,
8514 op == '{' ? (RExC_parse - parse_start) : 1);
8515
8516 if (!SIZE_ONLY && RExC_extralen)
8517 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
8518 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
8519 if (SIZE_ONLY)
8520 RExC_whilem_seen++, RExC_extralen += 3;
8521 RExC_naughty += 4 + RExC_naughty; /* compound interest */
8522 }
8523 ret->flags = 0;
8524
8525 if (min > 0)
8526 *flagp = WORST;
8527 if (max > 0)
8528 *flagp |= HASWIDTH;
8529 if (max < min)
8530 vFAIL("Can't do {n,m} with n > m");
8531 if (!SIZE_ONLY) {
8532 ARG1_SET(ret, (U16)min);
8533 ARG2_SET(ret, (U16)max);
8534 }
8535
8536 goto nest_check;
8537 }
8538 }
8539
8540 if (!ISMULT1(op)) {
8541 *flagp = flags;
8542 return(ret);
8543 }
8544
8545#if 0 /* Now runtime fix should be reliable. */
8546
8547 /* if this is reinstated, don't forget to put this back into perldiag:
8548
8549 =item Regexp *+ operand could be empty at {#} in regex m/%s/
8550
8551 (F) The part of the regexp subject to either the * or + quantifier
8552 could match an empty string. The {#} shows in the regular
8553 expression about where the problem was discovered.
8554
8555 */
8556
8557 if (!(flags&HASWIDTH) && op != '?')
8558 vFAIL("Regexp *+ operand could be empty");
8559#endif
8560
8561#ifdef RE_TRACK_PATTERN_OFFSETS
8562 parse_start = RExC_parse;
8563#endif
8564 nextchar(pRExC_state);
8565
8566 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
8567
8568 if (op == '*' && (flags&SIMPLE)) {
8569 reginsert(pRExC_state, STAR, ret, depth+1);
8570 ret->flags = 0;
8571 RExC_naughty += 4;
8572 }
8573 else if (op == '*') {
8574 min = 0;
8575 goto do_curly;
8576 }
8577 else if (op == '+' && (flags&SIMPLE)) {
8578 reginsert(pRExC_state, PLUS, ret, depth+1);
8579 ret->flags = 0;
8580 RExC_naughty += 3;
8581 }
8582 else if (op == '+') {
8583 min = 1;
8584 goto do_curly;
8585 }
8586 else if (op == '?') {
8587 min = 0; max = 1;
8588 goto do_curly;
8589 }
8590 nest_check:
8591 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8592 ckWARN3reg(RExC_parse,
8593 "%.*s matches null string many times",
8594 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8595 origparse);
8596 }
8597
8598 if (RExC_parse < RExC_end && *RExC_parse == '?') {
8599 nextchar(pRExC_state);
8600 reginsert(pRExC_state, MINMOD, ret, depth+1);
8601 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
8602 }
8603#ifndef REG_ALLOW_MINMOD_SUSPEND
8604 else
8605#endif
8606 if (RExC_parse < RExC_end && *RExC_parse == '+') {
8607 regnode *ender;
8608 nextchar(pRExC_state);
8609 ender = reg_node(pRExC_state, SUCCEED);
8610 REGTAIL(pRExC_state, ret, ender);
8611 reginsert(pRExC_state, SUSPEND, ret, depth+1);
8612 ret->flags = 0;
8613 ender = reg_node(pRExC_state, TAIL);
8614 REGTAIL(pRExC_state, ret, ender);
8615 /*ret= ender;*/
8616 }
8617
8618 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
8619 RExC_parse++;
8620 vFAIL("Nested quantifiers");
8621 }
8622
8623 return(ret);
8624}
8625
8626
8627/* reg_namedseq(pRExC_state,UVp, UV depth)
8628
8629 This is expected to be called by a parser routine that has
8630 recognized '\N' and needs to handle the rest. RExC_parse is
8631 expected to point at the first char following the N at the time
8632 of the call.
8633
8634 The \N may be inside (indicated by valuep not being NULL) or outside a
8635 character class.
8636
8637 \N may begin either a named sequence, or if outside a character class, mean
8638 to match a non-newline. For non single-quoted regexes, the tokenizer has
8639 attempted to decide which, and in the case of a named sequence converted it
8640 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8641 where c1... are the characters in the sequence. For single-quoted regexes,
8642 the tokenizer passes the \N sequence through unchanged; this code will not
8643 attempt to determine this nor expand those. The net effect is that if the
8644 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8645 signals that this \N occurrence means to match a non-newline.
8646
8647 Only the \N{U+...} form should occur in a character class, for the same
8648 reason that '.' inside a character class means to just match a period: it
8649 just doesn't make sense.
8650
8651 If valuep is non-null then it is assumed that we are parsing inside
8652 of a charclass definition and the first codepoint in the resolved
8653 string is returned via *valuep and the routine will return NULL.
8654 In this mode if a multichar string is returned from the charnames
8655 handler, a warning will be issued, and only the first char in the
8656 sequence will be examined. If the string returned is zero length
8657 then the value of *valuep is undefined and NON-NULL will
8658 be returned to indicate failure. (This will NOT be a valid pointer
8659 to a regnode.)
8660
8661 If valuep is null then it is assumed that we are parsing normal text and a
8662 new EXACT node is inserted into the program containing the resolved string,
8663 and a pointer to the new node is returned. But if the string is zero length
8664 a NOTHING node is emitted instead.
8665
8666 On success RExC_parse is set to the char following the endbrace.
8667 Parsing failures will generate a fatal error via vFAIL(...)
8668 */
8669STATIC regnode *
8670S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
8671{
8672 char * endbrace; /* '}' following the name */
8673 regnode *ret = NULL;
8674 char* p;
8675
8676 GET_RE_DEBUG_FLAGS_DECL;
8677
8678 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
8679
8680 GET_RE_DEBUG_FLAGS;
8681
8682 /* The [^\n] meaning of \N ignores spaces and comments under the /x
8683 * modifier. The other meaning does not */
8684 p = (RExC_flags & RXf_PMf_EXTENDED)
8685 ? regwhite( pRExC_state, RExC_parse )
8686 : RExC_parse;
8687
8688 /* Disambiguate between \N meaning a named character versus \N meaning
8689 * [^\n]. The former is assumed when it can't be the latter. */
8690 if (*p != '{' || regcurly(p)) {
8691 RExC_parse = p;
8692 if (valuep) {
8693 /* no bare \N in a charclass */
8694 vFAIL("\\N in a character class must be a named character: \\N{...}");
8695 }
8696 nextchar(pRExC_state);
8697 ret = reg_node(pRExC_state, REG_ANY);
8698 *flagp |= HASWIDTH|SIMPLE;
8699 RExC_naughty++;
8700 RExC_parse--;
8701 Set_Node_Length(ret, 1); /* MJD */
8702 return ret;
8703 }
8704
8705 /* Here, we have decided it should be a named sequence */
8706
8707 /* The test above made sure that the next real character is a '{', but
8708 * under the /x modifier, it could be separated by space (or a comment and
8709 * \n) and this is not allowed (for consistency with \x{...} and the
8710 * tokenizer handling of \N{NAME}). */
8711 if (*RExC_parse != '{') {
8712 vFAIL("Missing braces on \\N{}");
8713 }
8714
8715 RExC_parse++; /* Skip past the '{' */
8716
8717 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8718 || ! (endbrace == RExC_parse /* nothing between the {} */
8719 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
8720 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8721 {
8722 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
8723 vFAIL("\\N{NAME} must be resolved by the lexer");
8724 }
8725
8726 if (endbrace == RExC_parse) { /* empty: \N{} */
8727 if (! valuep) {
8728 RExC_parse = endbrace + 1;
8729 return reg_node(pRExC_state,NOTHING);
8730 }
8731
8732 if (SIZE_ONLY) {
8733 ckWARNreg(RExC_parse,
8734 "Ignoring zero length \\N{} in character class"
8735 );
8736 RExC_parse = endbrace + 1;
8737 }
8738 *valuep = 0;
8739 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
8740 }
8741
8742 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
8743 RExC_parse += 2; /* Skip past the 'U+' */
8744
8745 if (valuep) { /* In a bracketed char class */
8746 /* We only pay attention to the first char of
8747 multichar strings being returned. I kinda wonder
8748 if this makes sense as it does change the behaviour
8749 from earlier versions, OTOH that behaviour was broken
8750 as well. XXX Solution is to recharacterize as
8751 [rest-of-class]|multi1|multi2... */
8752
8753 STRLEN length_of_hex;
8754 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8755 | PERL_SCAN_DISALLOW_PREFIX
8756 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8757
8758 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8759 if (endchar < endbrace) {
8760 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8761 }
8762
8763 length_of_hex = (STRLEN)(endchar - RExC_parse);
8764 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8765
8766 /* The tokenizer should have guaranteed validity, but it's possible to
8767 * bypass it by using single quoting, so check */
8768 if (length_of_hex == 0
8769 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8770 {
8771 RExC_parse += length_of_hex; /* Includes all the valid */
8772 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
8773 ? UTF8SKIP(RExC_parse)
8774 : 1;
8775 /* Guard against malformed utf8 */
8776 if (RExC_parse >= endchar) RExC_parse = endchar;
8777 vFAIL("Invalid hexadecimal number in \\N{U+...}");
8778 }
8779
8780 RExC_parse = endbrace + 1;
8781 if (endchar == endbrace) return NULL;
8782
8783 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
8784 }
8785 else { /* Not a char class */
8786
8787 /* What is done here is to convert this to a sub-pattern of the form
8788 * (?:\x{char1}\x{char2}...)
8789 * and then call reg recursively. That way, it retains its atomicness,
8790 * while not having to worry about special handling that some code
8791 * points may have. toke.c has converted the original Unicode values
8792 * to native, so that we can just pass on the hex values unchanged. We
8793 * do have to set a flag to keep recoding from happening in the
8794 * recursion */
8795
8796 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8797 STRLEN len;
8798 char *endchar; /* Points to '.' or '}' ending cur char in the input
8799 stream */
8800 char *orig_end = RExC_end;
8801
8802 while (RExC_parse < endbrace) {
8803
8804 /* Code points are separated by dots. If none, there is only one
8805 * code point, and is terminated by the brace */
8806 endchar = RExC_parse + strcspn(RExC_parse, ".}");
8807
8808 /* Convert to notation the rest of the code understands */
8809 sv_catpv(substitute_parse, "\\x{");
8810 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8811 sv_catpv(substitute_parse, "}");
8812
8813 /* Point to the beginning of the next character in the sequence. */
8814 RExC_parse = endchar + 1;
8815 }
8816 sv_catpv(substitute_parse, ")");
8817
8818 RExC_parse = SvPV(substitute_parse, len);
8819
8820 /* Don't allow empty number */
8821 if (len < 8) {
8822 vFAIL("Invalid hexadecimal number in \\N{U+...}");
8823 }
8824 RExC_end = RExC_parse + len;
8825
8826 /* The values are Unicode, and therefore not subject to recoding */
8827 RExC_override_recoding = 1;
8828
8829 ret = reg(pRExC_state, 1, flagp, depth+1);
8830
8831 RExC_parse = endbrace;
8832 RExC_end = orig_end;
8833 RExC_override_recoding = 0;
8834
8835 nextchar(pRExC_state);
8836 }
8837
8838 return ret;
8839}
8840
8841
8842/*
8843 * reg_recode
8844 *
8845 * It returns the code point in utf8 for the value in *encp.
8846 * value: a code value in the source encoding
8847 * encp: a pointer to an Encode object
8848 *
8849 * If the result from Encode is not a single character,
8850 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8851 */
8852STATIC UV
8853S_reg_recode(pTHX_ const char value, SV **encp)
8854{
8855 STRLEN numlen = 1;
8856 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
8857 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
8858 const STRLEN newlen = SvCUR(sv);
8859 UV uv = UNICODE_REPLACEMENT;
8860
8861 PERL_ARGS_ASSERT_REG_RECODE;
8862
8863 if (newlen)
8864 uv = SvUTF8(sv)
8865 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8866 : *(U8*)s;
8867
8868 if (!newlen || numlen != newlen) {
8869 uv = UNICODE_REPLACEMENT;
8870 *encp = NULL;
8871 }
8872 return uv;
8873}
8874
8875
8876/*
8877 - regatom - the lowest level
8878
8879 Try to identify anything special at the start of the pattern. If there
8880 is, then handle it as required. This may involve generating a single regop,
8881 such as for an assertion; or it may involve recursing, such as to
8882 handle a () structure.
8883
8884 If the string doesn't start with something special then we gobble up
8885 as much literal text as we can.
8886
8887 Once we have been able to handle whatever type of thing started the
8888 sequence, we return.
8889
8890 Note: we have to be careful with escapes, as they can be both literal
8891 and special, and in the case of \10 and friends can either, depending
8892 on context. Specifically there are two separate switches for handling
8893 escape sequences, with the one for handling literal escapes requiring
8894 a dummy entry for all of the special escapes that are actually handled
8895 by the other.
8896*/
8897
8898STATIC regnode *
8899S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
8900{
8901 dVAR;
8902 register regnode *ret = NULL;
8903 I32 flags;
8904 char *parse_start = RExC_parse;
8905 U8 op;
8906 GET_RE_DEBUG_FLAGS_DECL;
8907 DEBUG_PARSE("atom");
8908 *flagp = WORST; /* Tentatively. */
8909
8910 PERL_ARGS_ASSERT_REGATOM;
8911
8912tryagain:
8913 switch ((U8)*RExC_parse) {
8914 case '^':
8915 RExC_seen_zerolen++;
8916 nextchar(pRExC_state);
8917 if (RExC_flags & RXf_PMf_MULTILINE)
8918 ret = reg_node(pRExC_state, MBOL);
8919 else if (RExC_flags & RXf_PMf_SINGLELINE)
8920 ret = reg_node(pRExC_state, SBOL);
8921 else
8922 ret = reg_node(pRExC_state, BOL);
8923 Set_Node_Length(ret, 1); /* MJD */
8924 break;
8925 case '$':
8926 nextchar(pRExC_state);
8927 if (*RExC_parse)
8928 RExC_seen_zerolen++;
8929 if (RExC_flags & RXf_PMf_MULTILINE)
8930 ret = reg_node(pRExC_state, MEOL);
8931 else if (RExC_flags & RXf_PMf_SINGLELINE)
8932 ret = reg_node(pRExC_state, SEOL);
8933 else
8934 ret = reg_node(pRExC_state, EOL);
8935 Set_Node_Length(ret, 1); /* MJD */
8936 break;
8937 case '.':
8938 nextchar(pRExC_state);
8939 if (RExC_flags & RXf_PMf_SINGLELINE)
8940 ret = reg_node(pRExC_state, SANY);
8941 else
8942 ret = reg_node(pRExC_state, REG_ANY);
8943 *flagp |= HASWIDTH|SIMPLE;
8944 RExC_naughty++;
8945 Set_Node_Length(ret, 1); /* MJD */
8946 break;
8947 case '[':
8948 {
8949 char * const oregcomp_parse = ++RExC_parse;
8950 ret = regclass(pRExC_state,depth+1);
8951 if (*RExC_parse != ']') {
8952 RExC_parse = oregcomp_parse;
8953 vFAIL("Unmatched [");
8954 }
8955 nextchar(pRExC_state);
8956 *flagp |= HASWIDTH|SIMPLE;
8957 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
8958 break;
8959 }
8960 case '(':
8961 nextchar(pRExC_state);
8962 ret = reg(pRExC_state, 1, &flags,depth+1);
8963 if (ret == NULL) {
8964 if (flags & TRYAGAIN) {
8965 if (RExC_parse == RExC_end) {
8966 /* Make parent create an empty node if needed. */
8967 *flagp |= TRYAGAIN;
8968 return(NULL);
8969 }
8970 goto tryagain;
8971 }
8972 return(NULL);
8973 }
8974 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
8975 break;
8976 case '|':
8977 case ')':
8978 if (flags & TRYAGAIN) {
8979 *flagp |= TRYAGAIN;
8980 return NULL;
8981 }
8982 vFAIL("Internal urp");
8983 /* Supposed to be caught earlier. */
8984 break;
8985 case '{':
8986 if (!regcurly(RExC_parse)) {
8987 RExC_parse++;
8988 goto defchar;
8989 }
8990 /* FALL THROUGH */
8991 case '?':
8992 case '+':
8993 case '*':
8994 RExC_parse++;
8995 vFAIL("Quantifier follows nothing");
8996 break;
8997 case '\\':
8998 /* Special Escapes
8999
9000 This switch handles escape sequences that resolve to some kind
9001 of special regop and not to literal text. Escape sequnces that
9002 resolve to literal text are handled below in the switch marked
9003 "Literal Escapes".
9004
9005 Every entry in this switch *must* have a corresponding entry
9006 in the literal escape switch. However, the opposite is not
9007 required, as the default for this switch is to jump to the
9008 literal text handling code.
9009 */
9010 switch ((U8)*++RExC_parse) {
9011 /* Special Escapes */
9012 case 'A':
9013 RExC_seen_zerolen++;
9014 ret = reg_node(pRExC_state, SBOL);
9015 *flagp |= SIMPLE;
9016 goto finish_meta_pat;
9017 case 'G':
9018 ret = reg_node(pRExC_state, GPOS);
9019 RExC_seen |= REG_SEEN_GPOS;
9020 *flagp |= SIMPLE;
9021 goto finish_meta_pat;
9022 case 'K':
9023 RExC_seen_zerolen++;
9024 ret = reg_node(pRExC_state, KEEPS);
9025 *flagp |= SIMPLE;
9026 /* XXX:dmq : disabling in-place substitution seems to
9027 * be necessary here to avoid cases of memory corruption, as
9028 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9029 */
9030 RExC_seen |= REG_SEEN_LOOKBEHIND;
9031 goto finish_meta_pat;
9032 case 'Z':
9033 ret = reg_node(pRExC_state, SEOL);
9034 *flagp |= SIMPLE;
9035 RExC_seen_zerolen++; /* Do not optimize RE away */
9036 goto finish_meta_pat;
9037 case 'z':
9038 ret = reg_node(pRExC_state, EOS);
9039 *flagp |= SIMPLE;
9040 RExC_seen_zerolen++; /* Do not optimize RE away */
9041 goto finish_meta_pat;
9042 case 'C':
9043 ret = reg_node(pRExC_state, CANY);
9044 RExC_seen |= REG_SEEN_CANY;
9045 *flagp |= HASWIDTH|SIMPLE;
9046 goto finish_meta_pat;
9047 case 'X':
9048 ret = reg_node(pRExC_state, CLUMP);
9049 *flagp |= HASWIDTH;
9050 goto finish_meta_pat;
9051 case 'w':
9052 switch (get_regex_charset(RExC_flags)) {
9053 case REGEX_LOCALE_CHARSET:
9054 op = ALNUML;
9055 break;
9056 case REGEX_UNICODE_CHARSET:
9057 op = ALNUMU;
9058 break;
9059 case REGEX_ASCII_RESTRICTED_CHARSET:
9060 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9061 op = ALNUMA;
9062 break;
9063 case REGEX_DEPENDS_CHARSET:
9064 op = ALNUM;
9065 break;
9066 default:
9067 goto bad_charset;
9068 }
9069 ret = reg_node(pRExC_state, op);
9070 *flagp |= HASWIDTH|SIMPLE;
9071 goto finish_meta_pat;
9072 case 'W':
9073 switch (get_regex_charset(RExC_flags)) {
9074 case REGEX_LOCALE_CHARSET:
9075 op = NALNUML;
9076 break;
9077 case REGEX_UNICODE_CHARSET:
9078 op = NALNUMU;
9079 break;
9080 case REGEX_ASCII_RESTRICTED_CHARSET:
9081 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9082 op = NALNUMA;
9083 break;
9084 case REGEX_DEPENDS_CHARSET:
9085 op = NALNUM;
9086 break;
9087 default:
9088 goto bad_charset;
9089 }
9090 ret = reg_node(pRExC_state, op);
9091 *flagp |= HASWIDTH|SIMPLE;
9092 goto finish_meta_pat;
9093 case 'b':
9094 RExC_seen_zerolen++;
9095 RExC_seen |= REG_SEEN_LOOKBEHIND;
9096 switch (get_regex_charset(RExC_flags)) {
9097 case REGEX_LOCALE_CHARSET:
9098 op = BOUNDL;
9099 break;
9100 case REGEX_UNICODE_CHARSET:
9101 op = BOUNDU;
9102 break;
9103 case REGEX_ASCII_RESTRICTED_CHARSET:
9104 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9105 op = BOUNDA;
9106 break;
9107 case REGEX_DEPENDS_CHARSET:
9108 op = BOUND;
9109 break;
9110 default:
9111 goto bad_charset;
9112 }
9113 ret = reg_node(pRExC_state, op);
9114 FLAGS(ret) = get_regex_charset(RExC_flags);
9115 *flagp |= SIMPLE;
9116 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9117 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
9118 }
9119 goto finish_meta_pat;
9120 case 'B':
9121 RExC_seen_zerolen++;
9122 RExC_seen |= REG_SEEN_LOOKBEHIND;
9123 switch (get_regex_charset(RExC_flags)) {
9124 case REGEX_LOCALE_CHARSET:
9125 op = NBOUNDL;
9126 break;
9127 case REGEX_UNICODE_CHARSET:
9128 op = NBOUNDU;
9129 break;
9130 case REGEX_ASCII_RESTRICTED_CHARSET:
9131 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9132 op = NBOUNDA;
9133 break;
9134 case REGEX_DEPENDS_CHARSET:
9135 op = NBOUND;
9136 break;
9137 default:
9138 goto bad_charset;
9139 }
9140 ret = reg_node(pRExC_state, op);
9141 FLAGS(ret) = get_regex_charset(RExC_flags);
9142 *flagp |= SIMPLE;
9143 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9144 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
9145 }
9146 goto finish_meta_pat;
9147 case 's':
9148 switch (get_regex_charset(RExC_flags)) {
9149 case REGEX_LOCALE_CHARSET:
9150 op = SPACEL;
9151 break;
9152 case REGEX_UNICODE_CHARSET:
9153 op = SPACEU;
9154 break;
9155 case REGEX_ASCII_RESTRICTED_CHARSET:
9156 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9157 op = SPACEA;
9158 break;
9159 case REGEX_DEPENDS_CHARSET:
9160 op = SPACE;
9161 break;
9162 default:
9163 goto bad_charset;
9164 }
9165 ret = reg_node(pRExC_state, op);
9166 *flagp |= HASWIDTH|SIMPLE;
9167 goto finish_meta_pat;
9168 case 'S':
9169 switch (get_regex_charset(RExC_flags)) {
9170 case REGEX_LOCALE_CHARSET:
9171 op = NSPACEL;
9172 break;
9173 case REGEX_UNICODE_CHARSET:
9174 op = NSPACEU;
9175 break;
9176 case REGEX_ASCII_RESTRICTED_CHARSET:
9177 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9178 op = NSPACEA;
9179 break;
9180 case REGEX_DEPENDS_CHARSET:
9181 op = NSPACE;
9182 break;
9183 default:
9184 goto bad_charset;
9185 }
9186 ret = reg_node(pRExC_state, op);
9187 *flagp |= HASWIDTH|SIMPLE;
9188 goto finish_meta_pat;
9189 case 'd':
9190 switch (get_regex_charset(RExC_flags)) {
9191 case REGEX_LOCALE_CHARSET:
9192 op = DIGITL;
9193 break;
9194 case REGEX_ASCII_RESTRICTED_CHARSET:
9195 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9196 op = DIGITA;
9197 break;
9198 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9199 case REGEX_UNICODE_CHARSET:
9200 op = DIGIT;
9201 break;
9202 default:
9203 goto bad_charset;
9204 }
9205 ret = reg_node(pRExC_state, op);
9206 *flagp |= HASWIDTH|SIMPLE;
9207 goto finish_meta_pat;
9208 case 'D':
9209 switch (get_regex_charset(RExC_flags)) {
9210 case REGEX_LOCALE_CHARSET:
9211 op = NDIGITL;
9212 break;
9213 case REGEX_ASCII_RESTRICTED_CHARSET:
9214 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9215 op = NDIGITA;
9216 break;
9217 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9218 case REGEX_UNICODE_CHARSET:
9219 op = NDIGIT;
9220 break;
9221 default:
9222 goto bad_charset;
9223 }
9224 ret = reg_node(pRExC_state, op);
9225 *flagp |= HASWIDTH|SIMPLE;
9226 goto finish_meta_pat;
9227 case 'R':
9228 ret = reg_node(pRExC_state, LNBREAK);
9229 *flagp |= HASWIDTH|SIMPLE;
9230 goto finish_meta_pat;
9231 case 'h':
9232 ret = reg_node(pRExC_state, HORIZWS);
9233 *flagp |= HASWIDTH|SIMPLE;
9234 goto finish_meta_pat;
9235 case 'H':
9236 ret = reg_node(pRExC_state, NHORIZWS);
9237 *flagp |= HASWIDTH|SIMPLE;
9238 goto finish_meta_pat;
9239 case 'v':
9240 ret = reg_node(pRExC_state, VERTWS);
9241 *flagp |= HASWIDTH|SIMPLE;
9242 goto finish_meta_pat;
9243 case 'V':
9244 ret = reg_node(pRExC_state, NVERTWS);
9245 *flagp |= HASWIDTH|SIMPLE;
9246 finish_meta_pat:
9247 nextchar(pRExC_state);
9248 Set_Node_Length(ret, 2); /* MJD */
9249 break;
9250 case 'p':
9251 case 'P':
9252 {
9253 char* const oldregxend = RExC_end;
9254#ifdef DEBUGGING
9255 char* parse_start = RExC_parse - 2;
9256#endif
9257
9258 if (RExC_parse[1] == '{') {
9259 /* a lovely hack--pretend we saw [\pX] instead */
9260 RExC_end = strchr(RExC_parse, '}');
9261 if (!RExC_end) {
9262 const U8 c = (U8)*RExC_parse;
9263 RExC_parse += 2;
9264 RExC_end = oldregxend;
9265 vFAIL2("Missing right brace on \\%c{}", c);
9266 }
9267 RExC_end++;
9268 }
9269 else {
9270 RExC_end = RExC_parse + 2;
9271 if (RExC_end > oldregxend)
9272 RExC_end = oldregxend;
9273 }
9274 RExC_parse--;
9275
9276 ret = regclass(pRExC_state,depth+1);
9277
9278 RExC_end = oldregxend;
9279 RExC_parse--;
9280
9281 Set_Node_Offset(ret, parse_start + 2);
9282 Set_Node_Cur_Length(ret);
9283 nextchar(pRExC_state);
9284 *flagp |= HASWIDTH|SIMPLE;
9285 }
9286 break;
9287 case 'N':
9288 /* Handle \N and \N{NAME} here and not below because it can be
9289 multicharacter. join_exact() will join them up later on.
9290 Also this makes sure that things like /\N{BLAH}+/ and
9291 \N{BLAH} being multi char Just Happen. dmq*/
9292 ++RExC_parse;
9293 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9294 break;
9295 case 'k': /* Handle \k<NAME> and \k'NAME' */
9296 parse_named_seq:
9297 {
9298 char ch= RExC_parse[1];
9299 if (ch != '<' && ch != '\'' && ch != '{') {
9300 RExC_parse++;
9301 vFAIL2("Sequence %.2s... not terminated",parse_start);
9302 } else {
9303 /* this pretty much dupes the code for (?P=...) in reg(), if
9304 you change this make sure you change that */
9305 char* name_start = (RExC_parse += 2);
9306 U32 num = 0;
9307 SV *sv_dat = reg_scan_name(pRExC_state,
9308 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9309 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9310 if (RExC_parse == name_start || *RExC_parse != ch)
9311 vFAIL2("Sequence %.3s... not terminated",parse_start);
9312
9313 if (!SIZE_ONLY) {
9314 num = add_data( pRExC_state, 1, "S" );
9315 RExC_rxi->data->data[num]=(void*)sv_dat;
9316 SvREFCNT_inc_simple_void(sv_dat);
9317 }
9318
9319 RExC_sawback = 1;
9320 ret = reganode(pRExC_state,
9321 ((! FOLD)
9322 ? NREF
9323 : (MORE_ASCII_RESTRICTED)
9324 ? NREFFA
9325 : (AT_LEAST_UNI_SEMANTICS)
9326 ? NREFFU
9327 : (LOC)
9328 ? NREFFL
9329 : NREFF),
9330 num);
9331 *flagp |= HASWIDTH;
9332
9333 /* override incorrect value set in reganode MJD */
9334 Set_Node_Offset(ret, parse_start+1);
9335 Set_Node_Cur_Length(ret); /* MJD */
9336 nextchar(pRExC_state);
9337
9338 }
9339 break;
9340 }
9341 case 'g':
9342 case '1': case '2': case '3': case '4':
9343 case '5': case '6': case '7': case '8': case '9':
9344 {
9345 I32 num;
9346 bool isg = *RExC_parse == 'g';
9347 bool isrel = 0;
9348 bool hasbrace = 0;
9349 if (isg) {
9350 RExC_parse++;
9351 if (*RExC_parse == '{') {
9352 RExC_parse++;
9353 hasbrace = 1;
9354 }
9355 if (*RExC_parse == '-') {
9356 RExC_parse++;
9357 isrel = 1;
9358 }
9359 if (hasbrace && !isDIGIT(*RExC_parse)) {
9360 if (isrel) RExC_parse--;
9361 RExC_parse -= 2;
9362 goto parse_named_seq;
9363 } }
9364 num = atoi(RExC_parse);
9365 if (isg && num == 0)
9366 vFAIL("Reference to invalid group 0");
9367 if (isrel) {
9368 num = RExC_npar - num;
9369 if (num < 1)
9370 vFAIL("Reference to nonexistent or unclosed group");
9371 }
9372 if (!isg && num > 9 && num >= RExC_npar)
9373 goto defchar;
9374 else {
9375 char * const parse_start = RExC_parse - 1; /* MJD */
9376 while (isDIGIT(*RExC_parse))
9377 RExC_parse++;
9378 if (parse_start == RExC_parse - 1)
9379 vFAIL("Unterminated \\g... pattern");
9380 if (hasbrace) {
9381 if (*RExC_parse != '}')
9382 vFAIL("Unterminated \\g{...} pattern");
9383 RExC_parse++;
9384 }
9385 if (!SIZE_ONLY) {
9386 if (num > (I32)RExC_rx->nparens)
9387 vFAIL("Reference to nonexistent group");
9388 }
9389 RExC_sawback = 1;
9390 ret = reganode(pRExC_state,
9391 ((! FOLD)
9392 ? REF
9393 : (MORE_ASCII_RESTRICTED)
9394 ? REFFA
9395 : (AT_LEAST_UNI_SEMANTICS)
9396 ? REFFU
9397 : (LOC)
9398 ? REFFL
9399 : REFF),
9400 num);
9401 *flagp |= HASWIDTH;
9402
9403 /* override incorrect value set in reganode MJD */
9404 Set_Node_Offset(ret, parse_start+1);
9405 Set_Node_Cur_Length(ret); /* MJD */
9406 RExC_parse--;
9407 nextchar(pRExC_state);
9408 }
9409 }
9410 break;
9411 case '\0':
9412 if (RExC_parse >= RExC_end)
9413 FAIL("Trailing \\");
9414 /* FALL THROUGH */
9415 default:
9416 /* Do not generate "unrecognized" warnings here, we fall
9417 back into the quick-grab loop below */
9418 parse_start--;
9419 goto defchar;
9420 }
9421 break;
9422
9423 case '#':
9424 if (RExC_flags & RXf_PMf_EXTENDED) {
9425 if ( reg_skipcomment( pRExC_state ) )
9426 goto tryagain;
9427 }
9428 /* FALL THROUGH */
9429
9430 default:
9431
9432 parse_start = RExC_parse - 1;
9433
9434 RExC_parse++;
9435
9436 defchar: {
9437 register STRLEN len;
9438 register UV ender;
9439 register char *p;
9440 char *s;
9441 STRLEN foldlen;
9442 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
9443 U8 node_type;
9444
9445 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
9446 * it is folded to 'ss' even if not utf8 */
9447 bool is_exactfu_sharp_s;
9448
9449 ender = 0;
9450 node_type = ((! FOLD) ? EXACT
9451 : (LOC)
9452 ? EXACTFL
9453 : (MORE_ASCII_RESTRICTED)
9454 ? EXACTFA
9455 : (AT_LEAST_UNI_SEMANTICS)
9456 ? EXACTFU
9457 : EXACTF);
9458 ret = reg_node(pRExC_state, node_type);
9459 s = STRING(ret);
9460
9461 /* XXX The node can hold up to 255 bytes, yet this only goes to
9462 * 127. I (khw) do not know why. Keeping it somewhat less than
9463 * 255 allows us to not have to worry about overflow due to
9464 * converting to utf8 and fold expansion, but that value is
9465 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
9466 * split up by this limit into a single one using the real max of
9467 * 255. Even at 127, this breaks under rare circumstances. If
9468 * folding, we do not want to split a node at a character that is a
9469 * non-final in a multi-char fold, as an input string could just
9470 * happen to want to match across the node boundary. The join
9471 * would solve that problem if the join actually happens. But a
9472 * series of more than two nodes in a row each of 127 would cause
9473 * the first join to succeed to get to 254, but then there wouldn't
9474 * be room for the next one, which could at be one of those split
9475 * multi-char folds. I don't know of any fool-proof solution. One
9476 * could back off to end with only a code point that isn't such a
9477 * non-final, but it is possible for there not to be any in the
9478 * entire node. */
9479 for (len = 0, p = RExC_parse - 1;
9480 len < 127 && p < RExC_end;
9481 len++)
9482 {
9483 char * const oldp = p;
9484
9485 if (RExC_flags & RXf_PMf_EXTENDED)
9486 p = regwhite( pRExC_state, p );
9487 switch ((U8)*p) {
9488 case '^':
9489 case '$':
9490 case '.':
9491 case '[':
9492 case '(':
9493 case ')':
9494 case '|':
9495 goto loopdone;
9496 case '\\':
9497 /* Literal Escapes Switch
9498
9499 This switch is meant to handle escape sequences that
9500 resolve to a literal character.
9501
9502 Every escape sequence that represents something
9503 else, like an assertion or a char class, is handled
9504 in the switch marked 'Special Escapes' above in this
9505 routine, but also has an entry here as anything that
9506 isn't explicitly mentioned here will be treated as
9507 an unescaped equivalent literal.
9508 */
9509
9510 switch ((U8)*++p) {
9511 /* These are all the special escapes. */
9512 case 'A': /* Start assertion */
9513 case 'b': case 'B': /* Word-boundary assertion*/
9514 case 'C': /* Single char !DANGEROUS! */
9515 case 'd': case 'D': /* digit class */
9516 case 'g': case 'G': /* generic-backref, pos assertion */
9517 case 'h': case 'H': /* HORIZWS */
9518 case 'k': case 'K': /* named backref, keep marker */
9519 case 'N': /* named char sequence */
9520 case 'p': case 'P': /* Unicode property */
9521 case 'R': /* LNBREAK */
9522 case 's': case 'S': /* space class */
9523 case 'v': case 'V': /* VERTWS */
9524 case 'w': case 'W': /* word class */
9525 case 'X': /* eXtended Unicode "combining character sequence" */
9526 case 'z': case 'Z': /* End of line/string assertion */
9527 --p;
9528 goto loopdone;
9529
9530 /* Anything after here is an escape that resolves to a
9531 literal. (Except digits, which may or may not)
9532 */
9533 case 'n':
9534 ender = '\n';
9535 p++;
9536 break;
9537 case 'r':
9538 ender = '\r';
9539 p++;
9540 break;
9541 case 't':
9542 ender = '\t';
9543 p++;
9544 break;
9545 case 'f':
9546 ender = '\f';
9547 p++;
9548 break;
9549 case 'e':
9550 ender = ASCII_TO_NATIVE('\033');
9551 p++;
9552 break;
9553 case 'a':
9554 ender = ASCII_TO_NATIVE('\007');
9555 p++;
9556 break;
9557 case 'o':
9558 {
9559 STRLEN brace_len = len;
9560 UV result;
9561 const char* error_msg;
9562
9563 bool valid = grok_bslash_o(p,
9564 &result,
9565 &brace_len,
9566 &error_msg,
9567 1);
9568 p += brace_len;
9569 if (! valid) {
9570 RExC_parse = p; /* going to die anyway; point
9571 to exact spot of failure */
9572 vFAIL(error_msg);
9573 }
9574 else
9575 {
9576 ender = result;
9577 }
9578 if (PL_encoding && ender < 0x100) {
9579 goto recode_encoding;
9580 }
9581 if (ender > 0xff) {
9582 REQUIRE_UTF8;
9583 }
9584 break;
9585 }
9586 case 'x':
9587 if (*++p == '{') {
9588 char* const e = strchr(p, '}');
9589
9590 if (!e) {
9591 RExC_parse = p + 1;
9592 vFAIL("Missing right brace on \\x{}");
9593 }
9594 else {
9595 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9596 | PERL_SCAN_DISALLOW_PREFIX;
9597 STRLEN numlen = e - p - 1;
9598 ender = grok_hex(p + 1, &numlen, &flags, NULL);
9599 if (ender > 0xff)
9600 REQUIRE_UTF8;
9601 p = e + 1;
9602 }
9603 }
9604 else {
9605 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
9606 STRLEN numlen = 2;
9607 ender = grok_hex(p, &numlen, &flags, NULL);
9608 p += numlen;
9609 }
9610 if (PL_encoding && ender < 0x100)
9611 goto recode_encoding;
9612 break;
9613 case 'c':
9614 p++;
9615 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
9616 break;
9617 case '0': case '1': case '2': case '3':case '4':
9618 case '5': case '6': case '7': case '8':case '9':
9619 if (*p == '0' ||
9620 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
9621 {
9622 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9623 STRLEN numlen = 3;
9624 ender = grok_oct(p, &numlen, &flags, NULL);
9625 if (ender > 0xff) {
9626 REQUIRE_UTF8;
9627 }
9628 p += numlen;
9629 }
9630 else {
9631 --p;
9632 goto loopdone;
9633 }
9634 if (PL_encoding && ender < 0x100)
9635 goto recode_encoding;
9636 break;
9637 recode_encoding:
9638 if (! RExC_override_recoding) {
9639 SV* enc = PL_encoding;
9640 ender = reg_recode((const char)(U8)ender, &enc);
9641 if (!enc && SIZE_ONLY)
9642 ckWARNreg(p, "Invalid escape in the specified encoding");
9643 REQUIRE_UTF8;
9644 }
9645 break;
9646 case '\0':
9647 if (p >= RExC_end)
9648 FAIL("Trailing \\");
9649 /* FALL THROUGH */
9650 default:
9651 if (!SIZE_ONLY&& isALPHA(*p)) {
9652 /* Include any { following the alpha to emphasize
9653 * that it could be part of an escape at some point
9654 * in the future */
9655 int len = (*(p + 1) == '{') ? 2 : 1;
9656 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9657 }
9658 goto normal_default;
9659 }
9660 break;
9661 default:
9662 normal_default:
9663 if (UTF8_IS_START(*p) && UTF) {
9664 STRLEN numlen;
9665 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9666 &numlen, UTF8_ALLOW_DEFAULT);
9667 p += numlen;
9668 }
9669 else
9670 ender = (U8) *p++;
9671 break;
9672 } /* End of switch on the literal */
9673
9674 is_exactfu_sharp_s = (node_type == EXACTFU
9675 && ender == LATIN_SMALL_LETTER_SHARP_S);
9676 if ( RExC_flags & RXf_PMf_EXTENDED)
9677 p = regwhite( pRExC_state, p );
9678 if ((UTF && FOLD) || is_exactfu_sharp_s) {
9679 /* Prime the casefolded buffer. Locale rules, which apply
9680 * only to code points < 256, aren't known until execution,
9681 * so for them, just output the original character using
9682 * utf8. If we start to fold non-UTF patterns, be sure to
9683 * update join_exact() */
9684 if (LOC && ender < 256) {
9685 if (UNI_IS_INVARIANT(ender)) {
9686 *tmpbuf = (U8) ender;
9687 foldlen = 1;
9688 } else {
9689 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9690 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9691 foldlen = 2;
9692 }
9693 }
9694 else if (isASCII(ender)) { /* Note: Here can't also be LOC
9695 */
9696 ender = toLOWER(ender);
9697 *tmpbuf = (U8) ender;
9698 foldlen = 1;
9699 }
9700 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9701
9702 /* Locale and /aa require more selectivity about the
9703 * fold, so are handled below. Otherwise, here, just
9704 * use the fold */
9705 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9706 }
9707 else {
9708 /* Under locale rules or /aa we are not to mix,
9709 * respectively, ords < 256 or ASCII with non-. So
9710 * reject folds that mix them, using only the
9711 * non-folded code point. So do the fold to a
9712 * temporary, and inspect each character in it. */
9713 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9714 U8* s = trialbuf;
9715 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9716 U8* e = s + foldlen;
9717 bool fold_ok = TRUE;
9718
9719 while (s < e) {
9720 if (isASCII(*s)
9721 || (LOC && (UTF8_IS_INVARIANT(*s)
9722 || UTF8_IS_DOWNGRADEABLE_START(*s))))
9723 {
9724 fold_ok = FALSE;
9725 break;
9726 }
9727 s += UTF8SKIP(s);
9728 }
9729 if (fold_ok) {
9730 Copy(trialbuf, tmpbuf, foldlen, U8);
9731 ender = tmpender;
9732 }
9733 else {
9734 uvuni_to_utf8(tmpbuf, ender);
9735 foldlen = UNISKIP(ender);
9736 }
9737 }
9738 }
9739 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
9740 if (len)
9741 p = oldp;
9742 else if (UTF || is_exactfu_sharp_s) {
9743 if (FOLD) {
9744 /* Emit all the Unicode characters. */
9745 STRLEN numlen;
9746 for (foldbuf = tmpbuf;
9747 foldlen;
9748 foldlen -= numlen) {
9749 ender = utf8_to_uvchr(foldbuf, &numlen);
9750 if (numlen > 0) {
9751 const STRLEN unilen = reguni(pRExC_state, ender, s);
9752 s += unilen;
9753 len += unilen;
9754 /* In EBCDIC the numlen
9755 * and unilen can differ. */
9756 foldbuf += numlen;
9757 if (numlen >= foldlen)
9758 break;
9759 }
9760 else
9761 break; /* "Can't happen." */
9762 }
9763 }
9764 else {
9765 const STRLEN unilen = reguni(pRExC_state, ender, s);
9766 if (unilen > 0) {
9767 s += unilen;
9768 len += unilen;
9769 }
9770 }
9771 }
9772 else {
9773 len++;
9774 REGC((char)ender, s++);
9775 }
9776 break;
9777 }
9778 if (UTF || is_exactfu_sharp_s) {
9779 if (FOLD) {
9780 /* Emit all the Unicode characters. */
9781 STRLEN numlen;
9782 for (foldbuf = tmpbuf;
9783 foldlen;
9784 foldlen -= numlen) {
9785 ender = utf8_to_uvchr(foldbuf, &numlen);
9786 if (numlen > 0) {
9787 const STRLEN unilen = reguni(pRExC_state, ender, s);
9788 len += unilen;
9789 s += unilen;
9790 /* In EBCDIC the numlen
9791 * and unilen can differ. */
9792 foldbuf += numlen;
9793 if (numlen >= foldlen)
9794 break;
9795 }
9796 else
9797 break;
9798 }
9799 }
9800 else {
9801 const STRLEN unilen = reguni(pRExC_state, ender, s);
9802 if (unilen > 0) {
9803 s += unilen;
9804 len += unilen;
9805 }
9806 }
9807 len--;
9808 }
9809 else {
9810 REGC((char)ender, s++);
9811 }
9812 }
9813 loopdone: /* Jumped to when encounters something that shouldn't be in
9814 the node */
9815 RExC_parse = p - 1;
9816 Set_Node_Cur_Length(ret); /* MJD */
9817 nextchar(pRExC_state);
9818 {
9819 /* len is STRLEN which is unsigned, need to copy to signed */
9820 IV iv = len;
9821 if (iv < 0)
9822 vFAIL("Internal disaster");
9823 }
9824 if (len > 0)
9825 *flagp |= HASWIDTH;
9826 if (len == 1 && UNI_IS_INVARIANT(ender))
9827 *flagp |= SIMPLE;
9828
9829 if (SIZE_ONLY)
9830 RExC_size += STR_SZ(len);
9831 else {
9832 STR_LEN(ret) = len;
9833 RExC_emit += STR_SZ(len);
9834 }
9835 }
9836 break;
9837 }
9838
9839 return(ret);
9840
9841/* Jumped to when an unrecognized character set is encountered */
9842bad_charset:
9843 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9844 return(NULL);
9845}
9846
9847STATIC char *
9848S_regwhite( RExC_state_t *pRExC_state, char *p )
9849{
9850 const char *e = RExC_end;
9851
9852 PERL_ARGS_ASSERT_REGWHITE;
9853
9854 while (p < e) {
9855 if (isSPACE(*p))
9856 ++p;
9857 else if (*p == '#') {
9858 bool ended = 0;
9859 do {
9860 if (*p++ == '\n') {
9861 ended = 1;
9862 break;
9863 }
9864 } while (p < e);
9865 if (!ended)
9866 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9867 }
9868 else
9869 break;
9870 }
9871 return p;
9872}
9873
9874/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9875 Character classes ([:foo:]) can also be negated ([:^foo:]).
9876 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9877 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
9878 but trigger failures because they are currently unimplemented. */
9879
9880#define POSIXCC_DONE(c) ((c) == ':')
9881#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9882#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9883
9884STATIC I32
9885S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
9886{
9887 dVAR;
9888 I32 namedclass = OOB_NAMEDCLASS;
9889
9890 PERL_ARGS_ASSERT_REGPPOSIXCC;
9891
9892 if (value == '[' && RExC_parse + 1 < RExC_end &&
9893 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9894 POSIXCC(UCHARAT(RExC_parse))) {
9895 const char c = UCHARAT(RExC_parse);
9896 char* const s = RExC_parse++;
9897
9898 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
9899 RExC_parse++;
9900 if (RExC_parse == RExC_end)
9901 /* Grandfather lone [:, [=, [. */
9902 RExC_parse = s;
9903 else {
9904 const char* const t = RExC_parse++; /* skip over the c */
9905 assert(*t == c);
9906
9907 if (UCHARAT(RExC_parse) == ']') {
9908 const char *posixcc = s + 1;
9909 RExC_parse++; /* skip over the ending ] */
9910
9911 if (*s == ':') {
9912 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9913 const I32 skip = t - posixcc;
9914
9915 /* Initially switch on the length of the name. */
9916 switch (skip) {
9917 case 4:
9918 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9919 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
9920 break;
9921 case 5:
9922 /* Names all of length 5. */
9923 /* alnum alpha ascii blank cntrl digit graph lower
9924 print punct space upper */
9925 /* Offset 4 gives the best switch position. */
9926 switch (posixcc[4]) {
9927 case 'a':
9928 if (memEQ(posixcc, "alph", 4)) /* alpha */
9929 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
9930 break;
9931 case 'e':
9932 if (memEQ(posixcc, "spac", 4)) /* space */
9933 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
9934 break;
9935 case 'h':
9936 if (memEQ(posixcc, "grap", 4)) /* graph */
9937 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
9938 break;
9939 case 'i':
9940 if (memEQ(posixcc, "asci", 4)) /* ascii */
9941 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
9942 break;
9943 case 'k':
9944 if (memEQ(posixcc, "blan", 4)) /* blank */
9945 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
9946 break;
9947 case 'l':
9948 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9949 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
9950 break;
9951 case 'm':
9952 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9953 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
9954 break;
9955 case 'r':
9956 if (memEQ(posixcc, "lowe", 4)) /* lower */
9957 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9958 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9959 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
9960 break;
9961 case 't':
9962 if (memEQ(posixcc, "digi", 4)) /* digit */
9963 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9964 else if (memEQ(posixcc, "prin", 4)) /* print */
9965 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9966 else if (memEQ(posixcc, "punc", 4)) /* punct */
9967 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
9968 break;
9969 }
9970 break;
9971 case 6:
9972 if (memEQ(posixcc, "xdigit", 6))
9973 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
9974 break;
9975 }
9976
9977 if (namedclass == OOB_NAMEDCLASS)
9978 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9979 t - s - 1, s + 1);
9980 assert (posixcc[skip] == ':');
9981 assert (posixcc[skip+1] == ']');
9982 } else if (!SIZE_ONLY) {
9983 /* [[=foo=]] and [[.foo.]] are still future. */
9984
9985 /* adjust RExC_parse so the warning shows after
9986 the class closes */
9987 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
9988 RExC_parse++;
9989 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9990 }
9991 } else {
9992 /* Maternal grandfather:
9993 * "[:" ending in ":" but not in ":]" */
9994 RExC_parse = s;
9995 }
9996 }
9997 }
9998
9999 return namedclass;
10000}
10001
10002STATIC void
10003S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10004{
10005 dVAR;
10006
10007 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10008
10009 if (POSIXCC(UCHARAT(RExC_parse))) {
10010 const char *s = RExC_parse;
10011 const char c = *s++;
10012
10013 while (isALNUM(*s))
10014 s++;
10015 if (*s && c == *s && s[1] == ']') {
10016 ckWARN3reg(s+2,
10017 "POSIX syntax [%c %c] belongs inside character classes",
10018 c, c);
10019
10020 /* [[=foo=]] and [[.foo.]] are still future. */
10021 if (POSIXCC_NOTYET(c)) {
10022 /* adjust RExC_parse so the error shows after
10023 the class closes */
10024 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10025 NOOP;
10026 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10027 }
10028 }
10029 }
10030}
10031
10032/* Generate the code to add a full posix character <class> to the bracketed
10033 * character class given by <node>. (<node> is needed only under locale rules)
10034 * destlist is the inversion list for non-locale rules that this class is
10035 * to be added to
10036 * sourcelist is the ASCII-range inversion list to add under /a rules
10037 * Xsourcelist is the full Unicode range list to use otherwise. */
10038#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10039 if (LOC) { \
10040 SV* scratch_list = NULL; \
10041 \
10042 /* Set this class in the node for runtime matching */ \
10043 ANYOF_CLASS_SET(node, class); \
10044 \
10045 /* For above Latin1 code points, we use the full Unicode range */ \
10046 _invlist_intersection(PL_AboveLatin1, \
10047 Xsourcelist, \
10048 &scratch_list); \
10049 /* And set the output to it, adding instead if there already is an \
10050 * output. Checking if <destlist> is NULL first saves an extra \
10051 * clone. Its reference count will be decremented at the next \
10052 * union, etc, or if this is the only instance, at the end of the \
10053 * routine */ \
10054 if (! destlist) { \
10055 destlist = scratch_list; \
10056 } \
10057 else { \
10058 _invlist_union(destlist, scratch_list, &destlist); \
10059 SvREFCNT_dec(scratch_list); \
10060 } \
10061 } \
10062 else { \
10063 /* For non-locale, just add it to any existing list */ \
10064 _invlist_union(destlist, \
10065 (AT_LEAST_ASCII_RESTRICTED) \
10066 ? sourcelist \
10067 : Xsourcelist, \
10068 &destlist); \
10069 }
10070
10071/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10072 */
10073#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10074 if (LOC) { \
10075 SV* scratch_list = NULL; \
10076 ANYOF_CLASS_SET(node, class); \
10077 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10078 if (! destlist) { \
10079 destlist = scratch_list; \
10080 } \
10081 else { \
10082 _invlist_union(destlist, scratch_list, &destlist); \
10083 SvREFCNT_dec(scratch_list); \
10084 } \
10085 } \
10086 else { \
10087 _invlist_union_complement_2nd(destlist, \
10088 (AT_LEAST_ASCII_RESTRICTED) \
10089 ? sourcelist \
10090 : Xsourcelist, \
10091 &destlist); \
10092 /* Under /d, everything in the upper half of the Latin1 range \
10093 * matches this complement */ \
10094 if (DEPENDS_SEMANTICS) { \
10095 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10096 } \
10097 }
10098
10099/* Generate the code to add a posix character <class> to the bracketed
10100 * character class given by <node>. (<node> is needed only under locale rules)
10101 * destlist is the inversion list for non-locale rules that this class is
10102 * to be added to
10103 * sourcelist is the ASCII-range inversion list to add under /a rules
10104 * l1_sourcelist is the Latin1 range list to use otherwise.
10105 * Xpropertyname is the name to add to <run_time_list> of the property to
10106 * specify the code points above Latin1 that will have to be
10107 * determined at run-time
10108 * run_time_list is a SV* that contains text names of properties that are to
10109 * be computed at run time. This concatenates <Xpropertyname>
10110 * to it, apppropriately
10111 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10112 * time */
10113#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10114 l1_sourcelist, Xpropertyname, run_time_list) \
10115 /* If not /a matching, there are going to be code points we will have \
10116 * to defer to runtime to look-up */ \
10117 if (! AT_LEAST_ASCII_RESTRICTED) { \
10118 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10119 } \
10120 if (LOC) { \
10121 ANYOF_CLASS_SET(node, class); \
10122 } \
10123 else { \
10124 _invlist_union(destlist, \
10125 (AT_LEAST_ASCII_RESTRICTED) \
10126 ? sourcelist \
10127 : l1_sourcelist, \
10128 &destlist); \
10129 }
10130
10131/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10132 * this and DO_N_POSIX */
10133#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10134 l1_sourcelist, Xpropertyname, run_time_list) \
10135 if (AT_LEAST_ASCII_RESTRICTED) { \
10136 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10137 } \
10138 else { \
10139 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10140 if (LOC) { \
10141 ANYOF_CLASS_SET(node, namedclass); \
10142 } \
10143 else { \
10144 SV* scratch_list = NULL; \
10145 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10146 if (! destlist) { \
10147 destlist = scratch_list; \
10148 } \
10149 else { \
10150 _invlist_union(destlist, scratch_list, &destlist); \
10151 SvREFCNT_dec(scratch_list); \
10152 } \
10153 if (DEPENDS_SEMANTICS) { \
10154 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10155 } \
10156 } \
10157 }
10158
10159STATIC U8
10160S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10161{
10162
10163 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10164 * Locale folding is done at run-time, so this function should not be
10165 * called for nodes that are for locales.
10166 *
10167 * This function sets the bit corresponding to the fold of the input
10168 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
10169 * 'F' is 'f'.
10170 *
10171 * It also knows about the characters that are in the bitmap that have
10172 * folds that are matchable only outside it, and sets the appropriate lists
10173 * and flags.
10174 *
10175 * It returns the number of bits that actually changed from 0 to 1 */
10176
10177 U8 stored = 0;
10178 U8 fold;
10179
10180 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10181
10182 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10183 : PL_fold[value];
10184
10185 /* It assumes the bit for 'value' has already been set */
10186 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10187 ANYOF_BITMAP_SET(node, fold);
10188 stored++;
10189 }
10190 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10191 /* Certain Latin1 characters have matches outside the bitmap. To get
10192 * here, 'value' is one of those characters. None of these matches is
10193 * valid for ASCII characters under /aa, which have been excluded by
10194 * the 'if' above. The matches fall into three categories:
10195 * 1) They are singly folded-to or -from an above 255 character, as
10196 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10197 * WITH DIAERESIS;
10198 * 2) They are part of a multi-char fold with another character in the
10199 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10200 * 3) They are part of a multi-char fold with a character not in the
10201 * bitmap, such as various ligatures.
10202 * We aren't dealing fully with multi-char folds, except we do deal
10203 * with the pattern containing a character that has a multi-char fold
10204 * (not so much the inverse).
10205 * For types 1) and 3), the matches only happen when the target string
10206 * is utf8; that's not true for 2), and we set a flag for it.
10207 *
10208 * The code below adds to the passed in inversion list the single fold
10209 * closures for 'value'. The values are hard-coded here so that an
10210 * innocent-looking character class, like /[ks]/i won't have to go out
10211 * to disk to find the possible matches. XXX It would be better to
10212 * generate these via regen, in case a new version of the Unicode
10213 * standard adds new mappings, though that is not really likely. */
10214 switch (value) {
10215 case 'k':
10216 case 'K':
10217 /* KELVIN SIGN */
10218 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10219 break;
10220 case 's':
10221 case 'S':
10222 /* LATIN SMALL LETTER LONG S */
10223 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10224 break;
10225 case MICRO_SIGN:
10226 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10227 GREEK_SMALL_LETTER_MU);
10228 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10229 GREEK_CAPITAL_LETTER_MU);
10230 break;
10231 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10232 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10233 /* ANGSTROM SIGN */
10234 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10235 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
10236 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10237 PL_fold_latin1[value]);
10238 }
10239 break;
10240 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10241 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10242 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10243 break;
10244 case LATIN_SMALL_LETTER_SHARP_S:
10245 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10246 LATIN_CAPITAL_LETTER_SHARP_S);
10247
10248 /* Under /a, /d, and /u, this can match the two chars "ss" */
10249 if (! MORE_ASCII_RESTRICTED) {
10250 add_alternate(alternate_ptr, (U8 *) "ss", 2);
10251
10252 /* And under /u or /a, it can match even if the target is
10253 * not utf8 */
10254 if (AT_LEAST_UNI_SEMANTICS) {
10255 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10256 }
10257 }
10258 break;
10259 case 'F': case 'f':
10260 case 'I': case 'i':
10261 case 'L': case 'l':
10262 case 'T': case 't':
10263 case 'A': case 'a':
10264 case 'H': case 'h':
10265 case 'J': case 'j':
10266 case 'N': case 'n':
10267 case 'W': case 'w':
10268 case 'Y': case 'y':
10269 /* These all are targets of multi-character folds from code
10270 * points that require UTF8 to express, so they can't match
10271 * unless the target string is in UTF-8, so no action here is
10272 * necessary, as regexec.c properly handles the general case
10273 * for UTF-8 matching */
10274 break;
10275 default:
10276 /* Use deprecated warning to increase the chances of this
10277 * being output */
10278 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10279 break;
10280 }
10281 }
10282 else if (DEPENDS_SEMANTICS
10283 && ! isASCII(value)
10284 && PL_fold_latin1[value] != value)
10285 {
10286 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10287 * folds only when the target string is in UTF-8. We add the fold
10288 * here to the list of things to match outside the bitmap, which
10289 * won't be looked at unless it is UTF8 (or else if something else
10290 * says to look even if not utf8, but those things better not happen
10291 * under DEPENDS semantics. */
10292 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10293 }
10294
10295 return stored;
10296}
10297
10298
10299PERL_STATIC_INLINE U8
10300S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10301{
10302 /* This inline function sets a bit in the bitmap if not already set, and if
10303 * appropriate, its fold, returning the number of bits that actually
10304 * changed from 0 to 1 */
10305
10306 U8 stored;
10307
10308 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10309
10310 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
10311 return 0;
10312 }
10313
10314 ANYOF_BITMAP_SET(node, value);
10315 stored = 1;
10316
10317 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
10318 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10319 }
10320
10321 return stored;
10322}
10323
10324STATIC void
10325S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10326{
10327 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10328 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10329 * the multi-character folds of characters in the node */
10330 SV *sv;
10331
10332 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10333
10334 if (! *alternate_ptr) {
10335 *alternate_ptr = newAV();
10336 }
10337 sv = newSVpvn_utf8((char*)string, len, TRUE);
10338 av_push(*alternate_ptr, sv);
10339 return;
10340}
10341
10342/*
10343 parse a class specification and produce either an ANYOF node that
10344 matches the pattern or perhaps will be optimized into an EXACTish node
10345 instead. The node contains a bit map for the first 256 characters, with the
10346 corresponding bit set if that character is in the list. For characters
10347 above 255, a range list is used */
10348
10349STATIC regnode *
10350S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10351{
10352 dVAR;
10353 register UV nextvalue;
10354 register IV prevvalue = OOB_UNICODE;
10355 register IV range = 0;
10356 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10357 register regnode *ret;
10358 STRLEN numlen;
10359 IV namedclass;
10360 char *rangebegin = NULL;
10361 bool need_class = 0;
10362 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
10363 SV *listsv = NULL;
10364 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10365 than just initialized. */
10366 SV* properties = NULL; /* Code points that match \p{} \P{} */
10367 UV element_count = 0; /* Number of distinct elements in the class.
10368 Optimizations may be possible if this is tiny */
10369 UV n;
10370
10371 /* Unicode properties are stored in a swash; this holds the current one
10372 * being parsed. If this swash is the only above-latin1 component of the
10373 * character class, an optimization is to pass it directly on to the
10374 * execution engine. Otherwise, it is set to NULL to indicate that there
10375 * are other things in the class that have to be dealt with at execution
10376 * time */
10377 SV* swash = NULL; /* Code points that match \p{} \P{} */
10378
10379 /* Set if a component of this character class is user-defined; just passed
10380 * on to the engine */
10381 UV has_user_defined_property = 0;
10382
10383 /* code points this node matches that can't be stored in the bitmap */
10384 SV* nonbitmap = NULL;
10385
10386 /* The items that are to match that aren't stored in the bitmap, but are a
10387 * result of things that are stored there. This is the fold closure of
10388 * such a character, either because it has DEPENDS semantics and shouldn't
10389 * be matched unless the target string is utf8, or is a code point that is
10390 * too large for the bit map, as for example, the fold of the MICRO SIGN is
10391 * above 255. This all is solely for performance reasons. By having this
10392 * code know the outside-the-bitmap folds that the bitmapped characters are
10393 * involved with, we don't have to go out to disk to find the list of
10394 * matches, unless the character class includes code points that aren't
10395 * storable in the bit map. That means that a character class with an 's'
10396 * in it, for example, doesn't need to go out to disk to find everything
10397 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
10398 * empty unless there is something whose fold we don't know about, and will
10399 * have to go out to the disk to find. */
10400 SV* l1_fold_invlist = NULL;
10401
10402 /* List of multi-character folds that are matched by this node */
10403 AV* unicode_alternate = NULL;
10404#ifdef EBCDIC
10405 UV literal_endpoint = 0;
10406#endif
10407 UV stored = 0; /* how many chars stored in the bitmap */
10408
10409 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
10410 case we need to change the emitted regop to an EXACT. */
10411 const char * orig_parse = RExC_parse;
10412 GET_RE_DEBUG_FLAGS_DECL;
10413
10414 PERL_ARGS_ASSERT_REGCLASS;
10415#ifndef DEBUGGING
10416 PERL_UNUSED_ARG(depth);
10417#endif
10418
10419 DEBUG_PARSE("clas");
10420
10421 /* Assume we are going to generate an ANYOF node. */
10422 ret = reganode(pRExC_state, ANYOF, 0);
10423
10424
10425 if (!SIZE_ONLY) {
10426 ANYOF_FLAGS(ret) = 0;
10427 }
10428
10429 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
10430 RExC_naughty++;
10431 RExC_parse++;
10432 if (!SIZE_ONLY)
10433 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
10434
10435 /* We have decided to not allow multi-char folds in inverted character
10436 * classes, due to the confusion that can happen, especially with
10437 * classes that are designed for a non-Unicode world: You have the
10438 * peculiar case that:
10439 "s s" =~ /^[^\xDF]+$/i => Y
10440 "ss" =~ /^[^\xDF]+$/i => N
10441 *
10442 * See [perl #89750] */
10443 allow_full_fold = FALSE;
10444 }
10445
10446 if (SIZE_ONLY) {
10447 RExC_size += ANYOF_SKIP;
10448 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10449 }
10450 else {
10451 RExC_emit += ANYOF_SKIP;
10452 if (LOC) {
10453 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
10454 }
10455 ANYOF_BITMAP_ZERO(ret);
10456 listsv = newSVpvs("# comment\n");
10457 initial_listsv_len = SvCUR(listsv);
10458 }
10459
10460 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10461
10462 if (!SIZE_ONLY && POSIXCC(nextvalue))
10463 checkposixcc(pRExC_state);
10464
10465 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10466 if (UCHARAT(RExC_parse) == ']')
10467 goto charclassloop;
10468
10469parseit:
10470 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
10471
10472 charclassloop:
10473
10474 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10475
10476 if (!range) {
10477 rangebegin = RExC_parse;
10478 element_count++;
10479 }
10480 if (UTF) {
10481 value = utf8n_to_uvchr((U8*)RExC_parse,
10482 RExC_end - RExC_parse,
10483 &numlen, UTF8_ALLOW_DEFAULT);
10484 RExC_parse += numlen;
10485 }
10486 else
10487 value = UCHARAT(RExC_parse++);
10488
10489 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10490 if (value == '[' && POSIXCC(nextvalue))
10491 namedclass = regpposixcc(pRExC_state, value);
10492 else if (value == '\\') {
10493 if (UTF) {
10494 value = utf8n_to_uvchr((U8*)RExC_parse,
10495 RExC_end - RExC_parse,
10496 &numlen, UTF8_ALLOW_DEFAULT);
10497 RExC_parse += numlen;
10498 }
10499 else
10500 value = UCHARAT(RExC_parse++);
10501 /* Some compilers cannot handle switching on 64-bit integer
10502 * values, therefore value cannot be an UV. Yes, this will
10503 * be a problem later if we want switch on Unicode.
10504 * A similar issue a little bit later when switching on
10505 * namedclass. --jhi */
10506 switch ((I32)value) {
10507 case 'w': namedclass = ANYOF_ALNUM; break;
10508 case 'W': namedclass = ANYOF_NALNUM; break;
10509 case 's': namedclass = ANYOF_SPACE; break;
10510 case 'S': namedclass = ANYOF_NSPACE; break;
10511 case 'd': namedclass = ANYOF_DIGIT; break;
10512 case 'D': namedclass = ANYOF_NDIGIT; break;
10513 case 'v': namedclass = ANYOF_VERTWS; break;
10514 case 'V': namedclass = ANYOF_NVERTWS; break;
10515 case 'h': namedclass = ANYOF_HORIZWS; break;
10516 case 'H': namedclass = ANYOF_NHORIZWS; break;
10517 case 'N': /* Handle \N{NAME} in class */
10518 {
10519 /* We only pay attention to the first char of
10520 multichar strings being returned. I kinda wonder
10521 if this makes sense as it does change the behaviour
10522 from earlier versions, OTOH that behaviour was broken
10523 as well. */
10524 UV v; /* value is register so we cant & it /grrr */
10525 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
10526 goto parseit;
10527 }
10528 value= v;
10529 }
10530 break;
10531 case 'p':
10532 case 'P':
10533 {
10534 char *e;
10535 if (RExC_parse >= RExC_end)
10536 vFAIL2("Empty \\%c{}", (U8)value);
10537 if (*RExC_parse == '{') {
10538 const U8 c = (U8)value;
10539 e = strchr(RExC_parse++, '}');
10540 if (!e)
10541 vFAIL2("Missing right brace on \\%c{}", c);
10542 while (isSPACE(UCHARAT(RExC_parse)))
10543 RExC_parse++;
10544 if (e == RExC_parse)
10545 vFAIL2("Empty \\%c{}", c);
10546 n = e - RExC_parse;
10547 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10548 n--;
10549 }
10550 else {
10551 e = RExC_parse;
10552 n = 1;
10553 }
10554 if (!SIZE_ONLY) {
10555 SV** invlistsvp;
10556 SV* invlist;
10557 char* name;
10558 if (UCHARAT(RExC_parse) == '^') {
10559 RExC_parse++;
10560 n--;
10561 value = value == 'p' ? 'P' : 'p'; /* toggle */
10562 while (isSPACE(UCHARAT(RExC_parse))) {
10563 RExC_parse++;
10564 n--;
10565 }
10566 }
10567 /* Try to get the definition of the property into
10568 * <invlist>. If /i is in effect, the effective property
10569 * will have its name be <__NAME_i>. The design is
10570 * discussed in commit
10571 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10572 Newx(name, n + sizeof("_i__\n"), char);
10573
10574 sprintf(name, "%s%.*s%s\n",
10575 (FOLD) ? "__" : "",
10576 (int)n,
10577 RExC_parse,
10578 (FOLD) ? "_i" : ""
10579 );
10580
10581 /* Look up the property name, and get its swash and
10582 * inversion list, if the property is found */
10583 if (swash) {
10584 SvREFCNT_dec(swash);
10585 }
10586 swash = _core_swash_init("utf8", name, &PL_sv_undef,
10587 1, /* binary */
10588 0, /* not tr/// */
10589 TRUE, /* this routine will handle
10590 undefined properties */
10591 NULL, FALSE /* No inversion list */
10592 );
10593 if ( ! swash
10594 || ! SvROK(swash)
10595 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10596 || ! (invlistsvp =
10597 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10598 "INVLIST", FALSE))
10599 || ! (invlist = *invlistsvp))
10600 {
10601 if (swash) {
10602 SvREFCNT_dec(swash);
10603 swash = NULL;
10604 }
10605
10606 /* Here didn't find it. It could be a user-defined
10607 * property that will be available at run-time. Add it
10608 * to the list to look up then */
10609 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10610 (value == 'p' ? '+' : '!'),
10611 name);
10612 has_user_defined_property = 1;
10613
10614 /* We don't know yet, so have to assume that the
10615 * property could match something in the Latin1 range,
10616 * hence something that isn't utf8 */
10617 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10618 }
10619 else {
10620
10621 /* Here, did get the swash and its inversion list. If
10622 * the swash is from a user-defined property, then this
10623 * whole character class should be regarded as such */
10624 SV** user_defined_svp =
10625 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10626 "USER_DEFINED", FALSE);
10627 if (user_defined_svp) {
10628 has_user_defined_property
10629 |= SvUV(*user_defined_svp);
10630 }
10631
10632 /* Invert if asking for the complement */
10633 if (value == 'P') {
10634 _invlist_union_complement_2nd(properties, invlist, &properties);
10635
10636 /* The swash can't be used as-is, because we've
10637 * inverted things; delay removing it to here after
10638 * have copied its invlist above */
10639 SvREFCNT_dec(swash);
10640 swash = NULL;
10641 }
10642 else {
10643 _invlist_union(properties, invlist, &properties);
10644 }
10645 }
10646 Safefree(name);
10647 }
10648 RExC_parse = e + 1;
10649 namedclass = ANYOF_MAX; /* no official name, but it's named */
10650
10651 /* \p means they want Unicode semantics */
10652 RExC_uni_semantics = 1;
10653 }
10654 break;
10655 case 'n': value = '\n'; break;
10656 case 'r': value = '\r'; break;
10657 case 't': value = '\t'; break;
10658 case 'f': value = '\f'; break;
10659 case 'b': value = '\b'; break;
10660 case 'e': value = ASCII_TO_NATIVE('\033');break;
10661 case 'a': value = ASCII_TO_NATIVE('\007');break;
10662 case 'o':
10663 RExC_parse--; /* function expects to be pointed at the 'o' */
10664 {
10665 const char* error_msg;
10666 bool valid = grok_bslash_o(RExC_parse,
10667 &value,
10668 &numlen,
10669 &error_msg,
10670 SIZE_ONLY);
10671 RExC_parse += numlen;
10672 if (! valid) {
10673 vFAIL(error_msg);
10674 }
10675 }
10676 if (PL_encoding && value < 0x100) {
10677 goto recode_encoding;
10678 }
10679 break;
10680 case 'x':
10681 if (*RExC_parse == '{') {
10682 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10683 | PERL_SCAN_DISALLOW_PREFIX;
10684 char * const e = strchr(RExC_parse++, '}');
10685 if (!e)
10686 vFAIL("Missing right brace on \\x{}");
10687
10688 numlen = e - RExC_parse;
10689 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10690 RExC_parse = e + 1;
10691 }
10692 else {
10693 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10694 numlen = 2;
10695 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
10696 RExC_parse += numlen;
10697 }
10698 if (PL_encoding && value < 0x100)
10699 goto recode_encoding;
10700 break;
10701 case 'c':
10702 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
10703 break;
10704 case '0': case '1': case '2': case '3': case '4':
10705 case '5': case '6': case '7':
10706 {
10707 /* Take 1-3 octal digits */
10708 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10709 numlen = 3;
10710 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10711 RExC_parse += numlen;
10712 if (PL_encoding && value < 0x100)
10713 goto recode_encoding;
10714 break;
10715 }
10716 recode_encoding:
10717 if (! RExC_override_recoding) {
10718 SV* enc = PL_encoding;
10719 value = reg_recode((const char)(U8)value, &enc);
10720 if (!enc && SIZE_ONLY)
10721 ckWARNreg(RExC_parse,
10722 "Invalid escape in the specified encoding");
10723 break;
10724 }
10725 default:
10726 /* Allow \_ to not give an error */
10727 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
10728 ckWARN2reg(RExC_parse,
10729 "Unrecognized escape \\%c in character class passed through",
10730 (int)value);
10731 }
10732 break;
10733 }
10734 } /* end of \blah */
10735#ifdef EBCDIC
10736 else
10737 literal_endpoint++;
10738#endif
10739
10740 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10741
10742 /* What matches in a locale is not known until runtime, so need to
10743 * (one time per class) allocate extra space to pass to regexec.
10744 * The space will contain a bit for each named class that is to be
10745 * matched against. This isn't needed for \p{} and pseudo-classes,
10746 * as they are not affected by locale, and hence are dealt with
10747 * separately */
10748 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10749 need_class = 1;
10750 if (SIZE_ONLY) {
10751 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10752 }
10753 else {
10754 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
10755 ANYOF_CLASS_ZERO(ret);
10756 }
10757 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
10758 }
10759
10760 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
10761 * literal, as is the character that began the false range, i.e.
10762 * the 'a' in the examples */
10763 if (range) {
10764 if (!SIZE_ONLY) {
10765 const int w =
10766 RExC_parse >= rangebegin ?
10767 RExC_parse - rangebegin : 0;
10768 ckWARN4reg(RExC_parse,
10769 "False [] range \"%*.*s\"",
10770 w, w, rangebegin);
10771
10772 stored +=
10773 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
10774 if (prevvalue < 256) {
10775 stored +=
10776 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
10777 }
10778 else {
10779 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
10780 }
10781 }
10782
10783 range = 0; /* this was not a true range */
10784 }
10785
10786 if (!SIZE_ONLY) {
10787
10788 /* Possible truncation here but in some 64-bit environments
10789 * the compiler gets heartburn about switch on 64-bit values.
10790 * A similar issue a little earlier when switching on value.
10791 * --jhi */
10792 switch ((I32)namedclass) {
10793
10794 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10795 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10796 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10797 break;
10798 case ANYOF_NALNUMC:
10799 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10800 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10801 break;
10802 case ANYOF_ALPHA:
10803 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10804 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10805 break;
10806 case ANYOF_NALPHA:
10807 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10808 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10809 break;
10810 case ANYOF_ASCII:
10811 if (LOC) {
10812 ANYOF_CLASS_SET(ret, namedclass);
10813 }
10814 else {
10815 _invlist_union(properties, PL_ASCII, &properties);
10816 }
10817 break;
10818 case ANYOF_NASCII:
10819 if (LOC) {
10820 ANYOF_CLASS_SET(ret, namedclass);
10821 }
10822 else {
10823 _invlist_union_complement_2nd(properties,
10824 PL_ASCII, &properties);
10825 if (DEPENDS_SEMANTICS) {
10826 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10827 }
10828 }
10829 break;
10830 case ANYOF_BLANK:
10831 DO_POSIX(ret, namedclass, properties,
10832 PL_PosixBlank, PL_XPosixBlank);
10833 break;
10834 case ANYOF_NBLANK:
10835 DO_N_POSIX(ret, namedclass, properties,
10836 PL_PosixBlank, PL_XPosixBlank);
10837 break;
10838 case ANYOF_CNTRL:
10839 DO_POSIX(ret, namedclass, properties,
10840 PL_PosixCntrl, PL_XPosixCntrl);
10841 break;
10842 case ANYOF_NCNTRL:
10843 DO_N_POSIX(ret, namedclass, properties,
10844 PL_PosixCntrl, PL_XPosixCntrl);
10845 break;
10846 case ANYOF_DIGIT:
10847 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10848 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10849 break;
10850 case ANYOF_NDIGIT:
10851 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10852 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10853 break;
10854 case ANYOF_GRAPH:
10855 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10856 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10857 break;
10858 case ANYOF_NGRAPH:
10859 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10860 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10861 break;
10862 case ANYOF_HORIZWS:
10863 /* For these, we use the nonbitmap, as /d doesn't make a
10864 * difference in what these match. There would be problems
10865 * if these characters had folds other than themselves, as
10866 * nonbitmap is subject to folding. It turns out that \h
10867 * is just a synonym for XPosixBlank */
10868 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
10869 break;
10870 case ANYOF_NHORIZWS:
10871 _invlist_union_complement_2nd(nonbitmap,
10872 PL_XPosixBlank, &nonbitmap);
10873 break;
10874 case ANYOF_LOWER:
10875 case ANYOF_NLOWER:
10876 { /* These require special handling, as they differ under
10877 folding, matching Cased there (which in the ASCII range
10878 is the same as Alpha */
10879
10880 SV* ascii_source;
10881 SV* l1_source;
10882 const char *Xname;
10883
10884 if (FOLD && ! LOC) {
10885 ascii_source = PL_PosixAlpha;
10886 l1_source = PL_L1Cased;
10887 Xname = "Cased";
10888 }
10889 else {
10890 ascii_source = PL_PosixLower;
10891 l1_source = PL_L1PosixLower;
10892 Xname = "XPosixLower";
10893 }
10894 if (namedclass == ANYOF_LOWER) {
10895 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10896 ascii_source, l1_source, Xname, listsv);
10897 }
10898 else {
10899 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10900 properties, ascii_source, l1_source, Xname, listsv);
10901 }
10902 break;
10903 }
10904 case ANYOF_PRINT:
10905 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10906 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10907 break;
10908 case ANYOF_NPRINT:
10909 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10910 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10911 break;
10912 case ANYOF_PUNCT:
10913 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10914 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10915 break;
10916 case ANYOF_NPUNCT:
10917 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10918 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10919 break;
10920 case ANYOF_PSXSPC:
10921 DO_POSIX(ret, namedclass, properties,
10922 PL_PosixSpace, PL_XPosixSpace);
10923 break;
10924 case ANYOF_NPSXSPC:
10925 DO_N_POSIX(ret, namedclass, properties,
10926 PL_PosixSpace, PL_XPosixSpace);
10927 break;
10928 case ANYOF_SPACE:
10929 DO_POSIX(ret, namedclass, properties,
10930 PL_PerlSpace, PL_XPerlSpace);
10931 break;
10932 case ANYOF_NSPACE:
10933 DO_N_POSIX(ret, namedclass, properties,
10934 PL_PerlSpace, PL_XPerlSpace);
10935 break;
10936 case ANYOF_UPPER: /* Same as LOWER, above */
10937 case ANYOF_NUPPER:
10938 {
10939 SV* ascii_source;
10940 SV* l1_source;
10941 const char *Xname;
10942
10943 if (FOLD && ! LOC) {
10944 ascii_source = PL_PosixAlpha;
10945 l1_source = PL_L1Cased;
10946 Xname = "Cased";
10947 }
10948 else {
10949 ascii_source = PL_PosixUpper;
10950 l1_source = PL_L1PosixUpper;
10951 Xname = "XPosixUpper";
10952 }
10953 if (namedclass == ANYOF_UPPER) {
10954 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10955 ascii_source, l1_source, Xname, listsv);
10956 }
10957 else {
10958 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10959 properties, ascii_source, l1_source, Xname, listsv);
10960 }
10961 break;
10962 }
10963 case ANYOF_ALNUM: /* Really is 'Word' */
10964 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10965 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10966 break;
10967 case ANYOF_NALNUM:
10968 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10969 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10970 break;
10971 case ANYOF_VERTWS:
10972 /* For these, we use the nonbitmap, as /d doesn't make a
10973 * difference in what these match. There would be problems
10974 * if these characters had folds other than themselves, as
10975 * nonbitmap is subject to folding */
10976 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
10977 break;
10978 case ANYOF_NVERTWS:
10979 _invlist_union_complement_2nd(nonbitmap,
10980 PL_VertSpace, &nonbitmap);
10981 break;
10982 case ANYOF_XDIGIT:
10983 DO_POSIX(ret, namedclass, properties,
10984 PL_PosixXDigit, PL_XPosixXDigit);
10985 break;
10986 case ANYOF_NXDIGIT:
10987 DO_N_POSIX(ret, namedclass, properties,
10988 PL_PosixXDigit, PL_XPosixXDigit);
10989 break;
10990 case ANYOF_MAX:
10991 /* this is to handle \p and \P */
10992 break;
10993 default:
10994 vFAIL("Invalid [::] class");
10995 break;
10996 }
10997
10998 continue;
10999 }
11000 } /* end of namedclass \blah */
11001
11002 if (range) {
11003 if (prevvalue > (IV)value) /* b-a */ {
11004 const int w = RExC_parse - rangebegin;
11005 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11006 range = 0; /* not a valid range */
11007 }
11008 }
11009 else {
11010 prevvalue = value; /* save the beginning of the range */
11011 if (RExC_parse+1 < RExC_end
11012 && *RExC_parse == '-'
11013 && RExC_parse[1] != ']')
11014 {
11015 RExC_parse++;
11016
11017 /* a bad range like \w-, [:word:]- ? */
11018 if (namedclass > OOB_NAMEDCLASS) {
11019 if (ckWARN(WARN_REGEXP)) {
11020 const int w =
11021 RExC_parse >= rangebegin ?
11022 RExC_parse - rangebegin : 0;
11023 vWARN4(RExC_parse,
11024 "False [] range \"%*.*s\"",
11025 w, w, rangebegin);
11026 }
11027 if (!SIZE_ONLY)
11028 stored +=
11029 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11030 } else
11031 range = 1; /* yeah, it's a range! */
11032 continue; /* but do it the next time */
11033 }
11034 }
11035
11036 /* non-Latin1 code point implies unicode semantics. Must be set in
11037 * pass1 so is there for the whole of pass 2 */
11038 if (value > 255) {
11039 RExC_uni_semantics = 1;
11040 }
11041
11042 /* now is the next time */
11043 if (!SIZE_ONLY) {
11044 if (prevvalue < 256) {
11045 const IV ceilvalue = value < 256 ? value : 255;
11046 IV i;
11047#ifdef EBCDIC
11048 /* In EBCDIC [\x89-\x91] should include
11049 * the \x8e but [i-j] should not. */
11050 if (literal_endpoint == 2 &&
11051 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11052 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11053 {
11054 if (isLOWER(prevvalue)) {
11055 for (i = prevvalue; i <= ceilvalue; i++)
11056 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11057 stored +=
11058 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11059 }
11060 } else {
11061 for (i = prevvalue; i <= ceilvalue; i++)
11062 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11063 stored +=
11064 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11065 }
11066 }
11067 }
11068 else
11069#endif
11070 for (i = prevvalue; i <= ceilvalue; i++) {
11071 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11072 }
11073 }
11074 if (value > 255) {
11075 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11076 const UV natvalue = NATIVE_TO_UNI(value);
11077 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11078 }
11079#ifdef EBCDIC
11080 literal_endpoint = 0;
11081#endif
11082 }
11083
11084 range = 0; /* this range (if it was one) is done now */
11085 }
11086
11087
11088
11089 if (SIZE_ONLY)
11090 return ret;
11091 /****** !SIZE_ONLY AFTER HERE *********/
11092
11093 /* If folding and there are code points above 255, we calculate all
11094 * characters that could fold to or from the ones already on the list */
11095 if (FOLD && nonbitmap) {
11096 UV start, end; /* End points of code point ranges */
11097
11098 SV* fold_intersection = NULL;
11099
11100 /* This is a list of all the characters that participate in folds
11101 * (except marks, etc in multi-char folds */
11102 if (! PL_utf8_foldable) {
11103 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11104 PL_utf8_foldable = _swash_to_invlist(swash);
11105 SvREFCNT_dec(swash);
11106 }
11107
11108 /* This is a hash that for a particular fold gives all characters
11109 * that are involved in it */
11110 if (! PL_utf8_foldclosures) {
11111
11112 /* If we were unable to find any folds, then we likely won't be
11113 * able to find the closures. So just create an empty list.
11114 * Folding will effectively be restricted to the non-Unicode rules
11115 * hard-coded into Perl. (This case happens legitimately during
11116 * compilation of Perl itself before the Unicode tables are
11117 * generated) */
11118 if (invlist_len(PL_utf8_foldable) == 0) {
11119 PL_utf8_foldclosures = newHV();
11120 } else {
11121 /* If the folds haven't been read in, call a fold function
11122 * to force that */
11123 if (! PL_utf8_tofold) {
11124 U8 dummy[UTF8_MAXBYTES+1];
11125 STRLEN dummy_len;
11126
11127 /* This particular string is above \xff in both UTF-8 and
11128 * UTFEBCDIC */
11129 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11130 assert(PL_utf8_tofold); /* Verify that worked */
11131 }
11132 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11133 }
11134 }
11135
11136 /* Only the characters in this class that participate in folds need be
11137 * checked. Get the intersection of this class and all the possible
11138 * characters that are foldable. This can quickly narrow down a large
11139 * class */
11140 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11141
11142 /* Now look at the foldable characters in this class individually */
11143 invlist_iterinit(fold_intersection);
11144 while (invlist_iternext(fold_intersection, &start, &end)) {
11145 UV j;
11146
11147 /* Look at every character in the range */
11148 for (j = start; j <= end; j++) {
11149
11150 /* Get its fold */
11151 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11152 STRLEN foldlen;
11153 const UV f =
11154 _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
11155
11156 if (foldlen > (STRLEN)UNISKIP(f)) {
11157
11158 /* Any multicharacter foldings (disallowed in lookbehind
11159 * patterns) require the following transform: [ABCDEF] ->
11160 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11161 * folds into "rst", all other characters fold to single
11162 * characters. We save away these multicharacter foldings,
11163 * to be later saved as part of the additional "s" data. */
11164 if (! RExC_in_lookbehind) {
11165 U8* loc = foldbuf;
11166 U8* e = foldbuf + foldlen;
11167
11168 /* If any of the folded characters of this are in the
11169 * Latin1 range, tell the regex engine that this can
11170 * match a non-utf8 target string. The only multi-byte
11171 * fold whose source is in the Latin1 range (U+00DF)
11172 * applies only when the target string is utf8, or
11173 * under unicode rules */
11174 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11175 while (loc < e) {
11176
11177 /* Can't mix ascii with non- under /aa */
11178 if (MORE_ASCII_RESTRICTED
11179 && (isASCII(*loc) != isASCII(j)))
11180 {
11181 goto end_multi_fold;
11182 }
11183 if (UTF8_IS_INVARIANT(*loc)
11184 || UTF8_IS_DOWNGRADEABLE_START(*loc))
11185 {
11186 /* Can't mix above and below 256 under LOC
11187 */
11188 if (LOC) {
11189 goto end_multi_fold;
11190 }
11191 ANYOF_FLAGS(ret)
11192 |= ANYOF_NONBITMAP_NON_UTF8;
11193 break;
11194 }
11195 loc += UTF8SKIP(loc);
11196 }
11197 }
11198
11199 add_alternate(&unicode_alternate, foldbuf, foldlen);
11200 end_multi_fold: ;
11201 }
11202
11203 /* This is special-cased, as it is the only letter which
11204 * has both a multi-fold and single-fold in Latin1. All
11205 * the other chars that have single and multi-folds are
11206 * always in utf8, and the utf8 folding algorithm catches
11207 * them */
11208 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11209 stored += set_regclass_bit(pRExC_state,
11210 ret,
11211 LATIN_SMALL_LETTER_SHARP_S,
11212 &l1_fold_invlist, &unicode_alternate);
11213 }
11214 }
11215 else {
11216 /* Single character fold. Add everything in its fold
11217 * closure to the list that this node should match */
11218 SV** listp;
11219
11220 /* The fold closures data structure is a hash with the keys
11221 * being every character that is folded to, like 'k', and
11222 * the values each an array of everything that folds to its
11223 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
11224 if ((listp = hv_fetch(PL_utf8_foldclosures,
11225 (char *) foldbuf, foldlen, FALSE)))
11226 {
11227 AV* list = (AV*) *listp;
11228 IV k;
11229 for (k = 0; k <= av_len(list); k++) {
11230 SV** c_p = av_fetch(list, k, FALSE);
11231 UV c;
11232 if (c_p == NULL) {
11233 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11234 }
11235 c = SvUV(*c_p);
11236
11237 /* /aa doesn't allow folds between ASCII and non-;
11238 * /l doesn't allow them between above and below
11239 * 256 */
11240 if ((MORE_ASCII_RESTRICTED
11241 && (isASCII(c) != isASCII(j)))
11242 || (LOC && ((c < 256) != (j < 256))))
11243 {
11244 continue;
11245 }
11246
11247 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11248 stored += set_regclass_bit(pRExC_state,
11249 ret,
11250 (U8) c,
11251 &l1_fold_invlist, &unicode_alternate);
11252 }
11253 /* It may be that the code point is already in
11254 * this range or already in the bitmap, in
11255 * which case we need do nothing */
11256 else if ((c < start || c > end)
11257 && (c > 255
11258 || ! ANYOF_BITMAP_TEST(ret, c)))
11259 {
11260 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11261 }
11262 }
11263 }
11264 }
11265 }
11266 }
11267 SvREFCNT_dec(fold_intersection);
11268 }
11269
11270 /* Combine the two lists into one. */
11271 if (l1_fold_invlist) {
11272 if (nonbitmap) {
11273 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11274 SvREFCNT_dec(l1_fold_invlist);
11275 }
11276 else {
11277 nonbitmap = l1_fold_invlist;
11278 }
11279 }
11280
11281 /* And combine the result (if any) with any inversion list from properties.
11282 * The lists are kept separate up to now because we don't want to fold the
11283 * properties */
11284 if (properties) {
11285 if (nonbitmap) {
11286 _invlist_union(nonbitmap, properties, &nonbitmap);
11287 SvREFCNT_dec(properties);
11288 }
11289 else {
11290 nonbitmap = properties;
11291 }
11292 }
11293
11294 /* Here, <nonbitmap> contains all the code points we can determine at
11295 * compile time that we haven't put into the bitmap. Go through it, and
11296 * for things that belong in the bitmap, put them there, and delete from
11297 * <nonbitmap> */
11298 if (nonbitmap) {
11299
11300 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11301 * possibly only should match when the target string is UTF-8 */
11302 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11303
11304 /* This gets set if we actually need to modify things */
11305 bool change_invlist = FALSE;
11306
11307 UV start, end;
11308
11309 /* Start looking through <nonbitmap> */
11310 invlist_iterinit(nonbitmap);
11311 while (invlist_iternext(nonbitmap, &start, &end)) {
11312 UV high;
11313 int i;
11314
11315 /* Quit if are above what we should change */
11316 if (start > max_cp_to_set) {
11317 break;
11318 }
11319
11320 change_invlist = TRUE;
11321
11322 /* Set all the bits in the range, up to the max that we are doing */
11323 high = (end < max_cp_to_set) ? end : max_cp_to_set;
11324 for (i = start; i <= (int) high; i++) {
11325 if (! ANYOF_BITMAP_TEST(ret, i)) {
11326 ANYOF_BITMAP_SET(ret, i);
11327 stored++;
11328 prevvalue = value;
11329 value = i;
11330 }
11331 }
11332 }
11333
11334 /* Done with loop; remove any code points that are in the bitmap from
11335 * <nonbitmap> */
11336 if (change_invlist) {
11337 _invlist_subtract(nonbitmap,
11338 (DEPENDS_SEMANTICS)
11339 ? PL_ASCII
11340 : PL_Latin1,
11341 &nonbitmap);
11342 }
11343
11344 /* If have completely emptied it, remove it completely */
11345 if (invlist_len(nonbitmap) == 0) {
11346 SvREFCNT_dec(nonbitmap);
11347 nonbitmap = NULL;
11348 }
11349 }
11350
11351 /* Here, we have calculated what code points should be in the character
11352 * class. <nonbitmap> does not overlap the bitmap except possibly in the
11353 * case of DEPENDS rules.
11354 *
11355 * Now we can see about various optimizations. Fold calculation (which we
11356 * did above) needs to take place before inversion. Otherwise /[^k]/i
11357 * would invert to include K, which under /i would match k, which it
11358 * shouldn't. */
11359
11360 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
11361 * set the FOLD flag yet, so this does optimize those. It doesn't
11362 * optimize locale. Doing so perhaps could be done as long as there is
11363 * nothing like \w in it; some thought also would have to be given to the
11364 * interaction with above 0x100 chars */
11365 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11366 && ! LOC
11367 && ! unicode_alternate
11368 /* In case of /d, there are some things that should match only when in
11369 * not in the bitmap, i.e., they require UTF8 to match. These are
11370 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11371 * case, they don't require UTF8, so can invert here */
11372 && (! nonbitmap
11373 || ! DEPENDS_SEMANTICS
11374 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11375 && SvCUR(listsv) == initial_listsv_len)
11376 {
11377 int i;
11378 if (! nonbitmap) {
11379 for (i = 0; i < 256; ++i) {
11380 if (ANYOF_BITMAP_TEST(ret, i)) {
11381 ANYOF_BITMAP_CLEAR(ret, i);
11382 }
11383 else {
11384 ANYOF_BITMAP_SET(ret, i);
11385 prevvalue = value;
11386 value = i;
11387 }
11388 }
11389 /* The inversion means that everything above 255 is matched */
11390 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11391 }
11392 else {
11393 /* Here, also has things outside the bitmap that may overlap with
11394 * the bitmap. We have to sync them up, so that they get inverted
11395 * in both places. Earlier, we removed all overlaps except in the
11396 * case of /d rules, so no syncing is needed except for this case
11397 */
11398 SV *remove_list = NULL;
11399
11400 if (DEPENDS_SEMANTICS) {
11401 UV start, end;
11402
11403 /* Set the bits that correspond to the ones that aren't in the
11404 * bitmap. Otherwise, when we invert, we'll miss these.
11405 * Earlier, we removed from the nonbitmap all code points
11406 * < 128, so there is no extra work here */
11407 invlist_iterinit(nonbitmap);
11408 while (invlist_iternext(nonbitmap, &start, &end)) {
11409 if (start > 255) { /* The bit map goes to 255 */
11410 break;
11411 }
11412 if (end > 255) {
11413 end = 255;
11414 }
11415 for (i = start; i <= (int) end; ++i) {
11416 ANYOF_BITMAP_SET(ret, i);
11417 prevvalue = value;
11418 value = i;
11419 }
11420 }
11421 }
11422
11423 /* Now invert both the bitmap and the nonbitmap. Anything in the
11424 * bitmap has to also be removed from the non-bitmap, but again,
11425 * there should not be overlap unless is /d rules. */
11426 _invlist_invert(nonbitmap);
11427
11428 /* Any swash can't be used as-is, because we've inverted things */
11429 if (swash) {
11430 SvREFCNT_dec(swash);
11431 swash = NULL;
11432 }
11433
11434 for (i = 0; i < 256; ++i) {
11435 if (ANYOF_BITMAP_TEST(ret, i)) {
11436 ANYOF_BITMAP_CLEAR(ret, i);
11437 if (DEPENDS_SEMANTICS) {
11438 if (! remove_list) {
11439 remove_list = _new_invlist(2);
11440 }
11441 remove_list = add_cp_to_invlist(remove_list, i);
11442 }
11443 }
11444 else {
11445 ANYOF_BITMAP_SET(ret, i);
11446 prevvalue = value;
11447 value = i;
11448 }
11449 }
11450
11451 /* And do the removal */
11452 if (DEPENDS_SEMANTICS) {
11453 if (remove_list) {
11454 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11455 SvREFCNT_dec(remove_list);
11456 }
11457 }
11458 else {
11459 /* There is no overlap for non-/d, so just delete anything
11460 * below 256 */
11461 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
11462 }
11463 }
11464
11465 stored = 256 - stored;
11466
11467 /* Clear the invert flag since have just done it here */
11468 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
11469 }
11470
11471 /* Folding in the bitmap is taken care of above, but not for locale (for
11472 * which we have to wait to see what folding is in effect at runtime), and
11473 * for some things not in the bitmap (only the upper latin folds in this
11474 * case, as all other single-char folding has been set above). Set
11475 * run-time fold flag for these */
11476 if (FOLD && (LOC
11477 || (DEPENDS_SEMANTICS
11478 && nonbitmap
11479 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11480 || unicode_alternate))
11481 {
11482 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
11483 }
11484
11485 /* A single character class can be "optimized" into an EXACTish node.
11486 * Note that since we don't currently count how many characters there are
11487 * outside the bitmap, we are XXX missing optimization possibilities for
11488 * them. This optimization can't happen unless this is a truly single
11489 * character class, which means that it can't be an inversion into a
11490 * many-character class, and there must be no possibility of there being
11491 * things outside the bitmap. 'stored' (only) for locales doesn't include
11492 * \w, etc, so have to make a special test that they aren't present
11493 *
11494 * Similarly A 2-character class of the very special form like [bB] can be
11495 * optimized into an EXACTFish node, but only for non-locales, and for
11496 * characters which only have the two folds; so things like 'fF' and 'Ii'
11497 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11498 * FI'. */
11499 if (! nonbitmap
11500 && ! unicode_alternate
11501 && SvCUR(listsv) == initial_listsv_len
11502 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
11503 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11504 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11505 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11506 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11507 /* If the latest code point has a fold whose
11508 * bit is set, it must be the only other one */
11509 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
11510 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
11511 {
11512 /* Note that the information needed to decide to do this optimization
11513 * is not currently available until the 2nd pass, and that the actually
11514 * used EXACTish node takes less space than the calculated ANYOF node,
11515 * and hence the amount of space calculated in the first pass is larger
11516 * than actually used, so this optimization doesn't gain us any space.
11517 * But an EXACT node is faster than an ANYOF node, and can be combined
11518 * with any adjacent EXACT nodes later by the optimizer for further
11519 * gains. The speed of executing an EXACTF is similar to an ANYOF
11520 * node, so the optimization advantage comes from the ability to join
11521 * it to adjacent EXACT nodes */
11522
11523 const char * cur_parse= RExC_parse;
11524 U8 op;
11525 RExC_emit = (regnode *)orig_emit;
11526 RExC_parse = (char *)orig_parse;
11527
11528 if (stored == 1) {
11529
11530 /* A locale node with one point can be folded; all the other cases
11531 * with folding will have two points, since we calculate them above
11532 */
11533 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
11534 op = EXACTFL;
11535 }
11536 else {
11537 op = EXACT;
11538 }
11539 }
11540 else { /* else 2 chars in the bit map: the folds of each other */
11541
11542 /* Use the folded value, which for the cases where we get here,
11543 * is just the lower case of the current one (which may resolve to
11544 * itself, or to the other one */
11545 value = toLOWER_LATIN1(value);
11546
11547 /* To join adjacent nodes, they must be the exact EXACTish type.
11548 * Try to use the most likely type, by using EXACTFA if possible,
11549 * then EXACTFU if the regex calls for it, or is required because
11550 * the character is non-ASCII. (If <value> is ASCII, its fold is
11551 * also ASCII for the cases where we get here.) */
11552 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11553 op = EXACTFA;
11554 }
11555 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
11556 op = EXACTFU;
11557 }
11558 else { /* Otherwise, more likely to be EXACTF type */
11559 op = EXACTF;
11560 }
11561 }
11562
11563 ret = reg_node(pRExC_state, op);
11564 RExC_parse = (char *)cur_parse;
11565 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11566 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11567 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11568 STR_LEN(ret)= 2;
11569 RExC_emit += STR_SZ(2);
11570 }
11571 else {
11572 *STRING(ret)= (char)value;
11573 STR_LEN(ret)= 1;
11574 RExC_emit += STR_SZ(1);
11575 }
11576 SvREFCNT_dec(listsv);
11577 return ret;
11578 }
11579
11580 /* If there is a swash and more than one element, we can't use the swash in
11581 * the optimization below. */
11582 if (swash && element_count > 1) {
11583 SvREFCNT_dec(swash);
11584 swash = NULL;
11585 }
11586 if (! nonbitmap
11587 && SvCUR(listsv) == initial_listsv_len
11588 && ! unicode_alternate)
11589 {
11590 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11591 SvREFCNT_dec(listsv);
11592 SvREFCNT_dec(unicode_alternate);
11593 }
11594 else {
11595 /* av[0] stores the character class description in its textual form:
11596 * used later (regexec.c:Perl_regclass_swash()) to initialize the
11597 * appropriate swash, and is also useful for dumping the regnode.
11598 * av[1] if NULL, is a placeholder to later contain the swash computed
11599 * from av[0]. But if no further computation need be done, the
11600 * swash is stored there now.
11601 * av[2] stores the multicharacter foldings, used later in
11602 * regexec.c:S_reginclass().
11603 * av[3] stores the nonbitmap inversion list for use in addition or
11604 * instead of av[0]; not used if av[1] isn't NULL
11605 * av[4] is set if any component of the class is from a user-defined
11606 * property; not used if av[1] isn't NULL */
11607 AV * const av = newAV();
11608 SV *rv;
11609
11610 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11611 ? &PL_sv_undef
11612 : listsv);
11613 if (swash) {
11614 av_store(av, 1, swash);
11615 SvREFCNT_dec(nonbitmap);
11616 }
11617 else {
11618 av_store(av, 1, NULL);
11619 if (nonbitmap) {
11620 av_store(av, 3, nonbitmap);
11621 av_store(av, 4, newSVuv(has_user_defined_property));
11622 }
11623 }
11624
11625 /* Store any computed multi-char folds only if we are allowing
11626 * them */
11627 if (allow_full_fold) {
11628 av_store(av, 2, MUTABLE_SV(unicode_alternate));
11629 if (unicode_alternate) { /* This node is variable length */
11630 OP(ret) = ANYOFV;
11631 }
11632 }
11633 else {
11634 av_store(av, 2, NULL);
11635 }
11636 rv = newRV_noinc(MUTABLE_SV(av));
11637 n = add_data(pRExC_state, 1, "s");
11638 RExC_rxi->data->data[n] = (void*)rv;
11639 ARG_SET(ret, n);
11640 }
11641 return ret;
11642}
11643
11644
11645/* reg_skipcomment()
11646
11647 Absorbs an /x style # comments from the input stream.
11648 Returns true if there is more text remaining in the stream.
11649 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11650 terminates the pattern without including a newline.
11651
11652 Note its the callers responsibility to ensure that we are
11653 actually in /x mode
11654
11655*/
11656
11657STATIC bool
11658S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11659{
11660 bool ended = 0;
11661
11662 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11663
11664 while (RExC_parse < RExC_end)
11665 if (*RExC_parse++ == '\n') {
11666 ended = 1;
11667 break;
11668 }
11669 if (!ended) {
11670 /* we ran off the end of the pattern without ending
11671 the comment, so we have to add an \n when wrapping */
11672 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11673 return 0;
11674 } else
11675 return 1;
11676}
11677
11678/* nextchar()
11679
11680 Advances the parse position, and optionally absorbs
11681 "whitespace" from the inputstream.
11682
11683 Without /x "whitespace" means (?#...) style comments only,
11684 with /x this means (?#...) and # comments and whitespace proper.
11685
11686 Returns the RExC_parse point from BEFORE the scan occurs.
11687
11688 This is the /x friendly way of saying RExC_parse++.
11689*/
11690
11691STATIC char*
11692S_nextchar(pTHX_ RExC_state_t *pRExC_state)
11693{
11694 char* const retval = RExC_parse++;
11695
11696 PERL_ARGS_ASSERT_NEXTCHAR;
11697
11698 for (;;) {
11699 if (RExC_end - RExC_parse >= 3
11700 && *RExC_parse == '('
11701 && RExC_parse[1] == '?'
11702 && RExC_parse[2] == '#')
11703 {
11704 while (*RExC_parse != ')') {
11705 if (RExC_parse == RExC_end)
11706 FAIL("Sequence (?#... not terminated");
11707 RExC_parse++;
11708 }
11709 RExC_parse++;
11710 continue;
11711 }
11712 if (RExC_flags & RXf_PMf_EXTENDED) {
11713 if (isSPACE(*RExC_parse)) {
11714 RExC_parse++;
11715 continue;
11716 }
11717 else if (*RExC_parse == '#') {
11718 if ( reg_skipcomment( pRExC_state ) )
11719 continue;
11720 }
11721 }
11722 return retval;
11723 }
11724}
11725
11726/*
11727- reg_node - emit a node
11728*/
11729STATIC regnode * /* Location. */
11730S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
11731{
11732 dVAR;
11733 register regnode *ptr;
11734 regnode * const ret = RExC_emit;
11735 GET_RE_DEBUG_FLAGS_DECL;
11736
11737 PERL_ARGS_ASSERT_REG_NODE;
11738
11739 if (SIZE_ONLY) {
11740 SIZE_ALIGN(RExC_size);
11741 RExC_size += 1;
11742 return(ret);
11743 }
11744 if (RExC_emit >= RExC_emit_bound)
11745 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11746 op, RExC_emit, RExC_emit_bound);
11747
11748 NODE_ALIGN_FILL(ret);
11749 ptr = ret;
11750 FILL_ADVANCE_NODE(ptr, op);
11751#ifdef RE_TRACK_PATTERN_OFFSETS
11752 if (RExC_offsets) { /* MJD */
11753 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
11754 "reg_node", __LINE__,
11755 PL_reg_name[op],
11756 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11757 ? "Overwriting end of array!\n" : "OK",
11758 (UV)(RExC_emit - RExC_emit_start),
11759 (UV)(RExC_parse - RExC_start),
11760 (UV)RExC_offsets[0]));
11761 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
11762 }
11763#endif
11764 RExC_emit = ptr;
11765 return(ret);
11766}
11767
11768/*
11769- reganode - emit a node with an argument
11770*/
11771STATIC regnode * /* Location. */
11772S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
11773{
11774 dVAR;
11775 register regnode *ptr;
11776 regnode * const ret = RExC_emit;
11777 GET_RE_DEBUG_FLAGS_DECL;
11778
11779 PERL_ARGS_ASSERT_REGANODE;
11780
11781 if (SIZE_ONLY) {
11782 SIZE_ALIGN(RExC_size);
11783 RExC_size += 2;
11784 /*
11785 We can't do this:
11786
11787 assert(2==regarglen[op]+1);
11788
11789 Anything larger than this has to allocate the extra amount.
11790 If we changed this to be:
11791
11792 RExC_size += (1 + regarglen[op]);
11793
11794 then it wouldn't matter. Its not clear what side effect
11795 might come from that so its not done so far.
11796 -- dmq
11797 */
11798 return(ret);
11799 }
11800 if (RExC_emit >= RExC_emit_bound)
11801 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11802 op, RExC_emit, RExC_emit_bound);
11803
11804 NODE_ALIGN_FILL(ret);
11805 ptr = ret;
11806 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
11807#ifdef RE_TRACK_PATTERN_OFFSETS
11808 if (RExC_offsets) { /* MJD */
11809 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11810 "reganode",
11811 __LINE__,
11812 PL_reg_name[op],
11813 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
11814 "Overwriting end of array!\n" : "OK",
11815 (UV)(RExC_emit - RExC_emit_start),
11816 (UV)(RExC_parse - RExC_start),
11817 (UV)RExC_offsets[0]));
11818 Set_Cur_Node_Offset;
11819 }
11820#endif
11821 RExC_emit = ptr;
11822 return(ret);
11823}
11824
11825/*
11826- reguni - emit (if appropriate) a Unicode character
11827*/
11828STATIC STRLEN
11829S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
11830{
11831 dVAR;
11832
11833 PERL_ARGS_ASSERT_REGUNI;
11834
11835 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
11836}
11837
11838/*
11839- reginsert - insert an operator in front of already-emitted operand
11840*
11841* Means relocating the operand.
11842*/
11843STATIC void
11844S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
11845{
11846 dVAR;
11847 register regnode *src;
11848 register regnode *dst;
11849 register regnode *place;
11850 const int offset = regarglen[(U8)op];
11851 const int size = NODE_STEP_REGNODE + offset;
11852 GET_RE_DEBUG_FLAGS_DECL;
11853
11854 PERL_ARGS_ASSERT_REGINSERT;
11855 PERL_UNUSED_ARG(depth);
11856/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
11857 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
11858 if (SIZE_ONLY) {
11859 RExC_size += size;
11860 return;
11861 }
11862
11863 src = RExC_emit;
11864 RExC_emit += size;
11865 dst = RExC_emit;
11866 if (RExC_open_parens) {
11867 int paren;
11868 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
11869 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
11870 if ( RExC_open_parens[paren] >= opnd ) {
11871 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
11872 RExC_open_parens[paren] += size;
11873 } else {
11874 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
11875 }
11876 if ( RExC_close_parens[paren] >= opnd ) {
11877 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
11878 RExC_close_parens[paren] += size;
11879 } else {
11880 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
11881 }
11882 }
11883 }
11884
11885 while (src > opnd) {
11886 StructCopy(--src, --dst, regnode);
11887#ifdef RE_TRACK_PATTERN_OFFSETS
11888 if (RExC_offsets) { /* MJD 20010112 */
11889 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
11890 "reg_insert",
11891 __LINE__,
11892 PL_reg_name[op],
11893 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11894 ? "Overwriting end of array!\n" : "OK",
11895 (UV)(src - RExC_emit_start),
11896 (UV)(dst - RExC_emit_start),
11897 (UV)RExC_offsets[0]));
11898 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11899 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
11900 }
11901#endif
11902 }
11903
11904
11905 place = opnd; /* Op node, where operand used to be. */
11906#ifdef RE_TRACK_PATTERN_OFFSETS
11907 if (RExC_offsets) { /* MJD */
11908 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
11909 "reginsert",
11910 __LINE__,
11911 PL_reg_name[op],
11912 (UV)(place - RExC_emit_start) > RExC_offsets[0]
11913 ? "Overwriting end of array!\n" : "OK",
11914 (UV)(place - RExC_emit_start),
11915 (UV)(RExC_parse - RExC_start),
11916 (UV)RExC_offsets[0]));
11917 Set_Node_Offset(place, RExC_parse);
11918 Set_Node_Length(place, 1);
11919 }
11920#endif
11921 src = NEXTOPER(place);
11922 FILL_ADVANCE_NODE(place, op);
11923 Zero(src, offset, regnode);
11924}
11925
11926/*
11927- regtail - set the next-pointer at the end of a node chain of p to val.
11928- SEE ALSO: regtail_study
11929*/
11930/* TODO: All three parms should be const */
11931STATIC void
11932S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11933{
11934 dVAR;
11935 register regnode *scan;
11936 GET_RE_DEBUG_FLAGS_DECL;
11937
11938 PERL_ARGS_ASSERT_REGTAIL;
11939#ifndef DEBUGGING
11940 PERL_UNUSED_ARG(depth);
11941#endif
11942
11943 if (SIZE_ONLY)
11944 return;
11945
11946 /* Find last node. */
11947 scan = p;
11948 for (;;) {
11949 regnode * const temp = regnext(scan);
11950 DEBUG_PARSE_r({
11951 SV * const mysv=sv_newmortal();
11952 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
11953 regprop(RExC_rx, mysv, scan);
11954 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
11955 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
11956 (temp == NULL ? "->" : ""),
11957 (temp == NULL ? PL_reg_name[OP(val)] : "")
11958 );
11959 });
11960 if (temp == NULL)
11961 break;
11962 scan = temp;
11963 }
11964
11965 if (reg_off_by_arg[OP(scan)]) {
11966 ARG_SET(scan, val - scan);
11967 }
11968 else {
11969 NEXT_OFF(scan) = val - scan;
11970 }
11971}
11972
11973#ifdef DEBUGGING
11974/*
11975- regtail_study - set the next-pointer at the end of a node chain of p to val.
11976- Look for optimizable sequences at the same time.
11977- currently only looks for EXACT chains.
11978
11979This is experimental code. The idea is to use this routine to perform
11980in place optimizations on branches and groups as they are constructed,
11981with the long term intention of removing optimization from study_chunk so
11982that it is purely analytical.
11983
11984Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
11985to control which is which.
11986
11987*/
11988/* TODO: All four parms should be const */
11989
11990STATIC U8
11991S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11992{
11993 dVAR;
11994 register regnode *scan;
11995 U8 exact = PSEUDO;
11996#ifdef EXPERIMENTAL_INPLACESCAN
11997 I32 min = 0;
11998#endif
11999 GET_RE_DEBUG_FLAGS_DECL;
12000
12001 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12002
12003
12004 if (SIZE_ONLY)
12005 return exact;
12006
12007 /* Find last node. */
12008
12009 scan = p;
12010 for (;;) {
12011 regnode * const temp = regnext(scan);
12012#ifdef EXPERIMENTAL_INPLACESCAN
12013 if (PL_regkind[OP(scan)] == EXACT) {
12014 bool has_exactf_sharp_s; /* Unexamined in this routine */
12015 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12016 return EXACT;
12017 }
12018#endif
12019 if ( exact ) {
12020 switch (OP(scan)) {
12021 case EXACT:
12022 case EXACTF:
12023 case EXACTFA:
12024 case EXACTFU:
12025 case EXACTFU_SS:
12026 case EXACTFU_NO_TRIE:
12027 case EXACTFL:
12028 if( exact == PSEUDO )
12029 exact= OP(scan);
12030 else if ( exact != OP(scan) )
12031 exact= 0;
12032 case NOTHING:
12033 break;
12034 default:
12035 exact= 0;
12036 }
12037 }
12038 DEBUG_PARSE_r({
12039 SV * const mysv=sv_newmortal();
12040 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12041 regprop(RExC_rx, mysv, scan);
12042 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12043 SvPV_nolen_const(mysv),
12044 REG_NODE_NUM(scan),
12045 PL_reg_name[exact]);
12046 });
12047 if (temp == NULL)
12048 break;
12049 scan = temp;
12050 }
12051 DEBUG_PARSE_r({
12052 SV * const mysv_val=sv_newmortal();
12053 DEBUG_PARSE_MSG("");
12054 regprop(RExC_rx, mysv_val, val);
12055 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12056 SvPV_nolen_const(mysv_val),
12057 (IV)REG_NODE_NUM(val),
12058 (IV)(val - scan)
12059 );
12060 });
12061 if (reg_off_by_arg[OP(scan)]) {
12062 ARG_SET(scan, val - scan);
12063 }
12064 else {
12065 NEXT_OFF(scan) = val - scan;
12066 }
12067
12068 return exact;
12069}
12070#endif
12071
12072/*
12073 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12074 */
12075#ifdef DEBUGGING
12076static void
12077S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12078{
12079 int bit;
12080 int set=0;
12081 regex_charset cs;
12082
12083 for (bit=0; bit<32; bit++) {
12084 if (flags & (1<<bit)) {
12085 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12086 continue;
12087 }
12088 if (!set++ && lead)
12089 PerlIO_printf(Perl_debug_log, "%s",lead);
12090 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12091 }
12092 }
12093 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12094 if (!set++ && lead) {
12095 PerlIO_printf(Perl_debug_log, "%s",lead);
12096 }
12097 switch (cs) {
12098 case REGEX_UNICODE_CHARSET:
12099 PerlIO_printf(Perl_debug_log, "UNICODE");
12100 break;
12101 case REGEX_LOCALE_CHARSET:
12102 PerlIO_printf(Perl_debug_log, "LOCALE");
12103 break;
12104 case REGEX_ASCII_RESTRICTED_CHARSET:
12105 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12106 break;
12107 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12108 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12109 break;
12110 default:
12111 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12112 break;
12113 }
12114 }
12115 if (lead) {
12116 if (set)
12117 PerlIO_printf(Perl_debug_log, "\n");
12118 else
12119 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12120 }
12121}
12122#endif
12123
12124void
12125Perl_regdump(pTHX_ const regexp *r)
12126{
12127#ifdef DEBUGGING
12128 dVAR;
12129 SV * const sv = sv_newmortal();
12130 SV *dsv= sv_newmortal();
12131 RXi_GET_DECL(r,ri);
12132 GET_RE_DEBUG_FLAGS_DECL;
12133
12134 PERL_ARGS_ASSERT_REGDUMP;
12135
12136 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12137
12138 /* Header fields of interest. */
12139 if (r->anchored_substr) {
12140 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12141 RE_SV_DUMPLEN(r->anchored_substr), 30);
12142 PerlIO_printf(Perl_debug_log,
12143 "anchored %s%s at %"IVdf" ",
12144 s, RE_SV_TAIL(r->anchored_substr),
12145 (IV)r->anchored_offset);
12146 } else if (r->anchored_utf8) {
12147 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12148 RE_SV_DUMPLEN(r->anchored_utf8), 30);
12149 PerlIO_printf(Perl_debug_log,
12150 "anchored utf8 %s%s at %"IVdf" ",
12151 s, RE_SV_TAIL(r->anchored_utf8),
12152 (IV)r->anchored_offset);
12153 }
12154 if (r->float_substr) {
12155 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12156 RE_SV_DUMPLEN(r->float_substr), 30);
12157 PerlIO_printf(Perl_debug_log,
12158 "floating %s%s at %"IVdf"..%"UVuf" ",
12159 s, RE_SV_TAIL(r->float_substr),
12160 (IV)r->float_min_offset, (UV)r->float_max_offset);
12161 } else if (r->float_utf8) {
12162 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12163 RE_SV_DUMPLEN(r->float_utf8), 30);
12164 PerlIO_printf(Perl_debug_log,
12165 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12166 s, RE_SV_TAIL(r->float_utf8),
12167 (IV)r->float_min_offset, (UV)r->float_max_offset);
12168 }
12169 if (r->check_substr || r->check_utf8)
12170 PerlIO_printf(Perl_debug_log,
12171 (const char *)
12172 (r->check_substr == r->float_substr
12173 && r->check_utf8 == r->float_utf8
12174 ? "(checking floating" : "(checking anchored"));
12175 if (r->extflags & RXf_NOSCAN)
12176 PerlIO_printf(Perl_debug_log, " noscan");
12177 if (r->extflags & RXf_CHECK_ALL)
12178 PerlIO_printf(Perl_debug_log, " isall");
12179 if (r->check_substr || r->check_utf8)
12180 PerlIO_printf(Perl_debug_log, ") ");
12181
12182 if (ri->regstclass) {
12183 regprop(r, sv, ri->regstclass);
12184 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12185 }
12186 if (r->extflags & RXf_ANCH) {
12187 PerlIO_printf(Perl_debug_log, "anchored");
12188 if (r->extflags & RXf_ANCH_BOL)
12189 PerlIO_printf(Perl_debug_log, "(BOL)");
12190 if (r->extflags & RXf_ANCH_MBOL)
12191 PerlIO_printf(Perl_debug_log, "(MBOL)");
12192 if (r->extflags & RXf_ANCH_SBOL)
12193 PerlIO_printf(Perl_debug_log, "(SBOL)");
12194 if (r->extflags & RXf_ANCH_GPOS)
12195 PerlIO_printf(Perl_debug_log, "(GPOS)");
12196 PerlIO_putc(Perl_debug_log, ' ');
12197 }
12198 if (r->extflags & RXf_GPOS_SEEN)
12199 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12200 if (r->intflags & PREGf_SKIP)
12201 PerlIO_printf(Perl_debug_log, "plus ");
12202 if (r->intflags & PREGf_IMPLICIT)
12203 PerlIO_printf(Perl_debug_log, "implicit ");
12204 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12205 if (r->extflags & RXf_EVAL_SEEN)
12206 PerlIO_printf(Perl_debug_log, "with eval ");
12207 PerlIO_printf(Perl_debug_log, "\n");
12208 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12209#else
12210 PERL_ARGS_ASSERT_REGDUMP;
12211 PERL_UNUSED_CONTEXT;
12212 PERL_UNUSED_ARG(r);
12213#endif /* DEBUGGING */
12214}
12215
12216/*
12217- regprop - printable representation of opcode
12218*/
12219#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12220STMT_START { \
12221 if (do_sep) { \
12222 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12223 if (flags & ANYOF_INVERT) \
12224 /*make sure the invert info is in each */ \
12225 sv_catpvs(sv, "^"); \
12226 do_sep = 0; \
12227 } \
12228} STMT_END
12229
12230void
12231Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12232{
12233#ifdef DEBUGGING
12234 dVAR;
12235 register int k;
12236 RXi_GET_DECL(prog,progi);
12237 GET_RE_DEBUG_FLAGS_DECL;
12238
12239 PERL_ARGS_ASSERT_REGPROP;
12240
12241 sv_setpvs(sv, "");
12242
12243 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
12244 /* It would be nice to FAIL() here, but this may be called from
12245 regexec.c, and it would be hard to supply pRExC_state. */
12246 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12247 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12248
12249 k = PL_regkind[OP(o)];
12250
12251 if (k == EXACT) {
12252 sv_catpvs(sv, " ");
12253 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12254 * is a crude hack but it may be the best for now since
12255 * we have no flag "this EXACTish node was UTF-8"
12256 * --jhi */
12257 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12258 PERL_PV_ESCAPE_UNI_DETECT |
12259 PERL_PV_ESCAPE_NONASCII |
12260 PERL_PV_PRETTY_ELLIPSES |
12261 PERL_PV_PRETTY_LTGT |
12262 PERL_PV_PRETTY_NOCLEAR
12263 );
12264 } else if (k == TRIE) {
12265 /* print the details of the trie in dumpuntil instead, as
12266 * progi->data isn't available here */
12267 const char op = OP(o);
12268 const U32 n = ARG(o);
12269 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12270 (reg_ac_data *)progi->data->data[n] :
12271 NULL;
12272 const reg_trie_data * const trie
12273 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12274
12275 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12276 DEBUG_TRIE_COMPILE_r(
12277 Perl_sv_catpvf(aTHX_ sv,
12278 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12279 (UV)trie->startstate,
12280 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12281 (UV)trie->wordcount,
12282 (UV)trie->minlen,
12283 (UV)trie->maxlen,
12284 (UV)TRIE_CHARCOUNT(trie),
12285 (UV)trie->uniquecharcount
12286 )
12287 );
12288 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12289 int i;
12290 int rangestart = -1;
12291 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12292 sv_catpvs(sv, "[");
12293 for (i = 0; i <= 256; i++) {
12294 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12295 if (rangestart == -1)
12296 rangestart = i;
12297 } else if (rangestart != -1) {
12298 if (i <= rangestart + 3)
12299 for (; rangestart < i; rangestart++)
12300 put_byte(sv, rangestart);
12301 else {
12302 put_byte(sv, rangestart);
12303 sv_catpvs(sv, "-");
12304 put_byte(sv, i - 1);
12305 }
12306 rangestart = -1;
12307 }
12308 }
12309 sv_catpvs(sv, "]");
12310 }
12311
12312 } else if (k == CURLY) {
12313 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12314 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12315 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12316 }
12317 else if (k == WHILEM && o->flags) /* Ordinal/of */
12318 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12319 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12320 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
12321 if ( RXp_PAREN_NAMES(prog) ) {
12322 if ( k != REF || (OP(o) < NREF)) {
12323 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12324 SV **name= av_fetch(list, ARG(o), 0 );
12325 if (name)
12326 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12327 }
12328 else {
12329 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12330 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12331 I32 *nums=(I32*)SvPVX(sv_dat);
12332 SV **name= av_fetch(list, nums[0], 0 );
12333 I32 n;
12334 if (name) {
12335 for ( n=0; n<SvIVX(sv_dat); n++ ) {
12336 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12337 (n ? "," : ""), (IV)nums[n]);
12338 }
12339 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12340 }
12341 }
12342 }
12343 } else if (k == GOSUB)
12344 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12345 else if (k == VERB) {
12346 if (!o->flags)
12347 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12348 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12349 } else if (k == LOGICAL)
12350 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
12351 else if (k == ANYOF) {
12352 int i, rangestart = -1;
12353 const U8 flags = ANYOF_FLAGS(o);
12354 int do_sep = 0;
12355
12356 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12357 static const char * const anyofs[] = {
12358 "\\w",
12359 "\\W",
12360 "\\s",
12361 "\\S",
12362 "\\d",
12363 "\\D",
12364 "[:alnum:]",
12365 "[:^alnum:]",
12366 "[:alpha:]",
12367 "[:^alpha:]",
12368 "[:ascii:]",
12369 "[:^ascii:]",
12370 "[:cntrl:]",
12371 "[:^cntrl:]",
12372 "[:graph:]",
12373 "[:^graph:]",
12374 "[:lower:]",
12375 "[:^lower:]",
12376 "[:print:]",
12377 "[:^print:]",
12378 "[:punct:]",
12379 "[:^punct:]",
12380 "[:upper:]",
12381 "[:^upper:]",
12382 "[:xdigit:]",
12383 "[:^xdigit:]",
12384 "[:space:]",
12385 "[:^space:]",
12386 "[:blank:]",
12387 "[:^blank:]"
12388 };
12389
12390 if (flags & ANYOF_LOCALE)
12391 sv_catpvs(sv, "{loc}");
12392 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
12393 sv_catpvs(sv, "{i}");
12394 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
12395 if (flags & ANYOF_INVERT)
12396 sv_catpvs(sv, "^");
12397
12398 /* output what the standard cp 0-255 bitmap matches */
12399 for (i = 0; i <= 256; i++) {
12400 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12401 if (rangestart == -1)
12402 rangestart = i;
12403 } else if (rangestart != -1) {
12404 if (i <= rangestart + 3)
12405 for (; rangestart < i; rangestart++)
12406 put_byte(sv, rangestart);
12407 else {
12408 put_byte(sv, rangestart);
12409 sv_catpvs(sv, "-");
12410 put_byte(sv, i - 1);
12411 }
12412 do_sep = 1;
12413 rangestart = -1;
12414 }
12415 }
12416
12417 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12418 /* output any special charclass tests (used entirely under use locale) */
12419 if (ANYOF_CLASS_TEST_ANY_SET(o))
12420 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
12421 if (ANYOF_CLASS_TEST(o,i)) {
12422 sv_catpv(sv, anyofs[i]);
12423 do_sep = 1;
12424 }
12425
12426 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12427
12428 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12429 sv_catpvs(sv, "{non-utf8-latin1-all}");
12430 }
12431
12432 /* output information about the unicode matching */
12433 if (flags & ANYOF_UNICODE_ALL)
12434 sv_catpvs(sv, "{unicode_all}");
12435 else if (ANYOF_NONBITMAP(o))
12436 sv_catpvs(sv, "{unicode}");
12437 if (flags & ANYOF_NONBITMAP_NON_UTF8)
12438 sv_catpvs(sv, "{outside bitmap}");
12439
12440 if (ANYOF_NONBITMAP(o)) {
12441 SV *lv; /* Set if there is something outside the bit map */
12442 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
12443 bool byte_output = FALSE; /* If something in the bitmap has been
12444 output */
12445
12446 if (lv && lv != &PL_sv_undef) {
12447 if (sw) {
12448 U8 s[UTF8_MAXBYTES_CASE+1];
12449
12450 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
12451 uvchr_to_utf8(s, i);
12452
12453 if (i < 256
12454 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
12455 things already
12456 output as part
12457 of the bitmap */
12458 && swash_fetch(sw, s, TRUE))
12459 {
12460 if (rangestart == -1)
12461 rangestart = i;
12462 } else if (rangestart != -1) {
12463 byte_output = TRUE;
12464 if (i <= rangestart + 3)
12465 for (; rangestart < i; rangestart++) {
12466 put_byte(sv, rangestart);
12467 }
12468 else {
12469 put_byte(sv, rangestart);
12470 sv_catpvs(sv, "-");
12471 put_byte(sv, i-1);
12472 }
12473 rangestart = -1;
12474 }
12475 }
12476 }
12477
12478 {
12479 char *s = savesvpv(lv);
12480 char * const origs = s;
12481
12482 while (*s && *s != '\n')
12483 s++;
12484
12485 if (*s == '\n') {
12486 const char * const t = ++s;
12487
12488 if (byte_output) {
12489 sv_catpvs(sv, " ");
12490 }
12491
12492 while (*s) {
12493 if (*s == '\n') {
12494
12495 /* Truncate very long output */
12496 if (s - origs > 256) {
12497 Perl_sv_catpvf(aTHX_ sv,
12498 "%.*s...",
12499 (int) (s - origs - 1),
12500 t);
12501 goto out_dump;
12502 }
12503 *s = ' ';
12504 }
12505 else if (*s == '\t') {
12506 *s = '-';
12507 }
12508 s++;
12509 }
12510 if (s[-1] == ' ')
12511 s[-1] = 0;
12512
12513 sv_catpv(sv, t);
12514 }
12515
12516 out_dump:
12517
12518 Safefree(origs);
12519 }
12520 SvREFCNT_dec(lv);
12521 }
12522 }
12523
12524 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12525 }
12526 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
12527 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
12528#else
12529 PERL_UNUSED_CONTEXT;
12530 PERL_UNUSED_ARG(sv);
12531 PERL_UNUSED_ARG(o);
12532 PERL_UNUSED_ARG(prog);
12533#endif /* DEBUGGING */
12534}
12535
12536SV *
12537Perl_re_intuit_string(pTHX_ REGEXP * const r)
12538{ /* Assume that RE_INTUIT is set */
12539 dVAR;
12540 struct regexp *const prog = (struct regexp *)SvANY(r);
12541 GET_RE_DEBUG_FLAGS_DECL;
12542
12543 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
12544 PERL_UNUSED_CONTEXT;
12545
12546 DEBUG_COMPILE_r(
12547 {
12548 const char * const s = SvPV_nolen_const(prog->check_substr
12549 ? prog->check_substr : prog->check_utf8);
12550
12551 if (!PL_colorset) reginitcolors();
12552 PerlIO_printf(Perl_debug_log,
12553 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
12554 PL_colors[4],
12555 prog->check_substr ? "" : "utf8 ",
12556 PL_colors[5],PL_colors[0],
12557 s,
12558 PL_colors[1],
12559 (strlen(s) > 60 ? "..." : ""));
12560 } );
12561
12562 return prog->check_substr ? prog->check_substr : prog->check_utf8;
12563}
12564
12565/*
12566 pregfree()
12567
12568 handles refcounting and freeing the perl core regexp structure. When
12569 it is necessary to actually free the structure the first thing it
12570 does is call the 'free' method of the regexp_engine associated to
12571 the regexp, allowing the handling of the void *pprivate; member
12572 first. (This routine is not overridable by extensions, which is why
12573 the extensions free is called first.)
12574
12575 See regdupe and regdupe_internal if you change anything here.
12576*/
12577#ifndef PERL_IN_XSUB_RE
12578void
12579Perl_pregfree(pTHX_ REGEXP *r)
12580{
12581 SvREFCNT_dec(r);
12582}
12583
12584void
12585Perl_pregfree2(pTHX_ REGEXP *rx)
12586{
12587 dVAR;
12588 struct regexp *const r = (struct regexp *)SvANY(rx);
12589 GET_RE_DEBUG_FLAGS_DECL;
12590
12591 PERL_ARGS_ASSERT_PREGFREE2;
12592
12593 if (r->mother_re) {
12594 ReREFCNT_dec(r->mother_re);
12595 } else {
12596 CALLREGFREE_PVT(rx); /* free the private data */
12597 SvREFCNT_dec(RXp_PAREN_NAMES(r));
12598 }
12599 if (r->substrs) {
12600 SvREFCNT_dec(r->anchored_substr);
12601 SvREFCNT_dec(r->anchored_utf8);
12602 SvREFCNT_dec(r->float_substr);
12603 SvREFCNT_dec(r->float_utf8);
12604 Safefree(r->substrs);
12605 }
12606 RX_MATCH_COPY_FREE(rx);
12607#ifdef PERL_OLD_COPY_ON_WRITE
12608 SvREFCNT_dec(r->saved_copy);
12609#endif
12610 Safefree(r->offs);
12611}
12612
12613/* reg_temp_copy()
12614
12615 This is a hacky workaround to the structural issue of match results
12616 being stored in the regexp structure which is in turn stored in
12617 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12618 could be PL_curpm in multiple contexts, and could require multiple
12619 result sets being associated with the pattern simultaneously, such
12620 as when doing a recursive match with (??{$qr})
12621
12622 The solution is to make a lightweight copy of the regexp structure
12623 when a qr// is returned from the code executed by (??{$qr}) this
12624 lightweight copy doesn't actually own any of its data except for
12625 the starp/end and the actual regexp structure itself.
12626
12627*/
12628
12629
12630REGEXP *
12631Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
12632{
12633 struct regexp *ret;
12634 struct regexp *const r = (struct regexp *)SvANY(rx);
12635 register const I32 npar = r->nparens+1;
12636
12637 PERL_ARGS_ASSERT_REG_TEMP_COPY;
12638
12639 if (!ret_x)
12640 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12641 ret = (struct regexp *)SvANY(ret_x);
12642
12643 (void)ReREFCNT_inc(rx);
12644 /* We can take advantage of the existing "copied buffer" mechanism in SVs
12645 by pointing directly at the buffer, but flagging that the allocated
12646 space in the copy is zero. As we've just done a struct copy, it's now
12647 a case of zero-ing that, rather than copying the current length. */
12648 SvPV_set(ret_x, RX_WRAPPED(rx));
12649 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
12650 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12651 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
12652 SvLEN_set(ret_x, 0);
12653 SvSTASH_set(ret_x, NULL);
12654 SvMAGIC_set(ret_x, NULL);
12655 Newx(ret->offs, npar, regexp_paren_pair);
12656 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12657 if (r->substrs) {
12658 Newx(ret->substrs, 1, struct reg_substr_data);
12659 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12660
12661 SvREFCNT_inc_void(ret->anchored_substr);
12662 SvREFCNT_inc_void(ret->anchored_utf8);
12663 SvREFCNT_inc_void(ret->float_substr);
12664 SvREFCNT_inc_void(ret->float_utf8);
12665
12666 /* check_substr and check_utf8, if non-NULL, point to either their
12667 anchored or float namesakes, and don't hold a second reference. */
12668 }
12669 RX_MATCH_COPIED_off(ret_x);
12670#ifdef PERL_OLD_COPY_ON_WRITE
12671 ret->saved_copy = NULL;
12672#endif
12673 ret->mother_re = rx;
12674
12675 return ret_x;
12676}
12677#endif
12678
12679/* regfree_internal()
12680
12681 Free the private data in a regexp. This is overloadable by
12682 extensions. Perl takes care of the regexp structure in pregfree(),
12683 this covers the *pprivate pointer which technically perl doesn't
12684 know about, however of course we have to handle the
12685 regexp_internal structure when no extension is in use.
12686
12687 Note this is called before freeing anything in the regexp
12688 structure.
12689 */
12690
12691void
12692Perl_regfree_internal(pTHX_ REGEXP * const rx)
12693{
12694 dVAR;
12695 struct regexp *const r = (struct regexp *)SvANY(rx);
12696 RXi_GET_DECL(r,ri);
12697 GET_RE_DEBUG_FLAGS_DECL;
12698
12699 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12700
12701 DEBUG_COMPILE_r({
12702 if (!PL_colorset)
12703 reginitcolors();
12704 {
12705 SV *dsv= sv_newmortal();
12706 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
12707 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
12708 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12709 PL_colors[4],PL_colors[5],s);
12710 }
12711 });
12712#ifdef RE_TRACK_PATTERN_OFFSETS
12713 if (ri->u.offsets)
12714 Safefree(ri->u.offsets); /* 20010421 MJD */
12715#endif
12716 if (ri->data) {
12717 int n = ri->data->count;
12718 PAD* new_comppad = NULL;
12719 PAD* old_comppad;
12720 PADOFFSET refcnt;
12721
12722 while (--n >= 0) {
12723 /* If you add a ->what type here, update the comment in regcomp.h */
12724 switch (ri->data->what[n]) {
12725 case 'a':
12726 case 's':
12727 case 'S':
12728 case 'u':
12729 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
12730 break;
12731 case 'f':
12732 Safefree(ri->data->data[n]);
12733 break;
12734 case 'p':
12735 new_comppad = MUTABLE_AV(ri->data->data[n]);
12736 break;
12737 case 'o':
12738 if (new_comppad == NULL)
12739 Perl_croak(aTHX_ "panic: pregfree comppad");
12740 PAD_SAVE_LOCAL(old_comppad,
12741 /* Watch out for global destruction's random ordering. */
12742 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
12743 );
12744 OP_REFCNT_LOCK;
12745 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
12746 OP_REFCNT_UNLOCK;
12747 if (!refcnt)
12748 op_free((OP_4tree*)ri->data->data[n]);
12749
12750 PAD_RESTORE_LOCAL(old_comppad);
12751 SvREFCNT_dec(MUTABLE_SV(new_comppad));
12752 new_comppad = NULL;
12753 break;
12754 case 'n':
12755 break;
12756 case 'T':
12757 { /* Aho Corasick add-on structure for a trie node.
12758 Used in stclass optimization only */
12759 U32 refcount;
12760 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
12761 OP_REFCNT_LOCK;
12762 refcount = --aho->refcount;
12763 OP_REFCNT_UNLOCK;
12764 if ( !refcount ) {
12765 PerlMemShared_free(aho->states);
12766 PerlMemShared_free(aho->fail);
12767 /* do this last!!!! */
12768 PerlMemShared_free(ri->data->data[n]);
12769 PerlMemShared_free(ri->regstclass);
12770 }
12771 }
12772 break;
12773 case 't':
12774 {
12775 /* trie structure. */
12776 U32 refcount;
12777 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
12778 OP_REFCNT_LOCK;
12779 refcount = --trie->refcount;
12780 OP_REFCNT_UNLOCK;
12781 if ( !refcount ) {
12782 PerlMemShared_free(trie->charmap);
12783 PerlMemShared_free(trie->states);
12784 PerlMemShared_free(trie->trans);
12785 if (trie->bitmap)
12786 PerlMemShared_free(trie->bitmap);
12787 if (trie->jump)
12788 PerlMemShared_free(trie->jump);
12789 PerlMemShared_free(trie->wordinfo);
12790 /* do this last!!!! */
12791 PerlMemShared_free(ri->data->data[n]);
12792 }
12793 }
12794 break;
12795 default:
12796 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
12797 }
12798 }
12799 Safefree(ri->data->what);
12800 Safefree(ri->data);
12801 }
12802
12803 Safefree(ri);
12804}
12805
12806#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12807#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
12808#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
12809
12810/*
12811 re_dup - duplicate a regexp.
12812
12813 This routine is expected to clone a given regexp structure. It is only
12814 compiled under USE_ITHREADS.
12815
12816 After all of the core data stored in struct regexp is duplicated
12817 the regexp_engine.dupe method is used to copy any private data
12818 stored in the *pprivate pointer. This allows extensions to handle
12819 any duplication it needs to do.
12820
12821 See pregfree() and regfree_internal() if you change anything here.
12822*/
12823#if defined(USE_ITHREADS)
12824#ifndef PERL_IN_XSUB_RE
12825void
12826Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
12827{
12828 dVAR;
12829 I32 npar;
12830 const struct regexp *r = (const struct regexp *)SvANY(sstr);
12831 struct regexp *ret = (struct regexp *)SvANY(dstr);
12832
12833 PERL_ARGS_ASSERT_RE_DUP_GUTS;
12834
12835 npar = r->nparens+1;
12836 Newx(ret->offs, npar, regexp_paren_pair);
12837 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
12838 if(ret->swap) {
12839 /* no need to copy these */
12840 Newx(ret->swap, npar, regexp_paren_pair);
12841 }
12842
12843 if (ret->substrs) {
12844 /* Do it this way to avoid reading from *r after the StructCopy().
12845 That way, if any of the sv_dup_inc()s dislodge *r from the L1
12846 cache, it doesn't matter. */
12847 const bool anchored = r->check_substr
12848 ? r->check_substr == r->anchored_substr
12849 : r->check_utf8 == r->anchored_utf8;
12850 Newx(ret->substrs, 1, struct reg_substr_data);
12851 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12852
12853 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12854 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12855 ret->float_substr = sv_dup_inc(ret->float_substr, param);
12856 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
12857
12858 /* check_substr and check_utf8, if non-NULL, point to either their
12859 anchored or float namesakes, and don't hold a second reference. */
12860
12861 if (ret->check_substr) {
12862 if (anchored) {
12863 assert(r->check_utf8 == r->anchored_utf8);
12864 ret->check_substr = ret->anchored_substr;
12865 ret->check_utf8 = ret->anchored_utf8;
12866 } else {
12867 assert(r->check_substr == r->float_substr);
12868 assert(r->check_utf8 == r->float_utf8);
12869 ret->check_substr = ret->float_substr;
12870 ret->check_utf8 = ret->float_utf8;
12871 }
12872 } else if (ret->check_utf8) {
12873 if (anchored) {
12874 ret->check_utf8 = ret->anchored_utf8;
12875 } else {
12876 ret->check_utf8 = ret->float_utf8;
12877 }
12878 }
12879 }
12880
12881 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
12882
12883 if (ret->pprivate)
12884 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
12885
12886 if (RX_MATCH_COPIED(dstr))
12887 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
12888 else
12889 ret->subbeg = NULL;
12890#ifdef PERL_OLD_COPY_ON_WRITE
12891 ret->saved_copy = NULL;
12892#endif
12893
12894 if (ret->mother_re) {
12895 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12896 /* Our storage points directly to our mother regexp, but that's
12897 1: a buffer in a different thread
12898 2: something we no longer hold a reference on
12899 so we need to copy it locally. */
12900 /* Note we need to use SvCUR(), rather than
12901 SvLEN(), on our mother_re, because it, in
12902 turn, may well be pointing to its own mother_re. */
12903 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12904 SvCUR(ret->mother_re)+1));
12905 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12906 }
12907 ret->mother_re = NULL;
12908 }
12909 ret->gofs = 0;
12910}
12911#endif /* PERL_IN_XSUB_RE */
12912
12913/*
12914 regdupe_internal()
12915
12916 This is the internal complement to regdupe() which is used to copy
12917 the structure pointed to by the *pprivate pointer in the regexp.
12918 This is the core version of the extension overridable cloning hook.
12919 The regexp structure being duplicated will be copied by perl prior
12920 to this and will be provided as the regexp *r argument, however
12921 with the /old/ structures pprivate pointer value. Thus this routine
12922 may override any copying normally done by perl.
12923
12924 It returns a pointer to the new regexp_internal structure.
12925*/
12926
12927void *
12928Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
12929{
12930 dVAR;
12931 struct regexp *const r = (struct regexp *)SvANY(rx);
12932 regexp_internal *reti;
12933 int len;
12934 RXi_GET_DECL(r,ri);
12935
12936 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
12937
12938 len = ProgLen(ri);
12939
12940 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
12941 Copy(ri->program, reti->program, len+1, regnode);
12942
12943
12944 reti->regstclass = NULL;
12945
12946 if (ri->data) {
12947 struct reg_data *d;
12948 const int count = ri->data->count;
12949 int i;
12950
12951 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
12952 char, struct reg_data);
12953 Newx(d->what, count, U8);
12954
12955 d->count = count;
12956 for (i = 0; i < count; i++) {
12957 d->what[i] = ri->data->what[i];
12958 switch (d->what[i]) {
12959 /* legal options are one of: sSfpontTua
12960 see also regcomp.h and pregfree() */
12961 case 'a': /* actually an AV, but the dup function is identical. */
12962 case 's':
12963 case 'S':
12964 case 'p': /* actually an AV, but the dup function is identical. */
12965 case 'u': /* actually an HV, but the dup function is identical. */
12966 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
12967 break;
12968 case 'f':
12969 /* This is cheating. */
12970 Newx(d->data[i], 1, struct regnode_charclass_class);
12971 StructCopy(ri->data->data[i], d->data[i],
12972 struct regnode_charclass_class);
12973 reti->regstclass = (regnode*)d->data[i];
12974 break;
12975 case 'o':
12976 /* Compiled op trees are readonly and in shared memory,
12977 and can thus be shared without duplication. */
12978 OP_REFCNT_LOCK;
12979 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
12980 OP_REFCNT_UNLOCK;
12981 break;
12982 case 'T':
12983 /* Trie stclasses are readonly and can thus be shared
12984 * without duplication. We free the stclass in pregfree
12985 * when the corresponding reg_ac_data struct is freed.
12986 */
12987 reti->regstclass= ri->regstclass;
12988 /* Fall through */
12989 case 't':
12990 OP_REFCNT_LOCK;
12991 ((reg_trie_data*)ri->data->data[i])->refcount++;
12992 OP_REFCNT_UNLOCK;
12993 /* Fall through */
12994 case 'n':
12995 d->data[i] = ri->data->data[i];
12996 break;
12997 default:
12998 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
12999 }
13000 }
13001
13002 reti->data = d;
13003 }
13004 else
13005 reti->data = NULL;
13006
13007 reti->name_list_idx = ri->name_list_idx;
13008
13009#ifdef RE_TRACK_PATTERN_OFFSETS
13010 if (ri->u.offsets) {
13011 Newx(reti->u.offsets, 2*len+1, U32);
13012 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13013 }
13014#else
13015 SetProgLen(reti,len);
13016#endif
13017
13018 return (void*)reti;
13019}
13020
13021#endif /* USE_ITHREADS */
13022
13023#ifndef PERL_IN_XSUB_RE
13024
13025/*
13026 - regnext - dig the "next" pointer out of a node
13027 */
13028regnode *
13029Perl_regnext(pTHX_ register regnode *p)
13030{
13031 dVAR;
13032 register I32 offset;
13033
13034 if (!p)
13035 return(NULL);
13036
13037 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13038 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13039 }
13040
13041 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13042 if (offset == 0)
13043 return(NULL);
13044
13045 return(p+offset);
13046}
13047#endif
13048
13049STATIC void
13050S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13051{
13052 va_list args;
13053 STRLEN l1 = strlen(pat1);
13054 STRLEN l2 = strlen(pat2);
13055 char buf[512];
13056 SV *msv;
13057 const char *message;
13058
13059 PERL_ARGS_ASSERT_RE_CROAK2;
13060
13061 if (l1 > 510)
13062 l1 = 510;
13063 if (l1 + l2 > 510)
13064 l2 = 510 - l1;
13065 Copy(pat1, buf, l1 , char);
13066 Copy(pat2, buf + l1, l2 , char);
13067 buf[l1 + l2] = '\n';
13068 buf[l1 + l2 + 1] = '\0';
13069#ifdef I_STDARG
13070 /* ANSI variant takes additional second argument */
13071 va_start(args, pat2);
13072#else
13073 va_start(args);
13074#endif
13075 msv = vmess(buf, &args);
13076 va_end(args);
13077 message = SvPV_const(msv,l1);
13078 if (l1 > 512)
13079 l1 = 512;
13080 Copy(message, buf, l1 , char);
13081 buf[l1-1] = '\0'; /* Overwrite \n */
13082 Perl_croak(aTHX_ "%s", buf);
13083}
13084
13085/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13086
13087#ifndef PERL_IN_XSUB_RE
13088void
13089Perl_save_re_context(pTHX)
13090{
13091 dVAR;
13092
13093 struct re_save_state *state;
13094
13095 SAVEVPTR(PL_curcop);
13096 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13097
13098 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13099 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13100 SSPUSHUV(SAVEt_RE_STATE);
13101
13102 Copy(&PL_reg_state, state, 1, struct re_save_state);
13103
13104 PL_reg_start_tmp = 0;
13105 PL_reg_start_tmpl = 0;
13106 PL_reg_oldsaved = NULL;
13107 PL_reg_oldsavedlen = 0;
13108 PL_reg_maxiter = 0;
13109 PL_reg_leftiter = 0;
13110 PL_reg_poscache = NULL;
13111 PL_reg_poscache_size = 0;
13112#ifdef PERL_OLD_COPY_ON_WRITE
13113 PL_nrs = NULL;
13114#endif
13115
13116 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13117 if (PL_curpm) {
13118 const REGEXP * const rx = PM_GETRE(PL_curpm);
13119 if (rx) {
13120 U32 i;
13121 for (i = 1; i <= RX_NPARENS(rx); i++) {
13122 char digits[TYPE_CHARS(long)];
13123 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13124 GV *const *const gvp
13125 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13126
13127 if (gvp) {
13128 GV * const gv = *gvp;
13129 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13130 save_scalar(gv);
13131 }
13132 }
13133 }
13134 }
13135}
13136#endif
13137
13138static void
13139clear_re(pTHX_ void *r)
13140{
13141 dVAR;
13142 ReREFCNT_dec((REGEXP *)r);
13143}
13144
13145#ifdef DEBUGGING
13146
13147STATIC void
13148S_put_byte(pTHX_ SV *sv, int c)
13149{
13150 PERL_ARGS_ASSERT_PUT_BYTE;
13151
13152 /* Our definition of isPRINT() ignores locales, so only bytes that are
13153 not part of UTF-8 are considered printable. I assume that the same
13154 holds for UTF-EBCDIC.
13155 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13156 which Wikipedia says:
13157
13158 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13159 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13160 identical, to the ASCII delete (DEL) or rubout control character.
13161 ) So the old condition can be simplified to !isPRINT(c) */
13162 if (!isPRINT(c)) {
13163 if (c < 256) {
13164 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13165 }
13166 else {
13167 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13168 }
13169 }
13170 else {
13171 const char string = c;
13172 if (c == '-' || c == ']' || c == '\\' || c == '^')
13173 sv_catpvs(sv, "\\");
13174 sv_catpvn(sv, &string, 1);
13175 }
13176}
13177
13178
13179#define CLEAR_OPTSTART \
13180 if (optstart) STMT_START { \
13181 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13182 optstart=NULL; \
13183 } STMT_END
13184
13185#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13186
13187STATIC const regnode *
13188S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13189 const regnode *last, const regnode *plast,
13190 SV* sv, I32 indent, U32 depth)
13191{
13192 dVAR;
13193 register U8 op = PSEUDO; /* Arbitrary non-END op. */
13194 register const regnode *next;
13195 const regnode *optstart= NULL;
13196
13197 RXi_GET_DECL(r,ri);
13198 GET_RE_DEBUG_FLAGS_DECL;
13199
13200 PERL_ARGS_ASSERT_DUMPUNTIL;
13201
13202#ifdef DEBUG_DUMPUNTIL
13203 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13204 last ? last-start : 0,plast ? plast-start : 0);
13205#endif
13206
13207 if (plast && plast < last)
13208 last= plast;
13209
13210 while (PL_regkind[op] != END && (!last || node < last)) {
13211 /* While that wasn't END last time... */
13212 NODE_ALIGN(node);
13213 op = OP(node);
13214 if (op == CLOSE || op == WHILEM)
13215 indent--;
13216 next = regnext((regnode *)node);
13217
13218 /* Where, what. */
13219 if (OP(node) == OPTIMIZED) {
13220 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13221 optstart = node;
13222 else
13223 goto after_print;
13224 } else
13225 CLEAR_OPTSTART;
13226
13227 regprop(r, sv, node);
13228 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13229 (int)(2*indent + 1), "", SvPVX_const(sv));
13230
13231 if (OP(node) != OPTIMIZED) {
13232 if (next == NULL) /* Next ptr. */
13233 PerlIO_printf(Perl_debug_log, " (0)");
13234 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13235 PerlIO_printf(Perl_debug_log, " (FAIL)");
13236 else
13237 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13238 (void)PerlIO_putc(Perl_debug_log, '\n');
13239 }
13240
13241 after_print:
13242 if (PL_regkind[(U8)op] == BRANCHJ) {
13243 assert(next);
13244 {
13245 register const regnode *nnode = (OP(next) == LONGJMP
13246 ? regnext((regnode *)next)
13247 : next);
13248 if (last && nnode > last)
13249 nnode = last;
13250 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13251 }
13252 }
13253 else if (PL_regkind[(U8)op] == BRANCH) {
13254 assert(next);
13255 DUMPUNTIL(NEXTOPER(node), next);
13256 }
13257 else if ( PL_regkind[(U8)op] == TRIE ) {
13258 const regnode *this_trie = node;
13259 const char op = OP(node);
13260 const U32 n = ARG(node);
13261 const reg_ac_data * const ac = op>=AHOCORASICK ?
13262 (reg_ac_data *)ri->data->data[n] :
13263 NULL;
13264 const reg_trie_data * const trie =
13265 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13266#ifdef DEBUGGING
13267 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13268#endif
13269 const regnode *nextbranch= NULL;
13270 I32 word_idx;
13271 sv_setpvs(sv, "");
13272 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13273 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13274
13275 PerlIO_printf(Perl_debug_log, "%*s%s ",
13276 (int)(2*(indent+3)), "",
13277 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13278 PL_colors[0], PL_colors[1],
13279 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13280 PERL_PV_PRETTY_ELLIPSES |
13281 PERL_PV_PRETTY_LTGT
13282 )
13283 : "???"
13284 );
13285 if (trie->jump) {
13286 U16 dist= trie->jump[word_idx+1];
13287 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13288 (UV)((dist ? this_trie + dist : next) - start));
13289 if (dist) {
13290 if (!nextbranch)
13291 nextbranch= this_trie + trie->jump[0];
13292 DUMPUNTIL(this_trie + dist, nextbranch);
13293 }
13294 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13295 nextbranch= regnext((regnode *)nextbranch);
13296 } else {
13297 PerlIO_printf(Perl_debug_log, "\n");
13298 }
13299 }
13300 if (last && next > last)
13301 node= last;
13302 else
13303 node= next;
13304 }
13305 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
13306 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13307 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13308 }
13309 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13310 assert(next);
13311 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13312 }
13313 else if ( op == PLUS || op == STAR) {
13314 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13315 }
13316 else if (PL_regkind[(U8)op] == ANYOF) {
13317 /* arglen 1 + class block */
13318 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13319 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13320 node = NEXTOPER(node);
13321 }
13322 else if (PL_regkind[(U8)op] == EXACT) {
13323 /* Literal string, where present. */
13324 node += NODE_SZ_STR(node) - 1;
13325 node = NEXTOPER(node);
13326 }
13327 else {
13328 node = NEXTOPER(node);
13329 node += regarglen[(U8)op];
13330 }
13331 if (op == CURLYX || op == OPEN)
13332 indent++;
13333 }
13334 CLEAR_OPTSTART;
13335#ifdef DEBUG_DUMPUNTIL
13336 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13337#endif
13338 return node;
13339}
13340
13341#endif /* DEBUGGING */
13342
13343/*
13344 * Local variables:
13345 * c-indentation-style: bsd
13346 * c-basic-offset: 4
13347 * indent-tabs-mode: t
13348 * End:
13349 *
13350 * ex: set ts=8 sts=4 sw=4 noet:
13351 */