This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
.github - switch to v3 actions
[perl5.git] / regcomp_study.c
CommitLineData
85900e28
YO
1#ifdef PERL_EXT_RE_BUILD
2#include "re_top.h"
3#endif
4
5#include "EXTERN.h"
6#define PERL_IN_REGEX_ENGINE
7#define PERL_IN_REGCOMP_ANY
8#define PERL_IN_REGCOMP_STUDY_C
9#include "perl.h"
10
11#ifdef PERL_IN_XSUB_RE
12# include "re_comp.h"
13#else
14# include "regcomp.h"
15#endif
16
17#include "invlist_inline.h"
18#include "unicode_constants.h"
19#include "regcomp_internal.h"
20
21#define INIT_AND_WITHP \
22 assert(!and_withp); \
23 Newx(and_withp, 1, regnode_ssc); \
24 SAVEFREEPV(and_withp)
25
26
27STATIC void
28S_unwind_scan_frames(pTHX_ const void *p)
29{
30 PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES;
31 scan_frame *f= (scan_frame *)p;
32 do {
33 scan_frame *n= f->next_frame;
34 Safefree(f);
35 f= n;
36 } while (f);
37}
38
39/* Follow the next-chain of the current node and optimize away
40 all the NOTHINGs from it.
41 */
42STATIC void
43S_rck_elide_nothing(pTHX_ regnode *node)
44{
45 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
46
47 if (OP(node) != CURLYX) {
48 const int max = (REGNODE_OFF_BY_ARG(OP(node))
49 ? I32_MAX
50 /* I32 may be smaller than U16 on CRAYs! */
51 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
52 int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node));
53 int noff;
54 regnode *n = node;
55
56 /* Skip NOTHING and LONGJMP. */
57 while (
58 (n = regnext(n))
59 && (
60 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
61 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
62 )
63 && off + noff < max
64 ) {
65 off += noff;
66 }
67 if (REGNODE_OFF_BY_ARG(OP(node)))
68 ARG(node) = off;
69 else
70 NEXT_OFF(node) = off;
71 }
72 return;
73}
74
75
76/*
77 * As best we can, determine the characters that can match the start of
78 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
79 * can be false positive matches
80 *
81 * Returns the invlist as a new SV*; it is the caller's responsibility to
82 * call SvREFCNT_dec() when done with it.
83 */
84STATIC SV*
85S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
86{
87 const U8 * s = (U8*)STRING(node);
88 SSize_t bytelen = STR_LEN(node);
89 UV uc;
90 /* Start out big enough for 2 separate code points */
91 SV* invlist = _new_invlist(4);
92
93 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
94
95 if (! UTF) {
96 uc = *s;
97
98 /* We punt and assume can match anything if the node begins
99 * with a multi-character fold. Things are complicated. For
100 * example, /ffi/i could match any of:
101 * "\N{LATIN SMALL LIGATURE FFI}"
102 * "\N{LATIN SMALL LIGATURE FF}I"
103 * "F\N{LATIN SMALL LIGATURE FI}"
104 * plus several other things; and making sure we have all the
105 * possibilities is hard. */
106 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
107 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
108 }
109 else {
110 /* Any Latin1 range character can potentially match any
111 * other depending on the locale, and in Turkic locales, 'I' and
112 * 'i' can match U+130 and U+131 */
113 if (OP(node) == EXACTFL) {
114 _invlist_union(invlist, PL_Latin1, &invlist);
115 if (isALPHA_FOLD_EQ(uc, 'I')) {
116 invlist = add_cp_to_invlist(invlist,
117 LATIN_SMALL_LETTER_DOTLESS_I);
118 invlist = add_cp_to_invlist(invlist,
119 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
120 }
121 }
122 else {
123 /* But otherwise, it matches at least itself. We can
124 * quickly tell if it has a distinct fold, and if so,
125 * it matches that as well */
126 invlist = add_cp_to_invlist(invlist, uc);
127 if (IS_IN_SOME_FOLD_L1(uc))
128 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
129 }
130
131 /* Some characters match above-Latin1 ones under /i. This
132 * is true of EXACTFL ones when the locale is UTF-8 */
133 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
134 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
135 EXACTFAA_NO_TRIE)))
136 {
137 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
138 }
139 }
140 }
141 else { /* Pattern is UTF-8 */
142 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
143 const U8* e = s + bytelen;
144 IV fc;
145
146 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
147
148 /* The only code points that aren't folded in a UTF EXACTFish
149 * node are the problematic ones in EXACTFL nodes */
150 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
151 /* We need to check for the possibility that this EXACTFL
152 * node begins with a multi-char fold. Therefore we fold
153 * the first few characters of it so that we can make that
154 * check */
155 U8 *d = folded;
156 int i;
157
158 fc = -1;
159 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
160 if (isASCII(*s)) {
161 *(d++) = (U8) toFOLD(*s);
162 if (fc < 0) { /* Save the first fold */
163 fc = *(d-1);
164 }
165 s++;
166 }
167 else {
168 STRLEN len;
169 UV fold = toFOLD_utf8_safe(s, e, d, &len);
170 if (fc < 0) { /* Save the first fold */
171 fc = fold;
172 }
173 d += len;
174 s += UTF8SKIP(s);
175 }
176 }
177
178 /* And set up so the code below that looks in this folded
179 * buffer instead of the node's string */
180 e = d;
181 s = folded;
182 }
183
184 /* When we reach here 's' points to the fold of the first
185 * character(s) of the node; and 'e' points to far enough along
186 * the folded string to be just past any possible multi-char
187 * fold.
188 *
189 * Like the non-UTF case above, we punt if the node begins with a
190 * multi-char fold */
191
192 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
193 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
194 }
195 else { /* Single char fold */
196 unsigned int k;
197 U32 first_fold;
198 const U32 * remaining_folds;
199 Size_t folds_count;
200
201 /* It matches itself */
202 invlist = add_cp_to_invlist(invlist, fc);
203
204 /* ... plus all the things that fold to it, which are found in
205 * PL_utf8_foldclosures */
206 folds_count = _inverse_folds(fc, &first_fold,
207 &remaining_folds);
208 for (k = 0; k < folds_count; k++) {
209 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
210
211 /* /aa doesn't allow folds between ASCII and non- */
212 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
213 && isASCII(c) != isASCII(fc))
214 {
215 continue;
216 }
217
218 invlist = add_cp_to_invlist(invlist, c);
219 }
220
221 if (OP(node) == EXACTFL) {
222
223 /* If either [iI] are present in an EXACTFL node the above code
224 * should have added its normal case pair, but under a Turkish
225 * locale they could match instead the case pairs from it. Add
226 * those as potential matches as well */
227 if (isALPHA_FOLD_EQ(fc, 'I')) {
228 invlist = add_cp_to_invlist(invlist,
229 LATIN_SMALL_LETTER_DOTLESS_I);
230 invlist = add_cp_to_invlist(invlist,
231 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
232 }
233 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
234 invlist = add_cp_to_invlist(invlist, 'I');
235 }
236 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
237 invlist = add_cp_to_invlist(invlist, 'i');
238 }
239 }
240 }
241 }
242
243 return invlist;
244}
245
246
247/* Mark that we cannot extend a found fixed substring at this point.
248 Update the longest found anchored substring or the longest found
249 floating substrings if needed. */
250
251void
252Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
253 SSize_t *minlenp, int is_inf)
254{
255 const STRLEN l = CHR_SVLEN(data->last_found);
256 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
257 const STRLEN old_l = CHR_SVLEN(longest_sv);
258 DECLARE_AND_GET_RE_DEBUG_FLAGS;
259
260 PERL_ARGS_ASSERT_SCAN_COMMIT;
261
262 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
263 const U8 i = data->cur_is_floating;
264 SvSetMagicSV(longest_sv, data->last_found);
265 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
266
267 if (!i) /* fixed */
268 data->substrs[0].max_offset = data->substrs[0].min_offset;
269 else { /* float */
270 data->substrs[1].max_offset =
271 (is_inf)
272 ? OPTIMIZE_INFTY
273 : (l
274 ? data->last_start_max
275 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
276 ? OPTIMIZE_INFTY
277 : data->pos_min + data->pos_delta));
278 }
279
280 data->substrs[i].flags &= ~SF_BEFORE_EOL;
281 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
282 data->substrs[i].minlenp = minlenp;
283 data->substrs[i].lookbehind = 0;
284 }
285
286 SvCUR_set(data->last_found, 0);
287 {
288 SV * const sv = data->last_found;
289 if (SvUTF8(sv) && SvMAGICAL(sv)) {
290 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
291 if (mg)
292 mg->mg_len = 0;
293 }
294 }
295 data->last_end = -1;
296 data->flags &= ~SF_BEFORE_EOL;
297 DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
298}
299
300/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
301 * list that describes which code points it matches */
302
303STATIC void
304S_ssc_anything(pTHX_ regnode_ssc *ssc)
305{
306 /* Set the SSC 'ssc' to match an empty string or any code point */
307
308 PERL_ARGS_ASSERT_SSC_ANYTHING;
309
310 assert(is_ANYOF_SYNTHETIC(ssc));
311
312 /* mortalize so won't leak */
313 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
314 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
315}
316
317STATIC int
318S_ssc_is_anything(const regnode_ssc *ssc)
319{
320 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
321 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
322 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
323 * in any way, so there's no point in using it */
324
325 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
326 bool ret;
327
328 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
329
330 assert(is_ANYOF_SYNTHETIC(ssc));
331
332 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
333 return FALSE;
334 }
335
336 /* See if the list consists solely of the range 0 - Infinity */
337 invlist_iterinit(ssc->invlist);
338 ret = invlist_iternext(ssc->invlist, &start, &end)
339 && start == 0
340 && end == UV_MAX;
341
342 invlist_iterfinish(ssc->invlist);
343
344 if (ret) {
345 return TRUE;
346 }
347
348 /* If e.g., both \w and \W are set, matches everything */
349 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
350 int i;
351 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
352 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
353 return TRUE;
354 }
355 }
356 }
357
358 return FALSE;
359}
360
361void
362Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
363{
364 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
365 * string, any code point, or any posix class under locale */
366
367 PERL_ARGS_ASSERT_SSC_INIT;
368
369 Zero(ssc, 1, regnode_ssc);
370 set_ANYOF_SYNTHETIC(ssc);
371 ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
372 ssc_anything(ssc);
373
374 /* If any portion of the regex is to operate under locale rules that aren't
375 * fully known at compile time, initialization includes it. The reason
376 * this isn't done for all regexes is that the optimizer was written under
377 * the assumption that locale was all-or-nothing. Given the complexity and
378 * lack of documentation in the optimizer, and that there are inadequate
379 * test cases for locale, many parts of it may not work properly, it is
380 * safest to avoid locale unless necessary. */
381 if (RExC_contains_locale) {
382 ANYOF_POSIXL_SETALL(ssc);
383 }
384 else {
385 ANYOF_POSIXL_ZERO(ssc);
386 }
387}
388
389STATIC int
390S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
391 const regnode_ssc *ssc)
392{
393 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
394 * to the list of code points matched, and locale posix classes; hence does
395 * not check its flags) */
396
397 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
398 bool ret;
399
400 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
401
402 assert(is_ANYOF_SYNTHETIC(ssc));
403
404 invlist_iterinit(ssc->invlist);
405 ret = invlist_iternext(ssc->invlist, &start, &end)
406 && start == 0
407 && end == UV_MAX;
408
409 invlist_iterfinish(ssc->invlist);
410
411 if (! ret) {
412 return FALSE;
413 }
414
415 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
416 return FALSE;
417 }
418
419 return TRUE;
420}
421
422
423STATIC SV*
424S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
425 const regnode_charclass* const node)
426{
427 /* Returns a mortal inversion list defining which code points are matched
428 * by 'node', which is of ANYOF-ish type . Handles complementing the
429 * result if appropriate. If some code points aren't knowable at this
430 * time, the returned list must, and will, contain every code point that is
431 * a possibility. */
432
433 SV* invlist = NULL;
434 SV* only_utf8_locale_invlist = NULL;
435 bool new_node_has_latin1 = FALSE;
436 const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
437 ? ANYOF_FLAGS(node)
438 : 0;
439
440 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
441
442 /* Look at the data structure created by S_set_ANYOF_arg() */
443 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
444 invlist = sv_2mortal(_new_invlist(1));
445 invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
446 }
447 else if (ANYOF_HAS_AUX(node)) {
448 const U32 n = ARG(node);
449 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
450 AV * const av = MUTABLE_AV(SvRV(rv));
451 SV **const ary = AvARRAY(av);
452
453 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
454
455 /* Here there are things that won't be known until runtime -- we
456 * have to assume it could be anything */
457 invlist = sv_2mortal(_new_invlist(1));
458 return _add_range_to_invlist(invlist, 0, UV_MAX);
459 }
460 else if (ary[INVLIST_INDEX]) {
461
462 /* Use the node's inversion list */
463 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
464 }
465
466 /* Get the code points valid only under UTF-8 locales */
467 if ( (flags & ANYOFL_FOLD)
468 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
469 {
470 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
471 }
472 }
473
474 if (! invlist) {
475 invlist = sv_2mortal(_new_invlist(0));
476 }
477
478 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
479 * code points, and an inversion list for the others, but if there are code
480 * points that should match only conditionally on the target string being
481 * UTF-8, those are placed in the inversion list, and not the bitmap.
482 * Since there are circumstances under which they could match, they are
483 * included in the SSC. But if the ANYOF node is to be inverted, we have
484 * to exclude them here, so that when we invert below, the end result
485 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
486 * have to do this here before we add the unconditionally matched code
487 * points */
488 if (flags & ANYOF_INVERT) {
489 _invlist_intersection_complement_2nd(invlist,
490 PL_UpperLatin1,
491 &invlist);
492 }
493
494 /* Add in the points from the bit map */
495 if (REGNODE_TYPE(OP(node)) == ANYOF){
496 for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
497 if (ANYOF_BITMAP_TEST(node, i)) {
498 unsigned int start = i++;
499
500 for (; i < NUM_ANYOF_CODE_POINTS
501 && ANYOF_BITMAP_TEST(node, i); ++i)
502 {
503 /* empty */
504 }
505 invlist = _add_range_to_invlist(invlist, start, i-1);
506 new_node_has_latin1 = TRUE;
507 }
508 }
509 }
510
511 /* If this can match all upper Latin1 code points, have to add them
512 * as well. But don't add them if inverting, as when that gets done below,
513 * it would exclude all these characters, including the ones it shouldn't
514 * that were added just above */
515 if ( ! (flags & ANYOF_INVERT)
516 && OP(node) == ANYOFD
517 && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
518 {
519 _invlist_union(invlist, PL_UpperLatin1, &invlist);
520 }
521
522 /* Similarly for these */
523 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
524 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
525 }
526
527 if (flags & ANYOF_INVERT) {
528 _invlist_invert(invlist);
529 }
530 else if (flags & ANYOFL_FOLD) {
531 if (new_node_has_latin1) {
532
533 /* These folds are potential in Turkic locales */
534 if (_invlist_contains_cp(invlist, 'i')) {
535 invlist = add_cp_to_invlist(invlist,
536 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
537 }
538 if (_invlist_contains_cp(invlist, 'I')) {
539 invlist = add_cp_to_invlist(invlist,
540 LATIN_SMALL_LETTER_DOTLESS_I);
541 }
542
543 /* Under /li, any 0-255 could fold to any other 0-255, depending on
544 * the locale. We can skip this if there are no 0-255 at all. */
545 _invlist_union(invlist, PL_Latin1, &invlist);
546 }
547 else {
548 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
549 invlist = add_cp_to_invlist(invlist, 'I');
550 }
551 if (_invlist_contains_cp(invlist,
552 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
553 {
554 invlist = add_cp_to_invlist(invlist, 'i');
555 }
556 }
557 }
558
559 /* Similarly add the UTF-8 locale possible matches. These have to be
560 * deferred until after the non-UTF-8 locale ones are taken care of just
561 * above, or it leads to wrong results under ANYOF_INVERT */
562 if (only_utf8_locale_invlist) {
563 _invlist_union_maybe_complement_2nd(invlist,
564 only_utf8_locale_invlist,
565 flags & ANYOF_INVERT,
566 &invlist);
567 }
568
569 return invlist;
570}
571
572/* 'AND' a given class with another one. Can create false positives. 'ssc'
573 * should not be inverted. */
574
575STATIC void
576S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
577 const regnode_charclass *and_with)
578{
579 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
580 * another SSC or a regular ANYOF class. Can create false positives. */
581
582 SV* anded_cp_list;
583 U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
584 ? ANYOF_FLAGS(and_with)
585 : 0;
586 U8 anded_flags;
587
588 PERL_ARGS_ASSERT_SSC_AND;
589
590 assert(is_ANYOF_SYNTHETIC(ssc));
591
592 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
593 * the code point inversion list and just the relevant flags */
594 if (is_ANYOF_SYNTHETIC(and_with)) {
595 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
596 anded_flags = and_with_flags;
597
598 /* XXX This is a kludge around what appears to be deficiencies in the
599 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
600 * there are paths through the optimizer where it doesn't get weeded
601 * out when it should. And if we don't make some extra provision for
602 * it like the code just below, it doesn't get added when it should.
603 * This solution is to add it only when AND'ing, which is here, and
604 * only when what is being AND'ed is the pristine, original node
605 * matching anything. Thus it is like adding it to ssc_anything() but
606 * only when the result is to be AND'ed. Probably the same solution
607 * could be adopted for the same problem we have with /l matching,
608 * which is solved differently in S_ssc_init(), and that would lead to
609 * fewer false positives than that solution has. But if this solution
610 * creates bugs, the consequences are only that a warning isn't raised
611 * that should be; while the consequences for having /l bugs is
612 * incorrect matches */
613 if (ssc_is_anything((regnode_ssc *)and_with)) {
614 anded_flags |= ANYOF_WARN_SUPER__shared;
615 }
616 }
617 else {
618 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
619 if (OP(and_with) == ANYOFD) {
620 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
621 }
622 else {
623 anded_flags = and_with_flags
624 & ( ANYOF_COMMON_FLAGS
625 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
626 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
627 if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
628 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
629 }
630 }
631 }
632
633 ANYOF_FLAGS(ssc) &= anded_flags;
634
635 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
636 * C2 is the list of code points in 'and-with'; P2, its posix classes.
637 * 'and_with' may be inverted. When not inverted, we have the situation of
638 * computing:
639 * (C1 | P1) & (C2 | P2)
640 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
641 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
642 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
643 * <= ((C1 & C2) | P1 | P2)
644 * Alternatively, the last few steps could be:
645 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
646 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
647 * <= (C1 | C2 | (P1 & P2))
648 * We favor the second approach if either P1 or P2 is non-empty. This is
649 * because these components are a barrier to doing optimizations, as what
650 * they match cannot be known until the moment of matching as they are
651 * dependent on the current locale, 'AND"ing them likely will reduce or
652 * eliminate them.
653 * But we can do better if we know that C1,P1 are in their initial state (a
654 * frequent occurrence), each matching everything:
655 * (<everything>) & (C2 | P2) = C2 | P2
656 * Similarly, if C2,P2 are in their initial state (again a frequent
657 * occurrence), the result is a no-op
658 * (C1 | P1) & (<everything>) = C1 | P1
659 *
660 * Inverted, we have
661 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
662 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
663 * <= (C1 & ~C2) | (P1 & ~P2)
664 * */
665
666 if ((and_with_flags & ANYOF_INVERT)
667 && ! is_ANYOF_SYNTHETIC(and_with))
668 {
669 unsigned int i;
670
671 ssc_intersection(ssc,
672 anded_cp_list,
673 FALSE /* Has already been inverted */
674 );
675
676 /* If either P1 or P2 is empty, the intersection will be also; can skip
677 * the loop */
678 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
679 ANYOF_POSIXL_ZERO(ssc);
680 }
681 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
682
683 /* Note that the Posix class component P from 'and_with' actually
684 * looks like:
685 * P = Pa | Pb | ... | Pn
686 * where each component is one posix class, such as in [\w\s].
687 * Thus
688 * ~P = ~(Pa | Pb | ... | Pn)
689 * = ~Pa & ~Pb & ... & ~Pn
690 * <= ~Pa | ~Pb | ... | ~Pn
691 * The last is something we can easily calculate, but unfortunately
692 * is likely to have many false positives. We could do better
693 * in some (but certainly not all) instances if two classes in
694 * P have known relationships. For example
695 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
696 * So
697 * :lower: & :print: = :lower:
698 * And similarly for classes that must be disjoint. For example,
699 * since \s and \w can have no elements in common based on rules in
700 * the POSIX standard,
701 * \w & ^\S = nothing
702 * Unfortunately, some vendor locales do not meet the Posix
703 * standard, in particular almost everything by Microsoft.
704 * The loop below just changes e.g., \w into \W and vice versa */
705
706 regnode_charclass_posixl temp;
707 int add = 1; /* To calculate the index of the complement */
708
709 Zero(&temp, 1, regnode_charclass_posixl);
710 ANYOF_POSIXL_ZERO(&temp);
711 for (i = 0; i < ANYOF_MAX; i++) {
712 assert(i % 2 != 0
713 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
714 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
715
716 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
717 ANYOF_POSIXL_SET(&temp, i + add);
718 }
719 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
720 }
721 ANYOF_POSIXL_AND(&temp, ssc);
722
723 } /* else ssc already has no posixes */
724 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
725 in its initial state */
726 else if (! is_ANYOF_SYNTHETIC(and_with)
727 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
728 {
729 /* But if 'ssc' is in its initial state, the result is just 'and_with';
730 * copy it over 'ssc' */
731 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
732 if (is_ANYOF_SYNTHETIC(and_with)) {
733 StructCopy(and_with, ssc, regnode_ssc);
734 }
735 else {
736 ssc->invlist = anded_cp_list;
737 ANYOF_POSIXL_ZERO(ssc);
738 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
739 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
740 }
741 }
742 }
743 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
744 || (and_with_flags & ANYOF_MATCHES_POSIXL))
745 {
746 /* One or the other of P1, P2 is non-empty. */
747 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
748 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
749 }
750 ssc_union(ssc, anded_cp_list, FALSE);
751 }
752 else { /* P1 = P2 = empty */
753 ssc_intersection(ssc, anded_cp_list, FALSE);
754 }
755 }
756}
757
758STATIC void
759S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
760 const regnode_charclass *or_with)
761{
762 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
763 * another SSC or a regular ANYOF class. Can create false positives if
764 * 'or_with' is to be inverted. */
765
766 SV* ored_cp_list;
767 U8 ored_flags;
768 U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
769 ? ANYOF_FLAGS(or_with)
770 : 0;
771
772 PERL_ARGS_ASSERT_SSC_OR;
773
774 assert(is_ANYOF_SYNTHETIC(ssc));
775
776 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
777 * the code point inversion list and just the relevant flags */
778 if (is_ANYOF_SYNTHETIC(or_with)) {
779 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
780 ored_flags = or_with_flags;
781 }
782 else {
783 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
784 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
785 if (OP(or_with) != ANYOFD) {
786 ored_flags |=
787 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
788 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
789 if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
790 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
791 }
792 }
793 }
794
795 ANYOF_FLAGS(ssc) |= ored_flags;
796
797 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
798 * C2 is the list of code points in 'or-with'; P2, its posix classes.
799 * 'or_with' may be inverted. When not inverted, we have the simple
800 * situation of computing:
801 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
802 * If P1|P2 yields a situation with both a class and its complement are
803 * set, like having both \w and \W, this matches all code points, and we
804 * can delete these from the P component of the ssc going forward. XXX We
805 * might be able to delete all the P components, but I (khw) am not certain
806 * about this, and it is better to be safe.
807 *
808 * Inverted, we have
809 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
810 * <= (C1 | P1) | ~C2
811 * <= (C1 | ~C2) | P1
812 * (which results in actually simpler code than the non-inverted case)
813 * */
814
815 if ((or_with_flags & ANYOF_INVERT)
816 && ! is_ANYOF_SYNTHETIC(or_with))
817 {
818 /* We ignore P2, leaving P1 going forward */
819 } /* else Not inverted */
820 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
821 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
822 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
823 unsigned int i;
824 for (i = 0; i < ANYOF_MAX; i += 2) {
825 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
826 {
827 ssc_match_all_cp(ssc);
828 ANYOF_POSIXL_CLEAR(ssc, i);
829 ANYOF_POSIXL_CLEAR(ssc, i+1);
830 }
831 }
832 }
833 }
834
835 ssc_union(ssc,
836 ored_cp_list,
837 FALSE /* Already has been inverted */
838 );
839}
840
841STATIC void
842S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
843{
844 PERL_ARGS_ASSERT_SSC_UNION;
845
846 assert(is_ANYOF_SYNTHETIC(ssc));
847
848 _invlist_union_maybe_complement_2nd(ssc->invlist,
849 invlist,
850 invert2nd,
851 &ssc->invlist);
852}
853
854STATIC void
855S_ssc_intersection(pTHX_ regnode_ssc *ssc,
856 SV* const invlist,
857 const bool invert2nd)
858{
859 PERL_ARGS_ASSERT_SSC_INTERSECTION;
860
861 assert(is_ANYOF_SYNTHETIC(ssc));
862
863 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
864 invlist,
865 invert2nd,
866 &ssc->invlist);
867}
868
869STATIC void
870S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
871{
872 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
873
874 assert(is_ANYOF_SYNTHETIC(ssc));
875
876 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
877}
878
879STATIC void
880S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
881{
882 /* AND just the single code point 'cp' into the SSC 'ssc' */
883
884 SV* cp_list = _new_invlist(2);
885
886 PERL_ARGS_ASSERT_SSC_CP_AND;
887
888 assert(is_ANYOF_SYNTHETIC(ssc));
889
890 cp_list = add_cp_to_invlist(cp_list, cp);
891 ssc_intersection(ssc, cp_list,
892 FALSE /* Not inverted */
893 );
894 SvREFCNT_dec_NN(cp_list);
895}
896
897STATIC void
898S_ssc_clear_locale(regnode_ssc *ssc)
899{
900 /* Set the SSC 'ssc' to not match any locale things */
901 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
902
903 assert(is_ANYOF_SYNTHETIC(ssc));
904
905 ANYOF_POSIXL_ZERO(ssc);
906 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
907}
908
909bool
910Perl_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
911{
912 /* The synthetic start class is used to hopefully quickly winnow down
913 * places where a pattern could start a match in the target string. If it
914 * doesn't really narrow things down that much, there isn't much point to
915 * having the overhead of using it. This function uses some very crude
916 * heuristics to decide if to use the ssc or not.
917 *
918 * It returns TRUE if 'ssc' rules out more than half what it considers to
919 * be the "likely" possible matches, but of course it doesn't know what the
920 * actual things being matched are going to be; these are only guesses
921 *
922 * For /l matches, it assumes that the only likely matches are going to be
923 * in the 0-255 range, uniformly distributed, so half of that is 127
924 * For /a and /d matches, it assumes that the likely matches will be just
925 * the ASCII range, so half of that is 63
926 * For /u and there isn't anything matching above the Latin1 range, it
927 * assumes that that is the only range likely to be matched, and uses
928 * half that as the cut-off: 127. If anything matches above Latin1,
929 * it assumes that all of Unicode could match (uniformly), except for
930 * non-Unicode code points and things in the General Category "Other"
931 * (unassigned, private use, surrogates, controls and formats). This
932 * is a much large number. */
933
934 U32 count = 0; /* Running total of number of code points matched by
935 'ssc' */
936 UV start, end; /* Start and end points of current range in inversion
937 XXX outdated. UTF-8 locales are common, what about invert? list */
938 const U32 max_code_points = (LOC)
939 ? 256
940 : (( ! UNI_SEMANTICS
941 || invlist_highest(ssc->invlist) < 256)
942 ? 128
943 : NON_OTHER_COUNT);
944 const U32 max_match = max_code_points / 2;
945
946 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
947
948 invlist_iterinit(ssc->invlist);
949 while (invlist_iternext(ssc->invlist, &start, &end)) {
950 if (start >= max_code_points) {
951 break;
952 }
953 end = MIN(end, max_code_points - 1);
954 count += end - start + 1;
955 if (count >= max_match) {
956 invlist_iterfinish(ssc->invlist);
957 return FALSE;
958 }
959 }
960
961 return TRUE;
962}
963
964
965void
966Perl_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
967{
968 /* The inversion list in the SSC is marked mortal; now we need a more
969 * permanent copy, which is stored the same way that is done in a regular
970 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
971 * map */
972
973 SV* invlist = invlist_clone(ssc->invlist, NULL);
974
975 PERL_ARGS_ASSERT_SSC_FINALIZE;
976
977 assert(is_ANYOF_SYNTHETIC(ssc));
978
979 /* The code in this file assumes that all but these flags aren't relevant
980 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
981 * by the time we reach here */
982 assert(! (ANYOF_FLAGS(ssc)
983 & ~( ANYOF_COMMON_FLAGS
984 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
985 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES)));
986
987 populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist);
988
989 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
990 SvREFCNT_dec(invlist);
991
992 /* Make sure is clone-safe */
993 ssc->invlist = NULL;
994
995 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
996 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
997 OP(ssc) = ANYOFPOSIXL;
998 }
999 else if (RExC_contains_locale) {
1000 OP(ssc) = ANYOFL;
1001 }
1002
1003 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1004}
1005
1006/* The below joins as many adjacent EXACTish nodes as possible into a single
1007 * one. The regop may be changed if the node(s) contain certain sequences that
1008 * require special handling. The joining is only done if:
1009 * 1) there is room in the current conglomerated node to entirely contain the
1010 * next one.
1011 * 2) they are compatible node types
1012 *
1013 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
1014 * these get optimized out
1015 *
1016 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
1017 * as possible, even if that means splitting an existing node so that its first
1018 * part is moved to the preceding node. This would maximise the efficiency of
1019 * memEQ during matching.
1020 *
1021 * If a node is to match under /i (folded), the number of characters it matches
1022 * can be different than its character length if it contains a multi-character
1023 * fold. *min_subtract is set to the total delta number of characters of the
1024 * input nodes.
1025 *
1026 * And *unfolded_multi_char is set to indicate whether or not the node contains
1027 * an unfolded multi-char fold. This happens when it won't be known until
1028 * runtime whether the fold is valid or not; namely
1029 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
1030 * target string being matched against turns out to be UTF-8 is that fold
1031 * valid; or
1032 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
1033 * runtime.
1034 * (Multi-char folds whose components are all above the Latin1 range are not
1035 * run-time locale dependent, and have already been folded by the time this
1036 * function is called.)
1037 *
1038 * This is as good a place as any to discuss the design of handling these
1039 * multi-character fold sequences. It's been wrong in Perl for a very long
1040 * time. There are three code points in Unicode whose multi-character folds
1041 * were long ago discovered to mess things up. The previous designs for
1042 * dealing with these involved assigning a special node for them. This
1043 * approach doesn't always work, as evidenced by this example:
1044 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
1045 * Both sides fold to "sss", but if the pattern is parsed to create a node that
1046 * would match just the \xDF, it won't be able to handle the case where a
1047 * successful match would have to cross the node's boundary. The new approach
1048 * that hopefully generally solves the problem generates an EXACTFUP node
1049 * that is "sss" in this case.
1050 *
1051 * It turns out that there are problems with all multi-character folds, and not
1052 * just these three. Now the code is general, for all such cases. The
1053 * approach taken is:
1054 * 1) This routine examines each EXACTFish node that could contain multi-
1055 * character folded sequences. Since a single character can fold into
1056 * such a sequence, the minimum match length for this node is less than
1057 * the number of characters in the node. This routine returns in
1058 * *min_subtract how many characters to subtract from the actual
1059 * length of the string to get a real minimum match length; it is 0 if
1060 * there are no multi-char foldeds. This delta is used by the caller to
1061 * adjust the min length of the match, and the delta between min and max,
1062 * so that the optimizer doesn't reject these possibilities based on size
1063 * constraints.
1064 *
1065 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
1066 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
1067 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
1068 * EXACTFU nodes. The node type of such nodes is then changed to
1069 * EXACTFUP, indicating it is problematic, and needs careful handling.
1070 * (The procedures in step 1) above are sufficient to handle this case in
1071 * UTF-8 encoded nodes.) The reason this is problematic is that this is
1072 * the only case where there is a possible fold length change in non-UTF-8
1073 * patterns. By reserving a special node type for problematic cases, the
1074 * far more common regular EXACTFU nodes can be processed faster.
1075 * regexec.c takes advantage of this.
1076 *
1077 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
1078 * problematic cases. These all only occur when the pattern is not
1079 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
1080 * length change, it handles the situation where the string cannot be
1081 * entirely folded. The strings in an EXACTFish node are folded as much
1082 * as possible during compilation in regcomp.c. This saves effort in
1083 * regex matching. By using an EXACTFUP node when it is not possible to
1084 * fully fold at compile time, regexec.c can know that everything in an
1085 * EXACTFU node is folded, so folding can be skipped at runtime. The only
1086 * case where folding in EXACTFU nodes can't be done at compile time is
1087 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
1088 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
1089 * handle two very different cases. Alternatively, there could have been
1090 * a node type where there are length changes, one for unfolded, and one
1091 * for both. If yet another special case needed to be created, the number
1092 * of required node types would have to go to 7. khw figures that even
1093 * though there are plenty of node types to spare, that the maintenance
1094 * cost wasn't worth the small speedup of doing it that way, especially
1095 * since he thinks the MICRO SIGN is rarely encountered in practice.
1096 *
1097 * There are other cases where folding isn't done at compile time, but
1098 * none of them are under /u, and hence not for EXACTFU nodes. The folds
1099 * in EXACTFL nodes aren't known until runtime, and vary as the locale
1100 * changes. Some folds in EXACTF depend on if the runtime target string
1101 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
1102 * when no fold in it depends on the UTF-8ness of the target string.)
1103 *
1104 * 3) A problem remains for unfolded multi-char folds. (These occur when the
1105 * validity of the fold won't be known until runtime, and so must remain
1106 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
1107 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
1108 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
1109 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
1110 * The reason this is a problem is that the optimizer part of regexec.c
1111 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
1112 * that a character in the pattern corresponds to at most a single
1113 * character in the target string. (And I do mean character, and not byte
1114 * here, unlike other parts of the documentation that have never been
1115 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
1116 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
1117 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
1118 * EXACTFL nodes, violate the assumption, and they are the only instances
1119 * where it is violated. I'm reluctant to try to change the assumption,
1120 * as the code involved is impenetrable to me (khw), so instead the code
1121 * here punts. This routine examines EXACTFL nodes, and (when the pattern
1122 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
1123 * boolean indicating whether or not the node contains such a fold. When
1124 * it is true, the caller sets a flag that later causes the optimizer in
1125 * this file to not set values for the floating and fixed string lengths,
1126 * and thus avoids the optimizer code in regexec.c that makes the invalid
1127 * assumption. Thus, there is no optimization based on string lengths for
1128 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
1129 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
1130 * assumption is wrong only in these cases is that all other non-UTF-8
1131 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
1132 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
1133 * EXACTF nodes because we don't know at compile time if it actually
1134 * matches 'ss' or not. For EXACTF nodes it will match iff the target
1135 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
1136 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
1137 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
1138 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
1139 * string would require the pattern to be forced into UTF-8, the overhead
1140 * of which we want to avoid. Similarly the unfolded multi-char folds in
1141 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
1142 * locale.)
1143 *
1144 * Similarly, the code that generates tries doesn't currently handle
1145 * not-already-folded multi-char folds, and it looks like a pain to change
1146 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
1147 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
1148 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
1149 * using /iaa matching will be doing so almost entirely with ASCII
1150 * strings, so this should rarely be encountered in practice */
1151
1152U32
1153Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
1154 UV *min_subtract, bool *unfolded_multi_char,
1155 U32 flags, regnode *val, U32 depth)
1156{
1157 /* Merge several consecutive EXACTish nodes into one. */
1158
1159 regnode *n = regnext(scan);
1160 U32 stringok = 1;
1161 regnode *next = REGNODE_AFTER_varies(scan);
1162 U32 merged = 0;
1163 U32 stopnow = 0;
1164#ifdef DEBUGGING
1165 regnode *stop = scan;
1166 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1167#else
1168 PERL_UNUSED_ARG(depth);
1169#endif
1170
1171 PERL_ARGS_ASSERT_JOIN_EXACT;
1172#ifndef EXPERIMENTAL_INPLACESCAN
1173 PERL_UNUSED_ARG(flags);
1174 PERL_UNUSED_ARG(val);
1175#endif
1176 DEBUG_PEEP("join", scan, depth, 0);
1177
1178 assert(REGNODE_TYPE(OP(scan)) == EXACT);
1179
1180 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
1181 * EXACT ones that are mergeable to the current one. */
1182 while ( n
1183 && ( REGNODE_TYPE(OP(n)) == NOTHING
1184 || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
1185 && NEXT_OFF(n)
1186 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
1187 {
1188
1189 if (OP(n) == TAIL || n > next)
1190 stringok = 0;
1191 if (REGNODE_TYPE(OP(n)) == NOTHING) {
1192 DEBUG_PEEP("skip:", n, depth, 0);
1193 NEXT_OFF(scan) += NEXT_OFF(n);
1194 next = n + NODE_STEP_REGNODE;
1195#ifdef DEBUGGING
1196 if (stringok)
1197 stop = n;
1198#endif
1199 n = regnext(n);
1200 }
1201 else if (stringok) {
1202 const unsigned int oldl = STR_LEN(scan);
1203 regnode * const nnext = regnext(n);
1204
1205 /* XXX I (khw) kind of doubt that this works on platforms (should
1206 * Perl ever run on one) where U8_MAX is above 255 because of lots
1207 * of other assumptions */
1208 /* Don't join if the sum can't fit into a single node */
1209 if (oldl + STR_LEN(n) > U8_MAX)
1210 break;
1211
1212 /* Joining something that requires UTF-8 with something that
1213 * doesn't, means the result requires UTF-8. */
1214 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
1215 OP(scan) = EXACT_REQ8;
1216 }
1217 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
1218 ; /* join is compatible, no need to change OP */
1219 }
1220 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
1221 OP(scan) = EXACTFU_REQ8;
1222 }
1223 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
1224 ; /* join is compatible, no need to change OP */
1225 }
1226 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
1227 ; /* join is compatible, no need to change OP */
1228 }
1229 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
1230
1231 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
1232 * which can join with EXACTFU ones. We check for this case
1233 * here. These need to be resolved to either EXACTFU or
1234 * EXACTF at joining time. They have nothing in them that
1235 * would forbid them from being the more desirable EXACTFU
1236 * nodes except that they begin and/or end with a single [Ss].
1237 * The reason this is problematic is because they could be
1238 * joined in this loop with an adjacent node that ends and/or
1239 * begins with [Ss] which would then form the sequence 'ss',
1240 * which matches differently under /di than /ui, in which case
1241 * EXACTFU can't be used. If the 'ss' sequence doesn't get
1242 * formed, the nodes get absorbed into any adjacent EXACTFU
1243 * node. And if the only adjacent node is EXACTF, they get
1244 * absorbed into that, under the theory that a longer node is
1245 * better than two shorter ones, even if one is EXACTFU. Note
1246 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
1247 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
1248
1249 if (STRING(n)[STR_LEN(n)-1] == 's') {
1250
1251 /* Here the joined node would end with 's'. If the node
1252 * following the combination is an EXACTF one, it's better to
1253 * join this trailing edge 's' node with that one, leaving the
1254 * current one in 'scan' be the more desirable EXACTFU */
1255 if (OP(nnext) == EXACTF) {
1256 break;
1257 }
1258
1259 OP(scan) = EXACTFU_S_EDGE;
1260
1261 } /* Otherwise, the beginning 's' of the 2nd node just
1262 becomes an interior 's' in 'scan' */
1263 }
1264 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
1265 ; /* join is compatible, no need to change OP */
1266 }
1267 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
1268
1269 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
1270 * nodes. But the latter nodes can be also joined with EXACTFU
1271 * ones, and that is a better outcome, so if the node following
1272 * 'n' is EXACTFU, quit now so that those two can be joined
1273 * later */
1274 if (OP(nnext) == EXACTFU) {
1275 break;
1276 }
1277
1278 /* The join is compatible, and the combined node will be
1279 * EXACTF. (These don't care if they begin or end with 's' */
1280 }
1281 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
1282 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
1283 && STRING(n)[0] == 's')
1284 {
1285 /* When combined, we have the sequence 'ss', which means we
1286 * have to remain /di */
1287 OP(scan) = EXACTF;
1288 }
1289 }
1290 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
1291 if (STRING(n)[0] == 's') {
1292 ; /* Here the join is compatible and the combined node
1293 starts with 's', no need to change OP */
1294 }
1295 else { /* Now the trailing 's' is in the interior */
1296 OP(scan) = EXACTFU;
1297 }
1298 }
1299 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
1300
1301 /* The join is compatible, and the combined node will be
1302 * EXACTF. (These don't care if they begin or end with 's' */
1303 OP(scan) = EXACTF;
1304 }
1305 else if (OP(scan) != OP(n)) {
1306
1307 /* The only other compatible joinings are the same node type */
1308 break;
1309 }
1310
1311 DEBUG_PEEP("merg", n, depth, 0);
1312 merged++;
1313
1314 next = REGNODE_AFTER_varies(n);
1315 NEXT_OFF(scan) += NEXT_OFF(n);
1316 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
1317 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
1318 /* Now we can overwrite *n : */
1319 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1320#ifdef DEBUGGING
1321 stop = next - 1;
1322#endif
1323 n = nnext;
1324 if (stopnow) break;
1325 }
1326
1327#ifdef EXPERIMENTAL_INPLACESCAN
1328 if (flags && !NEXT_OFF(n)) {
1329 DEBUG_PEEP("atch", val, depth, 0);
1330 if (REGNODE_OFF_BY_ARG(OP(n))) {
1331 ARG_SET(n, val - n);
1332 }
1333 else {
1334 NEXT_OFF(n) = val - n;
1335 }
1336 stopnow = 1;
1337 }
1338#endif
1339 }
1340
1341 /* This temporary node can now be turned into EXACTFU, and must, as
1342 * regexec.c doesn't handle it */
1343 if (OP(scan) == EXACTFU_S_EDGE) {
1344 OP(scan) = EXACTFU;
1345 }
1346
1347 *min_subtract = 0;
1348 *unfolded_multi_char = FALSE;
1349
1350 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
1351 * can now analyze for sequences of problematic code points. (Prior to
1352 * this final joining, sequences could have been split over boundaries, and
1353 * hence missed). The sequences only happen in folding, hence for any
1354 * non-EXACT EXACTish node */
1355 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
1356 U8* s0 = (U8*) STRING(scan);
1357 U8* s = s0;
1358 U8* s_end = s0 + STR_LEN(scan);
1359
1360 int total_count_delta = 0; /* Total delta number of characters that
1361 multi-char folds expand to */
1362
1363 /* One pass is made over the node's string looking for all the
1364 * possibilities. To avoid some tests in the loop, there are two main
1365 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
1366 * non-UTF-8 */
1367 if (UTF) {
1368 U8* folded = NULL;
1369
1370 if (OP(scan) == EXACTFL) {
1371 U8 *d;
1372
1373 /* An EXACTFL node would already have been changed to another
1374 * node type unless there is at least one character in it that
1375 * is problematic; likely a character whose fold definition
1376 * won't be known until runtime, and so has yet to be folded.
1377 * For all but the UTF-8 locale, folds are 1-1 in length, but
1378 * to handle the UTF-8 case, we need to create a temporary
1379 * folded copy using UTF-8 locale rules in order to analyze it.
1380 * This is because our macros that look to see if a sequence is
1381 * a multi-char fold assume everything is folded (otherwise the
1382 * tests in those macros would be too complicated and slow).
1383 * Note that here, the non-problematic folds will have already
1384 * been done, so we can just copy such characters. We actually
1385 * don't completely fold the EXACTFL string. We skip the
1386 * unfolded multi-char folds, as that would just create work
1387 * below to figure out the size they already are */
1388
1389 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
1390 d = folded;
1391 while (s < s_end) {
1392 STRLEN s_len = UTF8SKIP(s);
1393 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
1394 Copy(s, d, s_len, U8);
1395 d += s_len;
1396 }
1397 else if (is_FOLDS_TO_MULTI_utf8(s)) {
1398 *unfolded_multi_char = TRUE;
1399 Copy(s, d, s_len, U8);
1400 d += s_len;
1401 }
1402 else if (isASCII(*s)) {
1403 *(d++) = toFOLD(*s);
1404 }
1405 else {
1406 STRLEN len;
1407 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
1408 d += len;
1409 }
1410 s += s_len;
1411 }
1412
1413 /* Point the remainder of the routine to look at our temporary
1414 * folded copy */
1415 s = folded;
1416 s_end = d;
1417 } /* End of creating folded copy of EXACTFL string */
1418
1419 /* Examine the string for a multi-character fold sequence. UTF-8
1420 * patterns have all characters pre-folded by the time this code is
1421 * executed */
1422 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
1423 length sequence we are looking for is 2 */
1424 {
1425 int count = 0; /* How many characters in a multi-char fold */
1426 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
1427 if (! len) { /* Not a multi-char fold: get next char */
1428 s += UTF8SKIP(s);
1429 continue;
1430 }
1431
1432 { /* Here is a generic multi-char fold. */
1433 U8* multi_end = s + len;
1434
1435 /* Count how many characters are in it. In the case of
1436 * /aa, no folds which contain ASCII code points are
1437 * allowed, so check for those, and skip if found. */
1438 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
1439 count = utf8_length(s, multi_end);
1440 s = multi_end;
1441 }
1442 else {
1443 while (s < multi_end) {
1444 if (isASCII(*s)) {
1445 s++;
1446 goto next_iteration;
1447 }
1448 else {
1449 s += UTF8SKIP(s);
1450 }
1451 count++;
1452 }
1453 }
1454 }
1455
1456 /* The delta is how long the sequence is minus 1 (1 is how long
1457 * the character that folds to the sequence is) */
1458 total_count_delta += count - 1;
1459 next_iteration: ;
1460 }
1461
1462 /* We created a temporary folded copy of the string in EXACTFL
1463 * nodes. Therefore we need to be sure it doesn't go below zero,
1464 * as the real string could be shorter */
1465 if (OP(scan) == EXACTFL) {
1466 int total_chars = utf8_length((U8*) STRING(scan),
1467 (U8*) STRING(scan) + STR_LEN(scan));
1468 if (total_count_delta > total_chars) {
1469 total_count_delta = total_chars;
1470 }
1471 }
1472
1473 *min_subtract += total_count_delta;
1474 Safefree(folded);
1475 }
1476 else if (OP(scan) == EXACTFAA) {
1477
1478 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
1479 * fold to the ASCII range (and there are no existing ones in the
1480 * upper latin1 range). But, as outlined in the comments preceding
1481 * this function, we need to flag any occurrences of the sharp s.
1482 * This character forbids trie formation (because of added
1483 * complexity) */
1484#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
1485 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
1486 || UNICODE_DOT_DOT_VERSION > 0)
1487 while (s < s_end) {
1488 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
1489 OP(scan) = EXACTFAA_NO_TRIE;
1490 *unfolded_multi_char = TRUE;
1491 break;
1492 }
1493 s++;
1494 }
1495 }
1496 else if (OP(scan) != EXACTFAA_NO_TRIE) {
1497
1498 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
1499 * folds that are all Latin1. As explained in the comments
1500 * preceding this function, we look also for the sharp s in EXACTF
1501 * and EXACTFL nodes; it can be in the final position. Otherwise
1502 * we can stop looking 1 byte earlier because have to find at least
1503 * two characters for a multi-fold */
1504 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
1505 ? s_end
1506 : s_end -1;
1507
1508 while (s < upper) {
1509 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
1510 if (! len) { /* Not a multi-char fold. */
1511 if (*s == LATIN_SMALL_LETTER_SHARP_S
1512 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
1513 {
1514 *unfolded_multi_char = TRUE;
1515 }
1516 s++;
1517 continue;
1518 }
1519
1520 if (len == 2
1521 && isALPHA_FOLD_EQ(*s, 's')
1522 && isALPHA_FOLD_EQ(*(s+1), 's'))
1523 {
1524
1525 /* EXACTF nodes need to know that the minimum length
1526 * changed so that a sharp s in the string can match this
1527 * ss in the pattern, but they remain EXACTF nodes, as they
1528 * won't match this unless the target string is in UTF-8,
1529 * which we don't know until runtime. EXACTFL nodes can't
1530 * transform into EXACTFU nodes */
1531 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
1532 OP(scan) = EXACTFUP;
1533 }
1534 }
1535
1536 *min_subtract += len - 1;
1537 s += len;
1538 }
1539#endif
1540 }
1541 }
1542
1543#ifdef DEBUGGING
1544 /* Allow dumping but overwriting the collection of skipped
1545 * ops and/or strings with fake optimized ops */
1546 n = REGNODE_AFTER_varies(scan);
1547 while (n <= stop) {
1548 OP(n) = OPTIMIZED;
1549 FLAGS(n) = 0;
1550 NEXT_OFF(n) = 0;
1551 n++;
1552 }
1553#endif
1554 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
1555 return stopnow;
1556}
1557
1558/* REx optimizer. Converts nodes into quicker variants "in place".
1559 Finds fixed substrings. */
1560
1561
1562/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1563 to the position after last scanned or to NULL. */
1564
1565/* the return from this sub is the minimum length that could possibly match */
1566SSize_t
1567Perl_study_chunk(pTHX_
1568 RExC_state_t *pRExC_state,
1569 regnode **scanp, /* Start here (read-write). */
1570 SSize_t *minlenp, /* used for the minlen of substrings? */
1571 SSize_t *deltap, /* Write maxlen-minlen here. */
1572 regnode *last, /* Stop before this one. */
1573 scan_data_t *data, /* string data about the pattern */
1574 I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
1575 U32 recursed_depth, /* how deep have we recursed via GOSUB */
1576 regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
1577 U32 flags, /* flags controlling this call, see SCF_ flags */
1578 U32 depth, /* how deep have we recursed period */
1579 bool was_mutate_ok /* TRUE if in-place optimizations are allowed.
1580 FALSE only if the caller (recursively) was
1581 prohibited from modifying the regops, because
1582 a higher caller is holding a ptr to them. */
1583)
1584{
1585 /* vars about the regnodes we are working with */
1586 regnode *scan = *scanp; /* the current opcode we are inspecting */
1587 regnode *next = NULL; /* the next opcode beyond scan, tmp var */
1588 regnode *first_non_open = scan; /* FIXME: should this init to NULL?
1589 the first non open regop, if the init
1590 val IS an OPEN then we will skip past
1591 it just after the var decls section */
1592 I32 code = 0; /* temp var used to hold the optype of a regop */
1593
1594 /* vars about the min and max length of the pattern */
1595 SSize_t min = 0; /* min length of this part of the pattern */
1596 SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
1597 this is adjusted down if we find
1598 an ACCEPT */
1599 SSize_t delta = 0; /* difference between min and max length
1600 (not accounting for stopmin) */
1601
1602 /* vars about capture buffers in the pattern */
1603 I32 pars = 0; /* count of OPEN opcodes */
1604 I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */
1605
1606 /* vars about whether this pattern contains something that can match
1607 * infinitely long strings, eg, X* or X+ */
1608 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1609 int is_inf_internal = 0; /* The studied chunk is infinite */
1610
1611 /* scan_data_t (struct) is used to hold information about the substrings
1612 * and start class we have extracted from the string */
1613 scan_data_t data_fake; /* temp var used for recursing in some cases */
1614
1615 SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
1616 trie optimizations */
1617
1618 scan_frame *frame = NULL; /* used as part of fake recursion */
1619
1620 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1621
1622 PERL_ARGS_ASSERT_STUDY_CHUNK;
1623 RExC_study_started= 1;
1624
1625 Zero(&data_fake, 1, scan_data_t);
1626
1627 if ( depth == 0 ) {
1628 while (first_non_open && OP(first_non_open) == OPEN)
1629 first_non_open=regnext(first_non_open);
1630 }
1631
1632 fake_study_recurse:
1633 DEBUG_r(
1634 RExC_study_chunk_recursed_count++;
1635 );
1636 DEBUG_OPTIMISE_MORE_r(
1637 {
1638 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
1639 depth, (long)stopparen,
1640 (unsigned long)RExC_study_chunk_recursed_count,
1641 (unsigned long)depth, (unsigned long)recursed_depth,
1642 scan,
1643 last);
1644 if (recursed_depth) {
1645 U32 i;
1646 U32 j;
1647 for ( j = 0 ; j < recursed_depth ; j++ ) {
1648 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
1649 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
1650 Perl_re_printf( aTHX_ " %d",(int)i);
1651 break;
1652 }
1653 }
1654 if ( j + 1 < recursed_depth ) {
1655 Perl_re_printf( aTHX_ ",");
1656 }
1657 }
1658 }
1659 Perl_re_printf( aTHX_ "\n");
1660 }
1661 );
1662 while ( scan && OP(scan) != END && scan < last ){
1663 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
1664 node length to get a real minimum (because
1665 the folded version may be shorter) */
1666 bool unfolded_multi_char = FALSE;
1667 /* avoid mutating ops if we are anywhere within the recursed or
1668 * enframed handling for a GOSUB: the outermost level will handle it.
1669 */
1670 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
1671 /* Peephole optimizer: */
1672 DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
1673 DEBUG_PEEP("Peep", scan, depth, flags);
1674
1675
1676 /* The reason we do this here is that we need to deal with things like
1677 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
1678 * parsing code, as each (?:..) is handled by a different invocation of
1679 * reg() -- Yves
1680 */
1681 if (REGNODE_TYPE(OP(scan)) == EXACT
1682 && OP(scan) != LEXACT
1683 && OP(scan) != LEXACT_REQ8
1684 && mutate_ok
1685 ) {
1686 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
1687 0, NULL, depth + 1);
1688 }
1689
1690 /* Follow the next-chain of the current node and optimize
1691 away all the NOTHINGs from it.
1692 */
1693 rck_elide_nothing(scan);
1694
1695 /* The principal pseudo-switch. Cannot be a switch, since we look into
1696 * several different things. */
1697 if ( OP(scan) == DEFINEP ) {
1698 SSize_t minlen = 0;
1699 SSize_t deltanext = 0;
1700 SSize_t fake_last_close = 0;
1701 regnode *fake_last_close_op = NULL;
1702 U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
1703
1704 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1705 scan = regnext(scan);
1706 assert( OP(scan) == IFTHEN );
1707 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
1708
1709 data_fake.last_closep= &fake_last_close;
1710 data_fake.last_close_opp= &fake_last_close_op;
1711 minlen = *minlenp;
1712 next = regnext(scan);
1713 scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
1714 DEBUG_PEEP("scan", scan, depth, flags);
1715 DEBUG_PEEP("next", next, depth, flags);
1716
1717 /* we suppose the run is continuous, last=next...
1718 * NOTE we dont use the return here! */
1719 /* DEFINEP study_chunk() recursion */
1720 (void)study_chunk(pRExC_state, &scan, &minlen,
1721 &deltanext, next, &data_fake, stopparen,
1722 recursed_depth, NULL, f, depth+1, mutate_ok);
1723
1724 scan = next;
1725 } else
1726 if (
1727 OP(scan) == BRANCH ||
1728 OP(scan) == BRANCHJ ||
1729 OP(scan) == IFTHEN
1730 ) {
1731 next = regnext(scan);
1732 code = OP(scan);
1733
1734 /* The op(next)==code check below is to see if we
1735 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
1736 * IFTHEN is special as it might not appear in pairs.
1737 * Not sure whether BRANCH-BRANCHJ is possible, regardless
1738 * we dont handle it cleanly. */
1739 if (OP(next) == code || code == IFTHEN) {
1740 /* NOTE - There is similar code to this block below for
1741 * handling TRIE nodes on a re-study. If you change stuff here
1742 * check there too. */
1743 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
1744 regnode_ssc accum;
1745 regnode * const startbranch=scan;
1746
1747 if (flags & SCF_DO_SUBSTR) {
1748 /* Cannot merge strings after this. */
1749 scan_commit(pRExC_state, data, minlenp, is_inf);
1750 }
1751
1752 if (flags & SCF_DO_STCLASS)
1753 ssc_init_zero(pRExC_state, &accum);
1754
1755 while (OP(scan) == code) {
1756 SSize_t deltanext, minnext, fake_last_close = 0;
1757 regnode *fake_last_close_op = NULL;
1758 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
1759 regnode_ssc this_class;
1760
1761 DEBUG_PEEP("Branch", scan, depth, flags);
1762
1763 num++;
1764 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1765 if (data) {
1766 data_fake.whilem_c = data->whilem_c;
1767 data_fake.last_closep = data->last_closep;
1768 data_fake.last_close_opp = data->last_close_opp;
1769 }
1770 else {
1771 data_fake.last_closep = &fake_last_close;
1772 data_fake.last_close_opp = &fake_last_close_op;
1773 }
1774
1775 data_fake.pos_delta = delta;
1776 next = regnext(scan);
1777
1778 scan = REGNODE_AFTER_opcode(scan, code);
1779
1780 if (flags & SCF_DO_STCLASS) {
1781 ssc_init(pRExC_state, &this_class);
1782 data_fake.start_class = &this_class;
1783 f |= SCF_DO_STCLASS_AND;
1784 }
1785 if (flags & SCF_WHILEM_VISITED_POS)
1786 f |= SCF_WHILEM_VISITED_POS;
1787
1788 /* we suppose the run is continuous, last=next...*/
1789 /* recurse study_chunk() for each BRANCH in an alternation */
1790 minnext = study_chunk(pRExC_state, &scan, minlenp,
1791 &deltanext, next, &data_fake, stopparen,
1792 recursed_depth, NULL, f, depth+1,
1793 mutate_ok);
1794
1795 if (min1 > minnext)
1796 min1 = minnext;
1797 if (deltanext == OPTIMIZE_INFTY) {
1798 is_inf = is_inf_internal = 1;
1799 max1 = OPTIMIZE_INFTY;
1800 } else if (max1 < minnext + deltanext)
1801 max1 = minnext + deltanext;
1802 scan = next;
1803 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1804 pars++;
1805 if (data_fake.flags & SCF_SEEN_ACCEPT) {
1806 if ( stopmin > minnext)
1807 stopmin = min + min1;
1808 flags &= ~SCF_DO_SUBSTR;
1809 if (data)
1810 data->flags |= SCF_SEEN_ACCEPT;
1811 }
1812 if (data) {
1813 if (data_fake.flags & SF_HAS_EVAL)
1814 data->flags |= SF_HAS_EVAL;
1815 data->whilem_c = data_fake.whilem_c;
1816 }
1817 if (flags & SCF_DO_STCLASS)
1818 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
1819 DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
1820 }
1821 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1822 min1 = 0;
1823 if (flags & SCF_DO_SUBSTR) {
1824 data->pos_min += min1;
1825 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
1826 data->pos_delta = OPTIMIZE_INFTY;
1827 else
1828 data->pos_delta += max1 - min1;
1829 if (max1 != min1 || is_inf)
1830 data->cur_is_floating = 1;
1831 }
1832 min += min1;
1833 if (delta == OPTIMIZE_INFTY
1834 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
1835 delta = OPTIMIZE_INFTY;
1836 else
1837 delta += max1 - min1;
1838 if (flags & SCF_DO_STCLASS_OR) {
1839 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
1840 if (min1) {
1841 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
1842 flags &= ~SCF_DO_STCLASS;
1843 }
1844 }
1845 else if (flags & SCF_DO_STCLASS_AND) {
1846 if (min1) {
1847 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
1848 flags &= ~SCF_DO_STCLASS;
1849 }
1850 else {
1851 /* Switch to OR mode: cache the old value of
1852 * data->start_class */
1853 INIT_AND_WITHP;
1854 StructCopy(data->start_class, and_withp, regnode_ssc);
1855 flags &= ~SCF_DO_STCLASS_AND;
1856 StructCopy(&accum, data->start_class, regnode_ssc);
1857 flags |= SCF_DO_STCLASS_OR;
1858 }
1859 }
1860 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
1861
1862 if (PERL_ENABLE_TRIE_OPTIMISATION
1863 && OP(startbranch) == BRANCH
1864 && mutate_ok
1865 ) {
1866 /* demq.
1867
1868 Assuming this was/is a branch we are dealing with: 'scan'
1869 now points at the item that follows the branch sequence,
1870 whatever it is. We now start at the beginning of the
1871 sequence and look for subsequences of
1872
1873 BRANCH->EXACT=>x1
1874 BRANCH->EXACT=>x2
1875 tail
1876
1877 which would be constructed from a pattern like
1878 /A|LIST|OF|WORDS/
1879
1880 If we can find such a subsequence we need to turn the first
1881 element into a trie and then add the subsequent branch exact
1882 strings to the trie.
1883
1884 We have two cases
1885
1886 1. patterns where the whole set of branches can be
1887 converted.
1888
1889 2. patterns where only a subset can be converted.
1890
1891 In case 1 we can replace the whole set with a single regop
1892 for the trie. In case 2 we need to keep the start and end
1893 branches so
1894
1895 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1896 becomes BRANCH TRIE; BRANCH X;
1897
1898 There is an additional case, that being where there is a
1899 common prefix, which gets split out into an EXACT like node
1900 preceding the TRIE node.
1901
1902 If X(1..n)==tail then we can do a simple trie, if not we make
1903 a "jump" trie, such that when we match the appropriate word
1904 we "jump" to the appropriate tail node. Essentially we turn
1905 a nested if into a case structure of sorts.
1906
1907 */
1908
1909 int made=0;
1910 if (!re_trie_maxbuff) {
1911 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1912 if (!SvIOK(re_trie_maxbuff))
1913 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1914 }
1915 if ( SvIV(re_trie_maxbuff)>=0 ) {
1916 regnode *cur;
1917 regnode *first = (regnode *)NULL;
1918 regnode *prev = (regnode *)NULL;
1919 regnode *tail = scan;
1920 U8 trietype = 0;
1921 U32 count=0;
1922
1923 /* var tail is used because there may be a TAIL
1924 regop in the way. Ie, the exacts will point to the
1925 thing following the TAIL, but the last branch will
1926 point at the TAIL. So we advance tail. If we
1927 have nested (?:) we may have to move through several
1928 tails.
1929 */
1930
1931 while ( OP( tail ) == TAIL ) {
1932 /* this is the TAIL generated by (?:) */
1933 tail = regnext( tail );
1934 }
1935
1936
1937 DEBUG_TRIE_COMPILE_r({
1938 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
1939 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
1940 depth+1,
1941 "Looking for TRIE'able sequences. Tail node is ",
1942 (UV) REGNODE_OFFSET(tail),
1943 SvPV_nolen_const( RExC_mysv )
1944 );
1945 });
1946
1947 /*
1948
1949 Step through the branches
1950 cur represents each branch,
1951 noper is the first thing to be matched as part
1952 of that branch
1953 noper_next is the regnext() of that node.
1954
1955 We normally handle a case like this
1956 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
1957 support building with NOJUMPTRIE, which restricts
1958 the trie logic to structures like /FOO|BAR/.
1959
1960 If noper is a trieable nodetype then the branch is
1961 a possible optimization target. If we are building
1962 under NOJUMPTRIE then we require that noper_next is
1963 the same as scan (our current position in the regex
1964 program).
1965
1966 Once we have two or more consecutive such branches
1967 we can create a trie of the EXACT's contents and
1968 stitch it in place into the program.
1969
1970 If the sequence represents all of the branches in
1971 the alternation we replace the entire thing with a
1972 single TRIE node.
1973
1974 Otherwise when it is a subsequence we need to
1975 stitch it in place and replace only the relevant
1976 branches. This means the first branch has to remain
1977 as it is used by the alternation logic, and its
1978 next pointer, and needs to be repointed at the item
1979 on the branch chain following the last branch we
1980 have optimized away.
1981
1982 This could be either a BRANCH, in which case the
1983 subsequence is internal, or it could be the item
1984 following the branch sequence in which case the
1985 subsequence is at the end (which does not
1986 necessarily mean the first node is the start of the
1987 alternation).
1988
1989 TRIE_TYPE(X) is a define which maps the optype to a
1990 trietype.
1991
1992 optype | trietype
1993 ----------------+-----------
1994 NOTHING | NOTHING
1995 EXACT | EXACT
1996 EXACT_REQ8 | EXACT
1997 EXACTFU | EXACTFU
1998 EXACTFU_REQ8 | EXACTFU
1999 EXACTFUP | EXACTFU
2000 EXACTFAA | EXACTFAA
2001 EXACTL | EXACTL
2002 EXACTFLU8 | EXACTFLU8
2003
2004
2005 */
2006#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
2007 ? NOTHING \
2008 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
2009 ? EXACT \
2010 : ( EXACTFU == (X) \
2011 || EXACTFU_REQ8 == (X) \
2012 || EXACTFUP == (X) ) \
2013 ? EXACTFU \
2014 : ( EXACTFAA == (X) ) \
2015 ? EXACTFAA \
2016 : ( EXACTL == (X) ) \
2017 ? EXACTL \
2018 : ( EXACTFLU8 == (X) ) \
2019 ? EXACTFLU8 \
2020 : 0 )
2021
2022 /* dont use tail as the end marker for this traverse */
2023 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2024 regnode * const noper = REGNODE_AFTER( cur );
2025 U8 noper_type = OP( noper );
2026 U8 noper_trietype = TRIE_TYPE( noper_type );
2027#if defined(DEBUGGING) || defined(NOJUMPTRIE)
2028 regnode * const noper_next = regnext( noper );
2029 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
2030 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
2031#endif
2032
2033 DEBUG_TRIE_COMPILE_r({
2034 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2035 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
2036 depth+1,
2037 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
2038
2039 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
2040 Perl_re_printf( aTHX_ " -> %d:%s",
2041 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
2042
2043 if ( noper_next ) {
2044 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
2045 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
2046 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
2047 }
2048 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
2049 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2050 REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
2051 );
2052 });
2053
2054 /* Is noper a trieable nodetype that can be merged
2055 * with the current trie (if there is one)? */
2056 if ( noper_trietype
2057 &&
2058 (
2059 ( noper_trietype == NOTHING )
2060 || ( trietype == NOTHING )
2061 || ( trietype == noper_trietype )
2062 )
2063#ifdef NOJUMPTRIE
2064 && noper_next >= tail
2065#endif
2066 && count < U16_MAX)
2067 {
2068 /* Handle mergable triable node Either we are
2069 * the first node in a new trieable sequence,
2070 * in which case we do some bookkeeping,
2071 * otherwise we update the end pointer. */
2072 if ( !first ) {
2073 first = cur;
2074 if ( noper_trietype == NOTHING ) {
2075#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
2076 regnode * const noper_next = regnext( noper );
2077 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
2078 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
2079#endif
2080
2081 if ( noper_next_trietype ) {
2082 trietype = noper_next_trietype;
2083 } else if (noper_next_type) {
2084 /* a NOTHING regop is 1 regop wide.
2085 * We need at least two for a trie
2086 * so we can't merge this in */
2087 first = NULL;
2088 }
2089 } else {
2090 trietype = noper_trietype;
2091 }
2092 } else {
2093 if ( trietype == NOTHING )
2094 trietype = noper_trietype;
2095 prev = cur;
2096 }
2097 if (first)
2098 count++;
2099 } /* end handle mergable triable node */
2100 else {
2101 /* handle unmergable node -
2102 * noper may either be a triable node which can
2103 * not be tried together with the current trie,
2104 * or a non triable node */
2105 if ( prev ) {
2106 /* If last is set and trietype is not
2107 * NOTHING then we have found at least two
2108 * triable branch sequences in a row of a
2109 * similar trietype so we can turn them
2110 * into a trie. If/when we allow NOTHING to
2111 * start a trie sequence this condition
2112 * will be required, and it isn't expensive
2113 * so we leave it in for now. */
2114 if ( trietype && trietype != NOTHING )
2115 make_trie( pRExC_state,
2116 startbranch, first, cur, tail,
2117 count, trietype, depth+1 );
2118 prev = NULL; /* note: we clear/update
2119 first, trietype etc below,
2120 so we dont do it here */
2121 }
2122 if ( noper_trietype
2123#ifdef NOJUMPTRIE
2124 && noper_next >= tail
2125#endif
2126 ){
2127 /* noper is triable, so we can start a new
2128 * trie sequence */
2129 count = 1;
2130 first = cur;
2131 trietype = noper_trietype;
2132 } else if (first) {
2133 /* if we already saw a first but the
2134 * current node is not triable then we have
2135 * to reset the first information. */
2136 count = 0;
2137 first = NULL;
2138 trietype = 0;
2139 }
2140 } /* end handle unmergable node */
2141 } /* loop over branches */
2142 DEBUG_TRIE_COMPILE_r({
2143 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2144 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
2145 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2146 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
2147 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2148 REGNODE_NAME(trietype)
2149 );
2150
2151 });
2152 if ( prev && trietype ) {
2153 if ( trietype != NOTHING ) {
2154 /* the last branch of the sequence was part of
2155 * a trie, so we have to construct it here
2156 * outside of the loop */
2157 made= make_trie( pRExC_state, startbranch,
2158 first, scan, tail, count,
2159 trietype, depth+1 );
2160#ifdef TRIE_STUDY_OPT
2161 if ( ((made == MADE_EXACT_TRIE &&
2162 startbranch == first)
2163 || ( first_non_open == first )) &&
2164 depth==0 ) {
2165 flags |= SCF_TRIE_RESTUDY;
2166 if ( startbranch == first
2167 && scan >= tail )
2168 {
2169 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
2170 }
2171 }
2172#endif
2173 } else {
2174 /* at this point we know whatever we have is a
2175 * NOTHING sequence/branch AND if 'startbranch'
2176 * is 'first' then we can turn the whole thing
2177 * into a NOTHING
2178 */
2179 if ( startbranch == first ) {
2180 regnode *opt;
2181 /* the entire thing is a NOTHING sequence,
2182 * something like this: (?:|) So we can
2183 * turn it into a plain NOTHING op. */
2184 DEBUG_TRIE_COMPILE_r({
2185 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2186 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
2187 depth+1,
2188 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2189
2190 });
2191 OP(startbranch)= NOTHING;
2192 NEXT_OFF(startbranch)= tail - startbranch;
2193 for ( opt= startbranch + 1; opt < tail ; opt++ )
2194 OP(opt)= OPTIMIZED;
2195 }
2196 }
2197 } /* end if ( prev) */
2198 } /* TRIE_MAXBUF is non zero */
2199 } /* do trie */
2200 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
2201 }
2202 else
2203 scan = REGNODE_AFTER_opcode(scan,code);
2204 continue;
2205 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
2206 I32 paren = 0;
2207 regnode *start = NULL;
2208 regnode *end = NULL;
2209 U32 my_recursed_depth= recursed_depth;
2210
2211 if (OP(scan) != SUSPEND) { /* GOSUB */
2212 /* Do setup, note this code has side effects beyond
2213 * the rest of this block. Specifically setting
2214 * RExC_recurse[] must happen at least once during
2215 * study_chunk(). */
2216 paren = ARG(scan);
2217 RExC_recurse[ARG2L(scan)] = scan;
2218 start = REGNODE_p(RExC_open_parens[paren]);
2219 end = REGNODE_p(RExC_close_parens[paren]);
2220
2221 /* NOTE we MUST always execute the above code, even
2222 * if we do nothing with a GOSUB */
2223 if (
2224 ( flags & SCF_IN_DEFINE )
2225 ||
2226 (
2227 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
2228 &&
2229 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
2230 )
2231 ) {
2232 /* no need to do anything here if we are in a define. */
2233 /* or we are after some kind of infinite construct
2234 * so we can skip recursing into this item.
2235 * Since it is infinite we will not change the maxlen
2236 * or delta, and if we miss something that might raise
2237 * the minlen it will merely pessimise a little.
2238 *
2239 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
2240 * might result in a minlen of 1 and not of 4,
2241 * but this doesn't make us mismatch, just try a bit
2242 * harder than we should.
2243 *
2244 * However we must assume this GOSUB is infinite, to
2245 * avoid wrongly applying other optimizations in the
2246 * enclosing scope - see GH 18096, for example.
2247 */
2248 is_inf = is_inf_internal = 1;
2249 scan= regnext(scan);
2250 continue;
2251 }
2252
2253 if (
2254 !recursed_depth
2255 || !PAREN_TEST(recursed_depth - 1, paren)
2256 ) {
2257 /* it is quite possible that there are more efficient ways
2258 * to do this. We maintain a bitmap per level of recursion
2259 * of which patterns we have entered so we can detect if a
2260 * pattern creates a possible infinite loop. When we
2261 * recurse down a level we copy the previous levels bitmap
2262 * down. When we are at recursion level 0 we zero the top
2263 * level bitmap. It would be nice to implement a different
2264 * more efficient way of doing this. In particular the top
2265 * level bitmap may be unnecessary.
2266 */
2267 if (!recursed_depth) {
2268 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
2269 } else {
2270 Copy(PAREN_OFFSET(recursed_depth - 1),
2271 PAREN_OFFSET(recursed_depth),
2272 RExC_study_chunk_recursed_bytes, U8);
2273 }
2274 /* we havent recursed into this paren yet, so recurse into it */
2275 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
2276 PAREN_SET(recursed_depth, paren);
2277 my_recursed_depth= recursed_depth + 1;
2278 } else {
2279 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
2280 /* some form of infinite recursion, assume infinite length
2281 * */
2282 if (flags & SCF_DO_SUBSTR) {
2283 scan_commit(pRExC_state, data, minlenp, is_inf);
2284 data->cur_is_floating = 1;
2285 }
2286 is_inf = is_inf_internal = 1;
2287 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2288 ssc_anything(data->start_class);
2289 flags &= ~SCF_DO_STCLASS;
2290
2291 start= NULL; /* reset start so we dont recurse later on. */
2292 }
2293 } else {
2294 paren = stopparen;
2295 start = scan + 2;
2296 end = regnext(scan);
2297 }
2298 if (start) {
2299 scan_frame *newframe;
2300 assert(end);
2301 if (!RExC_frame_last) {
2302 Newxz(newframe, 1, scan_frame);
2303 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
2304 RExC_frame_head= newframe;
2305 RExC_frame_count++;
2306 } else if (!RExC_frame_last->next_frame) {
2307 Newxz(newframe, 1, scan_frame);
2308 RExC_frame_last->next_frame= newframe;
2309 newframe->prev_frame= RExC_frame_last;
2310 RExC_frame_count++;
2311 } else {
2312 newframe= RExC_frame_last->next_frame;
2313 }
2314 RExC_frame_last= newframe;
2315
2316 newframe->next_regnode = regnext(scan);
2317 newframe->last_regnode = last;
2318 newframe->stopparen = stopparen;
2319 newframe->prev_recursed_depth = recursed_depth;
2320 newframe->this_prev_frame= frame;
2321 newframe->in_gosub = (
2322 (frame && frame->in_gosub) || OP(scan) == GOSUB
2323 );
2324
2325 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
2326 DEBUG_PEEP("fnew", scan, depth, flags);
2327
2328 frame = newframe;
2329 scan = start;
2330 stopparen = paren;
2331 last = end;
2332 depth = depth + 1;
2333 recursed_depth= my_recursed_depth;
2334
2335 continue;
2336 }
2337 }
2338 else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
2339 SSize_t bytelen = STR_LEN(scan), charlen;
2340 UV uc;
2341 assert(bytelen);
2342 if (UTF) {
2343 const U8 * const s = (U8*)STRING(scan);
2344 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
2345 charlen = utf8_length(s, s + bytelen);
2346 } else {
2347 uc = *((U8*)STRING(scan));
2348 charlen = bytelen;
2349 }
2350 min += charlen;
2351 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2352 /* The code below prefers earlier match for fixed
2353 offset, later match for variable offset. */
2354 if (data->last_end == -1) { /* Update the start info. */
2355 data->last_start_min = data->pos_min;
2356 data->last_start_max =
2357 is_inf ? OPTIMIZE_INFTY
2358 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
2359 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
2360 }
2361 sv_catpvn(data->last_found, STRING(scan), bytelen);
2362 if (UTF)
2363 SvUTF8_on(data->last_found);
2364 {
2365 SV * const sv = data->last_found;
2366 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2367 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2368 if (mg && mg->mg_len >= 0)
2369 mg->mg_len += charlen;
2370 }
2371 data->last_end = data->pos_min + charlen;
2372 data->pos_min += charlen; /* As in the first entry. */
2373 data->flags &= ~SF_BEFORE_EOL;
2374 }
2375
2376 /* ANDing the code point leaves at most it, and not in locale, and
2377 * can't match null string */
2378 if (flags & SCF_DO_STCLASS_AND) {
2379 ssc_cp_and(data->start_class, uc);
2380 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2381 ssc_clear_locale(data->start_class);
2382 }
2383 else if (flags & SCF_DO_STCLASS_OR) {
2384 ssc_add_cp(data->start_class, uc);
2385 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2386
2387 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2388 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2389 }
2390 flags &= ~SCF_DO_STCLASS;
2391 DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
2392 }
2393 else if (REGNODE_TYPE(OP(scan)) == EXACT) {
2394 /* But OP != EXACT!, so is EXACTFish */
2395 SSize_t bytelen = STR_LEN(scan), charlen;
2396 const U8 * s = (U8*)STRING(scan);
2397
2398 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
2399 * with the mask set to the complement of the bit that differs
2400 * between upper and lower case, and the lowest code point of the
2401 * pair (which the '&' forces) */
2402 if ( bytelen == 1
2403 && isALPHA_A(*s)
2404 && ( OP(scan) == EXACTFAA
2405 || ( OP(scan) == EXACTFU
2406 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
2407 && mutate_ok
2408 ) {
2409 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
2410
2411 OP(scan) = ANYOFM;
2412 ARG_SET(scan, *s & mask);
2413 FLAGS(scan) = mask;
2414 /* We're not EXACTFish any more, so restudy.
2415 * Search for "restudy" in this file to find
2416 * a comment with details. */
2417 continue;
2418 }
2419
2420 /* Search for fixed substrings supports EXACT only. */
2421 if (flags & SCF_DO_SUBSTR) {
2422 assert(data);
2423 scan_commit(pRExC_state, data, minlenp, is_inf);
2424 }
2425 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
2426 if (unfolded_multi_char) {
2427 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
2428 }
2429 min += charlen - min_subtract;
2430 assert (min >= 0);
2431 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2432 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2433 ) {
2434 delta += min_subtract;
2435 } else {
2436 delta = OPTIMIZE_INFTY;
2437 }
2438 if (flags & SCF_DO_SUBSTR) {
2439 data->pos_min += charlen - min_subtract;
2440 if (data->pos_min < 0) {
2441 data->pos_min = 0;
2442 }
2443 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2444 && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2445 ) {
2446 data->pos_delta += min_subtract;
2447 } else {
2448 data->pos_delta = OPTIMIZE_INFTY;
2449 }
2450 if (min_subtract) {
2451 data->cur_is_floating = 1; /* float */
2452 }
2453 }
2454
2455 if (flags & SCF_DO_STCLASS) {
2456 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
2457
2458 assert(EXACTF_invlist);
2459 if (flags & SCF_DO_STCLASS_AND) {
2460 if (OP(scan) != EXACTFL)
2461 ssc_clear_locale(data->start_class);
2462 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2463 ANYOF_POSIXL_ZERO(data->start_class);
2464 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
2465 }
2466 else { /* SCF_DO_STCLASS_OR */
2467 ssc_union(data->start_class, EXACTF_invlist, FALSE);
2468 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2469
2470 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2471 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2472 }
2473 flags &= ~SCF_DO_STCLASS;
2474 SvREFCNT_dec(EXACTF_invlist);
2475 }
2476 DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
2477 }
2478 else if (REGNODE_VARIES(OP(scan))) {
2479 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
2480 I32 fl = 0;
2481 U32 f = flags;
2482 regnode * const oscan = scan;
2483 regnode_ssc this_class;
2484 regnode_ssc *oclass = NULL;
2485 I32 next_is_eval = 0;
2486
2487 switch (REGNODE_TYPE(OP(scan))) {
2488 case WHILEM: /* End of (?:...)* . */
2489 scan = REGNODE_AFTER(scan);
2490 goto finish;
2491 case PLUS:
2492 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2493 next = REGNODE_AFTER(scan);
2494 if ( ( REGNODE_TYPE(OP(next)) == EXACT
2495 && ! isEXACTFish(OP(next)))
2496 || (flags & SCF_DO_STCLASS))
2497 {
2498 mincount = 1;
2499 maxcount = REG_INFTY;
2500 next = regnext(scan);
2501 scan = REGNODE_AFTER(scan);
2502 goto do_curly;
2503 }
2504 }
2505 if (flags & SCF_DO_SUBSTR)
2506 data->pos_min++;
2507 /* This will bypass the formal 'min += minnext * mincount'
2508 * calculation in the do_curly path, so assumes min width
2509 * of the PLUS payload is exactly one. */
2510 min++;
2511 /* FALLTHROUGH */
2512 case STAR:
2513 next = REGNODE_AFTER(scan);
2514
2515 /* This temporary node can now be turned into EXACTFU, and
2516 * must, as regexec.c doesn't handle it */
2517 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
2518 OP(next) = EXACTFU;
2519 }
2520
2521 if ( STR_LEN(next) == 1
2522 && isALPHA_A(* STRING(next))
2523 && ( OP(next) == EXACTFAA
2524 || ( OP(next) == EXACTFU
2525 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
2526 && mutate_ok
2527 ) {
2528 /* These differ in just one bit */
2529 U8 mask = ~ ('A' ^ 'a');
2530
2531 assert(isALPHA_A(* STRING(next)));
2532
2533 /* Then replace it by an ANYOFM node, with
2534 * the mask set to the complement of the
2535 * bit that differs between upper and lower
2536 * case, and the lowest code point of the
2537 * pair (which the '&' forces) */
2538 OP(next) = ANYOFM;
2539 ARG_SET(next, *STRING(next) & mask);
2540 FLAGS(next) = mask;
2541 }
2542
2543 if (flags & SCF_DO_STCLASS) {
2544 mincount = 0;
2545 maxcount = REG_INFTY;
2546 next = regnext(scan);
2547 scan = REGNODE_AFTER(scan);
2548 goto do_curly;
2549 }
2550 if (flags & SCF_DO_SUBSTR) {
2551 scan_commit(pRExC_state, data, minlenp, is_inf);
2552 /* Cannot extend fixed substrings */
2553 data->cur_is_floating = 1; /* float */
2554 }
2555 is_inf = is_inf_internal = 1;
2556 scan = regnext(scan);
2557 goto optimize_curly_tail;
2558 case CURLY:
2559 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2560 && (scan->flags == stopparen))
2561 {
2562 mincount = 1;
2563 maxcount = 1;
2564 } else {
2565 mincount = ARG1(scan);
2566 maxcount = ARG2(scan);
2567 }
2568 next = regnext(scan);
2569 if (OP(scan) == CURLYX) {
2570 I32 lp = (data ? *(data->last_closep) : 0);
2571 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2572 }
2573 scan = REGNODE_AFTER(scan);
2574 next_is_eval = (OP(scan) == EVAL);
2575 do_curly:
2576 if (flags & SCF_DO_SUBSTR) {
2577 if (mincount == 0)
2578 scan_commit(pRExC_state, data, minlenp, is_inf);
2579 /* Cannot extend fixed substrings */
2580 pos_before = data->pos_min;
2581 }
2582 if (data) {
2583 fl = data->flags;
2584 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2585 if (is_inf)
2586 data->flags |= SF_IS_INF;
2587 }
2588 if (flags & SCF_DO_STCLASS) {
2589 ssc_init(pRExC_state, &this_class);
2590 oclass = data->start_class;
2591 data->start_class = &this_class;
2592 f |= SCF_DO_STCLASS_AND;
2593 f &= ~SCF_DO_STCLASS_OR;
2594 }
2595 /* Exclude from super-linear cache processing any {n,m}
2596 regops for which the combination of input pos and regex
2597 pos is not enough information to determine if a match
2598 will be possible.
2599
2600 For example, in the regex /foo(bar\s*){4,8}baz/ with the
2601 regex pos at the \s*, the prospects for a match depend not
2602 only on the input position but also on how many (bar\s*)
2603 repeats into the {4,8} we are. */
2604 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
2605 f &= ~SCF_WHILEM_VISITED_POS;
2606
2607 /* This will finish on WHILEM, setting scan, or on NULL: */
2608 /* recurse study_chunk() on loop bodies */
2609 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2610 last, data, stopparen, recursed_depth, NULL,
2611 (mincount == 0
2612 ? (f & ~SCF_DO_SUBSTR)
2613 : f)
2614 , depth+1, mutate_ok);
2615
2616 if (data && data->flags & SCF_SEEN_ACCEPT) {
2617 if (mincount > 1)
2618 mincount = 1;
2619 }
2620
2621 if (flags & SCF_DO_STCLASS)
2622 data->start_class = oclass;
2623 if (mincount == 0 || minnext == 0) {
2624 if (flags & SCF_DO_STCLASS_OR) {
2625 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2626 }
2627 else if (flags & SCF_DO_STCLASS_AND) {
2628 /* Switch to OR mode: cache the old value of
2629 * data->start_class */
2630 INIT_AND_WITHP;
2631 StructCopy(data->start_class, and_withp, regnode_ssc);
2632 flags &= ~SCF_DO_STCLASS_AND;
2633 StructCopy(&this_class, data->start_class, regnode_ssc);
2634 flags |= SCF_DO_STCLASS_OR;
2635 ANYOF_FLAGS(data->start_class)
2636 |= SSC_MATCHES_EMPTY_STRING;
2637 }
2638 } else { /* Non-zero len */
2639 if (flags & SCF_DO_STCLASS_OR) {
2640 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2641 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2642 }
2643 else if (flags & SCF_DO_STCLASS_AND)
2644 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2645 flags &= ~SCF_DO_STCLASS;
2646 }
2647 if (!scan) /* It was not CURLYX, but CURLY. */
2648 scan = next;
2649 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
2650 /* ? quantifier ok, except for (?{ ... }) */
2651 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2652 && (minnext == 0) && (deltanext == 0)
2653 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2654 && maxcount <= REG_INFTY/3) /* Complement check for big
2655 count */
2656 {
2657 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
2658 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
2659 "Quantifier unexpected on zero-length expression "
2660 "in regex m/%" UTF8f "/",
2661 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
2662 RExC_precomp)));
2663 }
2664
2665 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
2666 || min >= SSize_t_MAX - minnext * mincount )
2667 {
2668 FAIL("Regexp out of space");
2669 }
2670
2671 min += minnext * mincount;
2672 is_inf_internal |= deltanext == OPTIMIZE_INFTY
2673 || (maxcount == REG_INFTY && minnext + deltanext > 0);
2674 is_inf |= is_inf_internal;
2675 if (is_inf) {
2676 delta = OPTIMIZE_INFTY;
2677 } else {
2678 delta += (minnext + deltanext) * maxcount
2679 - minnext * mincount;
2680 }
2681
2682 if (data && data->flags & SCF_SEEN_ACCEPT) {
2683 if (flags & SCF_DO_SUBSTR) {
2684 scan_commit(pRExC_state, data, minlenp, is_inf);
2685 flags &= ~SCF_DO_SUBSTR;
2686 }
2687 if (stopmin > min)
2688 stopmin = min;
2689 DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
2690 }
2691 /* Try powerful optimization CURLYX => CURLYN. */
2692 if ( OP(oscan) == CURLYX && data
2693 && data->flags & SF_IN_PAR
2694 && !(data->flags & SF_HAS_EVAL)
2695 && !deltanext && minnext == 1
2696 && mutate_ok
2697 ) {
2698 /* Try to optimize to CURLYN. */
2699 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
2700 regnode * const nxt1 = nxt;
2701#ifdef DEBUGGING
2702 regnode *nxt2;
2703#endif
2704
2705 /* Skip open. */
2706 nxt = regnext(nxt);
2707 if (!REGNODE_SIMPLE(OP(nxt))
2708 && !(REGNODE_TYPE(OP(nxt)) == EXACT
2709 && STR_LEN(nxt) == 1))
2710 goto nogo;
2711#ifdef DEBUGGING
2712 nxt2 = nxt;
2713#endif
2714 nxt = regnext(nxt);
2715 if (OP(nxt) != CLOSE)
2716 goto nogo;
2717 if (RExC_open_parens) {
2718
2719 /*open->CURLYM*/
2720 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2721
2722 /*close->while*/
2723 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2;
2724 }
2725 /* Now we know that nxt2 is the only contents: */
2726 oscan->flags = (U8)PARNO(nxt);
2727 OP(oscan) = CURLYN;
2728 OP(nxt1) = NOTHING; /* was OPEN. */
2729
2730#ifdef DEBUGGING
2731 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2732 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
2733 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
2734 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2735 OP(nxt + 1) = OPTIMIZED; /* was count. */
2736 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
2737#endif
2738 }
2739 nogo:
2740
2741 /* Try optimization CURLYX => CURLYM. */
2742 if ( OP(oscan) == CURLYX && data
2743 && !(data->flags & SF_HAS_PAR)
2744 && !(data->flags & SF_HAS_EVAL)
2745 && !deltanext /* atom is fixed width */
2746 && minnext != 0 /* CURLYM can't handle zero width */
2747 /* Nor characters whose fold at run-time may be
2748 * multi-character */
2749 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
2750 && mutate_ok
2751 ) {
2752 /* XXXX How to optimize if data == 0? */
2753 /* Optimize to a simpler form. */
2754 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
2755 regnode *nxt2;
2756
2757 OP(oscan) = CURLYM;
2758 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2759 && (OP(nxt2) != WHILEM))
2760 nxt = nxt2;
2761 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2762 /* Need to optimize away parenths. */
2763 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
2764 /* Set the parenth number. */
2765 /* note that we have changed the type of oscan to CURLYM here */
2766 regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
2767
2768 oscan->flags = (U8)PARNO(nxt);
2769 if (RExC_open_parens) {
2770 /*open->CURLYM*/
2771 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2772
2773 /*close->NOTHING*/
2774 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2)
2775 + 1;
2776 }
2777 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2778 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2779
2780#ifdef DEBUGGING
2781 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2782 OP(nxt + 1) = OPTIMIZED; /* was count. */
2783 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
2784 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
2785#endif
2786#if 0
2787 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2788 regnode *nnxt = regnext(nxt1);
2789 if (nnxt == nxt) {
2790 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
2791 ARG_SET(nxt1, nxt2 - nxt1);
2792 else if (nxt2 - nxt1 < U16_MAX)
2793 NEXT_OFF(nxt1) = nxt2 - nxt1;
2794 else
2795 OP(nxt) = NOTHING; /* Cannot beautify */
2796 }
2797 nxt1 = nnxt;
2798 }
2799#endif
2800 /* Optimize again: */
2801 /* recurse study_chunk() on optimised CURLYX => CURLYM */
2802 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2803 NULL, stopparen, recursed_depth, NULL, 0,
2804 depth+1, mutate_ok);
2805 }
2806 else
2807 oscan->flags = 0;
2808 }
2809 else if ((OP(oscan) == CURLYX)
2810 && (flags & SCF_WHILEM_VISITED_POS)
2811 /* See the comment on a similar expression above.
2812 However, this time it's not a subexpression
2813 we care about, but the expression itself. */
2814 && (maxcount == REG_INFTY)
2815 && data) {
2816 /* This stays as CURLYX, we can put the count/of pair. */
2817 /* Find WHILEM (as in regexec.c) */
2818 regnode *nxt = oscan + NEXT_OFF(oscan);
2819
2820 if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
2821 nxt += ARG(nxt);
2822 nxt = REGNODE_BEFORE(nxt);
2823 if (nxt->flags & 0xf) {
2824 /* we've already set whilem count on this node */
2825 } else if (++data->whilem_c < 16) {
2826 assert(data->whilem_c <= RExC_whilem_seen);
2827 nxt->flags = (U8)(data->whilem_c
2828 | (RExC_whilem_seen << 4)); /* On WHILEM */
2829 }
2830 }
2831 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2832 pars++;
2833 if (flags & SCF_DO_SUBSTR) {
2834 SV *last_str = NULL;
2835 STRLEN last_chrs = 0;
2836 int counted = mincount != 0;
2837
2838 if (data->last_end > 0 && mincount != 0) { /* Ends with a
2839 string. */
2840 SSize_t b = pos_before >= data->last_start_min
2841 ? pos_before : data->last_start_min;
2842 STRLEN l;
2843 const char * const s = SvPV_const(data->last_found, l);
2844 SSize_t old = b - data->last_start_min;
2845 assert(old >= 0);
2846
2847 if (UTF)
2848 old = utf8_hop_forward((U8*)s, old,
2849 (U8 *) SvEND(data->last_found))
2850 - (U8*)s;
2851 l -= old;
2852 /* Get the added string: */
2853 last_str = newSVpvn_utf8(s + old, l, UTF);
2854 last_chrs = UTF ? utf8_length((U8*)(s + old),
2855 (U8*)(s + old + l)) : l;
2856 if (deltanext == 0 && pos_before == b) {
2857 /* What was added is a constant string */
2858 if (mincount > 1) {
2859
2860 SvGROW(last_str, (mincount * l) + 1);
2861 repeatcpy(SvPVX(last_str) + l,
2862 SvPVX_const(last_str), l,
2863 mincount - 1);
2864 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2865 /* Add additional parts. */
2866 SvCUR_set(data->last_found,
2867 SvCUR(data->last_found) - l);
2868 sv_catsv(data->last_found, last_str);
2869 {
2870 SV * sv = data->last_found;
2871 MAGIC *mg =
2872 SvUTF8(sv) && SvMAGICAL(sv) ?
2873 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2874 if (mg && mg->mg_len >= 0)
2875 mg->mg_len += last_chrs * (mincount-1);
2876 }
2877 last_chrs *= mincount;
2878 data->last_end += l * (mincount - 1);
2879 }
2880 } else {
2881 /* start offset must point into the last copy */
2882 data->last_start_min += minnext * (mincount - 1);
2883 data->last_start_max =
2884 is_inf
2885 ? OPTIMIZE_INFTY
2886 : data->last_start_max +
2887 (maxcount - 1) * (minnext + data->pos_delta);
2888 }
2889 }
2890 /* It is counted once already... */
2891 data->pos_min += minnext * (mincount - counted);
2892#if 0
2893 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
2894 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
2895 " maxcount=%" UVuf " mincount=%" UVuf
2896 " data->pos_delta=%" UVuf "\n",
2897 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
2898 (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
2899 if (deltanext != OPTIMIZE_INFTY)
2900 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
2901 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
2902 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
2903#endif
2904 if (deltanext == OPTIMIZE_INFTY
2905 || data->pos_delta == OPTIMIZE_INFTY
2906 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
2907 data->pos_delta = OPTIMIZE_INFTY;
2908 else
2909 data->pos_delta += - counted * deltanext +
2910 (minnext + deltanext) * maxcount - minnext * mincount;
2911 if (mincount != maxcount) {
2912 /* Cannot extend fixed substrings found inside
2913 the group. */
2914 scan_commit(pRExC_state, data, minlenp, is_inf);
2915 if (mincount && last_str) {
2916 SV * const sv = data->last_found;
2917 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2918 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2919
2920 if (mg)
2921 mg->mg_len = -1;
2922 sv_setsv(sv, last_str);
2923 data->last_end = data->pos_min;
2924 data->last_start_min = data->pos_min - last_chrs;
2925 data->last_start_max = is_inf
2926 ? OPTIMIZE_INFTY
2927 : data->pos_min + data->pos_delta - last_chrs;
2928 }
2929 data->cur_is_floating = 1; /* float */
2930 }
2931 SvREFCNT_dec(last_str);
2932 }
2933 if (data && (fl & SF_HAS_EVAL))
2934 data->flags |= SF_HAS_EVAL;
2935 optimize_curly_tail:
2936 rck_elide_nothing(oscan);
2937 continue;
2938
2939 default:
2940 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
2941 OP(scan));
2942 case REF:
2943 case CLUMP:
2944 if (flags & SCF_DO_SUBSTR) {
2945 /* Cannot expect anything... */
2946 scan_commit(pRExC_state, data, minlenp, is_inf);
2947 data->cur_is_floating = 1; /* float */
2948 }
2949 is_inf = is_inf_internal = 1;
2950 if (flags & SCF_DO_STCLASS_OR) {
2951 if (OP(scan) == CLUMP) {
2952 /* Actually is any start char, but very few code points
2953 * aren't start characters */
2954 ssc_match_all_cp(data->start_class);
2955 }
2956 else {
2957 ssc_anything(data->start_class);
2958 }
2959 }
2960 flags &= ~SCF_DO_STCLASS;
2961 break;
2962 }
2963 }
2964 else if (OP(scan) == LNBREAK) {
2965 if (flags & SCF_DO_STCLASS) {
2966 if (flags & SCF_DO_STCLASS_AND) {
2967 ssc_intersection(data->start_class,
2968 PL_XPosix_ptrs[CC_VERTSPACE_], FALSE);
2969 ssc_clear_locale(data->start_class);
2970 ANYOF_FLAGS(data->start_class)
2971 &= ~SSC_MATCHES_EMPTY_STRING;
2972 }
2973 else if (flags & SCF_DO_STCLASS_OR) {
2974 ssc_union(data->start_class,
2975 PL_XPosix_ptrs[CC_VERTSPACE_],
2976 FALSE);
2977 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2978
2979 /* See commit msg for
2980 * 749e076fceedeb708a624933726e7989f2302f6a */
2981 ANYOF_FLAGS(data->start_class)
2982 &= ~SSC_MATCHES_EMPTY_STRING;
2983 }
2984 flags &= ~SCF_DO_STCLASS;
2985 }
2986 min++;
2987 if (delta != OPTIMIZE_INFTY)
2988 delta++; /* Because of the 2 char string cr-lf */
2989 if (flags & SCF_DO_SUBSTR) {
2990 /* Cannot expect anything... */
2991 scan_commit(pRExC_state, data, minlenp, is_inf);
2992 data->pos_min += 1;
2993 if (data->pos_delta != OPTIMIZE_INFTY) {
2994 data->pos_delta += 1;
2995 }
2996 data->cur_is_floating = 1; /* float */
2997 }
2998 }
2999 else if (REGNODE_SIMPLE(OP(scan))) {
3000
3001 if (flags & SCF_DO_SUBSTR) {
3002 scan_commit(pRExC_state, data, minlenp, is_inf);
3003 data->pos_min++;
3004 }
3005 min++;
3006 if (flags & SCF_DO_STCLASS) {
3007 bool invert = 0;
3008 SV* my_invlist = NULL;
3009 U8 namedclass;
3010
3011 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
3012 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
3013
3014 /* Some of the logic below assumes that switching
3015 locale on will only add false positives. */
3016 switch (OP(scan)) {
3017
3018 default:
3019#ifdef DEBUGGING
3020 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
3021 OP(scan));
3022#endif
3023 case SANY:
3024 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3025 ssc_match_all_cp(data->start_class);
3026 break;
3027
3028 case REG_ANY:
3029 {
3030 SV* REG_ANY_invlist = _new_invlist(2);
3031 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
3032 '\n');
3033 if (flags & SCF_DO_STCLASS_OR) {
3034 ssc_union(data->start_class,
3035 REG_ANY_invlist,
3036 TRUE /* TRUE => invert, hence all but \n
3037 */
3038 );
3039 }
3040 else if (flags & SCF_DO_STCLASS_AND) {
3041 ssc_intersection(data->start_class,
3042 REG_ANY_invlist,
3043 TRUE /* TRUE => invert */
3044 );
3045 ssc_clear_locale(data->start_class);
3046 }
3047 SvREFCNT_dec_NN(REG_ANY_invlist);
3048 }
3049 break;
3050
3051 case ANYOFD:
3052 case ANYOFL:
3053 case ANYOFPOSIXL:
3054 case ANYOFH:
3055 case ANYOFHb:
3056 case ANYOFHr:
3057 case ANYOFHs:
3058 case ANYOF:
3059 if (flags & SCF_DO_STCLASS_AND)
3060 ssc_and(pRExC_state, data->start_class,
3061 (regnode_charclass *) scan);
3062 else
3063 ssc_or(pRExC_state, data->start_class,
3064 (regnode_charclass *) scan);
3065 break;
3066
3067 case ANYOFHbbm:
3068 {
3069 SV* cp_list = get_ANYOFHbbm_contents(scan);
3070
3071 if (flags & SCF_DO_STCLASS_OR) {
3072 ssc_union(data->start_class, cp_list, invert);
3073 }
3074 else if (flags & SCF_DO_STCLASS_AND) {
3075 ssc_intersection(data->start_class, cp_list, invert);
3076 }
3077
3078 SvREFCNT_dec_NN(cp_list);
3079 break;
3080 }
3081
3082 case NANYOFM: /* NANYOFM already contains the inversion of the
3083 input ANYOF data, so, unlike things like
3084 NPOSIXA, don't change 'invert' to TRUE */
3085 /* FALLTHROUGH */
3086 case ANYOFM:
3087 {
3088 SV* cp_list = get_ANYOFM_contents(scan);
3089
3090 if (flags & SCF_DO_STCLASS_OR) {
3091 ssc_union(data->start_class, cp_list, invert);
3092 }
3093 else if (flags & SCF_DO_STCLASS_AND) {
3094 ssc_intersection(data->start_class, cp_list, invert);
3095 }
3096
3097 SvREFCNT_dec_NN(cp_list);
3098 break;
3099 }
3100
3101 case ANYOFR:
3102 case ANYOFRb:
3103 {
3104 SV* cp_list = NULL;
3105
3106 cp_list = _add_range_to_invlist(cp_list,
3107 ANYOFRbase(scan),
3108 ANYOFRbase(scan) + ANYOFRdelta(scan));
3109
3110 if (flags & SCF_DO_STCLASS_OR) {
3111 ssc_union(data->start_class, cp_list, invert);
3112 }
3113 else if (flags & SCF_DO_STCLASS_AND) {
3114 ssc_intersection(data->start_class, cp_list, invert);
3115 }
3116
3117 SvREFCNT_dec_NN(cp_list);
3118 break;
3119 }
3120
3121 case NPOSIXL:
3122 invert = 1;
3123 /* FALLTHROUGH */
3124
3125 case POSIXL:
3126 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
3127 if (flags & SCF_DO_STCLASS_AND) {
3128 bool was_there = cBOOL(
3129 ANYOF_POSIXL_TEST(data->start_class,
3130 namedclass));
3131 ANYOF_POSIXL_ZERO(data->start_class);
3132 if (was_there) { /* Do an AND */
3133 ANYOF_POSIXL_SET(data->start_class, namedclass);
3134 }
3135 /* No individual code points can now match */
3136 data->start_class->invlist
3137 = sv_2mortal(_new_invlist(0));
3138 }
3139 else {
3140 int complement = namedclass + ((invert) ? -1 : 1);
3141
3142 assert(flags & SCF_DO_STCLASS_OR);
3143
3144 /* If the complement of this class was already there,
3145 * the result is that they match all code points,
3146 * (\d + \D == everything). Remove the classes from
3147 * future consideration. Locale is not relevant in
3148 * this case */
3149 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
3150 ssc_match_all_cp(data->start_class);
3151 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
3152 ANYOF_POSIXL_CLEAR(data->start_class, complement);
3153 }
3154 else { /* The usual case; just add this class to the
3155 existing set */
3156 ANYOF_POSIXL_SET(data->start_class, namedclass);
3157 }
3158 }
3159 break;
3160
3161 case NPOSIXA: /* For these, we always know the exact set of
3162 what's matched */
3163 invert = 1;
3164 /* FALLTHROUGH */
3165 case POSIXA:
3166 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
3167 goto join_posix_and_ascii;
3168
3169 case NPOSIXD:
3170 case NPOSIXU:
3171 invert = 1;
3172 /* FALLTHROUGH */
3173 case POSIXD:
3174 case POSIXU:
3175 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
3176
3177 /* NPOSIXD matches all upper Latin1 code points unless the
3178 * target string being matched is UTF-8, which is
3179 * unknowable until match time. Since we are going to
3180 * invert, we want to get rid of all of them so that the
3181 * inversion will match all */
3182 if (OP(scan) == NPOSIXD) {
3183 _invlist_subtract(my_invlist, PL_UpperLatin1,
3184 &my_invlist);
3185 }
3186
3187 join_posix_and_ascii:
3188
3189 if (flags & SCF_DO_STCLASS_AND) {
3190 ssc_intersection(data->start_class, my_invlist, invert);
3191 ssc_clear_locale(data->start_class);
3192 }
3193 else {
3194 assert(flags & SCF_DO_STCLASS_OR);
3195 ssc_union(data->start_class, my_invlist, invert);
3196 }
3197 SvREFCNT_dec(my_invlist);
3198 }
3199 if (flags & SCF_DO_STCLASS_OR)
3200 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3201 flags &= ~SCF_DO_STCLASS;
3202 }
3203 }
3204 else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
3205 data->flags |= (OP(scan) == MEOL
3206 ? SF_BEFORE_MEOL
3207 : SF_BEFORE_SEOL);
3208 scan_commit(pRExC_state, data, minlenp, is_inf);
3209
3210 }
3211 else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ
3212 /* Lookbehind, or need to calculate parens/evals/stclass: */
3213 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3214 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
3215 {
3216 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3217 || OP(scan) == UNLESSM )
3218 {
3219 /* Negative Lookahead/lookbehind
3220 In this case we can't do fixed string optimisation.
3221 */
3222
3223 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
3224 SSize_t deltanext, minnext;
3225 SSize_t fake_last_close = 0;
3226 regnode *fake_last_close_op = NULL;
3227 regnode *cur_last_close_op;
3228 regnode *nscan;
3229 regnode_ssc intrnl;
3230 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3231
3232 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3233 if (data) {
3234 data_fake.whilem_c = data->whilem_c;
3235 data_fake.last_closep = data->last_closep;
3236 data_fake.last_close_opp = data->last_close_opp;
3237 }
3238 else {
3239 data_fake.last_closep = &fake_last_close;
3240 data_fake.last_close_opp = &fake_last_close_op;
3241 }
3242
3243 /* remember the last_close_op we saw so we can see if
3244 * we are dealing with variable length lookbehind that
3245 * contains capturing buffers, which are considered
3246 * experimental */
3247 cur_last_close_op= *(data_fake.last_close_opp);
3248
3249 data_fake.pos_delta = delta;
3250 if ( flags & SCF_DO_STCLASS && !scan->flags
3251 && OP(scan) == IFMATCH ) { /* Lookahead */
3252 ssc_init(pRExC_state, &intrnl);
3253 data_fake.start_class = &intrnl;
3254 f |= SCF_DO_STCLASS_AND;
3255 }
3256 if (flags & SCF_WHILEM_VISITED_POS)
3257 f |= SCF_WHILEM_VISITED_POS;
3258 next = regnext(scan);
3259 nscan = REGNODE_AFTER(scan);
3260
3261 /* recurse study_chunk() for lookahead body */
3262 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3263 last, &data_fake, stopparen,
3264 recursed_depth, NULL, f, depth+1,
3265 mutate_ok);
3266
3267 if (scan->flags) {
3268 if ( deltanext < 0
3269 || deltanext > (I32) U8_MAX
3270 || minnext > (I32)U8_MAX
3271 || minnext + deltanext > (I32)U8_MAX)
3272 {
3273 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3274 (UV)U8_MAX);
3275 }
3276
3277 /* The 'next_off' field has been repurposed to count the
3278 * additional starting positions to try beyond the initial
3279 * one. (This leaves it at 0 for non-variable length
3280 * matches to avoid breakage for those not using this
3281 * extension) */
3282 if (deltanext) {
3283 scan->next_off = deltanext;
3284 if (
3285 /* See a CLOSE op inside this lookbehind? */
3286 cur_last_close_op != *(data_fake.last_close_opp)
3287 /* and not doing restudy. see: restudied */
3288 && !(flags & SCF_TRIE_DOING_RESTUDY)
3289 ) {
3290 /* this is positive variable length lookbehind with
3291 * capture buffers inside of it */
3292 ckWARNexperimental_with_arg(RExC_parse,
3293 WARN_EXPERIMENTAL__VLB,
3294 "Variable length %s lookbehind with capturing is experimental",
3295 is_positive ? "positive" : "negative");
3296 }
3297 }
3298 scan->flags = (U8)minnext + deltanext;
3299 }
3300 if (data) {
3301 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3302 pars++;
3303 if (data_fake.flags & SF_HAS_EVAL)
3304 data->flags |= SF_HAS_EVAL;
3305 data->whilem_c = data_fake.whilem_c;
3306 }
3307 if (f & SCF_DO_STCLASS_AND) {
3308 if (flags & SCF_DO_STCLASS_OR) {
3309 /* OR before, AND after: ideally we would recurse with
3310 * data_fake to get the AND applied by study of the
3311 * remainder of the pattern, and then derecurse;
3312 * *** HACK *** for now just treat as "no information".
3313 * See [perl #56690].
3314 */
3315 ssc_init(pRExC_state, data->start_class);
3316 } else {
3317 /* AND before and after: combine and continue. These
3318 * assertions are zero-length, so can match an EMPTY
3319 * string */
3320 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3321 ANYOF_FLAGS(data->start_class)
3322 |= SSC_MATCHES_EMPTY_STRING;
3323 }
3324 }
3325 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
3326 }
3327#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3328 else {
3329 /* Positive Lookahead/lookbehind
3330 In this case we can do fixed string optimisation,
3331 but we must be careful about it. Note in the case of
3332 lookbehind the positions will be offset by the minimum
3333 length of the pattern, something we won't know about
3334 until after the recurse.
3335 */
3336 SSize_t deltanext, fake_last_close = 0;
3337 regnode *last_close_op = NULL;
3338 regnode *nscan;
3339 regnode_ssc intrnl;
3340 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3341 /* We use SAVEFREEPV so that when the full compile
3342 is finished perl will clean up the allocated
3343 minlens when it's all done. This way we don't
3344 have to worry about freeing them when we know
3345 they wont be used, which would be a pain.
3346 */
3347 SSize_t *minnextp;
3348 Newx( minnextp, 1, SSize_t );
3349 SAVEFREEPV(minnextp);
3350
3351 if (data) {
3352 StructCopy(data, &data_fake, scan_data_t);
3353 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3354 f |= SCF_DO_SUBSTR;
3355 if (scan->flags)
3356 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
3357 data_fake.last_found=newSVsv(data->last_found);
3358 }
3359 }
3360 else {
3361 data_fake.last_closep = &fake_last_close;
3362 data_fake.last_close_opp = &fake_last_close_opp;
3363 }
3364 data_fake.flags = 0;
3365 data_fake.substrs[0].flags = 0;
3366 data_fake.substrs[1].flags = 0;
3367 data_fake.pos_delta = delta;
3368 if (is_inf)
3369 data_fake.flags |= SF_IS_INF;
3370 if ( flags & SCF_DO_STCLASS && !scan->flags
3371 && OP(scan) == IFMATCH ) { /* Lookahead */
3372 ssc_init(pRExC_state, &intrnl);
3373 data_fake.start_class = &intrnl;
3374 f |= SCF_DO_STCLASS_AND;
3375 }
3376 if (flags & SCF_WHILEM_VISITED_POS)
3377 f |= SCF_WHILEM_VISITED_POS;
3378 next = regnext(scan);
3379 nscan = REGNODE_AFTER(scan);
3380
3381 /* positive lookahead study_chunk() recursion */
3382 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
3383 &deltanext, last, &data_fake,
3384 stopparen, recursed_depth, NULL,
3385 f, depth+1, mutate_ok);
3386 if (scan->flags) {
3387 assert(0); /* This code has never been tested since this
3388 is normally not compiled */
3389 if ( deltanext < 0
3390 || deltanext > (I32) U8_MAX
3391 || *minnextp > (I32)U8_MAX
3392 || *minnextp + deltanext > (I32)U8_MAX)
3393 {
3394 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3395 (UV)U8_MAX);
3396 }
3397
3398 if (deltanext) {
3399 scan->next_off = deltanext;
3400 }
3401 scan->flags = (U8)*minnextp + deltanext;
3402 }
3403
3404 *minnextp += min;
3405
3406 if (f & SCF_DO_STCLASS_AND) {
3407 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3408 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
3409 }
3410 if (data) {
3411 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3412 pars++;
3413 if (data_fake.flags & SF_HAS_EVAL)
3414 data->flags |= SF_HAS_EVAL;
3415 data->whilem_c = data_fake.whilem_c;
3416 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3417 int i;
3418 if (RExC_rx->minlen < *minnextp)
3419 RExC_rx->minlen = *minnextp;
3420 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
3421 SvREFCNT_dec_NN(data_fake.last_found);
3422
3423 for (i = 0; i < 2; i++) {
3424 if (data_fake.substrs[i].minlenp != minlenp) {
3425 data->substrs[i].min_offset =
3426 data_fake.substrs[i].min_offset;
3427 data->substrs[i].max_offset =
3428 data_fake.substrs[i].max_offset;
3429 data->substrs[i].minlenp =
3430 data_fake.substrs[i].minlenp;
3431 data->substrs[i].lookbehind += scan->flags;
3432 }
3433 }
3434 }
3435 }
3436 }
3437#endif
3438 }
3439 else if (OP(scan) == OPEN) {
3440 if (stopparen != (I32)PARNO(scan))
3441 pars++;
3442 }
3443 else if (OP(scan) == CLOSE) {
3444 if (stopparen == (I32)PARNO(scan)) {
3445 break;
3446 }
3447 if ((I32)PARNO(scan) == is_par) {
3448 next = regnext(scan);
3449
3450 if ( next && (OP(next) != WHILEM) && next < last)
3451 is_par = 0; /* Disable optimization */
3452 }
3453 if (data) {
3454 *(data->last_closep) = PARNO(scan);
3455 *(data->last_close_opp) = scan;
3456 }
3457 }
3458 else if (OP(scan) == EVAL) {
3459 if (data)
3460 data->flags |= SF_HAS_EVAL;
3461 }
3462 else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
3463 if (flags & SCF_DO_SUBSTR) {
3464 scan_commit(pRExC_state, data, minlenp, is_inf);
3465 flags &= ~SCF_DO_SUBSTR;
3466 }
3467 if (OP(scan)==ACCEPT) {
3468 /* m{(*ACCEPT)x} does not have to start with 'x' */
3469 flags &= ~SCF_DO_STCLASS;
3470 if (data)
3471 data->flags |= SCF_SEEN_ACCEPT;
3472 if (stopmin > min)
3473 stopmin = min;
3474 }
3475 }
3476 else if (OP(scan) == COMMIT) {
3477 /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
3478 * must not end up with "abcxyz" as a fixed substring else we'll
3479 * skip straight to attempting to match at offset 4.
3480 */
3481 if (flags & SCF_DO_SUBSTR) {
3482 scan_commit(pRExC_state, data, minlenp, is_inf);
3483 flags &= ~SCF_DO_SUBSTR;
3484 }
3485 }
3486 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3487 {
3488 if (flags & SCF_DO_SUBSTR) {
3489 scan_commit(pRExC_state, data, minlenp, is_inf);
3490 data->cur_is_floating = 1; /* float */
3491 }
3492 is_inf = is_inf_internal = 1;
3493 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3494 ssc_anything(data->start_class);
3495 flags &= ~SCF_DO_STCLASS;
3496 }
3497 else if (OP(scan) == GPOS) {
3498 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
3499 !(delta || is_inf || (data && data->pos_delta)))
3500 {
3501 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
3502 RExC_rx->intflags |= PREGf_ANCH_GPOS;
3503 if (RExC_rx->gofs < (STRLEN)min)
3504 RExC_rx->gofs = min;
3505 } else {
3506 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
3507 RExC_rx->gofs = 0;
3508 }
3509 }
3510#ifdef TRIE_STUDY_OPT
3511#ifdef FULL_TRIE_STUDY
3512 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3513 /* NOTE - There is similar code to this block above for handling
3514 BRANCH nodes on the initial study. If you change stuff here
3515 check there too. */
3516 regnode *trie_node= scan;
3517 regnode *tail= regnext(scan);
3518 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3519 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
3520 regnode_ssc accum;
3521
3522 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
3523 /* Cannot merge strings after this. */
3524 scan_commit(pRExC_state, data, minlenp, is_inf);
3525 }
3526 if (flags & SCF_DO_STCLASS)
3527 ssc_init_zero(pRExC_state, &accum);
3528
3529 if (!trie->jump) {
3530 min1= trie->minlen;
3531 max1= trie->maxlen;
3532 } else {
3533 const regnode *nextbranch= NULL;
3534 U32 word;
3535
3536 for ( word=1 ; word <= trie->wordcount ; word++)
3537 {
3538 SSize_t deltanext = 0, minnext = 0;
3539 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3540 SSize_t fake_last_close = 0;
3541 regnode *fake_last_close_op = NULL;
3542 regnode_ssc this_class;
3543
3544 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3545 if (data) {
3546 data_fake.whilem_c = data->whilem_c;
3547 data_fake.last_closep = data->last_closep;
3548 data_fake.last_close_opp = data->last_close_opp;
3549 }
3550 else {
3551 data_fake.last_closep = &fake_last_close;
3552 data_fake.last_close_opp = &fake_last_close_op;
3553 }
3554 data_fake.pos_delta = delta;
3555 if (flags & SCF_DO_STCLASS) {
3556 ssc_init(pRExC_state, &this_class);
3557 data_fake.start_class = &this_class;
3558 f |= SCF_DO_STCLASS_AND;
3559 }
3560 if (flags & SCF_WHILEM_VISITED_POS)
3561 f |= SCF_WHILEM_VISITED_POS;
3562
3563 if (trie->jump[word]) {
3564 if (!nextbranch)
3565 nextbranch = trie_node + trie->jump[0];
3566 scan= trie_node + trie->jump[word];
3567 /* We go from the jump point to the branch that follows
3568 it. Note this means we need the vestigal unused
3569 branches even though they arent otherwise used. */
3570 /* optimise study_chunk() for TRIE */
3571 minnext = study_chunk(pRExC_state, &scan, minlenp,
3572 &deltanext, (regnode *)nextbranch, &data_fake,
3573 stopparen, recursed_depth, NULL, f, depth+1,
3574 mutate_ok);
3575 }
3576 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
3577 nextbranch= regnext((regnode*)nextbranch);
3578
3579 if (min1 > (SSize_t)(minnext + trie->minlen))
3580 min1 = minnext + trie->minlen;
3581 if (deltanext == OPTIMIZE_INFTY) {
3582 is_inf = is_inf_internal = 1;
3583 max1 = OPTIMIZE_INFTY;
3584 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
3585 max1 = minnext + deltanext + trie->maxlen;
3586
3587 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3588 pars++;
3589 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3590 if ( stopmin > min + min1)
3591 stopmin = min + min1;
3592 flags &= ~SCF_DO_SUBSTR;
3593 if (data)
3594 data->flags |= SCF_SEEN_ACCEPT;
3595 }
3596 if (data) {
3597 if (data_fake.flags & SF_HAS_EVAL)
3598 data->flags |= SF_HAS_EVAL;
3599 data->whilem_c = data_fake.whilem_c;
3600 }
3601 if (flags & SCF_DO_STCLASS)
3602 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
3603 }
3604 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
3605 }
3606 if (flags & SCF_DO_SUBSTR) {
3607 data->pos_min += min1;
3608 data->pos_delta += max1 - min1;
3609 if (max1 != min1 || is_inf)
3610 data->cur_is_floating = 1; /* float */
3611 }
3612 min += min1;
3613 if (delta != OPTIMIZE_INFTY) {
3614 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
3615 delta += max1 - min1;
3616 else
3617 delta = OPTIMIZE_INFTY;
3618 }
3619 if (flags & SCF_DO_STCLASS_OR) {
3620 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3621 if (min1) {
3622 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3623 flags &= ~SCF_DO_STCLASS;
3624 }
3625 }
3626 else if (flags & SCF_DO_STCLASS_AND) {
3627 if (min1) {
3628 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3629 flags &= ~SCF_DO_STCLASS;
3630 }
3631 else {
3632 /* Switch to OR mode: cache the old value of
3633 * data->start_class */
3634 INIT_AND_WITHP;
3635 StructCopy(data->start_class, and_withp, regnode_ssc);
3636 flags &= ~SCF_DO_STCLASS_AND;
3637 StructCopy(&accum, data->start_class, regnode_ssc);
3638 flags |= SCF_DO_STCLASS_OR;
3639 }
3640 }
3641 scan= tail;
3642 DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
3643 continue;
3644 }
3645#else
3646 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3647 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3648 U8*bang=NULL;
3649
3650 min += trie->minlen;
3651 delta += (trie->maxlen - trie->minlen);
3652 flags &= ~SCF_DO_STCLASS; /* xxx */
3653 if (flags & SCF_DO_SUBSTR) {
3654 /* Cannot expect anything... */
3655 scan_commit(pRExC_state, data, minlenp, is_inf);
3656 data->pos_min += trie->minlen;
3657 data->pos_delta += (trie->maxlen - trie->minlen);
3658 if (trie->maxlen != trie->minlen)
3659 data->cur_is_floating = 1; /* float */
3660 }
3661 if (trie->jump) /* no more substrings -- for now /grr*/
3662 flags &= ~SCF_DO_SUBSTR;
3663 }
3664
3665#endif /* old or new */
3666#endif /* TRIE_STUDY_OPT */
3667
3668 else if (OP(scan) == REGEX_SET) {
3669 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
3670 " before optimization", REGNODE_NAME(REGEX_SET));
3671 }
3672
3673 /* Else: zero-length, ignore. */
3674 scan = regnext(scan);
3675 }
3676
3677 finish:
3678 if (frame) {
3679 /* we need to unwind recursion. */
3680 depth = depth - 1;
3681
3682 DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
3683 DEBUG_PEEP("fend", scan, depth, flags);
3684
3685 /* restore previous context */
3686 last = frame->last_regnode;
3687 scan = frame->next_regnode;
3688 stopparen = frame->stopparen;
3689 recursed_depth = frame->prev_recursed_depth;
3690
3691 RExC_frame_last = frame->prev_frame;
3692 frame = frame->this_prev_frame;
3693 goto fake_study_recurse;
3694 }
3695
3696 assert(!frame);
3697 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
3698
3699 /* is this pattern infinite? Eg, consider /(a|b+)/ */
3700 if (is_inf_internal)
3701 delta = OPTIMIZE_INFTY;
3702
3703 /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
3704 if (min > stopmin) {
3705 /*
3706 At this point 'min' represents the minimum length string we can
3707 match while *ignoring* the implication of ACCEPT, and 'delta'
3708 represents the difference between the minimum length and maximum
3709 length, and if the pattern matches an infinitely long string
3710 (consider the + and * quantifiers) then we use the special delta
3711 value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
3712 minimum length that can be matched *and* accepted.
3713
3714 A pattern is accepted when matching was successful *and*
3715 complete, and thus there is no further matching needing to be
3716 done, no backtracking to occur, etc. Prior to the introduction
3717 of ACCEPT the only opcode that signaled acceptance was the END
3718 opcode, which is always the very last opcode in a regex program.
3719 ACCEPT is thus conceptually an early successful return out of
3720 the matching process. stopmin starts out as OPTIMIZE_INFTY to
3721 represent "the entire pattern", and is ratched down to the
3722 "current min" if necessary when an ACCEPT opcode is encountered.
3723
3724 Thus stopmin might be smaller than min if we saw an (*ACCEPT),
3725 and we now need to account for it in both min and delta.
3726 Consider that in a pattern /AB/ normally the min length it can
3727 match can be computed as min(A)+min(B). But (*ACCEPT) means
3728 that it might be something else, not even neccesarily min(A) at
3729 all. Consider
3730
3731 A = /(foo(*ACCEPT)|x+)/
3732 B = /whop/
3733 AB = /(foo(*ACCEPT)|x+)whop/
3734
3735 The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
3736 for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
3737 "whop", and the delta of 0 as the pattern is of fixed length, the
3738 stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
3739 When handling AB we expect to see a min of 5 for "xwhop", and a
3740 delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
3741 for "foo". This should result in a final min of 3 for "foo", and
3742 a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
3743
3744 In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
3745 min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
3746 stop min would be 4 for "dude". This should result in a final
3747 min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
3748
3749 When min is smaller than stopmin then we can ignore it. In the
3750 fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
3751 and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
3752 the ACCEPT doesn't reduce the minimum length of the string that
3753 might be matched, nor affect the maximum length.
3754
3755 In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
3756 for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
3757 "foo". We currently turn this into a min of 3 for "foo" and a
3758 delta of 3 for "foobar" even though technically "foobar" isn't
3759 possible. ACCEPT affects some aspects of the optimizer, like
3760 length computations and mandatory substring optimizations, but
3761 there are other optimzations this routine perfoms that are not
3762 affected and this compromise simplifies implementation.
3763
3764 It might be helpful to consider that this C function is called
3765 recursively on the pattern in a bottom up fashion, and that the
3766 min returned by a nested call may be marked as coming from an
3767 ACCEPT, causing its callers to treat the returned min as a
3768 stopmin as the recursion unwinds. Thus a single ACCEPT can affect
3769 multiple calls into this function in different ways.
3770 */
3771
3772 if (OPTIMIZE_INFTY - delta >= min - stopmin)
3773 delta += min - stopmin;
3774 else
3775 delta = OPTIMIZE_INFTY;
3776 min = stopmin;
3777 }
3778
3779 *scanp = scan;
3780 *deltap = delta;
3781
3782 if (flags & SCF_DO_SUBSTR && is_inf)
3783 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
3784 if (is_par > (I32)U8_MAX)
3785 is_par = 0;
3786 if (is_par && pars==1 && data) {
3787 data->flags |= SF_IN_PAR;
3788 data->flags &= ~SF_HAS_PAR;
3789 }
3790 else if (pars && data) {
3791 data->flags |= SF_HAS_PAR;
3792 data->flags &= ~SF_IN_PAR;
3793 }
3794 if (flags & SCF_DO_STCLASS_OR)
3795 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3796 if (flags & SCF_TRIE_RESTUDY)
3797 data->flags |= SCF_TRIE_RESTUDY;
3798
3799
3800 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
3801 if (min > OPTIMIZE_INFTY - delta)
3802 RExC_maxlen = OPTIMIZE_INFTY;
3803 else if (RExC_maxlen < min + delta)
3804 RExC_maxlen = min + delta;
3805 }
3806 DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);
3807 return min;
3808}