Commit | Line | Data |
---|---|---|
85900e28 YO |
1 | #ifdef PERL_EXT_RE_BUILD |
2 | #include "re_top.h" | |
3 | #endif | |
4 | ||
5 | #include "EXTERN.h" | |
6 | #define PERL_IN_REGEX_ENGINE | |
7 | #define PERL_IN_REGCOMP_ANY | |
8 | #define PERL_IN_REGCOMP_STUDY_C | |
9 | #include "perl.h" | |
10 | ||
11 | #ifdef PERL_IN_XSUB_RE | |
12 | # include "re_comp.h" | |
13 | #else | |
14 | # include "regcomp.h" | |
15 | #endif | |
16 | ||
17 | #include "invlist_inline.h" | |
18 | #include "unicode_constants.h" | |
19 | #include "regcomp_internal.h" | |
20 | ||
21 | #define INIT_AND_WITHP \ | |
22 | assert(!and_withp); \ | |
23 | Newx(and_withp, 1, regnode_ssc); \ | |
24 | SAVEFREEPV(and_withp) | |
25 | ||
26 | ||
27 | STATIC void | |
28 | S_unwind_scan_frames(pTHX_ const void *p) | |
29 | { | |
30 | PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES; | |
31 | scan_frame *f= (scan_frame *)p; | |
32 | do { | |
33 | scan_frame *n= f->next_frame; | |
34 | Safefree(f); | |
35 | f= n; | |
36 | } while (f); | |
37 | } | |
38 | ||
39 | /* Follow the next-chain of the current node and optimize away | |
40 | all the NOTHINGs from it. | |
41 | */ | |
42 | STATIC void | |
43 | S_rck_elide_nothing(pTHX_ regnode *node) | |
44 | { | |
45 | PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING; | |
46 | ||
47 | if (OP(node) != CURLYX) { | |
48 | const int max = (REGNODE_OFF_BY_ARG(OP(node)) | |
49 | ? I32_MAX | |
50 | /* I32 may be smaller than U16 on CRAYs! */ | |
51 | : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); | |
52 | int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node)); | |
53 | int noff; | |
54 | regnode *n = node; | |
55 | ||
56 | /* Skip NOTHING and LONGJMP. */ | |
57 | while ( | |
58 | (n = regnext(n)) | |
59 | && ( | |
60 | (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n))) | |
61 | || ((OP(n) == LONGJMP) && (noff = ARG(n))) | |
62 | ) | |
63 | && off + noff < max | |
64 | ) { | |
65 | off += noff; | |
66 | } | |
67 | if (REGNODE_OFF_BY_ARG(OP(node))) | |
68 | ARG(node) = off; | |
69 | else | |
70 | NEXT_OFF(node) = off; | |
71 | } | |
72 | return; | |
73 | } | |
74 | ||
75 | ||
76 | /* | |
77 | * As best we can, determine the characters that can match the start of | |
78 | * the given EXACTF-ish node. This is for use in creating ssc nodes, so there | |
79 | * can be false positive matches | |
80 | * | |
81 | * Returns the invlist as a new SV*; it is the caller's responsibility to | |
82 | * call SvREFCNT_dec() when done with it. | |
83 | */ | |
84 | STATIC SV* | |
85 | S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) | |
86 | { | |
87 | const U8 * s = (U8*)STRING(node); | |
88 | SSize_t bytelen = STR_LEN(node); | |
89 | UV uc; | |
90 | /* Start out big enough for 2 separate code points */ | |
91 | SV* invlist = _new_invlist(4); | |
92 | ||
93 | PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST; | |
94 | ||
95 | if (! UTF) { | |
96 | uc = *s; | |
97 | ||
98 | /* We punt and assume can match anything if the node begins | |
99 | * with a multi-character fold. Things are complicated. For | |
100 | * example, /ffi/i could match any of: | |
101 | * "\N{LATIN SMALL LIGATURE FFI}" | |
102 | * "\N{LATIN SMALL LIGATURE FF}I" | |
103 | * "F\N{LATIN SMALL LIGATURE FI}" | |
104 | * plus several other things; and making sure we have all the | |
105 | * possibilities is hard. */ | |
106 | if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { | |
107 | invlist = _add_range_to_invlist(invlist, 0, UV_MAX); | |
108 | } | |
109 | else { | |
110 | /* Any Latin1 range character can potentially match any | |
111 | * other depending on the locale, and in Turkic locales, 'I' and | |
112 | * 'i' can match U+130 and U+131 */ | |
113 | if (OP(node) == EXACTFL) { | |
114 | _invlist_union(invlist, PL_Latin1, &invlist); | |
115 | if (isALPHA_FOLD_EQ(uc, 'I')) { | |
116 | invlist = add_cp_to_invlist(invlist, | |
117 | LATIN_SMALL_LETTER_DOTLESS_I); | |
118 | invlist = add_cp_to_invlist(invlist, | |
119 | LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); | |
120 | } | |
121 | } | |
122 | else { | |
123 | /* But otherwise, it matches at least itself. We can | |
124 | * quickly tell if it has a distinct fold, and if so, | |
125 | * it matches that as well */ | |
126 | invlist = add_cp_to_invlist(invlist, uc); | |
127 | if (IS_IN_SOME_FOLD_L1(uc)) | |
128 | invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); | |
129 | } | |
130 | ||
131 | /* Some characters match above-Latin1 ones under /i. This | |
132 | * is true of EXACTFL ones when the locale is UTF-8 */ | |
133 | if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) | |
134 | && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA, | |
135 | EXACTFAA_NO_TRIE))) | |
136 | { | |
137 | add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); | |
138 | } | |
139 | } | |
140 | } | |
141 | else { /* Pattern is UTF-8 */ | |
142 | U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; | |
143 | const U8* e = s + bytelen; | |
144 | IV fc; | |
145 | ||
146 | fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); | |
147 | ||
148 | /* The only code points that aren't folded in a UTF EXACTFish | |
149 | * node are the problematic ones in EXACTFL nodes */ | |
150 | if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { | |
151 | /* We need to check for the possibility that this EXACTFL | |
152 | * node begins with a multi-char fold. Therefore we fold | |
153 | * the first few characters of it so that we can make that | |
154 | * check */ | |
155 | U8 *d = folded; | |
156 | int i; | |
157 | ||
158 | fc = -1; | |
159 | for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { | |
160 | if (isASCII(*s)) { | |
161 | *(d++) = (U8) toFOLD(*s); | |
162 | if (fc < 0) { /* Save the first fold */ | |
163 | fc = *(d-1); | |
164 | } | |
165 | s++; | |
166 | } | |
167 | else { | |
168 | STRLEN len; | |
169 | UV fold = toFOLD_utf8_safe(s, e, d, &len); | |
170 | if (fc < 0) { /* Save the first fold */ | |
171 | fc = fold; | |
172 | } | |
173 | d += len; | |
174 | s += UTF8SKIP(s); | |
175 | } | |
176 | } | |
177 | ||
178 | /* And set up so the code below that looks in this folded | |
179 | * buffer instead of the node's string */ | |
180 | e = d; | |
181 | s = folded; | |
182 | } | |
183 | ||
184 | /* When we reach here 's' points to the fold of the first | |
185 | * character(s) of the node; and 'e' points to far enough along | |
186 | * the folded string to be just past any possible multi-char | |
187 | * fold. | |
188 | * | |
189 | * Like the non-UTF case above, we punt if the node begins with a | |
190 | * multi-char fold */ | |
191 | ||
192 | if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { | |
193 | invlist = _add_range_to_invlist(invlist, 0, UV_MAX); | |
194 | } | |
195 | else { /* Single char fold */ | |
196 | unsigned int k; | |
197 | U32 first_fold; | |
198 | const U32 * remaining_folds; | |
199 | Size_t folds_count; | |
200 | ||
201 | /* It matches itself */ | |
202 | invlist = add_cp_to_invlist(invlist, fc); | |
203 | ||
204 | /* ... plus all the things that fold to it, which are found in | |
205 | * PL_utf8_foldclosures */ | |
206 | folds_count = _inverse_folds(fc, &first_fold, | |
207 | &remaining_folds); | |
208 | for (k = 0; k < folds_count; k++) { | |
209 | UV c = (k == 0) ? first_fold : remaining_folds[k-1]; | |
210 | ||
211 | /* /aa doesn't allow folds between ASCII and non- */ | |
212 | if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE) | |
213 | && isASCII(c) != isASCII(fc)) | |
214 | { | |
215 | continue; | |
216 | } | |
217 | ||
218 | invlist = add_cp_to_invlist(invlist, c); | |
219 | } | |
220 | ||
221 | if (OP(node) == EXACTFL) { | |
222 | ||
223 | /* If either [iI] are present in an EXACTFL node the above code | |
224 | * should have added its normal case pair, but under a Turkish | |
225 | * locale they could match instead the case pairs from it. Add | |
226 | * those as potential matches as well */ | |
227 | if (isALPHA_FOLD_EQ(fc, 'I')) { | |
228 | invlist = add_cp_to_invlist(invlist, | |
229 | LATIN_SMALL_LETTER_DOTLESS_I); | |
230 | invlist = add_cp_to_invlist(invlist, | |
231 | LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); | |
232 | } | |
233 | else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) { | |
234 | invlist = add_cp_to_invlist(invlist, 'I'); | |
235 | } | |
236 | else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { | |
237 | invlist = add_cp_to_invlist(invlist, 'i'); | |
238 | } | |
239 | } | |
240 | } | |
241 | } | |
242 | ||
243 | return invlist; | |
244 | } | |
245 | ||
246 | ||
247 | /* Mark that we cannot extend a found fixed substring at this point. | |
248 | Update the longest found anchored substring or the longest found | |
249 | floating substrings if needed. */ | |
250 | ||
251 | void | |
252 | Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, | |
253 | SSize_t *minlenp, int is_inf) | |
254 | { | |
255 | const STRLEN l = CHR_SVLEN(data->last_found); | |
256 | SV * const longest_sv = data->substrs[data->cur_is_floating].str; | |
257 | const STRLEN old_l = CHR_SVLEN(longest_sv); | |
258 | DECLARE_AND_GET_RE_DEBUG_FLAGS; | |
259 | ||
260 | PERL_ARGS_ASSERT_SCAN_COMMIT; | |
261 | ||
262 | if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) { | |
263 | const U8 i = data->cur_is_floating; | |
264 | SvSetMagicSV(longest_sv, data->last_found); | |
265 | data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min; | |
266 | ||
267 | if (!i) /* fixed */ | |
268 | data->substrs[0].max_offset = data->substrs[0].min_offset; | |
269 | else { /* float */ | |
270 | data->substrs[1].max_offset = | |
271 | (is_inf) | |
272 | ? OPTIMIZE_INFTY | |
273 | : (l | |
274 | ? data->last_start_max | |
275 | : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min | |
276 | ? OPTIMIZE_INFTY | |
277 | : data->pos_min + data->pos_delta)); | |
278 | } | |
279 | ||
280 | data->substrs[i].flags &= ~SF_BEFORE_EOL; | |
281 | data->substrs[i].flags |= data->flags & SF_BEFORE_EOL; | |
282 | data->substrs[i].minlenp = minlenp; | |
283 | data->substrs[i].lookbehind = 0; | |
284 | } | |
285 | ||
286 | SvCUR_set(data->last_found, 0); | |
287 | { | |
288 | SV * const sv = data->last_found; | |
289 | if (SvUTF8(sv) && SvMAGICAL(sv)) { | |
290 | MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8); | |
291 | if (mg) | |
292 | mg->mg_len = 0; | |
293 | } | |
294 | } | |
295 | data->last_end = -1; | |
296 | data->flags &= ~SF_BEFORE_EOL; | |
297 | DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1); | |
298 | } | |
299 | ||
300 | /* An SSC is just a regnode_charclass_posix with an extra field: the inversion | |
301 | * list that describes which code points it matches */ | |
302 | ||
303 | STATIC void | |
304 | S_ssc_anything(pTHX_ regnode_ssc *ssc) | |
305 | { | |
306 | /* Set the SSC 'ssc' to match an empty string or any code point */ | |
307 | ||
308 | PERL_ARGS_ASSERT_SSC_ANYTHING; | |
309 | ||
310 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
311 | ||
312 | /* mortalize so won't leak */ | |
313 | ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX)); | |
314 | ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ | |
315 | } | |
316 | ||
317 | STATIC int | |
318 | S_ssc_is_anything(const regnode_ssc *ssc) | |
319 | { | |
320 | /* Returns TRUE if the SSC 'ssc' can match the empty string and any code | |
321 | * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys | |
322 | * us anything: if the function returns TRUE, 'ssc' hasn't been restricted | |
323 | * in any way, so there's no point in using it */ | |
324 | ||
325 | UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ | |
326 | bool ret; | |
327 | ||
328 | PERL_ARGS_ASSERT_SSC_IS_ANYTHING; | |
329 | ||
330 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
331 | ||
332 | if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { | |
333 | return FALSE; | |
334 | } | |
335 | ||
336 | /* See if the list consists solely of the range 0 - Infinity */ | |
337 | invlist_iterinit(ssc->invlist); | |
338 | ret = invlist_iternext(ssc->invlist, &start, &end) | |
339 | && start == 0 | |
340 | && end == UV_MAX; | |
341 | ||
342 | invlist_iterfinish(ssc->invlist); | |
343 | ||
344 | if (ret) { | |
345 | return TRUE; | |
346 | } | |
347 | ||
348 | /* If e.g., both \w and \W are set, matches everything */ | |
349 | if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { | |
350 | int i; | |
351 | for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) { | |
352 | if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) { | |
353 | return TRUE; | |
354 | } | |
355 | } | |
356 | } | |
357 | ||
358 | return FALSE; | |
359 | } | |
360 | ||
361 | void | |
362 | Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) | |
363 | { | |
364 | /* Initializes the SSC 'ssc'. This includes setting it to match an empty | |
365 | * string, any code point, or any posix class under locale */ | |
366 | ||
367 | PERL_ARGS_ASSERT_SSC_INIT; | |
368 | ||
369 | Zero(ssc, 1, regnode_ssc); | |
370 | set_ANYOF_SYNTHETIC(ssc); | |
371 | ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE); | |
372 | ssc_anything(ssc); | |
373 | ||
374 | /* If any portion of the regex is to operate under locale rules that aren't | |
375 | * fully known at compile time, initialization includes it. The reason | |
376 | * this isn't done for all regexes is that the optimizer was written under | |
377 | * the assumption that locale was all-or-nothing. Given the complexity and | |
378 | * lack of documentation in the optimizer, and that there are inadequate | |
379 | * test cases for locale, many parts of it may not work properly, it is | |
380 | * safest to avoid locale unless necessary. */ | |
381 | if (RExC_contains_locale) { | |
382 | ANYOF_POSIXL_SETALL(ssc); | |
383 | } | |
384 | else { | |
385 | ANYOF_POSIXL_ZERO(ssc); | |
386 | } | |
387 | } | |
388 | ||
389 | STATIC int | |
390 | S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, | |
391 | const regnode_ssc *ssc) | |
392 | { | |
393 | /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only | |
394 | * to the list of code points matched, and locale posix classes; hence does | |
395 | * not check its flags) */ | |
396 | ||
397 | UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */ | |
398 | bool ret; | |
399 | ||
400 | PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT; | |
401 | ||
402 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
403 | ||
404 | invlist_iterinit(ssc->invlist); | |
405 | ret = invlist_iternext(ssc->invlist, &start, &end) | |
406 | && start == 0 | |
407 | && end == UV_MAX; | |
408 | ||
409 | invlist_iterfinish(ssc->invlist); | |
410 | ||
411 | if (! ret) { | |
412 | return FALSE; | |
413 | } | |
414 | ||
415 | if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) { | |
416 | return FALSE; | |
417 | } | |
418 | ||
419 | return TRUE; | |
420 | } | |
421 | ||
422 | ||
423 | STATIC SV* | |
424 | S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, | |
425 | const regnode_charclass* const node) | |
426 | { | |
427 | /* Returns a mortal inversion list defining which code points are matched | |
428 | * by 'node', which is of ANYOF-ish type . Handles complementing the | |
429 | * result if appropriate. If some code points aren't knowable at this | |
430 | * time, the returned list must, and will, contain every code point that is | |
431 | * a possibility. */ | |
432 | ||
433 | SV* invlist = NULL; | |
434 | SV* only_utf8_locale_invlist = NULL; | |
435 | bool new_node_has_latin1 = FALSE; | |
436 | const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF) | |
437 | ? ANYOF_FLAGS(node) | |
438 | : 0; | |
439 | ||
440 | PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; | |
441 | ||
442 | /* Look at the data structure created by S_set_ANYOF_arg() */ | |
443 | if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { | |
444 | invlist = sv_2mortal(_new_invlist(1)); | |
445 | invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX); | |
446 | } | |
447 | else if (ANYOF_HAS_AUX(node)) { | |
448 | const U32 n = ARG(node); | |
449 | SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); | |
450 | AV * const av = MUTABLE_AV(SvRV(rv)); | |
451 | SV **const ary = AvARRAY(av); | |
452 | ||
453 | if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { | |
454 | ||
455 | /* Here there are things that won't be known until runtime -- we | |
456 | * have to assume it could be anything */ | |
457 | invlist = sv_2mortal(_new_invlist(1)); | |
458 | return _add_range_to_invlist(invlist, 0, UV_MAX); | |
459 | } | |
460 | else if (ary[INVLIST_INDEX]) { | |
461 | ||
462 | /* Use the node's inversion list */ | |
463 | invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL)); | |
464 | } | |
465 | ||
466 | /* Get the code points valid only under UTF-8 locales */ | |
467 | if ( (flags & ANYOFL_FOLD) | |
468 | && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) | |
469 | { | |
470 | only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX]; | |
471 | } | |
472 | } | |
473 | ||
474 | if (! invlist) { | |
475 | invlist = sv_2mortal(_new_invlist(0)); | |
476 | } | |
477 | ||
478 | /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS | |
479 | * code points, and an inversion list for the others, but if there are code | |
480 | * points that should match only conditionally on the target string being | |
481 | * UTF-8, those are placed in the inversion list, and not the bitmap. | |
482 | * Since there are circumstances under which they could match, they are | |
483 | * included in the SSC. But if the ANYOF node is to be inverted, we have | |
484 | * to exclude them here, so that when we invert below, the end result | |
485 | * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We | |
486 | * have to do this here before we add the unconditionally matched code | |
487 | * points */ | |
488 | if (flags & ANYOF_INVERT) { | |
489 | _invlist_intersection_complement_2nd(invlist, | |
490 | PL_UpperLatin1, | |
491 | &invlist); | |
492 | } | |
493 | ||
494 | /* Add in the points from the bit map */ | |
495 | if (REGNODE_TYPE(OP(node)) == ANYOF){ | |
496 | for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { | |
497 | if (ANYOF_BITMAP_TEST(node, i)) { | |
498 | unsigned int start = i++; | |
499 | ||
500 | for (; i < NUM_ANYOF_CODE_POINTS | |
501 | && ANYOF_BITMAP_TEST(node, i); ++i) | |
502 | { | |
503 | /* empty */ | |
504 | } | |
505 | invlist = _add_range_to_invlist(invlist, start, i-1); | |
506 | new_node_has_latin1 = TRUE; | |
507 | } | |
508 | } | |
509 | } | |
510 | ||
511 | /* If this can match all upper Latin1 code points, have to add them | |
512 | * as well. But don't add them if inverting, as when that gets done below, | |
513 | * it would exclude all these characters, including the ones it shouldn't | |
514 | * that were added just above */ | |
515 | if ( ! (flags & ANYOF_INVERT) | |
516 | && OP(node) == ANYOFD | |
517 | && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)) | |
518 | { | |
519 | _invlist_union(invlist, PL_UpperLatin1, &invlist); | |
520 | } | |
521 | ||
522 | /* Similarly for these */ | |
523 | if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) { | |
524 | _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); | |
525 | } | |
526 | ||
527 | if (flags & ANYOF_INVERT) { | |
528 | _invlist_invert(invlist); | |
529 | } | |
530 | else if (flags & ANYOFL_FOLD) { | |
531 | if (new_node_has_latin1) { | |
532 | ||
533 | /* These folds are potential in Turkic locales */ | |
534 | if (_invlist_contains_cp(invlist, 'i')) { | |
535 | invlist = add_cp_to_invlist(invlist, | |
536 | LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); | |
537 | } | |
538 | if (_invlist_contains_cp(invlist, 'I')) { | |
539 | invlist = add_cp_to_invlist(invlist, | |
540 | LATIN_SMALL_LETTER_DOTLESS_I); | |
541 | } | |
542 | ||
543 | /* Under /li, any 0-255 could fold to any other 0-255, depending on | |
544 | * the locale. We can skip this if there are no 0-255 at all. */ | |
545 | _invlist_union(invlist, PL_Latin1, &invlist); | |
546 | } | |
547 | else { | |
548 | if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) { | |
549 | invlist = add_cp_to_invlist(invlist, 'I'); | |
550 | } | |
551 | if (_invlist_contains_cp(invlist, | |
552 | LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) | |
553 | { | |
554 | invlist = add_cp_to_invlist(invlist, 'i'); | |
555 | } | |
556 | } | |
557 | } | |
558 | ||
559 | /* Similarly add the UTF-8 locale possible matches. These have to be | |
560 | * deferred until after the non-UTF-8 locale ones are taken care of just | |
561 | * above, or it leads to wrong results under ANYOF_INVERT */ | |
562 | if (only_utf8_locale_invlist) { | |
563 | _invlist_union_maybe_complement_2nd(invlist, | |
564 | only_utf8_locale_invlist, | |
565 | flags & ANYOF_INVERT, | |
566 | &invlist); | |
567 | } | |
568 | ||
569 | return invlist; | |
570 | } | |
571 | ||
572 | /* 'AND' a given class with another one. Can create false positives. 'ssc' | |
573 | * should not be inverted. */ | |
574 | ||
575 | STATIC void | |
576 | S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, | |
577 | const regnode_charclass *and_with) | |
578 | { | |
579 | /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either | |
580 | * another SSC or a regular ANYOF class. Can create false positives. */ | |
581 | ||
582 | SV* anded_cp_list; | |
583 | U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF) | |
584 | ? ANYOF_FLAGS(and_with) | |
585 | : 0; | |
586 | U8 anded_flags; | |
587 | ||
588 | PERL_ARGS_ASSERT_SSC_AND; | |
589 | ||
590 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
591 | ||
592 | /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract | |
593 | * the code point inversion list and just the relevant flags */ | |
594 | if (is_ANYOF_SYNTHETIC(and_with)) { | |
595 | anded_cp_list = ((regnode_ssc *)and_with)->invlist; | |
596 | anded_flags = and_with_flags; | |
597 | ||
598 | /* XXX This is a kludge around what appears to be deficiencies in the | |
599 | * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, | |
600 | * there are paths through the optimizer where it doesn't get weeded | |
601 | * out when it should. And if we don't make some extra provision for | |
602 | * it like the code just below, it doesn't get added when it should. | |
603 | * This solution is to add it only when AND'ing, which is here, and | |
604 | * only when what is being AND'ed is the pristine, original node | |
605 | * matching anything. Thus it is like adding it to ssc_anything() but | |
606 | * only when the result is to be AND'ed. Probably the same solution | |
607 | * could be adopted for the same problem we have with /l matching, | |
608 | * which is solved differently in S_ssc_init(), and that would lead to | |
609 | * fewer false positives than that solution has. But if this solution | |
610 | * creates bugs, the consequences are only that a warning isn't raised | |
611 | * that should be; while the consequences for having /l bugs is | |
612 | * incorrect matches */ | |
613 | if (ssc_is_anything((regnode_ssc *)and_with)) { | |
614 | anded_flags |= ANYOF_WARN_SUPER__shared; | |
615 | } | |
616 | } | |
617 | else { | |
618 | anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); | |
619 | if (OP(and_with) == ANYOFD) { | |
620 | anded_flags = and_with_flags & ANYOF_COMMON_FLAGS; | |
621 | } | |
622 | else { | |
623 | anded_flags = and_with_flags | |
624 | & ( ANYOF_COMMON_FLAGS | |
625 | |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared | |
626 | |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); | |
627 | if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) { | |
628 | anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; | |
629 | } | |
630 | } | |
631 | } | |
632 | ||
633 | ANYOF_FLAGS(ssc) &= anded_flags; | |
634 | ||
635 | /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. | |
636 | * C2 is the list of code points in 'and-with'; P2, its posix classes. | |
637 | * 'and_with' may be inverted. When not inverted, we have the situation of | |
638 | * computing: | |
639 | * (C1 | P1) & (C2 | P2) | |
640 | * = (C1 & (C2 | P2)) | (P1 & (C2 | P2)) | |
641 | * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) | |
642 | * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2)) | |
643 | * <= ((C1 & C2) | P1 | P2) | |
644 | * Alternatively, the last few steps could be: | |
645 | * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2)) | |
646 | * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2)) | |
647 | * <= (C1 | C2 | (P1 & P2)) | |
648 | * We favor the second approach if either P1 or P2 is non-empty. This is | |
649 | * because these components are a barrier to doing optimizations, as what | |
650 | * they match cannot be known until the moment of matching as they are | |
651 | * dependent on the current locale, 'AND"ing them likely will reduce or | |
652 | * eliminate them. | |
653 | * But we can do better if we know that C1,P1 are in their initial state (a | |
654 | * frequent occurrence), each matching everything: | |
655 | * (<everything>) & (C2 | P2) = C2 | P2 | |
656 | * Similarly, if C2,P2 are in their initial state (again a frequent | |
657 | * occurrence), the result is a no-op | |
658 | * (C1 | P1) & (<everything>) = C1 | P1 | |
659 | * | |
660 | * Inverted, we have | |
661 | * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2) | |
662 | * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2)) | |
663 | * <= (C1 & ~C2) | (P1 & ~P2) | |
664 | * */ | |
665 | ||
666 | if ((and_with_flags & ANYOF_INVERT) | |
667 | && ! is_ANYOF_SYNTHETIC(and_with)) | |
668 | { | |
669 | unsigned int i; | |
670 | ||
671 | ssc_intersection(ssc, | |
672 | anded_cp_list, | |
673 | FALSE /* Has already been inverted */ | |
674 | ); | |
675 | ||
676 | /* If either P1 or P2 is empty, the intersection will be also; can skip | |
677 | * the loop */ | |
678 | if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) { | |
679 | ANYOF_POSIXL_ZERO(ssc); | |
680 | } | |
681 | else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { | |
682 | ||
683 | /* Note that the Posix class component P from 'and_with' actually | |
684 | * looks like: | |
685 | * P = Pa | Pb | ... | Pn | |
686 | * where each component is one posix class, such as in [\w\s]. | |
687 | * Thus | |
688 | * ~P = ~(Pa | Pb | ... | Pn) | |
689 | * = ~Pa & ~Pb & ... & ~Pn | |
690 | * <= ~Pa | ~Pb | ... | ~Pn | |
691 | * The last is something we can easily calculate, but unfortunately | |
692 | * is likely to have many false positives. We could do better | |
693 | * in some (but certainly not all) instances if two classes in | |
694 | * P have known relationships. For example | |
695 | * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print: | |
696 | * So | |
697 | * :lower: & :print: = :lower: | |
698 | * And similarly for classes that must be disjoint. For example, | |
699 | * since \s and \w can have no elements in common based on rules in | |
700 | * the POSIX standard, | |
701 | * \w & ^\S = nothing | |
702 | * Unfortunately, some vendor locales do not meet the Posix | |
703 | * standard, in particular almost everything by Microsoft. | |
704 | * The loop below just changes e.g., \w into \W and vice versa */ | |
705 | ||
706 | regnode_charclass_posixl temp; | |
707 | int add = 1; /* To calculate the index of the complement */ | |
708 | ||
709 | Zero(&temp, 1, regnode_charclass_posixl); | |
710 | ANYOF_POSIXL_ZERO(&temp); | |
711 | for (i = 0; i < ANYOF_MAX; i++) { | |
712 | assert(i % 2 != 0 | |
713 | || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i) | |
714 | || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1)); | |
715 | ||
716 | if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) { | |
717 | ANYOF_POSIXL_SET(&temp, i + add); | |
718 | } | |
719 | add = 0 - add; /* 1 goes to -1; -1 goes to 1 */ | |
720 | } | |
721 | ANYOF_POSIXL_AND(&temp, ssc); | |
722 | ||
723 | } /* else ssc already has no posixes */ | |
724 | } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC | |
725 | in its initial state */ | |
726 | else if (! is_ANYOF_SYNTHETIC(and_with) | |
727 | || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with)) | |
728 | { | |
729 | /* But if 'ssc' is in its initial state, the result is just 'and_with'; | |
730 | * copy it over 'ssc' */ | |
731 | if (ssc_is_cp_posixl_init(pRExC_state, ssc)) { | |
732 | if (is_ANYOF_SYNTHETIC(and_with)) { | |
733 | StructCopy(and_with, ssc, regnode_ssc); | |
734 | } | |
735 | else { | |
736 | ssc->invlist = anded_cp_list; | |
737 | ANYOF_POSIXL_ZERO(ssc); | |
738 | if (and_with_flags & ANYOF_MATCHES_POSIXL) { | |
739 | ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); | |
740 | } | |
741 | } | |
742 | } | |
743 | else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) | |
744 | || (and_with_flags & ANYOF_MATCHES_POSIXL)) | |
745 | { | |
746 | /* One or the other of P1, P2 is non-empty. */ | |
747 | if (and_with_flags & ANYOF_MATCHES_POSIXL) { | |
748 | ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); | |
749 | } | |
750 | ssc_union(ssc, anded_cp_list, FALSE); | |
751 | } | |
752 | else { /* P1 = P2 = empty */ | |
753 | ssc_intersection(ssc, anded_cp_list, FALSE); | |
754 | } | |
755 | } | |
756 | } | |
757 | ||
758 | STATIC void | |
759 | S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, | |
760 | const regnode_charclass *or_with) | |
761 | { | |
762 | /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either | |
763 | * another SSC or a regular ANYOF class. Can create false positives if | |
764 | * 'or_with' is to be inverted. */ | |
765 | ||
766 | SV* ored_cp_list; | |
767 | U8 ored_flags; | |
768 | U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF) | |
769 | ? ANYOF_FLAGS(or_with) | |
770 | : 0; | |
771 | ||
772 | PERL_ARGS_ASSERT_SSC_OR; | |
773 | ||
774 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
775 | ||
776 | /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract | |
777 | * the code point inversion list and just the relevant flags */ | |
778 | if (is_ANYOF_SYNTHETIC(or_with)) { | |
779 | ored_cp_list = ((regnode_ssc*) or_with)->invlist; | |
780 | ored_flags = or_with_flags; | |
781 | } | |
782 | else { | |
783 | ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); | |
784 | ored_flags = or_with_flags & ANYOF_COMMON_FLAGS; | |
785 | if (OP(or_with) != ANYOFD) { | |
786 | ored_flags |= | |
787 | or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared | |
788 | |ANYOF_HAS_EXTRA_RUNTIME_MATCHES); | |
789 | if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) { | |
790 | ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES; | |
791 | } | |
792 | } | |
793 | } | |
794 | ||
795 | ANYOF_FLAGS(ssc) |= ored_flags; | |
796 | ||
797 | /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes. | |
798 | * C2 is the list of code points in 'or-with'; P2, its posix classes. | |
799 | * 'or_with' may be inverted. When not inverted, we have the simple | |
800 | * situation of computing: | |
801 | * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2) | |
802 | * If P1|P2 yields a situation with both a class and its complement are | |
803 | * set, like having both \w and \W, this matches all code points, and we | |
804 | * can delete these from the P component of the ssc going forward. XXX We | |
805 | * might be able to delete all the P components, but I (khw) am not certain | |
806 | * about this, and it is better to be safe. | |
807 | * | |
808 | * Inverted, we have | |
809 | * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2) | |
810 | * <= (C1 | P1) | ~C2 | |
811 | * <= (C1 | ~C2) | P1 | |
812 | * (which results in actually simpler code than the non-inverted case) | |
813 | * */ | |
814 | ||
815 | if ((or_with_flags & ANYOF_INVERT) | |
816 | && ! is_ANYOF_SYNTHETIC(or_with)) | |
817 | { | |
818 | /* We ignore P2, leaving P1 going forward */ | |
819 | } /* else Not inverted */ | |
820 | else if (or_with_flags & ANYOF_MATCHES_POSIXL) { | |
821 | ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); | |
822 | if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { | |
823 | unsigned int i; | |
824 | for (i = 0; i < ANYOF_MAX; i += 2) { | |
825 | if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1)) | |
826 | { | |
827 | ssc_match_all_cp(ssc); | |
828 | ANYOF_POSIXL_CLEAR(ssc, i); | |
829 | ANYOF_POSIXL_CLEAR(ssc, i+1); | |
830 | } | |
831 | } | |
832 | } | |
833 | } | |
834 | ||
835 | ssc_union(ssc, | |
836 | ored_cp_list, | |
837 | FALSE /* Already has been inverted */ | |
838 | ); | |
839 | } | |
840 | ||
841 | STATIC void | |
842 | S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd) | |
843 | { | |
844 | PERL_ARGS_ASSERT_SSC_UNION; | |
845 | ||
846 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
847 | ||
848 | _invlist_union_maybe_complement_2nd(ssc->invlist, | |
849 | invlist, | |
850 | invert2nd, | |
851 | &ssc->invlist); | |
852 | } | |
853 | ||
854 | STATIC void | |
855 | S_ssc_intersection(pTHX_ regnode_ssc *ssc, | |
856 | SV* const invlist, | |
857 | const bool invert2nd) | |
858 | { | |
859 | PERL_ARGS_ASSERT_SSC_INTERSECTION; | |
860 | ||
861 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
862 | ||
863 | _invlist_intersection_maybe_complement_2nd(ssc->invlist, | |
864 | invlist, | |
865 | invert2nd, | |
866 | &ssc->invlist); | |
867 | } | |
868 | ||
869 | STATIC void | |
870 | S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end) | |
871 | { | |
872 | PERL_ARGS_ASSERT_SSC_ADD_RANGE; | |
873 | ||
874 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
875 | ||
876 | ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end); | |
877 | } | |
878 | ||
879 | STATIC void | |
880 | S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) | |
881 | { | |
882 | /* AND just the single code point 'cp' into the SSC 'ssc' */ | |
883 | ||
884 | SV* cp_list = _new_invlist(2); | |
885 | ||
886 | PERL_ARGS_ASSERT_SSC_CP_AND; | |
887 | ||
888 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
889 | ||
890 | cp_list = add_cp_to_invlist(cp_list, cp); | |
891 | ssc_intersection(ssc, cp_list, | |
892 | FALSE /* Not inverted */ | |
893 | ); | |
894 | SvREFCNT_dec_NN(cp_list); | |
895 | } | |
896 | ||
897 | STATIC void | |
898 | S_ssc_clear_locale(regnode_ssc *ssc) | |
899 | { | |
900 | /* Set the SSC 'ssc' to not match any locale things */ | |
901 | PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; | |
902 | ||
903 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
904 | ||
905 | ANYOF_POSIXL_ZERO(ssc); | |
906 | ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; | |
907 | } | |
908 | ||
909 | bool | |
910 | Perl_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) | |
911 | { | |
912 | /* The synthetic start class is used to hopefully quickly winnow down | |
913 | * places where a pattern could start a match in the target string. If it | |
914 | * doesn't really narrow things down that much, there isn't much point to | |
915 | * having the overhead of using it. This function uses some very crude | |
916 | * heuristics to decide if to use the ssc or not. | |
917 | * | |
918 | * It returns TRUE if 'ssc' rules out more than half what it considers to | |
919 | * be the "likely" possible matches, but of course it doesn't know what the | |
920 | * actual things being matched are going to be; these are only guesses | |
921 | * | |
922 | * For /l matches, it assumes that the only likely matches are going to be | |
923 | * in the 0-255 range, uniformly distributed, so half of that is 127 | |
924 | * For /a and /d matches, it assumes that the likely matches will be just | |
925 | * the ASCII range, so half of that is 63 | |
926 | * For /u and there isn't anything matching above the Latin1 range, it | |
927 | * assumes that that is the only range likely to be matched, and uses | |
928 | * half that as the cut-off: 127. If anything matches above Latin1, | |
929 | * it assumes that all of Unicode could match (uniformly), except for | |
930 | * non-Unicode code points and things in the General Category "Other" | |
931 | * (unassigned, private use, surrogates, controls and formats). This | |
932 | * is a much large number. */ | |
933 | ||
934 | U32 count = 0; /* Running total of number of code points matched by | |
935 | 'ssc' */ | |
936 | UV start, end; /* Start and end points of current range in inversion | |
937 | XXX outdated. UTF-8 locales are common, what about invert? list */ | |
938 | const U32 max_code_points = (LOC) | |
939 | ? 256 | |
940 | : (( ! UNI_SEMANTICS | |
941 | || invlist_highest(ssc->invlist) < 256) | |
942 | ? 128 | |
943 | : NON_OTHER_COUNT); | |
944 | const U32 max_match = max_code_points / 2; | |
945 | ||
946 | PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; | |
947 | ||
948 | invlist_iterinit(ssc->invlist); | |
949 | while (invlist_iternext(ssc->invlist, &start, &end)) { | |
950 | if (start >= max_code_points) { | |
951 | break; | |
952 | } | |
953 | end = MIN(end, max_code_points - 1); | |
954 | count += end - start + 1; | |
955 | if (count >= max_match) { | |
956 | invlist_iterfinish(ssc->invlist); | |
957 | return FALSE; | |
958 | } | |
959 | } | |
960 | ||
961 | return TRUE; | |
962 | } | |
963 | ||
964 | ||
965 | void | |
966 | Perl_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) | |
967 | { | |
968 | /* The inversion list in the SSC is marked mortal; now we need a more | |
969 | * permanent copy, which is stored the same way that is done in a regular | |
970 | * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit | |
971 | * map */ | |
972 | ||
973 | SV* invlist = invlist_clone(ssc->invlist, NULL); | |
974 | ||
975 | PERL_ARGS_ASSERT_SSC_FINALIZE; | |
976 | ||
977 | assert(is_ANYOF_SYNTHETIC(ssc)); | |
978 | ||
979 | /* The code in this file assumes that all but these flags aren't relevant | |
980 | * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared | |
981 | * by the time we reach here */ | |
982 | assert(! (ANYOF_FLAGS(ssc) | |
983 | & ~( ANYOF_COMMON_FLAGS | |
984 | |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared | |
985 | |ANYOF_HAS_EXTRA_RUNTIME_MATCHES))); | |
986 | ||
987 | populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist); | |
988 | ||
989 | set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL); | |
990 | SvREFCNT_dec(invlist); | |
991 | ||
992 | /* Make sure is clone-safe */ | |
993 | ssc->invlist = NULL; | |
994 | ||
995 | if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { | |
996 | ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; | |
997 | OP(ssc) = ANYOFPOSIXL; | |
998 | } | |
999 | else if (RExC_contains_locale) { | |
1000 | OP(ssc) = ANYOFL; | |
1001 | } | |
1002 | ||
1003 | assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); | |
1004 | } | |
1005 | ||
1006 | /* The below joins as many adjacent EXACTish nodes as possible into a single | |
1007 | * one. The regop may be changed if the node(s) contain certain sequences that | |
1008 | * require special handling. The joining is only done if: | |
1009 | * 1) there is room in the current conglomerated node to entirely contain the | |
1010 | * next one. | |
1011 | * 2) they are compatible node types | |
1012 | * | |
1013 | * The adjacent nodes actually may be separated by NOTHING-kind nodes, and | |
1014 | * these get optimized out | |
1015 | * | |
1016 | * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full | |
1017 | * as possible, even if that means splitting an existing node so that its first | |
1018 | * part is moved to the preceding node. This would maximise the efficiency of | |
1019 | * memEQ during matching. | |
1020 | * | |
1021 | * If a node is to match under /i (folded), the number of characters it matches | |
1022 | * can be different than its character length if it contains a multi-character | |
1023 | * fold. *min_subtract is set to the total delta number of characters of the | |
1024 | * input nodes. | |
1025 | * | |
1026 | * And *unfolded_multi_char is set to indicate whether or not the node contains | |
1027 | * an unfolded multi-char fold. This happens when it won't be known until | |
1028 | * runtime whether the fold is valid or not; namely | |
1029 | * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the | |
1030 | * target string being matched against turns out to be UTF-8 is that fold | |
1031 | * valid; or | |
1032 | * 2) for EXACTFL nodes whose folding rules depend on the locale in force at | |
1033 | * runtime. | |
1034 | * (Multi-char folds whose components are all above the Latin1 range are not | |
1035 | * run-time locale dependent, and have already been folded by the time this | |
1036 | * function is called.) | |
1037 | * | |
1038 | * This is as good a place as any to discuss the design of handling these | |
1039 | * multi-character fold sequences. It's been wrong in Perl for a very long | |
1040 | * time. There are three code points in Unicode whose multi-character folds | |
1041 | * were long ago discovered to mess things up. The previous designs for | |
1042 | * dealing with these involved assigning a special node for them. This | |
1043 | * approach doesn't always work, as evidenced by this example: | |
1044 | * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches | |
1045 | * Both sides fold to "sss", but if the pattern is parsed to create a node that | |
1046 | * would match just the \xDF, it won't be able to handle the case where a | |
1047 | * successful match would have to cross the node's boundary. The new approach | |
1048 | * that hopefully generally solves the problem generates an EXACTFUP node | |
1049 | * that is "sss" in this case. | |
1050 | * | |
1051 | * It turns out that there are problems with all multi-character folds, and not | |
1052 | * just these three. Now the code is general, for all such cases. The | |
1053 | * approach taken is: | |
1054 | * 1) This routine examines each EXACTFish node that could contain multi- | |
1055 | * character folded sequences. Since a single character can fold into | |
1056 | * such a sequence, the minimum match length for this node is less than | |
1057 | * the number of characters in the node. This routine returns in | |
1058 | * *min_subtract how many characters to subtract from the actual | |
1059 | * length of the string to get a real minimum match length; it is 0 if | |
1060 | * there are no multi-char foldeds. This delta is used by the caller to | |
1061 | * adjust the min length of the match, and the delta between min and max, | |
1062 | * so that the optimizer doesn't reject these possibilities based on size | |
1063 | * constraints. | |
1064 | * | |
1065 | * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF) | |
1066 | * under /u, we fold it to 'ss' in regatom(), and in this routine, after | |
1067 | * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8 | |
1068 | * EXACTFU nodes. The node type of such nodes is then changed to | |
1069 | * EXACTFUP, indicating it is problematic, and needs careful handling. | |
1070 | * (The procedures in step 1) above are sufficient to handle this case in | |
1071 | * UTF-8 encoded nodes.) The reason this is problematic is that this is | |
1072 | * the only case where there is a possible fold length change in non-UTF-8 | |
1073 | * patterns. By reserving a special node type for problematic cases, the | |
1074 | * far more common regular EXACTFU nodes can be processed faster. | |
1075 | * regexec.c takes advantage of this. | |
1076 | * | |
1077 | * EXACTFUP has been created as a grab-bag for (hopefully uncommon) | |
1078 | * problematic cases. These all only occur when the pattern is not | |
1079 | * UTF-8. In addition to the 'ss' sequence where there is a possible fold | |
1080 | * length change, it handles the situation where the string cannot be | |
1081 | * entirely folded. The strings in an EXACTFish node are folded as much | |
1082 | * as possible during compilation in regcomp.c. This saves effort in | |
1083 | * regex matching. By using an EXACTFUP node when it is not possible to | |
1084 | * fully fold at compile time, regexec.c can know that everything in an | |
1085 | * EXACTFU node is folded, so folding can be skipped at runtime. The only | |
1086 | * case where folding in EXACTFU nodes can't be done at compile time is | |
1087 | * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This | |
1088 | * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes | |
1089 | * handle two very different cases. Alternatively, there could have been | |
1090 | * a node type where there are length changes, one for unfolded, and one | |
1091 | * for both. If yet another special case needed to be created, the number | |
1092 | * of required node types would have to go to 7. khw figures that even | |
1093 | * though there are plenty of node types to spare, that the maintenance | |
1094 | * cost wasn't worth the small speedup of doing it that way, especially | |
1095 | * since he thinks the MICRO SIGN is rarely encountered in practice. | |
1096 | * | |
1097 | * There are other cases where folding isn't done at compile time, but | |
1098 | * none of them are under /u, and hence not for EXACTFU nodes. The folds | |
1099 | * in EXACTFL nodes aren't known until runtime, and vary as the locale | |
1100 | * changes. Some folds in EXACTF depend on if the runtime target string | |
1101 | * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di | |
1102 | * when no fold in it depends on the UTF-8ness of the target string.) | |
1103 | * | |
1104 | * 3) A problem remains for unfolded multi-char folds. (These occur when the | |
1105 | * validity of the fold won't be known until runtime, and so must remain | |
1106 | * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA | |
1107 | * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot | |
1108 | * be an EXACTF node with a UTF-8 pattern.) They also occur for various | |
1109 | * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.) | |
1110 | * The reason this is a problem is that the optimizer part of regexec.c | |
1111 | * (probably unwittingly, in Perl_regexec_flags()) makes an assumption | |
1112 | * that a character in the pattern corresponds to at most a single | |
1113 | * character in the target string. (And I do mean character, and not byte | |
1114 | * here, unlike other parts of the documentation that have never been | |
1115 | * updated to account for multibyte Unicode.) Sharp s in EXACTF and | |
1116 | * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA | |
1117 | * nodes it can match "\x{17F}\x{17F}". These, along with other ones in | |
1118 | * EXACTFL nodes, violate the assumption, and they are the only instances | |
1119 | * where it is violated. I'm reluctant to try to change the assumption, | |
1120 | * as the code involved is impenetrable to me (khw), so instead the code | |
1121 | * here punts. This routine examines EXACTFL nodes, and (when the pattern | |
1122 | * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a | |
1123 | * boolean indicating whether or not the node contains such a fold. When | |
1124 | * it is true, the caller sets a flag that later causes the optimizer in | |
1125 | * this file to not set values for the floating and fixed string lengths, | |
1126 | * and thus avoids the optimizer code in regexec.c that makes the invalid | |
1127 | * assumption. Thus, there is no optimization based on string lengths for | |
1128 | * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern | |
1129 | * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the | |
1130 | * assumption is wrong only in these cases is that all other non-UTF-8 | |
1131 | * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to | |
1132 | * their expanded versions. (Again, we can't prefold sharp s to 'ss' in | |
1133 | * EXACTF nodes because we don't know at compile time if it actually | |
1134 | * matches 'ss' or not. For EXACTF nodes it will match iff the target | |
1135 | * string is in UTF-8. This is in contrast to EXACTFU nodes, where it | |
1136 | * always matches; and EXACTFAA where it never does. In an EXACTFAA node | |
1137 | * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the | |
1138 | * problem; but in a non-UTF8 pattern, folding it to that above-Latin1 | |
1139 | * string would require the pattern to be forced into UTF-8, the overhead | |
1140 | * of which we want to avoid. Similarly the unfolded multi-char folds in | |
1141 | * EXACTFL nodes will match iff the locale at the time of match is a UTF-8 | |
1142 | * locale.) | |
1143 | * | |
1144 | * Similarly, the code that generates tries doesn't currently handle | |
1145 | * not-already-folded multi-char folds, and it looks like a pain to change | |
1146 | * that. Therefore, trie generation of EXACTFAA nodes with the sharp s | |
1147 | * doesn't work. Instead, such an EXACTFAA is turned into a new regnode, | |
1148 | * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people | |
1149 | * using /iaa matching will be doing so almost entirely with ASCII | |
1150 | * strings, so this should rarely be encountered in practice */ | |
1151 | ||
1152 | U32 | |
1153 | Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, | |
1154 | UV *min_subtract, bool *unfolded_multi_char, | |
1155 | U32 flags, regnode *val, U32 depth) | |
1156 | { | |
1157 | /* Merge several consecutive EXACTish nodes into one. */ | |
1158 | ||
1159 | regnode *n = regnext(scan); | |
1160 | U32 stringok = 1; | |
1161 | regnode *next = REGNODE_AFTER_varies(scan); | |
1162 | U32 merged = 0; | |
1163 | U32 stopnow = 0; | |
1164 | #ifdef DEBUGGING | |
1165 | regnode *stop = scan; | |
1166 | DECLARE_AND_GET_RE_DEBUG_FLAGS; | |
1167 | #else | |
1168 | PERL_UNUSED_ARG(depth); | |
1169 | #endif | |
1170 | ||
1171 | PERL_ARGS_ASSERT_JOIN_EXACT; | |
1172 | #ifndef EXPERIMENTAL_INPLACESCAN | |
1173 | PERL_UNUSED_ARG(flags); | |
1174 | PERL_UNUSED_ARG(val); | |
1175 | #endif | |
1176 | DEBUG_PEEP("join", scan, depth, 0); | |
1177 | ||
1178 | assert(REGNODE_TYPE(OP(scan)) == EXACT); | |
1179 | ||
1180 | /* Look through the subsequent nodes in the chain. Skip NOTHING, merge | |
1181 | * EXACT ones that are mergeable to the current one. */ | |
1182 | while ( n | |
1183 | && ( REGNODE_TYPE(OP(n)) == NOTHING | |
1184 | || (stringok && REGNODE_TYPE(OP(n)) == EXACT)) | |
1185 | && NEXT_OFF(n) | |
1186 | && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) | |
1187 | { | |
1188 | ||
1189 | if (OP(n) == TAIL || n > next) | |
1190 | stringok = 0; | |
1191 | if (REGNODE_TYPE(OP(n)) == NOTHING) { | |
1192 | DEBUG_PEEP("skip:", n, depth, 0); | |
1193 | NEXT_OFF(scan) += NEXT_OFF(n); | |
1194 | next = n + NODE_STEP_REGNODE; | |
1195 | #ifdef DEBUGGING | |
1196 | if (stringok) | |
1197 | stop = n; | |
1198 | #endif | |
1199 | n = regnext(n); | |
1200 | } | |
1201 | else if (stringok) { | |
1202 | const unsigned int oldl = STR_LEN(scan); | |
1203 | regnode * const nnext = regnext(n); | |
1204 | ||
1205 | /* XXX I (khw) kind of doubt that this works on platforms (should | |
1206 | * Perl ever run on one) where U8_MAX is above 255 because of lots | |
1207 | * of other assumptions */ | |
1208 | /* Don't join if the sum can't fit into a single node */ | |
1209 | if (oldl + STR_LEN(n) > U8_MAX) | |
1210 | break; | |
1211 | ||
1212 | /* Joining something that requires UTF-8 with something that | |
1213 | * doesn't, means the result requires UTF-8. */ | |
1214 | if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) { | |
1215 | OP(scan) = EXACT_REQ8; | |
1216 | } | |
1217 | else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) { | |
1218 | ; /* join is compatible, no need to change OP */ | |
1219 | } | |
1220 | else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) { | |
1221 | OP(scan) = EXACTFU_REQ8; | |
1222 | } | |
1223 | else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) { | |
1224 | ; /* join is compatible, no need to change OP */ | |
1225 | } | |
1226 | else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) { | |
1227 | ; /* join is compatible, no need to change OP */ | |
1228 | } | |
1229 | else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) { | |
1230 | ||
1231 | /* Under /di, temporary EXACTFU_S_EDGE nodes are generated, | |
1232 | * which can join with EXACTFU ones. We check for this case | |
1233 | * here. These need to be resolved to either EXACTFU or | |
1234 | * EXACTF at joining time. They have nothing in them that | |
1235 | * would forbid them from being the more desirable EXACTFU | |
1236 | * nodes except that they begin and/or end with a single [Ss]. | |
1237 | * The reason this is problematic is because they could be | |
1238 | * joined in this loop with an adjacent node that ends and/or | |
1239 | * begins with [Ss] which would then form the sequence 'ss', | |
1240 | * which matches differently under /di than /ui, in which case | |
1241 | * EXACTFU can't be used. If the 'ss' sequence doesn't get | |
1242 | * formed, the nodes get absorbed into any adjacent EXACTFU | |
1243 | * node. And if the only adjacent node is EXACTF, they get | |
1244 | * absorbed into that, under the theory that a longer node is | |
1245 | * better than two shorter ones, even if one is EXACTFU. Note | |
1246 | * that EXACTFU_REQ8 is generated only for UTF-8 patterns, | |
1247 | * and the EXACTFU_S_EDGE ones only for non-UTF-8. */ | |
1248 | ||
1249 | if (STRING(n)[STR_LEN(n)-1] == 's') { | |
1250 | ||
1251 | /* Here the joined node would end with 's'. If the node | |
1252 | * following the combination is an EXACTF one, it's better to | |
1253 | * join this trailing edge 's' node with that one, leaving the | |
1254 | * current one in 'scan' be the more desirable EXACTFU */ | |
1255 | if (OP(nnext) == EXACTF) { | |
1256 | break; | |
1257 | } | |
1258 | ||
1259 | OP(scan) = EXACTFU_S_EDGE; | |
1260 | ||
1261 | } /* Otherwise, the beginning 's' of the 2nd node just | |
1262 | becomes an interior 's' in 'scan' */ | |
1263 | } | |
1264 | else if (OP(scan) == EXACTF && OP(n) == EXACTF) { | |
1265 | ; /* join is compatible, no need to change OP */ | |
1266 | } | |
1267 | else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) { | |
1268 | ||
1269 | /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE | |
1270 | * nodes. But the latter nodes can be also joined with EXACTFU | |
1271 | * ones, and that is a better outcome, so if the node following | |
1272 | * 'n' is EXACTFU, quit now so that those two can be joined | |
1273 | * later */ | |
1274 | if (OP(nnext) == EXACTFU) { | |
1275 | break; | |
1276 | } | |
1277 | ||
1278 | /* The join is compatible, and the combined node will be | |
1279 | * EXACTF. (These don't care if they begin or end with 's' */ | |
1280 | } | |
1281 | else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) { | |
1282 | if ( STRING(scan)[STR_LEN(scan)-1] == 's' | |
1283 | && STRING(n)[0] == 's') | |
1284 | { | |
1285 | /* When combined, we have the sequence 'ss', which means we | |
1286 | * have to remain /di */ | |
1287 | OP(scan) = EXACTF; | |
1288 | } | |
1289 | } | |
1290 | else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) { | |
1291 | if (STRING(n)[0] == 's') { | |
1292 | ; /* Here the join is compatible and the combined node | |
1293 | starts with 's', no need to change OP */ | |
1294 | } | |
1295 | else { /* Now the trailing 's' is in the interior */ | |
1296 | OP(scan) = EXACTFU; | |
1297 | } | |
1298 | } | |
1299 | else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) { | |
1300 | ||
1301 | /* The join is compatible, and the combined node will be | |
1302 | * EXACTF. (These don't care if they begin or end with 's' */ | |
1303 | OP(scan) = EXACTF; | |
1304 | } | |
1305 | else if (OP(scan) != OP(n)) { | |
1306 | ||
1307 | /* The only other compatible joinings are the same node type */ | |
1308 | break; | |
1309 | } | |
1310 | ||
1311 | DEBUG_PEEP("merg", n, depth, 0); | |
1312 | merged++; | |
1313 | ||
1314 | next = REGNODE_AFTER_varies(n); | |
1315 | NEXT_OFF(scan) += NEXT_OFF(n); | |
1316 | assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 ); | |
1317 | setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n))); | |
1318 | /* Now we can overwrite *n : */ | |
1319 | Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); | |
1320 | #ifdef DEBUGGING | |
1321 | stop = next - 1; | |
1322 | #endif | |
1323 | n = nnext; | |
1324 | if (stopnow) break; | |
1325 | } | |
1326 | ||
1327 | #ifdef EXPERIMENTAL_INPLACESCAN | |
1328 | if (flags && !NEXT_OFF(n)) { | |
1329 | DEBUG_PEEP("atch", val, depth, 0); | |
1330 | if (REGNODE_OFF_BY_ARG(OP(n))) { | |
1331 | ARG_SET(n, val - n); | |
1332 | } | |
1333 | else { | |
1334 | NEXT_OFF(n) = val - n; | |
1335 | } | |
1336 | stopnow = 1; | |
1337 | } | |
1338 | #endif | |
1339 | } | |
1340 | ||
1341 | /* This temporary node can now be turned into EXACTFU, and must, as | |
1342 | * regexec.c doesn't handle it */ | |
1343 | if (OP(scan) == EXACTFU_S_EDGE) { | |
1344 | OP(scan) = EXACTFU; | |
1345 | } | |
1346 | ||
1347 | *min_subtract = 0; | |
1348 | *unfolded_multi_char = FALSE; | |
1349 | ||
1350 | /* Here, all the adjacent mergeable EXACTish nodes have been merged. We | |
1351 | * can now analyze for sequences of problematic code points. (Prior to | |
1352 | * this final joining, sequences could have been split over boundaries, and | |
1353 | * hence missed). The sequences only happen in folding, hence for any | |
1354 | * non-EXACT EXACTish node */ | |
1355 | if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) { | |
1356 | U8* s0 = (U8*) STRING(scan); | |
1357 | U8* s = s0; | |
1358 | U8* s_end = s0 + STR_LEN(scan); | |
1359 | ||
1360 | int total_count_delta = 0; /* Total delta number of characters that | |
1361 | multi-char folds expand to */ | |
1362 | ||
1363 | /* One pass is made over the node's string looking for all the | |
1364 | * possibilities. To avoid some tests in the loop, there are two main | |
1365 | * cases, for UTF-8 patterns (which can't have EXACTF nodes) and | |
1366 | * non-UTF-8 */ | |
1367 | if (UTF) { | |
1368 | U8* folded = NULL; | |
1369 | ||
1370 | if (OP(scan) == EXACTFL) { | |
1371 | U8 *d; | |
1372 | ||
1373 | /* An EXACTFL node would already have been changed to another | |
1374 | * node type unless there is at least one character in it that | |
1375 | * is problematic; likely a character whose fold definition | |
1376 | * won't be known until runtime, and so has yet to be folded. | |
1377 | * For all but the UTF-8 locale, folds are 1-1 in length, but | |
1378 | * to handle the UTF-8 case, we need to create a temporary | |
1379 | * folded copy using UTF-8 locale rules in order to analyze it. | |
1380 | * This is because our macros that look to see if a sequence is | |
1381 | * a multi-char fold assume everything is folded (otherwise the | |
1382 | * tests in those macros would be too complicated and slow). | |
1383 | * Note that here, the non-problematic folds will have already | |
1384 | * been done, so we can just copy such characters. We actually | |
1385 | * don't completely fold the EXACTFL string. We skip the | |
1386 | * unfolded multi-char folds, as that would just create work | |
1387 | * below to figure out the size they already are */ | |
1388 | ||
1389 | Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8); | |
1390 | d = folded; | |
1391 | while (s < s_end) { | |
1392 | STRLEN s_len = UTF8SKIP(s); | |
1393 | if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) { | |
1394 | Copy(s, d, s_len, U8); | |
1395 | d += s_len; | |
1396 | } | |
1397 | else if (is_FOLDS_TO_MULTI_utf8(s)) { | |
1398 | *unfolded_multi_char = TRUE; | |
1399 | Copy(s, d, s_len, U8); | |
1400 | d += s_len; | |
1401 | } | |
1402 | else if (isASCII(*s)) { | |
1403 | *(d++) = toFOLD(*s); | |
1404 | } | |
1405 | else { | |
1406 | STRLEN len; | |
1407 | _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL); | |
1408 | d += len; | |
1409 | } | |
1410 | s += s_len; | |
1411 | } | |
1412 | ||
1413 | /* Point the remainder of the routine to look at our temporary | |
1414 | * folded copy */ | |
1415 | s = folded; | |
1416 | s_end = d; | |
1417 | } /* End of creating folded copy of EXACTFL string */ | |
1418 | ||
1419 | /* Examine the string for a multi-character fold sequence. UTF-8 | |
1420 | * patterns have all characters pre-folded by the time this code is | |
1421 | * executed */ | |
1422 | while (s < s_end - 1) /* Can stop 1 before the end, as minimum | |
1423 | length sequence we are looking for is 2 */ | |
1424 | { | |
1425 | int count = 0; /* How many characters in a multi-char fold */ | |
1426 | int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); | |
1427 | if (! len) { /* Not a multi-char fold: get next char */ | |
1428 | s += UTF8SKIP(s); | |
1429 | continue; | |
1430 | } | |
1431 | ||
1432 | { /* Here is a generic multi-char fold. */ | |
1433 | U8* multi_end = s + len; | |
1434 | ||
1435 | /* Count how many characters are in it. In the case of | |
1436 | * /aa, no folds which contain ASCII code points are | |
1437 | * allowed, so check for those, and skip if found. */ | |
1438 | if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) { | |
1439 | count = utf8_length(s, multi_end); | |
1440 | s = multi_end; | |
1441 | } | |
1442 | else { | |
1443 | while (s < multi_end) { | |
1444 | if (isASCII(*s)) { | |
1445 | s++; | |
1446 | goto next_iteration; | |
1447 | } | |
1448 | else { | |
1449 | s += UTF8SKIP(s); | |
1450 | } | |
1451 | count++; | |
1452 | } | |
1453 | } | |
1454 | } | |
1455 | ||
1456 | /* The delta is how long the sequence is minus 1 (1 is how long | |
1457 | * the character that folds to the sequence is) */ | |
1458 | total_count_delta += count - 1; | |
1459 | next_iteration: ; | |
1460 | } | |
1461 | ||
1462 | /* We created a temporary folded copy of the string in EXACTFL | |
1463 | * nodes. Therefore we need to be sure it doesn't go below zero, | |
1464 | * as the real string could be shorter */ | |
1465 | if (OP(scan) == EXACTFL) { | |
1466 | int total_chars = utf8_length((U8*) STRING(scan), | |
1467 | (U8*) STRING(scan) + STR_LEN(scan)); | |
1468 | if (total_count_delta > total_chars) { | |
1469 | total_count_delta = total_chars; | |
1470 | } | |
1471 | } | |
1472 | ||
1473 | *min_subtract += total_count_delta; | |
1474 | Safefree(folded); | |
1475 | } | |
1476 | else if (OP(scan) == EXACTFAA) { | |
1477 | ||
1478 | /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char | |
1479 | * fold to the ASCII range (and there are no existing ones in the | |
1480 | * upper latin1 range). But, as outlined in the comments preceding | |
1481 | * this function, we need to flag any occurrences of the sharp s. | |
1482 | * This character forbids trie formation (because of added | |
1483 | * complexity) */ | |
1484 | #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ | |
1485 | || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ | |
1486 | || UNICODE_DOT_DOT_VERSION > 0) | |
1487 | while (s < s_end) { | |
1488 | if (*s == LATIN_SMALL_LETTER_SHARP_S) { | |
1489 | OP(scan) = EXACTFAA_NO_TRIE; | |
1490 | *unfolded_multi_char = TRUE; | |
1491 | break; | |
1492 | } | |
1493 | s++; | |
1494 | } | |
1495 | } | |
1496 | else if (OP(scan) != EXACTFAA_NO_TRIE) { | |
1497 | ||
1498 | /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char | |
1499 | * folds that are all Latin1. As explained in the comments | |
1500 | * preceding this function, we look also for the sharp s in EXACTF | |
1501 | * and EXACTFL nodes; it can be in the final position. Otherwise | |
1502 | * we can stop looking 1 byte earlier because have to find at least | |
1503 | * two characters for a multi-fold */ | |
1504 | const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL) | |
1505 | ? s_end | |
1506 | : s_end -1; | |
1507 | ||
1508 | while (s < upper) { | |
1509 | int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); | |
1510 | if (! len) { /* Not a multi-char fold. */ | |
1511 | if (*s == LATIN_SMALL_LETTER_SHARP_S | |
1512 | && (OP(scan) == EXACTF || OP(scan) == EXACTFL)) | |
1513 | { | |
1514 | *unfolded_multi_char = TRUE; | |
1515 | } | |
1516 | s++; | |
1517 | continue; | |
1518 | } | |
1519 | ||
1520 | if (len == 2 | |
1521 | && isALPHA_FOLD_EQ(*s, 's') | |
1522 | && isALPHA_FOLD_EQ(*(s+1), 's')) | |
1523 | { | |
1524 | ||
1525 | /* EXACTF nodes need to know that the minimum length | |
1526 | * changed so that a sharp s in the string can match this | |
1527 | * ss in the pattern, but they remain EXACTF nodes, as they | |
1528 | * won't match this unless the target string is in UTF-8, | |
1529 | * which we don't know until runtime. EXACTFL nodes can't | |
1530 | * transform into EXACTFU nodes */ | |
1531 | if (OP(scan) != EXACTF && OP(scan) != EXACTFL) { | |
1532 | OP(scan) = EXACTFUP; | |
1533 | } | |
1534 | } | |
1535 | ||
1536 | *min_subtract += len - 1; | |
1537 | s += len; | |
1538 | } | |
1539 | #endif | |
1540 | } | |
1541 | } | |
1542 | ||
1543 | #ifdef DEBUGGING | |
1544 | /* Allow dumping but overwriting the collection of skipped | |
1545 | * ops and/or strings with fake optimized ops */ | |
1546 | n = REGNODE_AFTER_varies(scan); | |
1547 | while (n <= stop) { | |
1548 | OP(n) = OPTIMIZED; | |
1549 | FLAGS(n) = 0; | |
1550 | NEXT_OFF(n) = 0; | |
1551 | n++; | |
1552 | } | |
1553 | #endif | |
1554 | DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);}); | |
1555 | return stopnow; | |
1556 | } | |
1557 | ||
1558 | /* REx optimizer. Converts nodes into quicker variants "in place". | |
1559 | Finds fixed substrings. */ | |
1560 | ||
1561 | ||
1562 | /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set | |
1563 | to the position after last scanned or to NULL. */ | |
1564 | ||
1565 | /* the return from this sub is the minimum length that could possibly match */ | |
1566 | SSize_t | |
1567 | Perl_study_chunk(pTHX_ | |
1568 | RExC_state_t *pRExC_state, | |
1569 | regnode **scanp, /* Start here (read-write). */ | |
1570 | SSize_t *minlenp, /* used for the minlen of substrings? */ | |
1571 | SSize_t *deltap, /* Write maxlen-minlen here. */ | |
1572 | regnode *last, /* Stop before this one. */ | |
1573 | scan_data_t *data, /* string data about the pattern */ | |
1574 | I32 stopparen, /* treat CLOSE-N as END, see GOSUB */ | |
1575 | U32 recursed_depth, /* how deep have we recursed via GOSUB */ | |
1576 | regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */ | |
1577 | U32 flags, /* flags controlling this call, see SCF_ flags */ | |
1578 | U32 depth, /* how deep have we recursed period */ | |
1579 | bool was_mutate_ok /* TRUE if in-place optimizations are allowed. | |
1580 | FALSE only if the caller (recursively) was | |
1581 | prohibited from modifying the regops, because | |
1582 | a higher caller is holding a ptr to them. */ | |
1583 | ) | |
1584 | { | |
1585 | /* vars about the regnodes we are working with */ | |
1586 | regnode *scan = *scanp; /* the current opcode we are inspecting */ | |
1587 | regnode *next = NULL; /* the next opcode beyond scan, tmp var */ | |
1588 | regnode *first_non_open = scan; /* FIXME: should this init to NULL? | |
1589 | the first non open regop, if the init | |
1590 | val IS an OPEN then we will skip past | |
1591 | it just after the var decls section */ | |
1592 | I32 code = 0; /* temp var used to hold the optype of a regop */ | |
1593 | ||
1594 | /* vars about the min and max length of the pattern */ | |
1595 | SSize_t min = 0; /* min length of this part of the pattern */ | |
1596 | SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT | |
1597 | this is adjusted down if we find | |
1598 | an ACCEPT */ | |
1599 | SSize_t delta = 0; /* difference between min and max length | |
1600 | (not accounting for stopmin) */ | |
1601 | ||
1602 | /* vars about capture buffers in the pattern */ | |
1603 | I32 pars = 0; /* count of OPEN opcodes */ | |
1604 | I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */ | |
1605 | ||
1606 | /* vars about whether this pattern contains something that can match | |
1607 | * infinitely long strings, eg, X* or X+ */ | |
1608 | int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); | |
1609 | int is_inf_internal = 0; /* The studied chunk is infinite */ | |
1610 | ||
1611 | /* scan_data_t (struct) is used to hold information about the substrings | |
1612 | * and start class we have extracted from the string */ | |
1613 | scan_data_t data_fake; /* temp var used for recursing in some cases */ | |
1614 | ||
1615 | SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do | |
1616 | trie optimizations */ | |
1617 | ||
1618 | scan_frame *frame = NULL; /* used as part of fake recursion */ | |
1619 | ||
1620 | DECLARE_AND_GET_RE_DEBUG_FLAGS; | |
1621 | ||
1622 | PERL_ARGS_ASSERT_STUDY_CHUNK; | |
1623 | RExC_study_started= 1; | |
1624 | ||
1625 | Zero(&data_fake, 1, scan_data_t); | |
1626 | ||
1627 | if ( depth == 0 ) { | |
1628 | while (first_non_open && OP(first_non_open) == OPEN) | |
1629 | first_non_open=regnext(first_non_open); | |
1630 | } | |
1631 | ||
1632 | fake_study_recurse: | |
1633 | DEBUG_r( | |
1634 | RExC_study_chunk_recursed_count++; | |
1635 | ); | |
1636 | DEBUG_OPTIMISE_MORE_r( | |
1637 | { | |
1638 | Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", | |
1639 | depth, (long)stopparen, | |
1640 | (unsigned long)RExC_study_chunk_recursed_count, | |
1641 | (unsigned long)depth, (unsigned long)recursed_depth, | |
1642 | scan, | |
1643 | last); | |
1644 | if (recursed_depth) { | |
1645 | U32 i; | |
1646 | U32 j; | |
1647 | for ( j = 0 ; j < recursed_depth ; j++ ) { | |
1648 | for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) { | |
1649 | if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) { | |
1650 | Perl_re_printf( aTHX_ " %d",(int)i); | |
1651 | break; | |
1652 | } | |
1653 | } | |
1654 | if ( j + 1 < recursed_depth ) { | |
1655 | Perl_re_printf( aTHX_ ","); | |
1656 | } | |
1657 | } | |
1658 | } | |
1659 | Perl_re_printf( aTHX_ "\n"); | |
1660 | } | |
1661 | ); | |
1662 | while ( scan && OP(scan) != END && scan < last ){ | |
1663 | UV min_subtract = 0; /* How mmany chars to subtract from the minimum | |
1664 | node length to get a real minimum (because | |
1665 | the folded version may be shorter) */ | |
1666 | bool unfolded_multi_char = FALSE; | |
1667 | /* avoid mutating ops if we are anywhere within the recursed or | |
1668 | * enframed handling for a GOSUB: the outermost level will handle it. | |
1669 | */ | |
1670 | bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub); | |
1671 | /* Peephole optimizer: */ | |
1672 | DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta); | |
1673 | DEBUG_PEEP("Peep", scan, depth, flags); | |
1674 | ||
1675 | ||
1676 | /* The reason we do this here is that we need to deal with things like | |
1677 | * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT | |
1678 | * parsing code, as each (?:..) is handled by a different invocation of | |
1679 | * reg() -- Yves | |
1680 | */ | |
1681 | if (REGNODE_TYPE(OP(scan)) == EXACT | |
1682 | && OP(scan) != LEXACT | |
1683 | && OP(scan) != LEXACT_REQ8 | |
1684 | && mutate_ok | |
1685 | ) { | |
1686 | join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char, | |
1687 | 0, NULL, depth + 1); | |
1688 | } | |
1689 | ||
1690 | /* Follow the next-chain of the current node and optimize | |
1691 | away all the NOTHINGs from it. | |
1692 | */ | |
1693 | rck_elide_nothing(scan); | |
1694 | ||
1695 | /* The principal pseudo-switch. Cannot be a switch, since we look into | |
1696 | * several different things. */ | |
1697 | if ( OP(scan) == DEFINEP ) { | |
1698 | SSize_t minlen = 0; | |
1699 | SSize_t deltanext = 0; | |
1700 | SSize_t fake_last_close = 0; | |
1701 | regnode *fake_last_close_op = NULL; | |
1702 | U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY); | |
1703 | ||
1704 | StructCopy(&zero_scan_data, &data_fake, scan_data_t); | |
1705 | scan = regnext(scan); | |
1706 | assert( OP(scan) == IFTHEN ); | |
1707 | DEBUG_PEEP("expect IFTHEN", scan, depth, flags); | |
1708 | ||
1709 | data_fake.last_closep= &fake_last_close; | |
1710 | data_fake.last_close_opp= &fake_last_close_op; | |
1711 | minlen = *minlenp; | |
1712 | next = regnext(scan); | |
1713 | scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN); | |
1714 | DEBUG_PEEP("scan", scan, depth, flags); | |
1715 | DEBUG_PEEP("next", next, depth, flags); | |
1716 | ||
1717 | /* we suppose the run is continuous, last=next... | |
1718 | * NOTE we dont use the return here! */ | |
1719 | /* DEFINEP study_chunk() recursion */ | |
1720 | (void)study_chunk(pRExC_state, &scan, &minlen, | |
1721 | &deltanext, next, &data_fake, stopparen, | |
1722 | recursed_depth, NULL, f, depth+1, mutate_ok); | |
1723 | ||
1724 | scan = next; | |
1725 | } else | |
1726 | if ( | |
1727 | OP(scan) == BRANCH || | |
1728 | OP(scan) == BRANCHJ || | |
1729 | OP(scan) == IFTHEN | |
1730 | ) { | |
1731 | next = regnext(scan); | |
1732 | code = OP(scan); | |
1733 | ||
1734 | /* The op(next)==code check below is to see if we | |
1735 | * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN" | |
1736 | * IFTHEN is special as it might not appear in pairs. | |
1737 | * Not sure whether BRANCH-BRANCHJ is possible, regardless | |
1738 | * we dont handle it cleanly. */ | |
1739 | if (OP(next) == code || code == IFTHEN) { | |
1740 | /* NOTE - There is similar code to this block below for | |
1741 | * handling TRIE nodes on a re-study. If you change stuff here | |
1742 | * check there too. */ | |
1743 | SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0; | |
1744 | regnode_ssc accum; | |
1745 | regnode * const startbranch=scan; | |
1746 | ||
1747 | if (flags & SCF_DO_SUBSTR) { | |
1748 | /* Cannot merge strings after this. */ | |
1749 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
1750 | } | |
1751 | ||
1752 | if (flags & SCF_DO_STCLASS) | |
1753 | ssc_init_zero(pRExC_state, &accum); | |
1754 | ||
1755 | while (OP(scan) == code) { | |
1756 | SSize_t deltanext, minnext, fake_last_close = 0; | |
1757 | regnode *fake_last_close_op = NULL; | |
1758 | U32 f = (flags & SCF_TRIE_DOING_RESTUDY); | |
1759 | regnode_ssc this_class; | |
1760 | ||
1761 | DEBUG_PEEP("Branch", scan, depth, flags); | |
1762 | ||
1763 | num++; | |
1764 | StructCopy(&zero_scan_data, &data_fake, scan_data_t); | |
1765 | if (data) { | |
1766 | data_fake.whilem_c = data->whilem_c; | |
1767 | data_fake.last_closep = data->last_closep; | |
1768 | data_fake.last_close_opp = data->last_close_opp; | |
1769 | } | |
1770 | else { | |
1771 | data_fake.last_closep = &fake_last_close; | |
1772 | data_fake.last_close_opp = &fake_last_close_op; | |
1773 | } | |
1774 | ||
1775 | data_fake.pos_delta = delta; | |
1776 | next = regnext(scan); | |
1777 | ||
1778 | scan = REGNODE_AFTER_opcode(scan, code); | |
1779 | ||
1780 | if (flags & SCF_DO_STCLASS) { | |
1781 | ssc_init(pRExC_state, &this_class); | |
1782 | data_fake.start_class = &this_class; | |
1783 | f |= SCF_DO_STCLASS_AND; | |
1784 | } | |
1785 | if (flags & SCF_WHILEM_VISITED_POS) | |
1786 | f |= SCF_WHILEM_VISITED_POS; | |
1787 | ||
1788 | /* we suppose the run is continuous, last=next...*/ | |
1789 | /* recurse study_chunk() for each BRANCH in an alternation */ | |
1790 | minnext = study_chunk(pRExC_state, &scan, minlenp, | |
1791 | &deltanext, next, &data_fake, stopparen, | |
1792 | recursed_depth, NULL, f, depth+1, | |
1793 | mutate_ok); | |
1794 | ||
1795 | if (min1 > minnext) | |
1796 | min1 = minnext; | |
1797 | if (deltanext == OPTIMIZE_INFTY) { | |
1798 | is_inf = is_inf_internal = 1; | |
1799 | max1 = OPTIMIZE_INFTY; | |
1800 | } else if (max1 < minnext + deltanext) | |
1801 | max1 = minnext + deltanext; | |
1802 | scan = next; | |
1803 | if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) | |
1804 | pars++; | |
1805 | if (data_fake.flags & SCF_SEEN_ACCEPT) { | |
1806 | if ( stopmin > minnext) | |
1807 | stopmin = min + min1; | |
1808 | flags &= ~SCF_DO_SUBSTR; | |
1809 | if (data) | |
1810 | data->flags |= SCF_SEEN_ACCEPT; | |
1811 | } | |
1812 | if (data) { | |
1813 | if (data_fake.flags & SF_HAS_EVAL) | |
1814 | data->flags |= SF_HAS_EVAL; | |
1815 | data->whilem_c = data_fake.whilem_c; | |
1816 | } | |
1817 | if (flags & SCF_DO_STCLASS) | |
1818 | ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class); | |
1819 | DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta); | |
1820 | } | |
1821 | if (code == IFTHEN && num < 2) /* Empty ELSE branch */ | |
1822 | min1 = 0; | |
1823 | if (flags & SCF_DO_SUBSTR) { | |
1824 | data->pos_min += min1; | |
1825 | if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1)) | |
1826 | data->pos_delta = OPTIMIZE_INFTY; | |
1827 | else | |
1828 | data->pos_delta += max1 - min1; | |
1829 | if (max1 != min1 || is_inf) | |
1830 | data->cur_is_floating = 1; | |
1831 | } | |
1832 | min += min1; | |
1833 | if (delta == OPTIMIZE_INFTY | |
1834 | || OPTIMIZE_INFTY - delta - (max1 - min1) < 0) | |
1835 | delta = OPTIMIZE_INFTY; | |
1836 | else | |
1837 | delta += max1 - min1; | |
1838 | if (flags & SCF_DO_STCLASS_OR) { | |
1839 | ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum); | |
1840 | if (min1) { | |
1841 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); | |
1842 | flags &= ~SCF_DO_STCLASS; | |
1843 | } | |
1844 | } | |
1845 | else if (flags & SCF_DO_STCLASS_AND) { | |
1846 | if (min1) { | |
1847 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); | |
1848 | flags &= ~SCF_DO_STCLASS; | |
1849 | } | |
1850 | else { | |
1851 | /* Switch to OR mode: cache the old value of | |
1852 | * data->start_class */ | |
1853 | INIT_AND_WITHP; | |
1854 | StructCopy(data->start_class, and_withp, regnode_ssc); | |
1855 | flags &= ~SCF_DO_STCLASS_AND; | |
1856 | StructCopy(&accum, data->start_class, regnode_ssc); | |
1857 | flags |= SCF_DO_STCLASS_OR; | |
1858 | } | |
1859 | } | |
1860 | DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta); | |
1861 | ||
1862 | if (PERL_ENABLE_TRIE_OPTIMISATION | |
1863 | && OP(startbranch) == BRANCH | |
1864 | && mutate_ok | |
1865 | ) { | |
1866 | /* demq. | |
1867 | ||
1868 | Assuming this was/is a branch we are dealing with: 'scan' | |
1869 | now points at the item that follows the branch sequence, | |
1870 | whatever it is. We now start at the beginning of the | |
1871 | sequence and look for subsequences of | |
1872 | ||
1873 | BRANCH->EXACT=>x1 | |
1874 | BRANCH->EXACT=>x2 | |
1875 | tail | |
1876 | ||
1877 | which would be constructed from a pattern like | |
1878 | /A|LIST|OF|WORDS/ | |
1879 | ||
1880 | If we can find such a subsequence we need to turn the first | |
1881 | element into a trie and then add the subsequent branch exact | |
1882 | strings to the trie. | |
1883 | ||
1884 | We have two cases | |
1885 | ||
1886 | 1. patterns where the whole set of branches can be | |
1887 | converted. | |
1888 | ||
1889 | 2. patterns where only a subset can be converted. | |
1890 | ||
1891 | In case 1 we can replace the whole set with a single regop | |
1892 | for the trie. In case 2 we need to keep the start and end | |
1893 | branches so | |
1894 | ||
1895 | 'BRANCH EXACT; BRANCH EXACT; BRANCH X' | |
1896 | becomes BRANCH TRIE; BRANCH X; | |
1897 | ||
1898 | There is an additional case, that being where there is a | |
1899 | common prefix, which gets split out into an EXACT like node | |
1900 | preceding the TRIE node. | |
1901 | ||
1902 | If X(1..n)==tail then we can do a simple trie, if not we make | |
1903 | a "jump" trie, such that when we match the appropriate word | |
1904 | we "jump" to the appropriate tail node. Essentially we turn | |
1905 | a nested if into a case structure of sorts. | |
1906 | ||
1907 | */ | |
1908 | ||
1909 | int made=0; | |
1910 | if (!re_trie_maxbuff) { | |
1911 | re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); | |
1912 | if (!SvIOK(re_trie_maxbuff)) | |
1913 | sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); | |
1914 | } | |
1915 | if ( SvIV(re_trie_maxbuff)>=0 ) { | |
1916 | regnode *cur; | |
1917 | regnode *first = (regnode *)NULL; | |
1918 | regnode *prev = (regnode *)NULL; | |
1919 | regnode *tail = scan; | |
1920 | U8 trietype = 0; | |
1921 | U32 count=0; | |
1922 | ||
1923 | /* var tail is used because there may be a TAIL | |
1924 | regop in the way. Ie, the exacts will point to the | |
1925 | thing following the TAIL, but the last branch will | |
1926 | point at the TAIL. So we advance tail. If we | |
1927 | have nested (?:) we may have to move through several | |
1928 | tails. | |
1929 | */ | |
1930 | ||
1931 | while ( OP( tail ) == TAIL ) { | |
1932 | /* this is the TAIL generated by (?:) */ | |
1933 | tail = regnext( tail ); | |
1934 | } | |
1935 | ||
1936 | ||
1937 | DEBUG_TRIE_COMPILE_r({ | |
1938 | regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); | |
1939 | Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n", | |
1940 | depth+1, | |
1941 | "Looking for TRIE'able sequences. Tail node is ", | |
1942 | (UV) REGNODE_OFFSET(tail), | |
1943 | SvPV_nolen_const( RExC_mysv ) | |
1944 | ); | |
1945 | }); | |
1946 | ||
1947 | /* | |
1948 | ||
1949 | Step through the branches | |
1950 | cur represents each branch, | |
1951 | noper is the first thing to be matched as part | |
1952 | of that branch | |
1953 | noper_next is the regnext() of that node. | |
1954 | ||
1955 | We normally handle a case like this | |
1956 | /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also | |
1957 | support building with NOJUMPTRIE, which restricts | |
1958 | the trie logic to structures like /FOO|BAR/. | |
1959 | ||
1960 | If noper is a trieable nodetype then the branch is | |
1961 | a possible optimization target. If we are building | |
1962 | under NOJUMPTRIE then we require that noper_next is | |
1963 | the same as scan (our current position in the regex | |
1964 | program). | |
1965 | ||
1966 | Once we have two or more consecutive such branches | |
1967 | we can create a trie of the EXACT's contents and | |
1968 | stitch it in place into the program. | |
1969 | ||
1970 | If the sequence represents all of the branches in | |
1971 | the alternation we replace the entire thing with a | |
1972 | single TRIE node. | |
1973 | ||
1974 | Otherwise when it is a subsequence we need to | |
1975 | stitch it in place and replace only the relevant | |
1976 | branches. This means the first branch has to remain | |
1977 | as it is used by the alternation logic, and its | |
1978 | next pointer, and needs to be repointed at the item | |
1979 | on the branch chain following the last branch we | |
1980 | have optimized away. | |
1981 | ||
1982 | This could be either a BRANCH, in which case the | |
1983 | subsequence is internal, or it could be the item | |
1984 | following the branch sequence in which case the | |
1985 | subsequence is at the end (which does not | |
1986 | necessarily mean the first node is the start of the | |
1987 | alternation). | |
1988 | ||
1989 | TRIE_TYPE(X) is a define which maps the optype to a | |
1990 | trietype. | |
1991 | ||
1992 | optype | trietype | |
1993 | ----------------+----------- | |
1994 | NOTHING | NOTHING | |
1995 | EXACT | EXACT | |
1996 | EXACT_REQ8 | EXACT | |
1997 | EXACTFU | EXACTFU | |
1998 | EXACTFU_REQ8 | EXACTFU | |
1999 | EXACTFUP | EXACTFU | |
2000 | EXACTFAA | EXACTFAA | |
2001 | EXACTL | EXACTL | |
2002 | EXACTFLU8 | EXACTFLU8 | |
2003 | ||
2004 | ||
2005 | */ | |
2006 | #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ | |
2007 | ? NOTHING \ | |
2008 | : ( EXACT == (X) || EXACT_REQ8 == (X) ) \ | |
2009 | ? EXACT \ | |
2010 | : ( EXACTFU == (X) \ | |
2011 | || EXACTFU_REQ8 == (X) \ | |
2012 | || EXACTFUP == (X) ) \ | |
2013 | ? EXACTFU \ | |
2014 | : ( EXACTFAA == (X) ) \ | |
2015 | ? EXACTFAA \ | |
2016 | : ( EXACTL == (X) ) \ | |
2017 | ? EXACTL \ | |
2018 | : ( EXACTFLU8 == (X) ) \ | |
2019 | ? EXACTFLU8 \ | |
2020 | : 0 ) | |
2021 | ||
2022 | /* dont use tail as the end marker for this traverse */ | |
2023 | for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { | |
2024 | regnode * const noper = REGNODE_AFTER( cur ); | |
2025 | U8 noper_type = OP( noper ); | |
2026 | U8 noper_trietype = TRIE_TYPE( noper_type ); | |
2027 | #if defined(DEBUGGING) || defined(NOJUMPTRIE) | |
2028 | regnode * const noper_next = regnext( noper ); | |
2029 | U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; | |
2030 | U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; | |
2031 | #endif | |
2032 | ||
2033 | DEBUG_TRIE_COMPILE_r({ | |
2034 | regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); | |
2035 | Perl_re_indentf( aTHX_ "- %d:%s (%d)", | |
2036 | depth+1, | |
2037 | REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); | |
2038 | ||
2039 | regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); | |
2040 | Perl_re_printf( aTHX_ " -> %d:%s", | |
2041 | REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); | |
2042 | ||
2043 | if ( noper_next ) { | |
2044 | regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); | |
2045 | Perl_re_printf( aTHX_ "\t=> %d:%s\t", | |
2046 | REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); | |
2047 | } | |
2048 | Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", | |
2049 | REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), | |
2050 | REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype) | |
2051 | ); | |
2052 | }); | |
2053 | ||
2054 | /* Is noper a trieable nodetype that can be merged | |
2055 | * with the current trie (if there is one)? */ | |
2056 | if ( noper_trietype | |
2057 | && | |
2058 | ( | |
2059 | ( noper_trietype == NOTHING ) | |
2060 | || ( trietype == NOTHING ) | |
2061 | || ( trietype == noper_trietype ) | |
2062 | ) | |
2063 | #ifdef NOJUMPTRIE | |
2064 | && noper_next >= tail | |
2065 | #endif | |
2066 | && count < U16_MAX) | |
2067 | { | |
2068 | /* Handle mergable triable node Either we are | |
2069 | * the first node in a new trieable sequence, | |
2070 | * in which case we do some bookkeeping, | |
2071 | * otherwise we update the end pointer. */ | |
2072 | if ( !first ) { | |
2073 | first = cur; | |
2074 | if ( noper_trietype == NOTHING ) { | |
2075 | #if !defined(DEBUGGING) && !defined(NOJUMPTRIE) | |
2076 | regnode * const noper_next = regnext( noper ); | |
2077 | U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; | |
2078 | U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; | |
2079 | #endif | |
2080 | ||
2081 | if ( noper_next_trietype ) { | |
2082 | trietype = noper_next_trietype; | |
2083 | } else if (noper_next_type) { | |
2084 | /* a NOTHING regop is 1 regop wide. | |
2085 | * We need at least two for a trie | |
2086 | * so we can't merge this in */ | |
2087 | first = NULL; | |
2088 | } | |
2089 | } else { | |
2090 | trietype = noper_trietype; | |
2091 | } | |
2092 | } else { | |
2093 | if ( trietype == NOTHING ) | |
2094 | trietype = noper_trietype; | |
2095 | prev = cur; | |
2096 | } | |
2097 | if (first) | |
2098 | count++; | |
2099 | } /* end handle mergable triable node */ | |
2100 | else { | |
2101 | /* handle unmergable node - | |
2102 | * noper may either be a triable node which can | |
2103 | * not be tried together with the current trie, | |
2104 | * or a non triable node */ | |
2105 | if ( prev ) { | |
2106 | /* If last is set and trietype is not | |
2107 | * NOTHING then we have found at least two | |
2108 | * triable branch sequences in a row of a | |
2109 | * similar trietype so we can turn them | |
2110 | * into a trie. If/when we allow NOTHING to | |
2111 | * start a trie sequence this condition | |
2112 | * will be required, and it isn't expensive | |
2113 | * so we leave it in for now. */ | |
2114 | if ( trietype && trietype != NOTHING ) | |
2115 | make_trie( pRExC_state, | |
2116 | startbranch, first, cur, tail, | |
2117 | count, trietype, depth+1 ); | |
2118 | prev = NULL; /* note: we clear/update | |
2119 | first, trietype etc below, | |
2120 | so we dont do it here */ | |
2121 | } | |
2122 | if ( noper_trietype | |
2123 | #ifdef NOJUMPTRIE | |
2124 | && noper_next >= tail | |
2125 | #endif | |
2126 | ){ | |
2127 | /* noper is triable, so we can start a new | |
2128 | * trie sequence */ | |
2129 | count = 1; | |
2130 | first = cur; | |
2131 | trietype = noper_trietype; | |
2132 | } else if (first) { | |
2133 | /* if we already saw a first but the | |
2134 | * current node is not triable then we have | |
2135 | * to reset the first information. */ | |
2136 | count = 0; | |
2137 | first = NULL; | |
2138 | trietype = 0; | |
2139 | } | |
2140 | } /* end handle unmergable node */ | |
2141 | } /* loop over branches */ | |
2142 | DEBUG_TRIE_COMPILE_r({ | |
2143 | regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); | |
2144 | Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ", | |
2145 | depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); | |
2146 | Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", | |
2147 | REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur), | |
2148 | REGNODE_NAME(trietype) | |
2149 | ); | |
2150 | ||
2151 | }); | |
2152 | if ( prev && trietype ) { | |
2153 | if ( trietype != NOTHING ) { | |
2154 | /* the last branch of the sequence was part of | |
2155 | * a trie, so we have to construct it here | |
2156 | * outside of the loop */ | |
2157 | made= make_trie( pRExC_state, startbranch, | |
2158 | first, scan, tail, count, | |
2159 | trietype, depth+1 ); | |
2160 | #ifdef TRIE_STUDY_OPT | |
2161 | if ( ((made == MADE_EXACT_TRIE && | |
2162 | startbranch == first) | |
2163 | || ( first_non_open == first )) && | |
2164 | depth==0 ) { | |
2165 | flags |= SCF_TRIE_RESTUDY; | |
2166 | if ( startbranch == first | |
2167 | && scan >= tail ) | |
2168 | { | |
2169 | RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; | |
2170 | } | |
2171 | } | |
2172 | #endif | |
2173 | } else { | |
2174 | /* at this point we know whatever we have is a | |
2175 | * NOTHING sequence/branch AND if 'startbranch' | |
2176 | * is 'first' then we can turn the whole thing | |
2177 | * into a NOTHING | |
2178 | */ | |
2179 | if ( startbranch == first ) { | |
2180 | regnode *opt; | |
2181 | /* the entire thing is a NOTHING sequence, | |
2182 | * something like this: (?:|) So we can | |
2183 | * turn it into a plain NOTHING op. */ | |
2184 | DEBUG_TRIE_COMPILE_r({ | |
2185 | regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); | |
2186 | Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n", | |
2187 | depth+1, | |
2188 | SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur)); | |
2189 | ||
2190 | }); | |
2191 | OP(startbranch)= NOTHING; | |
2192 | NEXT_OFF(startbranch)= tail - startbranch; | |
2193 | for ( opt= startbranch + 1; opt < tail ; opt++ ) | |
2194 | OP(opt)= OPTIMIZED; | |
2195 | } | |
2196 | } | |
2197 | } /* end if ( prev) */ | |
2198 | } /* TRIE_MAXBUF is non zero */ | |
2199 | } /* do trie */ | |
2200 | DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta); | |
2201 | } | |
2202 | else | |
2203 | scan = REGNODE_AFTER_opcode(scan,code); | |
2204 | continue; | |
2205 | } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { | |
2206 | I32 paren = 0; | |
2207 | regnode *start = NULL; | |
2208 | regnode *end = NULL; | |
2209 | U32 my_recursed_depth= recursed_depth; | |
2210 | ||
2211 | if (OP(scan) != SUSPEND) { /* GOSUB */ | |
2212 | /* Do setup, note this code has side effects beyond | |
2213 | * the rest of this block. Specifically setting | |
2214 | * RExC_recurse[] must happen at least once during | |
2215 | * study_chunk(). */ | |
2216 | paren = ARG(scan); | |
2217 | RExC_recurse[ARG2L(scan)] = scan; | |
2218 | start = REGNODE_p(RExC_open_parens[paren]); | |
2219 | end = REGNODE_p(RExC_close_parens[paren]); | |
2220 | ||
2221 | /* NOTE we MUST always execute the above code, even | |
2222 | * if we do nothing with a GOSUB */ | |
2223 | if ( | |
2224 | ( flags & SCF_IN_DEFINE ) | |
2225 | || | |
2226 | ( | |
2227 | (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) | |
2228 | && | |
2229 | ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) | |
2230 | ) | |
2231 | ) { | |
2232 | /* no need to do anything here if we are in a define. */ | |
2233 | /* or we are after some kind of infinite construct | |
2234 | * so we can skip recursing into this item. | |
2235 | * Since it is infinite we will not change the maxlen | |
2236 | * or delta, and if we miss something that might raise | |
2237 | * the minlen it will merely pessimise a little. | |
2238 | * | |
2239 | * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/ | |
2240 | * might result in a minlen of 1 and not of 4, | |
2241 | * but this doesn't make us mismatch, just try a bit | |
2242 | * harder than we should. | |
2243 | * | |
2244 | * However we must assume this GOSUB is infinite, to | |
2245 | * avoid wrongly applying other optimizations in the | |
2246 | * enclosing scope - see GH 18096, for example. | |
2247 | */ | |
2248 | is_inf = is_inf_internal = 1; | |
2249 | scan= regnext(scan); | |
2250 | continue; | |
2251 | } | |
2252 | ||
2253 | if ( | |
2254 | !recursed_depth | |
2255 | || !PAREN_TEST(recursed_depth - 1, paren) | |
2256 | ) { | |
2257 | /* it is quite possible that there are more efficient ways | |
2258 | * to do this. We maintain a bitmap per level of recursion | |
2259 | * of which patterns we have entered so we can detect if a | |
2260 | * pattern creates a possible infinite loop. When we | |
2261 | * recurse down a level we copy the previous levels bitmap | |
2262 | * down. When we are at recursion level 0 we zero the top | |
2263 | * level bitmap. It would be nice to implement a different | |
2264 | * more efficient way of doing this. In particular the top | |
2265 | * level bitmap may be unnecessary. | |
2266 | */ | |
2267 | if (!recursed_depth) { | |
2268 | Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); | |
2269 | } else { | |
2270 | Copy(PAREN_OFFSET(recursed_depth - 1), | |
2271 | PAREN_OFFSET(recursed_depth), | |
2272 | RExC_study_chunk_recursed_bytes, U8); | |
2273 | } | |
2274 | /* we havent recursed into this paren yet, so recurse into it */ | |
2275 | DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta); | |
2276 | PAREN_SET(recursed_depth, paren); | |
2277 | my_recursed_depth= recursed_depth + 1; | |
2278 | } else { | |
2279 | DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta); | |
2280 | /* some form of infinite recursion, assume infinite length | |
2281 | * */ | |
2282 | if (flags & SCF_DO_SUBSTR) { | |
2283 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
2284 | data->cur_is_floating = 1; | |
2285 | } | |
2286 | is_inf = is_inf_internal = 1; | |
2287 | if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ | |
2288 | ssc_anything(data->start_class); | |
2289 | flags &= ~SCF_DO_STCLASS; | |
2290 | ||
2291 | start= NULL; /* reset start so we dont recurse later on. */ | |
2292 | } | |
2293 | } else { | |
2294 | paren = stopparen; | |
2295 | start = scan + 2; | |
2296 | end = regnext(scan); | |
2297 | } | |
2298 | if (start) { | |
2299 | scan_frame *newframe; | |
2300 | assert(end); | |
2301 | if (!RExC_frame_last) { | |
2302 | Newxz(newframe, 1, scan_frame); | |
2303 | SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe); | |
2304 | RExC_frame_head= newframe; | |
2305 | RExC_frame_count++; | |
2306 | } else if (!RExC_frame_last->next_frame) { | |
2307 | Newxz(newframe, 1, scan_frame); | |
2308 | RExC_frame_last->next_frame= newframe; | |
2309 | newframe->prev_frame= RExC_frame_last; | |
2310 | RExC_frame_count++; | |
2311 | } else { | |
2312 | newframe= RExC_frame_last->next_frame; | |
2313 | } | |
2314 | RExC_frame_last= newframe; | |
2315 | ||
2316 | newframe->next_regnode = regnext(scan); | |
2317 | newframe->last_regnode = last; | |
2318 | newframe->stopparen = stopparen; | |
2319 | newframe->prev_recursed_depth = recursed_depth; | |
2320 | newframe->this_prev_frame= frame; | |
2321 | newframe->in_gosub = ( | |
2322 | (frame && frame->in_gosub) || OP(scan) == GOSUB | |
2323 | ); | |
2324 | ||
2325 | DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta); | |
2326 | DEBUG_PEEP("fnew", scan, depth, flags); | |
2327 | ||
2328 | frame = newframe; | |
2329 | scan = start; | |
2330 | stopparen = paren; | |
2331 | last = end; | |
2332 | depth = depth + 1; | |
2333 | recursed_depth= my_recursed_depth; | |
2334 | ||
2335 | continue; | |
2336 | } | |
2337 | } | |
2338 | else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) { | |
2339 | SSize_t bytelen = STR_LEN(scan), charlen; | |
2340 | UV uc; | |
2341 | assert(bytelen); | |
2342 | if (UTF) { | |
2343 | const U8 * const s = (U8*)STRING(scan); | |
2344 | uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); | |
2345 | charlen = utf8_length(s, s + bytelen); | |
2346 | } else { | |
2347 | uc = *((U8*)STRING(scan)); | |
2348 | charlen = bytelen; | |
2349 | } | |
2350 | min += charlen; | |
2351 | if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ | |
2352 | /* The code below prefers earlier match for fixed | |
2353 | offset, later match for variable offset. */ | |
2354 | if (data->last_end == -1) { /* Update the start info. */ | |
2355 | data->last_start_min = data->pos_min; | |
2356 | data->last_start_max = | |
2357 | is_inf ? OPTIMIZE_INFTY | |
2358 | : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min) | |
2359 | ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta; | |
2360 | } | |
2361 | sv_catpvn(data->last_found, STRING(scan), bytelen); | |
2362 | if (UTF) | |
2363 | SvUTF8_on(data->last_found); | |
2364 | { | |
2365 | SV * const sv = data->last_found; | |
2366 | MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? | |
2367 | mg_find(sv, PERL_MAGIC_utf8) : NULL; | |
2368 | if (mg && mg->mg_len >= 0) | |
2369 | mg->mg_len += charlen; | |
2370 | } | |
2371 | data->last_end = data->pos_min + charlen; | |
2372 | data->pos_min += charlen; /* As in the first entry. */ | |
2373 | data->flags &= ~SF_BEFORE_EOL; | |
2374 | } | |
2375 | ||
2376 | /* ANDing the code point leaves at most it, and not in locale, and | |
2377 | * can't match null string */ | |
2378 | if (flags & SCF_DO_STCLASS_AND) { | |
2379 | ssc_cp_and(data->start_class, uc); | |
2380 | ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; | |
2381 | ssc_clear_locale(data->start_class); | |
2382 | } | |
2383 | else if (flags & SCF_DO_STCLASS_OR) { | |
2384 | ssc_add_cp(data->start_class, uc); | |
2385 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); | |
2386 | ||
2387 | /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ | |
2388 | ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; | |
2389 | } | |
2390 | flags &= ~SCF_DO_STCLASS; | |
2391 | DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta); | |
2392 | } | |
2393 | else if (REGNODE_TYPE(OP(scan)) == EXACT) { | |
2394 | /* But OP != EXACT!, so is EXACTFish */ | |
2395 | SSize_t bytelen = STR_LEN(scan), charlen; | |
2396 | const U8 * s = (U8*)STRING(scan); | |
2397 | ||
2398 | /* Replace a length 1 ASCII fold pair node with an ANYOFM node, | |
2399 | * with the mask set to the complement of the bit that differs | |
2400 | * between upper and lower case, and the lowest code point of the | |
2401 | * pair (which the '&' forces) */ | |
2402 | if ( bytelen == 1 | |
2403 | && isALPHA_A(*s) | |
2404 | && ( OP(scan) == EXACTFAA | |
2405 | || ( OP(scan) == EXACTFU | |
2406 | && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))) | |
2407 | && mutate_ok | |
2408 | ) { | |
2409 | U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ | |
2410 | ||
2411 | OP(scan) = ANYOFM; | |
2412 | ARG_SET(scan, *s & mask); | |
2413 | FLAGS(scan) = mask; | |
2414 | /* We're not EXACTFish any more, so restudy. | |
2415 | * Search for "restudy" in this file to find | |
2416 | * a comment with details. */ | |
2417 | continue; | |
2418 | } | |
2419 | ||
2420 | /* Search for fixed substrings supports EXACT only. */ | |
2421 | if (flags & SCF_DO_SUBSTR) { | |
2422 | assert(data); | |
2423 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
2424 | } | |
2425 | charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen; | |
2426 | if (unfolded_multi_char) { | |
2427 | RExC_seen |= REG_UNFOLDED_MULTI_SEEN; | |
2428 | } | |
2429 | min += charlen - min_subtract; | |
2430 | assert (min >= 0); | |
2431 | if ((SSize_t)min_subtract < OPTIMIZE_INFTY | |
2432 | && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract | |
2433 | ) { | |
2434 | delta += min_subtract; | |
2435 | } else { | |
2436 | delta = OPTIMIZE_INFTY; | |
2437 | } | |
2438 | if (flags & SCF_DO_SUBSTR) { | |
2439 | data->pos_min += charlen - min_subtract; | |
2440 | if (data->pos_min < 0) { | |
2441 | data->pos_min = 0; | |
2442 | } | |
2443 | if ((SSize_t)min_subtract < OPTIMIZE_INFTY | |
2444 | && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract | |
2445 | ) { | |
2446 | data->pos_delta += min_subtract; | |
2447 | } else { | |
2448 | data->pos_delta = OPTIMIZE_INFTY; | |
2449 | } | |
2450 | if (min_subtract) { | |
2451 | data->cur_is_floating = 1; /* float */ | |
2452 | } | |
2453 | } | |
2454 | ||
2455 | if (flags & SCF_DO_STCLASS) { | |
2456 | SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan); | |
2457 | ||
2458 | assert(EXACTF_invlist); | |
2459 | if (flags & SCF_DO_STCLASS_AND) { | |
2460 | if (OP(scan) != EXACTFL) | |
2461 | ssc_clear_locale(data->start_class); | |
2462 | ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; | |
2463 | ANYOF_POSIXL_ZERO(data->start_class); | |
2464 | ssc_intersection(data->start_class, EXACTF_invlist, FALSE); | |
2465 | } | |
2466 | else { /* SCF_DO_STCLASS_OR */ | |
2467 | ssc_union(data->start_class, EXACTF_invlist, FALSE); | |
2468 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); | |
2469 | ||
2470 | /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ | |
2471 | ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; | |
2472 | } | |
2473 | flags &= ~SCF_DO_STCLASS; | |
2474 | SvREFCNT_dec(EXACTF_invlist); | |
2475 | } | |
2476 | DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta); | |
2477 | } | |
2478 | else if (REGNODE_VARIES(OP(scan))) { | |
2479 | SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; | |
2480 | I32 fl = 0; | |
2481 | U32 f = flags; | |
2482 | regnode * const oscan = scan; | |
2483 | regnode_ssc this_class; | |
2484 | regnode_ssc *oclass = NULL; | |
2485 | I32 next_is_eval = 0; | |
2486 | ||
2487 | switch (REGNODE_TYPE(OP(scan))) { | |
2488 | case WHILEM: /* End of (?:...)* . */ | |
2489 | scan = REGNODE_AFTER(scan); | |
2490 | goto finish; | |
2491 | case PLUS: | |
2492 | if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { | |
2493 | next = REGNODE_AFTER(scan); | |
2494 | if ( ( REGNODE_TYPE(OP(next)) == EXACT | |
2495 | && ! isEXACTFish(OP(next))) | |
2496 | || (flags & SCF_DO_STCLASS)) | |
2497 | { | |
2498 | mincount = 1; | |
2499 | maxcount = REG_INFTY; | |
2500 | next = regnext(scan); | |
2501 | scan = REGNODE_AFTER(scan); | |
2502 | goto do_curly; | |
2503 | } | |
2504 | } | |
2505 | if (flags & SCF_DO_SUBSTR) | |
2506 | data->pos_min++; | |
2507 | /* This will bypass the formal 'min += minnext * mincount' | |
2508 | * calculation in the do_curly path, so assumes min width | |
2509 | * of the PLUS payload is exactly one. */ | |
2510 | min++; | |
2511 | /* FALLTHROUGH */ | |
2512 | case STAR: | |
2513 | next = REGNODE_AFTER(scan); | |
2514 | ||
2515 | /* This temporary node can now be turned into EXACTFU, and | |
2516 | * must, as regexec.c doesn't handle it */ | |
2517 | if (OP(next) == EXACTFU_S_EDGE && mutate_ok) { | |
2518 | OP(next) = EXACTFU; | |
2519 | } | |
2520 | ||
2521 | if ( STR_LEN(next) == 1 | |
2522 | && isALPHA_A(* STRING(next)) | |
2523 | && ( OP(next) == EXACTFAA | |
2524 | || ( OP(next) == EXACTFU | |
2525 | && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))) | |
2526 | && mutate_ok | |
2527 | ) { | |
2528 | /* These differ in just one bit */ | |
2529 | U8 mask = ~ ('A' ^ 'a'); | |
2530 | ||
2531 | assert(isALPHA_A(* STRING(next))); | |
2532 | ||
2533 | /* Then replace it by an ANYOFM node, with | |
2534 | * the mask set to the complement of the | |
2535 | * bit that differs between upper and lower | |
2536 | * case, and the lowest code point of the | |
2537 | * pair (which the '&' forces) */ | |
2538 | OP(next) = ANYOFM; | |
2539 | ARG_SET(next, *STRING(next) & mask); | |
2540 | FLAGS(next) = mask; | |
2541 | } | |
2542 | ||
2543 | if (flags & SCF_DO_STCLASS) { | |
2544 | mincount = 0; | |
2545 | maxcount = REG_INFTY; | |
2546 | next = regnext(scan); | |
2547 | scan = REGNODE_AFTER(scan); | |
2548 | goto do_curly; | |
2549 | } | |
2550 | if (flags & SCF_DO_SUBSTR) { | |
2551 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
2552 | /* Cannot extend fixed substrings */ | |
2553 | data->cur_is_floating = 1; /* float */ | |
2554 | } | |
2555 | is_inf = is_inf_internal = 1; | |
2556 | scan = regnext(scan); | |
2557 | goto optimize_curly_tail; | |
2558 | case CURLY: | |
2559 | if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM) | |
2560 | && (scan->flags == stopparen)) | |
2561 | { | |
2562 | mincount = 1; | |
2563 | maxcount = 1; | |
2564 | } else { | |
2565 | mincount = ARG1(scan); | |
2566 | maxcount = ARG2(scan); | |
2567 | } | |
2568 | next = regnext(scan); | |
2569 | if (OP(scan) == CURLYX) { | |
2570 | I32 lp = (data ? *(data->last_closep) : 0); | |
2571 | scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX); | |
2572 | } | |
2573 | scan = REGNODE_AFTER(scan); | |
2574 | next_is_eval = (OP(scan) == EVAL); | |
2575 | do_curly: | |
2576 | if (flags & SCF_DO_SUBSTR) { | |
2577 | if (mincount == 0) | |
2578 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
2579 | /* Cannot extend fixed substrings */ | |
2580 | pos_before = data->pos_min; | |
2581 | } | |
2582 | if (data) { | |
2583 | fl = data->flags; | |
2584 | data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL); | |
2585 | if (is_inf) | |
2586 | data->flags |= SF_IS_INF; | |
2587 | } | |
2588 | if (flags & SCF_DO_STCLASS) { | |
2589 | ssc_init(pRExC_state, &this_class); | |
2590 | oclass = data->start_class; | |
2591 | data->start_class = &this_class; | |
2592 | f |= SCF_DO_STCLASS_AND; | |
2593 | f &= ~SCF_DO_STCLASS_OR; | |
2594 | } | |
2595 | /* Exclude from super-linear cache processing any {n,m} | |
2596 | regops for which the combination of input pos and regex | |
2597 | pos is not enough information to determine if a match | |
2598 | will be possible. | |
2599 | ||
2600 | For example, in the regex /foo(bar\s*){4,8}baz/ with the | |
2601 | regex pos at the \s*, the prospects for a match depend not | |
2602 | only on the input position but also on how many (bar\s*) | |
2603 | repeats into the {4,8} we are. */ | |
2604 | if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY)) | |
2605 | f &= ~SCF_WHILEM_VISITED_POS; | |
2606 | ||
2607 | /* This will finish on WHILEM, setting scan, or on NULL: */ | |
2608 | /* recurse study_chunk() on loop bodies */ | |
2609 | minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, | |
2610 | last, data, stopparen, recursed_depth, NULL, | |
2611 | (mincount == 0 | |
2612 | ? (f & ~SCF_DO_SUBSTR) | |
2613 | : f) | |
2614 | , depth+1, mutate_ok); | |
2615 | ||
2616 | if (data && data->flags & SCF_SEEN_ACCEPT) { | |
2617 | if (mincount > 1) | |
2618 | mincount = 1; | |
2619 | } | |
2620 | ||
2621 | if (flags & SCF_DO_STCLASS) | |
2622 | data->start_class = oclass; | |
2623 | if (mincount == 0 || minnext == 0) { | |
2624 | if (flags & SCF_DO_STCLASS_OR) { | |
2625 | ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); | |
2626 | } | |
2627 | else if (flags & SCF_DO_STCLASS_AND) { | |
2628 | /* Switch to OR mode: cache the old value of | |
2629 | * data->start_class */ | |
2630 | INIT_AND_WITHP; | |
2631 | StructCopy(data->start_class, and_withp, regnode_ssc); | |
2632 | flags &= ~SCF_DO_STCLASS_AND; | |
2633 | StructCopy(&this_class, data->start_class, regnode_ssc); | |
2634 | flags |= SCF_DO_STCLASS_OR; | |
2635 | ANYOF_FLAGS(data->start_class) | |
2636 | |= SSC_MATCHES_EMPTY_STRING; | |
2637 | } | |
2638 | } else { /* Non-zero len */ | |
2639 | if (flags & SCF_DO_STCLASS_OR) { | |
2640 | ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class); | |
2641 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); | |
2642 | } | |
2643 | else if (flags & SCF_DO_STCLASS_AND) | |
2644 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class); | |
2645 | flags &= ~SCF_DO_STCLASS; | |
2646 | } | |
2647 | if (!scan) /* It was not CURLYX, but CURLY. */ | |
2648 | scan = next; | |
2649 | if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR) | |
2650 | /* ? quantifier ok, except for (?{ ... }) */ | |
2651 | && (next_is_eval || !(mincount == 0 && maxcount == 1)) | |
2652 | && (minnext == 0) && (deltanext == 0) | |
2653 | && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) | |
2654 | && maxcount <= REG_INFTY/3) /* Complement check for big | |
2655 | count */ | |
2656 | { | |
2657 | _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP), | |
2658 | Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), | |
2659 | "Quantifier unexpected on zero-length expression " | |
2660 | "in regex m/%" UTF8f "/", | |
2661 | UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, | |
2662 | RExC_precomp))); | |
2663 | } | |
2664 | ||
2665 | if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext ) | |
2666 | || min >= SSize_t_MAX - minnext * mincount ) | |
2667 | { | |
2668 | FAIL("Regexp out of space"); | |
2669 | } | |
2670 | ||
2671 | min += minnext * mincount; | |
2672 | is_inf_internal |= deltanext == OPTIMIZE_INFTY | |
2673 | || (maxcount == REG_INFTY && minnext + deltanext > 0); | |
2674 | is_inf |= is_inf_internal; | |
2675 | if (is_inf) { | |
2676 | delta = OPTIMIZE_INFTY; | |
2677 | } else { | |
2678 | delta += (minnext + deltanext) * maxcount | |
2679 | - minnext * mincount; | |
2680 | } | |
2681 | ||
2682 | if (data && data->flags & SCF_SEEN_ACCEPT) { | |
2683 | if (flags & SCF_DO_SUBSTR) { | |
2684 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
2685 | flags &= ~SCF_DO_SUBSTR; | |
2686 | } | |
2687 | if (stopmin > min) | |
2688 | stopmin = min; | |
2689 | DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta); | |
2690 | } | |
98ce67cb | 2691 | DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta); |
85900e28 | 2692 | /* Try powerful optimization CURLYX => CURLYN. */ |
c5b1c090 YO |
2693 | if ( RE_OPTIMIZE_CURLYX_TO_CURLYN |
2694 | && OP(oscan) == CURLYX | |
2695 | && data | |
c224bbd5 YO |
2696 | && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a |
2697 | non optimistic eval is seen | |
2698 | anywhere.*/ | |
2699 | && ( data->flags & SF_IN_PAR ) /* has parens */ | |
c5b1c090 YO |
2700 | && !deltanext |
2701 | && minnext == 1 | |
2702 | && mutate_ok | |
85900e28 | 2703 | ) { |
98ce67cb | 2704 | DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta); |
85900e28 YO |
2705 | /* Try to optimize to CURLYN. */ |
2706 | regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); | |
2707 | regnode * const nxt1 = nxt; | |
2708 | #ifdef DEBUGGING | |
2709 | regnode *nxt2; | |
2710 | #endif | |
85900e28 YO |
2711 | /* Skip open. */ |
2712 | nxt = regnext(nxt); | |
2713 | if (!REGNODE_SIMPLE(OP(nxt)) | |
2714 | && !(REGNODE_TYPE(OP(nxt)) == EXACT | |
2715 | && STR_LEN(nxt) == 1)) | |
2716 | goto nogo; | |
2717 | #ifdef DEBUGGING | |
2718 | nxt2 = nxt; | |
2719 | #endif | |
2720 | nxt = regnext(nxt); | |
2721 | if (OP(nxt) != CLOSE) | |
2722 | goto nogo; | |
2723 | if (RExC_open_parens) { | |
2724 | ||
2725 | /*open->CURLYM*/ | |
2726 | RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); | |
2727 | ||
2728 | /*close->while*/ | |
2729 | RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2; | |
2730 | } | |
2731 | /* Now we know that nxt2 is the only contents: */ | |
2732 | oscan->flags = (U8)PARNO(nxt); | |
2733 | OP(oscan) = CURLYN; | |
2734 | OP(nxt1) = NOTHING; /* was OPEN. */ | |
2735 | ||
2736 | #ifdef DEBUGGING | |
2737 | OP(nxt1 + 1) = OPTIMIZED; /* was count. */ | |
2738 | NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */ | |
2739 | NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */ | |
2740 | OP(nxt) = OPTIMIZED; /* was CLOSE. */ | |
2741 | OP(nxt + 1) = OPTIMIZED; /* was count. */ | |
2742 | NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */ | |
2743 | #endif | |
2744 | } | |
2745 | nogo: | |
2746 | ||
98ce67cb YO |
2747 | DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta); |
2748 | ||
85900e28 | 2749 | /* Try optimization CURLYX => CURLYM. */ |
c5b1c090 YO |
2750 | if ( RE_OPTIMIZE_CURLYX_TO_CURLYM |
2751 | && OP(oscan) == CURLYX | |
2752 | && data | |
c224bbd5 YO |
2753 | && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a |
2754 | non optimistic eval is seen | |
2755 | anywhere.*/ | |
2756 | && !(data->flags & SF_HAS_PAR) /* no parens! */ | |
c5b1c090 YO |
2757 | && !deltanext /* atom is fixed width */ |
2758 | && minnext != 0 /* CURLYM can't handle zero width */ | |
85900e28 YO |
2759 | /* Nor characters whose fold at run-time may be |
2760 | * multi-character */ | |
c5b1c090 YO |
2761 | && !(RExC_seen & REG_UNFOLDED_MULTI_SEEN) |
2762 | && mutate_ok | |
85900e28 | 2763 | ) { |
98ce67cb | 2764 | DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta); |
85900e28 YO |
2765 | /* XXXX How to optimize if data == 0? */ |
2766 | /* Optimize to a simpler form. */ | |
2767 | regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */ | |
2768 | regnode *nxt2; | |
2769 | ||
2770 | OP(oscan) = CURLYM; | |
2771 | while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/ | |
2772 | && (OP(nxt2) != WHILEM)) | |
2773 | nxt = nxt2; | |
2774 | OP(nxt2) = SUCCEED; /* Whas WHILEM */ | |
2775 | /* Need to optimize away parenths. */ | |
2776 | if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) { | |
2777 | /* Set the parenth number. */ | |
2778 | /* note that we have changed the type of oscan to CURLYM here */ | |
2779 | regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/ | |
2780 | ||
2781 | oscan->flags = (U8)PARNO(nxt); | |
2782 | if (RExC_open_parens) { | |
2783 | /*open->CURLYM*/ | |
2784 | RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan); | |
2785 | ||
2786 | /*close->NOTHING*/ | |
2787 | RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2) | |
2788 | + 1; | |
2789 | } | |
2790 | OP(nxt1) = OPTIMIZED; /* was OPEN. */ | |
2791 | OP(nxt) = OPTIMIZED; /* was CLOSE. */ | |
2792 | ||
2793 | #ifdef DEBUGGING | |
2794 | OP(nxt1 + 1) = OPTIMIZED; /* was count. */ | |
2795 | OP(nxt + 1) = OPTIMIZED; /* was count. */ | |
2796 | NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */ | |
2797 | NEXT_OFF(nxt + 1) = 0; /* just for consistency. */ | |
2798 | #endif | |
2799 | #if 0 | |
2800 | while ( nxt1 && (OP(nxt1) != WHILEM)) { | |
2801 | regnode *nnxt = regnext(nxt1); | |
2802 | if (nnxt == nxt) { | |
2803 | if (REGNODE_OFF_BY_ARG(OP(nxt1))) | |
2804 | ARG_SET(nxt1, nxt2 - nxt1); | |
2805 | else if (nxt2 - nxt1 < U16_MAX) | |
2806 | NEXT_OFF(nxt1) = nxt2 - nxt1; | |
2807 | else | |
2808 | OP(nxt) = NOTHING; /* Cannot beautify */ | |
2809 | } | |
2810 | nxt1 = nnxt; | |
2811 | } | |
2812 | #endif | |
2813 | /* Optimize again: */ | |
2814 | /* recurse study_chunk() on optimised CURLYX => CURLYM */ | |
2815 | study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt, | |
2816 | NULL, stopparen, recursed_depth, NULL, 0, | |
2817 | depth+1, mutate_ok); | |
2818 | } | |
2819 | else | |
2820 | oscan->flags = 0; | |
2821 | } | |
2822 | else if ((OP(oscan) == CURLYX) | |
2823 | && (flags & SCF_WHILEM_VISITED_POS) | |
2824 | /* See the comment on a similar expression above. | |
2825 | However, this time it's not a subexpression | |
2826 | we care about, but the expression itself. */ | |
2827 | && (maxcount == REG_INFTY) | |
2828 | && data) { | |
2829 | /* This stays as CURLYX, we can put the count/of pair. */ | |
2830 | /* Find WHILEM (as in regexec.c) */ | |
2831 | regnode *nxt = oscan + NEXT_OFF(oscan); | |
2832 | ||
2833 | if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */ | |
2834 | nxt += ARG(nxt); | |
2835 | nxt = REGNODE_BEFORE(nxt); | |
2836 | if (nxt->flags & 0xf) { | |
2837 | /* we've already set whilem count on this node */ | |
2838 | } else if (++data->whilem_c < 16) { | |
2839 | assert(data->whilem_c <= RExC_whilem_seen); | |
2840 | nxt->flags = (U8)(data->whilem_c | |
2841 | | (RExC_whilem_seen << 4)); /* On WHILEM */ | |
2842 | } | |
2843 | } | |
2844 | if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) | |
2845 | pars++; | |
2846 | if (flags & SCF_DO_SUBSTR) { | |
2847 | SV *last_str = NULL; | |
2848 | STRLEN last_chrs = 0; | |
2849 | int counted = mincount != 0; | |
2850 | ||
2851 | if (data->last_end > 0 && mincount != 0) { /* Ends with a | |
2852 | string. */ | |
2853 | SSize_t b = pos_before >= data->last_start_min | |
2854 | ? pos_before : data->last_start_min; | |
2855 | STRLEN l; | |
2856 | const char * const s = SvPV_const(data->last_found, l); | |
2857 | SSize_t old = b - data->last_start_min; | |
2858 | assert(old >= 0); | |
2859 | ||
2860 | if (UTF) | |
2861 | old = utf8_hop_forward((U8*)s, old, | |
2862 | (U8 *) SvEND(data->last_found)) | |
2863 | - (U8*)s; | |
2864 | l -= old; | |
2865 | /* Get the added string: */ | |
2866 | last_str = newSVpvn_utf8(s + old, l, UTF); | |
2867 | last_chrs = UTF ? utf8_length((U8*)(s + old), | |
2868 | (U8*)(s + old + l)) : l; | |
2869 | if (deltanext == 0 && pos_before == b) { | |
2870 | /* What was added is a constant string */ | |
2871 | if (mincount > 1) { | |
2872 | ||
2873 | SvGROW(last_str, (mincount * l) + 1); | |
2874 | repeatcpy(SvPVX(last_str) + l, | |
2875 | SvPVX_const(last_str), l, | |
2876 | mincount - 1); | |
2877 | SvCUR_set(last_str, SvCUR(last_str) * mincount); | |
2878 | /* Add additional parts. */ | |
2879 | SvCUR_set(data->last_found, | |
2880 | SvCUR(data->last_found) - l); | |
2881 | sv_catsv(data->last_found, last_str); | |
2882 | { | |
2883 | SV * sv = data->last_found; | |
2884 | MAGIC *mg = | |
2885 | SvUTF8(sv) && SvMAGICAL(sv) ? | |
2886 | mg_find(sv, PERL_MAGIC_utf8) : NULL; | |
2887 | if (mg && mg->mg_len >= 0) | |
2888 | mg->mg_len += last_chrs * (mincount-1); | |
2889 | } | |
2890 | last_chrs *= mincount; | |
2891 | data->last_end += l * (mincount - 1); | |
2892 | } | |
2893 | } else { | |
2894 | /* start offset must point into the last copy */ | |
2895 | data->last_start_min += minnext * (mincount - 1); | |
2896 | data->last_start_max = | |
2897 | is_inf | |
2898 | ? OPTIMIZE_INFTY | |
2899 | : data->last_start_max + | |
2900 | (maxcount - 1) * (minnext + data->pos_delta); | |
2901 | } | |
2902 | } | |
2903 | /* It is counted once already... */ | |
2904 | data->pos_min += minnext * (mincount - counted); | |
2905 | #if 0 | |
2906 | Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf | |
2907 | " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf | |
2908 | " maxcount=%" UVuf " mincount=%" UVuf | |
2909 | " data->pos_delta=%" UVuf "\n", | |
2910 | (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, | |
2911 | (UV)maxcount, (UV)mincount, (UV)data->pos_delta); | |
2912 | if (deltanext != OPTIMIZE_INFTY) | |
2913 | Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", | |
2914 | (UV)(-counted * deltanext + (minnext + deltanext) * maxcount | |
2915 | - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta)); | |
2916 | #endif | |
2917 | if (deltanext == OPTIMIZE_INFTY | |
2918 | || data->pos_delta == OPTIMIZE_INFTY | |
2919 | || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta) | |
2920 | data->pos_delta = OPTIMIZE_INFTY; | |
2921 | else | |
2922 | data->pos_delta += - counted * deltanext + | |
2923 | (minnext + deltanext) * maxcount - minnext * mincount; | |
2924 | if (mincount != maxcount) { | |
2925 | /* Cannot extend fixed substrings found inside | |
2926 | the group. */ | |
2927 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
2928 | if (mincount && last_str) { | |
2929 | SV * const sv = data->last_found; | |
2930 | MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ? | |
2931 | mg_find(sv, PERL_MAGIC_utf8) : NULL; | |
2932 | ||
2933 | if (mg) | |
2934 | mg->mg_len = -1; | |
2935 | sv_setsv(sv, last_str); | |
2936 | data->last_end = data->pos_min; | |
2937 | data->last_start_min = data->pos_min - last_chrs; | |
2938 | data->last_start_max = is_inf | |
2939 | ? OPTIMIZE_INFTY | |
2940 | : data->pos_min + data->pos_delta - last_chrs; | |
2941 | } | |
2942 | data->cur_is_floating = 1; /* float */ | |
2943 | } | |
2944 | SvREFCNT_dec(last_str); | |
2945 | } | |
2946 | if (data && (fl & SF_HAS_EVAL)) | |
2947 | data->flags |= SF_HAS_EVAL; | |
2948 | optimize_curly_tail: | |
2949 | rck_elide_nothing(oscan); | |
2950 | continue; | |
2951 | ||
2952 | default: | |
2953 | Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d", | |
2954 | OP(scan)); | |
2955 | case REF: | |
2956 | case CLUMP: | |
2957 | if (flags & SCF_DO_SUBSTR) { | |
2958 | /* Cannot expect anything... */ | |
2959 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
2960 | data->cur_is_floating = 1; /* float */ | |
2961 | } | |
2962 | is_inf = is_inf_internal = 1; | |
2963 | if (flags & SCF_DO_STCLASS_OR) { | |
2964 | if (OP(scan) == CLUMP) { | |
2965 | /* Actually is any start char, but very few code points | |
2966 | * aren't start characters */ | |
2967 | ssc_match_all_cp(data->start_class); | |
2968 | } | |
2969 | else { | |
2970 | ssc_anything(data->start_class); | |
2971 | } | |
2972 | } | |
2973 | flags &= ~SCF_DO_STCLASS; | |
2974 | break; | |
2975 | } | |
2976 | } | |
2977 | else if (OP(scan) == LNBREAK) { | |
2978 | if (flags & SCF_DO_STCLASS) { | |
2979 | if (flags & SCF_DO_STCLASS_AND) { | |
2980 | ssc_intersection(data->start_class, | |
2981 | PL_XPosix_ptrs[CC_VERTSPACE_], FALSE); | |
2982 | ssc_clear_locale(data->start_class); | |
2983 | ANYOF_FLAGS(data->start_class) | |
2984 | &= ~SSC_MATCHES_EMPTY_STRING; | |
2985 | } | |
2986 | else if (flags & SCF_DO_STCLASS_OR) { | |
2987 | ssc_union(data->start_class, | |
2988 | PL_XPosix_ptrs[CC_VERTSPACE_], | |
2989 | FALSE); | |
2990 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); | |
2991 | ||
2992 | /* See commit msg for | |
2993 | * 749e076fceedeb708a624933726e7989f2302f6a */ | |
2994 | ANYOF_FLAGS(data->start_class) | |
2995 | &= ~SSC_MATCHES_EMPTY_STRING; | |
2996 | } | |
2997 | flags &= ~SCF_DO_STCLASS; | |
2998 | } | |
2999 | min++; | |
3000 | if (delta != OPTIMIZE_INFTY) | |
3001 | delta++; /* Because of the 2 char string cr-lf */ | |
3002 | if (flags & SCF_DO_SUBSTR) { | |
3003 | /* Cannot expect anything... */ | |
3004 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
3005 | data->pos_min += 1; | |
3006 | if (data->pos_delta != OPTIMIZE_INFTY) { | |
3007 | data->pos_delta += 1; | |
3008 | } | |
3009 | data->cur_is_floating = 1; /* float */ | |
3010 | } | |
3011 | } | |
3012 | else if (REGNODE_SIMPLE(OP(scan))) { | |
3013 | ||
3014 | if (flags & SCF_DO_SUBSTR) { | |
3015 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
3016 | data->pos_min++; | |
3017 | } | |
3018 | min++; | |
3019 | if (flags & SCF_DO_STCLASS) { | |
3020 | bool invert = 0; | |
3021 | SV* my_invlist = NULL; | |
3022 | U8 namedclass; | |
3023 | ||
3024 | /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ | |
3025 | ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; | |
3026 | ||
3027 | /* Some of the logic below assumes that switching | |
3028 | locale on will only add false positives. */ | |
3029 | switch (OP(scan)) { | |
3030 | ||
3031 | default: | |
3032 | #ifdef DEBUGGING | |
3033 | Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", | |
3034 | OP(scan)); | |
3035 | #endif | |
3036 | case SANY: | |
3037 | if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ | |
3038 | ssc_match_all_cp(data->start_class); | |
3039 | break; | |
3040 | ||
3041 | case REG_ANY: | |
3042 | { | |
3043 | SV* REG_ANY_invlist = _new_invlist(2); | |
3044 | REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist, | |
3045 | '\n'); | |
3046 | if (flags & SCF_DO_STCLASS_OR) { | |
3047 | ssc_union(data->start_class, | |
3048 | REG_ANY_invlist, | |
3049 | TRUE /* TRUE => invert, hence all but \n | |
3050 | */ | |
3051 | ); | |
3052 | } | |
3053 | else if (flags & SCF_DO_STCLASS_AND) { | |
3054 | ssc_intersection(data->start_class, | |
3055 | REG_ANY_invlist, | |
3056 | TRUE /* TRUE => invert */ | |
3057 | ); | |
3058 | ssc_clear_locale(data->start_class); | |
3059 | } | |
3060 | SvREFCNT_dec_NN(REG_ANY_invlist); | |
3061 | } | |
3062 | break; | |
3063 | ||
3064 | case ANYOFD: | |
3065 | case ANYOFL: | |
3066 | case ANYOFPOSIXL: | |
3067 | case ANYOFH: | |
3068 | case ANYOFHb: | |
3069 | case ANYOFHr: | |
3070 | case ANYOFHs: | |
3071 | case ANYOF: | |
3072 | if (flags & SCF_DO_STCLASS_AND) | |
3073 | ssc_and(pRExC_state, data->start_class, | |
3074 | (regnode_charclass *) scan); | |
3075 | else | |
3076 | ssc_or(pRExC_state, data->start_class, | |
3077 | (regnode_charclass *) scan); | |
3078 | break; | |
3079 | ||
3080 | case ANYOFHbbm: | |
3081 | { | |
3082 | SV* cp_list = get_ANYOFHbbm_contents(scan); | |
3083 | ||
3084 | if (flags & SCF_DO_STCLASS_OR) { | |
3085 | ssc_union(data->start_class, cp_list, invert); | |
3086 | } | |
3087 | else if (flags & SCF_DO_STCLASS_AND) { | |
3088 | ssc_intersection(data->start_class, cp_list, invert); | |
3089 | } | |
3090 | ||
3091 | SvREFCNT_dec_NN(cp_list); | |
3092 | break; | |
3093 | } | |
3094 | ||
3095 | case NANYOFM: /* NANYOFM already contains the inversion of the | |
3096 | input ANYOF data, so, unlike things like | |
3097 | NPOSIXA, don't change 'invert' to TRUE */ | |
3098 | /* FALLTHROUGH */ | |
3099 | case ANYOFM: | |
3100 | { | |
3101 | SV* cp_list = get_ANYOFM_contents(scan); | |
3102 | ||
3103 | if (flags & SCF_DO_STCLASS_OR) { | |
3104 | ssc_union(data->start_class, cp_list, invert); | |
3105 | } | |
3106 | else if (flags & SCF_DO_STCLASS_AND) { | |
3107 | ssc_intersection(data->start_class, cp_list, invert); | |
3108 | } | |
3109 | ||
3110 | SvREFCNT_dec_NN(cp_list); | |
3111 | break; | |
3112 | } | |
3113 | ||
3114 | case ANYOFR: | |
3115 | case ANYOFRb: | |
3116 | { | |
3117 | SV* cp_list = NULL; | |
3118 | ||
3119 | cp_list = _add_range_to_invlist(cp_list, | |
3120 | ANYOFRbase(scan), | |
3121 | ANYOFRbase(scan) + ANYOFRdelta(scan)); | |
3122 | ||
3123 | if (flags & SCF_DO_STCLASS_OR) { | |
3124 | ssc_union(data->start_class, cp_list, invert); | |
3125 | } | |
3126 | else if (flags & SCF_DO_STCLASS_AND) { | |
3127 | ssc_intersection(data->start_class, cp_list, invert); | |
3128 | } | |
3129 | ||
3130 | SvREFCNT_dec_NN(cp_list); | |
3131 | break; | |
3132 | } | |
3133 | ||
3134 | case NPOSIXL: | |
3135 | invert = 1; | |
3136 | /* FALLTHROUGH */ | |
3137 | ||
3138 | case POSIXL: | |
3139 | namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; | |
3140 | if (flags & SCF_DO_STCLASS_AND) { | |
3141 | bool was_there = cBOOL( | |
3142 | ANYOF_POSIXL_TEST(data->start_class, | |
3143 | namedclass)); | |
3144 | ANYOF_POSIXL_ZERO(data->start_class); | |
3145 | if (was_there) { /* Do an AND */ | |
3146 | ANYOF_POSIXL_SET(data->start_class, namedclass); | |
3147 | } | |
3148 | /* No individual code points can now match */ | |
3149 | data->start_class->invlist | |
3150 | = sv_2mortal(_new_invlist(0)); | |
3151 | } | |
3152 | else { | |
3153 | int complement = namedclass + ((invert) ? -1 : 1); | |
3154 | ||
3155 | assert(flags & SCF_DO_STCLASS_OR); | |
3156 | ||
3157 | /* If the complement of this class was already there, | |
3158 | * the result is that they match all code points, | |
3159 | * (\d + \D == everything). Remove the classes from | |
3160 | * future consideration. Locale is not relevant in | |
3161 | * this case */ | |
3162 | if (ANYOF_POSIXL_TEST(data->start_class, complement)) { | |
3163 | ssc_match_all_cp(data->start_class); | |
3164 | ANYOF_POSIXL_CLEAR(data->start_class, namedclass); | |
3165 | ANYOF_POSIXL_CLEAR(data->start_class, complement); | |
3166 | } | |
3167 | else { /* The usual case; just add this class to the | |
3168 | existing set */ | |
3169 | ANYOF_POSIXL_SET(data->start_class, namedclass); | |
3170 | } | |
3171 | } | |
3172 | break; | |
3173 | ||
3174 | case NPOSIXA: /* For these, we always know the exact set of | |
3175 | what's matched */ | |
3176 | invert = 1; | |
3177 | /* FALLTHROUGH */ | |
3178 | case POSIXA: | |
3179 | my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL); | |
3180 | goto join_posix_and_ascii; | |
3181 | ||
3182 | case NPOSIXD: | |
3183 | case NPOSIXU: | |
3184 | invert = 1; | |
3185 | /* FALLTHROUGH */ | |
3186 | case POSIXD: | |
3187 | case POSIXU: | |
3188 | my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL); | |
3189 | ||
3190 | /* NPOSIXD matches all upper Latin1 code points unless the | |
3191 | * target string being matched is UTF-8, which is | |
3192 | * unknowable until match time. Since we are going to | |
3193 | * invert, we want to get rid of all of them so that the | |
3194 | * inversion will match all */ | |
3195 | if (OP(scan) == NPOSIXD) { | |
3196 | _invlist_subtract(my_invlist, PL_UpperLatin1, | |
3197 | &my_invlist); | |
3198 | } | |
3199 | ||
3200 | join_posix_and_ascii: | |
3201 | ||
3202 | if (flags & SCF_DO_STCLASS_AND) { | |
3203 | ssc_intersection(data->start_class, my_invlist, invert); | |
3204 | ssc_clear_locale(data->start_class); | |
3205 | } | |
3206 | else { | |
3207 | assert(flags & SCF_DO_STCLASS_OR); | |
3208 | ssc_union(data->start_class, my_invlist, invert); | |
3209 | } | |
3210 | SvREFCNT_dec(my_invlist); | |
3211 | } | |
3212 | if (flags & SCF_DO_STCLASS_OR) | |
3213 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); | |
3214 | flags &= ~SCF_DO_STCLASS; | |
3215 | } | |
3216 | } | |
3217 | else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) { | |
3218 | data->flags |= (OP(scan) == MEOL | |
3219 | ? SF_BEFORE_MEOL | |
3220 | : SF_BEFORE_SEOL); | |
3221 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
3222 | ||
3223 | } | |
3224 | else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ | |
3225 | /* Lookbehind, or need to calculate parens/evals/stclass: */ | |
3226 | && (scan->flags || data || (flags & SCF_DO_STCLASS)) | |
3227 | && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) | |
3228 | { | |
3229 | if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY | |
3230 | || OP(scan) == UNLESSM ) | |
3231 | { | |
3232 | /* Negative Lookahead/lookbehind | |
3233 | In this case we can't do fixed string optimisation. | |
3234 | */ | |
3235 | ||
3236 | bool is_positive = OP(scan) == IFMATCH ? 1 : 0; | |
3237 | SSize_t deltanext, minnext; | |
3238 | SSize_t fake_last_close = 0; | |
3239 | regnode *fake_last_close_op = NULL; | |
3240 | regnode *cur_last_close_op; | |
3241 | regnode *nscan; | |
3242 | regnode_ssc intrnl; | |
3243 | U32 f = (flags & SCF_TRIE_DOING_RESTUDY); | |
3244 | ||
3245 | StructCopy(&zero_scan_data, &data_fake, scan_data_t); | |
3246 | if (data) { | |
3247 | data_fake.whilem_c = data->whilem_c; | |
3248 | data_fake.last_closep = data->last_closep; | |
3249 | data_fake.last_close_opp = data->last_close_opp; | |
3250 | } | |
3251 | else { | |
3252 | data_fake.last_closep = &fake_last_close; | |
3253 | data_fake.last_close_opp = &fake_last_close_op; | |
3254 | } | |
3255 | ||
3256 | /* remember the last_close_op we saw so we can see if | |
3257 | * we are dealing with variable length lookbehind that | |
3258 | * contains capturing buffers, which are considered | |
3259 | * experimental */ | |
3260 | cur_last_close_op= *(data_fake.last_close_opp); | |
3261 | ||
3262 | data_fake.pos_delta = delta; | |
3263 | if ( flags & SCF_DO_STCLASS && !scan->flags | |
3264 | && OP(scan) == IFMATCH ) { /* Lookahead */ | |
3265 | ssc_init(pRExC_state, &intrnl); | |
3266 | data_fake.start_class = &intrnl; | |
3267 | f |= SCF_DO_STCLASS_AND; | |
3268 | } | |
3269 | if (flags & SCF_WHILEM_VISITED_POS) | |
3270 | f |= SCF_WHILEM_VISITED_POS; | |
3271 | next = regnext(scan); | |
3272 | nscan = REGNODE_AFTER(scan); | |
3273 | ||
3274 | /* recurse study_chunk() for lookahead body */ | |
3275 | minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, | |
3276 | last, &data_fake, stopparen, | |
3277 | recursed_depth, NULL, f, depth+1, | |
3278 | mutate_ok); | |
3279 | ||
3280 | if (scan->flags) { | |
3281 | if ( deltanext < 0 | |
3282 | || deltanext > (I32) U8_MAX | |
3283 | || minnext > (I32)U8_MAX | |
3284 | || minnext + deltanext > (I32)U8_MAX) | |
3285 | { | |
3286 | FAIL2("Lookbehind longer than %" UVuf " not implemented", | |
3287 | (UV)U8_MAX); | |
3288 | } | |
3289 | ||
3290 | /* The 'next_off' field has been repurposed to count the | |
3291 | * additional starting positions to try beyond the initial | |
3292 | * one. (This leaves it at 0 for non-variable length | |
3293 | * matches to avoid breakage for those not using this | |
3294 | * extension) */ | |
3295 | if (deltanext) { | |
3296 | scan->next_off = deltanext; | |
3297 | if ( | |
3298 | /* See a CLOSE op inside this lookbehind? */ | |
3299 | cur_last_close_op != *(data_fake.last_close_opp) | |
3300 | /* and not doing restudy. see: restudied */ | |
3301 | && !(flags & SCF_TRIE_DOING_RESTUDY) | |
3302 | ) { | |
3303 | /* this is positive variable length lookbehind with | |
3304 | * capture buffers inside of it */ | |
3305 | ckWARNexperimental_with_arg(RExC_parse, | |
3306 | WARN_EXPERIMENTAL__VLB, | |
3307 | "Variable length %s lookbehind with capturing is experimental", | |
3308 | is_positive ? "positive" : "negative"); | |
3309 | } | |
3310 | } | |
3311 | scan->flags = (U8)minnext + deltanext; | |
3312 | } | |
3313 | if (data) { | |
3314 | if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) | |
3315 | pars++; | |
3316 | if (data_fake.flags & SF_HAS_EVAL) | |
3317 | data->flags |= SF_HAS_EVAL; | |
3318 | data->whilem_c = data_fake.whilem_c; | |
3319 | } | |
3320 | if (f & SCF_DO_STCLASS_AND) { | |
3321 | if (flags & SCF_DO_STCLASS_OR) { | |
3322 | /* OR before, AND after: ideally we would recurse with | |
3323 | * data_fake to get the AND applied by study of the | |
3324 | * remainder of the pattern, and then derecurse; | |
3325 | * *** HACK *** for now just treat as "no information". | |
3326 | * See [perl #56690]. | |
3327 | */ | |
3328 | ssc_init(pRExC_state, data->start_class); | |
3329 | } else { | |
3330 | /* AND before and after: combine and continue. These | |
3331 | * assertions are zero-length, so can match an EMPTY | |
3332 | * string */ | |
3333 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); | |
3334 | ANYOF_FLAGS(data->start_class) | |
3335 | |= SSC_MATCHES_EMPTY_STRING; | |
3336 | } | |
3337 | } | |
3338 | DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta); | |
3339 | } | |
3340 | #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY | |
3341 | else { | |
3342 | /* Positive Lookahead/lookbehind | |
3343 | In this case we can do fixed string optimisation, | |
3344 | but we must be careful about it. Note in the case of | |
3345 | lookbehind the positions will be offset by the minimum | |
3346 | length of the pattern, something we won't know about | |
3347 | until after the recurse. | |
3348 | */ | |
3349 | SSize_t deltanext, fake_last_close = 0; | |
3350 | regnode *last_close_op = NULL; | |
3351 | regnode *nscan; | |
3352 | regnode_ssc intrnl; | |
3353 | U32 f = (flags & SCF_TRIE_DOING_RESTUDY); | |
3354 | /* We use SAVEFREEPV so that when the full compile | |
3355 | is finished perl will clean up the allocated | |
3356 | minlens when it's all done. This way we don't | |
3357 | have to worry about freeing them when we know | |
3358 | they wont be used, which would be a pain. | |
3359 | */ | |
3360 | SSize_t *minnextp; | |
3361 | Newx( minnextp, 1, SSize_t ); | |
3362 | SAVEFREEPV(minnextp); | |
3363 | ||
3364 | if (data) { | |
3365 | StructCopy(data, &data_fake, scan_data_t); | |
3366 | if ((flags & SCF_DO_SUBSTR) && data->last_found) { | |
3367 | f |= SCF_DO_SUBSTR; | |
3368 | if (scan->flags) | |
3369 | scan_commit(pRExC_state, &data_fake, minlenp, is_inf); | |
3370 | data_fake.last_found=newSVsv(data->last_found); | |
3371 | } | |
3372 | } | |
3373 | else { | |
3374 | data_fake.last_closep = &fake_last_close; | |
3375 | data_fake.last_close_opp = &fake_last_close_opp; | |
3376 | } | |
3377 | data_fake.flags = 0; | |
3378 | data_fake.substrs[0].flags = 0; | |
3379 | data_fake.substrs[1].flags = 0; | |
3380 | data_fake.pos_delta = delta; | |
3381 | if (is_inf) | |
3382 | data_fake.flags |= SF_IS_INF; | |
3383 | if ( flags & SCF_DO_STCLASS && !scan->flags | |
3384 | && OP(scan) == IFMATCH ) { /* Lookahead */ | |
3385 | ssc_init(pRExC_state, &intrnl); | |
3386 | data_fake.start_class = &intrnl; | |
3387 | f |= SCF_DO_STCLASS_AND; | |
3388 | } | |
3389 | if (flags & SCF_WHILEM_VISITED_POS) | |
3390 | f |= SCF_WHILEM_VISITED_POS; | |
3391 | next = regnext(scan); | |
3392 | nscan = REGNODE_AFTER(scan); | |
3393 | ||
3394 | /* positive lookahead study_chunk() recursion */ | |
3395 | *minnextp = study_chunk(pRExC_state, &nscan, minnextp, | |
3396 | &deltanext, last, &data_fake, | |
3397 | stopparen, recursed_depth, NULL, | |
3398 | f, depth+1, mutate_ok); | |
3399 | if (scan->flags) { | |
3400 | assert(0); /* This code has never been tested since this | |
3401 | is normally not compiled */ | |
3402 | if ( deltanext < 0 | |
3403 | || deltanext > (I32) U8_MAX | |
3404 | || *minnextp > (I32)U8_MAX | |
3405 | || *minnextp + deltanext > (I32)U8_MAX) | |
3406 | { | |
3407 | FAIL2("Lookbehind longer than %" UVuf " not implemented", | |
3408 | (UV)U8_MAX); | |
3409 | } | |
3410 | ||
3411 | if (deltanext) { | |
3412 | scan->next_off = deltanext; | |
3413 | } | |
3414 | scan->flags = (U8)*minnextp + deltanext; | |
3415 | } | |
3416 | ||
3417 | *minnextp += min; | |
3418 | ||
3419 | if (f & SCF_DO_STCLASS_AND) { | |
3420 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); | |
3421 | ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; | |
3422 | } | |
3423 | if (data) { | |
3424 | if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) | |
3425 | pars++; | |
3426 | if (data_fake.flags & SF_HAS_EVAL) | |
3427 | data->flags |= SF_HAS_EVAL; | |
3428 | data->whilem_c = data_fake.whilem_c; | |
3429 | if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) { | |
3430 | int i; | |
3431 | if (RExC_rx->minlen < *minnextp) | |
3432 | RExC_rx->minlen = *minnextp; | |
3433 | scan_commit(pRExC_state, &data_fake, minnextp, is_inf); | |
3434 | SvREFCNT_dec_NN(data_fake.last_found); | |
3435 | ||
3436 | for (i = 0; i < 2; i++) { | |
3437 | if (data_fake.substrs[i].minlenp != minlenp) { | |
3438 | data->substrs[i].min_offset = | |
3439 | data_fake.substrs[i].min_offset; | |
3440 | data->substrs[i].max_offset = | |
3441 | data_fake.substrs[i].max_offset; | |
3442 | data->substrs[i].minlenp = | |
3443 | data_fake.substrs[i].minlenp; | |
3444 | data->substrs[i].lookbehind += scan->flags; | |
3445 | } | |
3446 | } | |
3447 | } | |
3448 | } | |
3449 | } | |
3450 | #endif | |
3451 | } | |
3452 | else if (OP(scan) == OPEN) { | |
3453 | if (stopparen != (I32)PARNO(scan)) | |
3454 | pars++; | |
3455 | } | |
3456 | else if (OP(scan) == CLOSE) { | |
3457 | if (stopparen == (I32)PARNO(scan)) { | |
3458 | break; | |
3459 | } | |
3460 | if ((I32)PARNO(scan) == is_par) { | |
3461 | next = regnext(scan); | |
3462 | ||
3463 | if ( next && (OP(next) != WHILEM) && next < last) | |
3464 | is_par = 0; /* Disable optimization */ | |
3465 | } | |
3466 | if (data) { | |
3467 | *(data->last_closep) = PARNO(scan); | |
3468 | *(data->last_close_opp) = scan; | |
3469 | } | |
3470 | } | |
3471 | else if (OP(scan) == EVAL) { | |
c224bbd5 | 3472 | if (data && !(scan->flags & EVAL_OPTIMISTIC_FLAG) ) |
85900e28 YO |
3473 | data->flags |= SF_HAS_EVAL; |
3474 | } | |
3475 | else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) { | |
3476 | if (flags & SCF_DO_SUBSTR) { | |
3477 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
3478 | flags &= ~SCF_DO_SUBSTR; | |
3479 | } | |
3480 | if (OP(scan)==ACCEPT) { | |
3481 | /* m{(*ACCEPT)x} does not have to start with 'x' */ | |
3482 | flags &= ~SCF_DO_STCLASS; | |
3483 | if (data) | |
3484 | data->flags |= SCF_SEEN_ACCEPT; | |
3485 | if (stopmin > min) | |
3486 | stopmin = min; | |
3487 | } | |
3488 | } | |
3489 | else if (OP(scan) == COMMIT) { | |
3490 | /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we | |
3491 | * must not end up with "abcxyz" as a fixed substring else we'll | |
3492 | * skip straight to attempting to match at offset 4. | |
3493 | */ | |
3494 | if (flags & SCF_DO_SUBSTR) { | |
3495 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
3496 | flags &= ~SCF_DO_SUBSTR; | |
3497 | } | |
3498 | } | |
3499 | else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */ | |
3500 | { | |
3501 | if (flags & SCF_DO_SUBSTR) { | |
3502 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
3503 | data->cur_is_floating = 1; /* float */ | |
3504 | } | |
3505 | is_inf = is_inf_internal = 1; | |
3506 | if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ | |
3507 | ssc_anything(data->start_class); | |
3508 | flags &= ~SCF_DO_STCLASS; | |
3509 | } | |
3510 | else if (OP(scan) == GPOS) { | |
3511 | if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) && | |
3512 | !(delta || is_inf || (data && data->pos_delta))) | |
3513 | { | |
3514 | if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR)) | |
3515 | RExC_rx->intflags |= PREGf_ANCH_GPOS; | |
3516 | if (RExC_rx->gofs < (STRLEN)min) | |
3517 | RExC_rx->gofs = min; | |
3518 | } else { | |
3519 | RExC_rx->intflags |= PREGf_GPOS_FLOAT; | |
3520 | RExC_rx->gofs = 0; | |
3521 | } | |
3522 | } | |
3523 | #ifdef TRIE_STUDY_OPT | |
3524 | #ifdef FULL_TRIE_STUDY | |
3525 | else if (REGNODE_TYPE(OP(scan)) == TRIE) { | |
3526 | /* NOTE - There is similar code to this block above for handling | |
3527 | BRANCH nodes on the initial study. If you change stuff here | |
3528 | check there too. */ | |
3529 | regnode *trie_node= scan; | |
3530 | regnode *tail= regnext(scan); | |
3531 | reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; | |
3532 | SSize_t max1 = 0, min1 = OPTIMIZE_INFTY; | |
3533 | regnode_ssc accum; | |
3534 | ||
3535 | if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */ | |
3536 | /* Cannot merge strings after this. */ | |
3537 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
3538 | } | |
3539 | if (flags & SCF_DO_STCLASS) | |
3540 | ssc_init_zero(pRExC_state, &accum); | |
3541 | ||
3542 | if (!trie->jump) { | |
3543 | min1= trie->minlen; | |
3544 | max1= trie->maxlen; | |
3545 | } else { | |
3546 | const regnode *nextbranch= NULL; | |
3547 | U32 word; | |
3548 | ||
3549 | for ( word=1 ; word <= trie->wordcount ; word++) | |
3550 | { | |
3551 | SSize_t deltanext = 0, minnext = 0; | |
3552 | U32 f = (flags & SCF_TRIE_DOING_RESTUDY); | |
3553 | SSize_t fake_last_close = 0; | |
3554 | regnode *fake_last_close_op = NULL; | |
3555 | regnode_ssc this_class; | |
3556 | ||
3557 | StructCopy(&zero_scan_data, &data_fake, scan_data_t); | |
3558 | if (data) { | |
3559 | data_fake.whilem_c = data->whilem_c; | |
3560 | data_fake.last_closep = data->last_closep; | |
3561 | data_fake.last_close_opp = data->last_close_opp; | |
3562 | } | |
3563 | else { | |
3564 | data_fake.last_closep = &fake_last_close; | |
3565 | data_fake.last_close_opp = &fake_last_close_op; | |
3566 | } | |
3567 | data_fake.pos_delta = delta; | |
3568 | if (flags & SCF_DO_STCLASS) { | |
3569 | ssc_init(pRExC_state, &this_class); | |
3570 | data_fake.start_class = &this_class; | |
3571 | f |= SCF_DO_STCLASS_AND; | |
3572 | } | |
3573 | if (flags & SCF_WHILEM_VISITED_POS) | |
3574 | f |= SCF_WHILEM_VISITED_POS; | |
3575 | ||
3576 | if (trie->jump[word]) { | |
3577 | if (!nextbranch) | |
3578 | nextbranch = trie_node + trie->jump[0]; | |
3579 | scan= trie_node + trie->jump[word]; | |
3580 | /* We go from the jump point to the branch that follows | |
3581 | it. Note this means we need the vestigal unused | |
3582 | branches even though they arent otherwise used. */ | |
3583 | /* optimise study_chunk() for TRIE */ | |
3584 | minnext = study_chunk(pRExC_state, &scan, minlenp, | |
3585 | &deltanext, (regnode *)nextbranch, &data_fake, | |
3586 | stopparen, recursed_depth, NULL, f, depth+1, | |
3587 | mutate_ok); | |
3588 | } | |
3589 | if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH) | |
3590 | nextbranch= regnext((regnode*)nextbranch); | |
3591 | ||
3592 | if (min1 > (SSize_t)(minnext + trie->minlen)) | |
3593 | min1 = minnext + trie->minlen; | |
3594 | if (deltanext == OPTIMIZE_INFTY) { | |
3595 | is_inf = is_inf_internal = 1; | |
3596 | max1 = OPTIMIZE_INFTY; | |
3597 | } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen)) | |
3598 | max1 = minnext + deltanext + trie->maxlen; | |
3599 | ||
3600 | if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) | |
3601 | pars++; | |
3602 | if (data_fake.flags & SCF_SEEN_ACCEPT) { | |
3603 | if ( stopmin > min + min1) | |
3604 | stopmin = min + min1; | |
3605 | flags &= ~SCF_DO_SUBSTR; | |
3606 | if (data) | |
3607 | data->flags |= SCF_SEEN_ACCEPT; | |
3608 | } | |
3609 | if (data) { | |
3610 | if (data_fake.flags & SF_HAS_EVAL) | |
3611 | data->flags |= SF_HAS_EVAL; | |
3612 | data->whilem_c = data_fake.whilem_c; | |
3613 | } | |
3614 | if (flags & SCF_DO_STCLASS) | |
3615 | ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class); | |
3616 | } | |
3617 | DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta); | |
3618 | } | |
3619 | if (flags & SCF_DO_SUBSTR) { | |
3620 | data->pos_min += min1; | |
3621 | data->pos_delta += max1 - min1; | |
3622 | if (max1 != min1 || is_inf) | |
3623 | data->cur_is_floating = 1; /* float */ | |
3624 | } | |
3625 | min += min1; | |
3626 | if (delta != OPTIMIZE_INFTY) { | |
3627 | if (OPTIMIZE_INFTY - (max1 - min1) >= delta) | |
3628 | delta += max1 - min1; | |
3629 | else | |
3630 | delta = OPTIMIZE_INFTY; | |
3631 | } | |
3632 | if (flags & SCF_DO_STCLASS_OR) { | |
3633 | ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); | |
3634 | if (min1) { | |
3635 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); | |
3636 | flags &= ~SCF_DO_STCLASS; | |
3637 | } | |
3638 | } | |
3639 | else if (flags & SCF_DO_STCLASS_AND) { | |
3640 | if (min1) { | |
3641 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum); | |
3642 | flags &= ~SCF_DO_STCLASS; | |
3643 | } | |
3644 | else { | |
3645 | /* Switch to OR mode: cache the old value of | |
3646 | * data->start_class */ | |
3647 | INIT_AND_WITHP; | |
3648 | StructCopy(data->start_class, and_withp, regnode_ssc); | |
3649 | flags &= ~SCF_DO_STCLASS_AND; | |
3650 | StructCopy(&accum, data->start_class, regnode_ssc); | |
3651 | flags |= SCF_DO_STCLASS_OR; | |
3652 | } | |
3653 | } | |
3654 | scan= tail; | |
3655 | DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta); | |
3656 | continue; | |
3657 | } | |
3658 | #else | |
3659 | else if (REGNODE_TYPE(OP(scan)) == TRIE) { | |
3660 | reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ]; | |
3661 | U8*bang=NULL; | |
3662 | ||
3663 | min += trie->minlen; | |
3664 | delta += (trie->maxlen - trie->minlen); | |
3665 | flags &= ~SCF_DO_STCLASS; /* xxx */ | |
3666 | if (flags & SCF_DO_SUBSTR) { | |
3667 | /* Cannot expect anything... */ | |
3668 | scan_commit(pRExC_state, data, minlenp, is_inf); | |
3669 | data->pos_min += trie->minlen; | |
3670 | data->pos_delta += (trie->maxlen - trie->minlen); | |
3671 | if (trie->maxlen != trie->minlen) | |
3672 | data->cur_is_floating = 1; /* float */ | |
3673 | } | |
3674 | if (trie->jump) /* no more substrings -- for now /grr*/ | |
3675 | flags &= ~SCF_DO_SUBSTR; | |
3676 | } | |
3677 | ||
3678 | #endif /* old or new */ | |
3679 | #endif /* TRIE_STUDY_OPT */ | |
3680 | ||
3681 | else if (OP(scan) == REGEX_SET) { | |
3682 | Perl_croak(aTHX_ "panic: %s regnode should be resolved" | |
3683 | " before optimization", REGNODE_NAME(REGEX_SET)); | |
3684 | } | |
3685 | ||
3686 | /* Else: zero-length, ignore. */ | |
3687 | scan = regnext(scan); | |
3688 | } | |
3689 | ||
3690 | finish: | |
3691 | if (frame) { | |
3692 | /* we need to unwind recursion. */ | |
3693 | depth = depth - 1; | |
3694 | ||
3695 | DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta); | |
3696 | DEBUG_PEEP("fend", scan, depth, flags); | |
3697 | ||
3698 | /* restore previous context */ | |
3699 | last = frame->last_regnode; | |
3700 | scan = frame->next_regnode; | |
3701 | stopparen = frame->stopparen; | |
3702 | recursed_depth = frame->prev_recursed_depth; | |
3703 | ||
3704 | RExC_frame_last = frame->prev_frame; | |
3705 | frame = frame->this_prev_frame; | |
3706 | goto fake_study_recurse; | |
3707 | } | |
3708 | ||
3709 | assert(!frame); | |
3710 | DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta); | |
3711 | ||
3712 | /* is this pattern infinite? Eg, consider /(a|b+)/ */ | |
3713 | if (is_inf_internal) | |
3714 | delta = OPTIMIZE_INFTY; | |
3715 | ||
3716 | /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */ | |
3717 | if (min > stopmin) { | |
3718 | /* | |
3719 | At this point 'min' represents the minimum length string we can | |
3720 | match while *ignoring* the implication of ACCEPT, and 'delta' | |
3721 | represents the difference between the minimum length and maximum | |
3722 | length, and if the pattern matches an infinitely long string | |
3723 | (consider the + and * quantifiers) then we use the special delta | |
3724 | value of OPTIMIZE_INFTY to represent it. 'stopmin' is the | |
3725 | minimum length that can be matched *and* accepted. | |
3726 | ||
3727 | A pattern is accepted when matching was successful *and* | |
3728 | complete, and thus there is no further matching needing to be | |
3729 | done, no backtracking to occur, etc. Prior to the introduction | |
3730 | of ACCEPT the only opcode that signaled acceptance was the END | |
3731 | opcode, which is always the very last opcode in a regex program. | |
3732 | ACCEPT is thus conceptually an early successful return out of | |
3733 | the matching process. stopmin starts out as OPTIMIZE_INFTY to | |
3734 | represent "the entire pattern", and is ratched down to the | |
3735 | "current min" if necessary when an ACCEPT opcode is encountered. | |
3736 | ||
3737 | Thus stopmin might be smaller than min if we saw an (*ACCEPT), | |
3738 | and we now need to account for it in both min and delta. | |
3739 | Consider that in a pattern /AB/ normally the min length it can | |
3740 | match can be computed as min(A)+min(B). But (*ACCEPT) means | |
3741 | that it might be something else, not even neccesarily min(A) at | |
3742 | all. Consider | |
3743 | ||
3744 | A = /(foo(*ACCEPT)|x+)/ | |
3745 | B = /whop/ | |
3746 | AB = /(foo(*ACCEPT)|x+)whop/ | |
3747 | ||
3748 | The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY | |
3749 | for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for | |
3750 | "whop", and the delta of 0 as the pattern is of fixed length, the | |
3751 | stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT. | |
3752 | When handling AB we expect to see a min of 5 for "xwhop", and a | |
3753 | delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3 | |
3754 | for "foo". This should result in a final min of 3 for "foo", and | |
3755 | a final delta of OPTIMIZE_INFTY for "xxxxx...whop". | |
3756 | ||
3757 | In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a | |
3758 | min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the | |
3759 | stop min would be 4 for "dude". This should result in a final | |
3760 | min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx". | |
3761 | ||
3762 | When min is smaller than stopmin then we can ignore it. In the | |
3763 | fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2, | |
3764 | and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously | |
3765 | the ACCEPT doesn't reduce the minimum length of the string that | |
3766 | might be matched, nor affect the maximum length. | |
3767 | ||
3768 | In something like /foo(*ACCEPT)ba?r/ we would have a min of 5 | |
3769 | for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for | |
3770 | "foo". We currently turn this into a min of 3 for "foo" and a | |
3771 | delta of 3 for "foobar" even though technically "foobar" isn't | |
3772 | possible. ACCEPT affects some aspects of the optimizer, like | |
3773 | length computations and mandatory substring optimizations, but | |
3774 | there are other optimzations this routine perfoms that are not | |
3775 | affected and this compromise simplifies implementation. | |
3776 | ||
3777 | It might be helpful to consider that this C function is called | |
3778 | recursively on the pattern in a bottom up fashion, and that the | |
3779 | min returned by a nested call may be marked as coming from an | |
3780 | ACCEPT, causing its callers to treat the returned min as a | |
3781 | stopmin as the recursion unwinds. Thus a single ACCEPT can affect | |
3782 | multiple calls into this function in different ways. | |
3783 | */ | |
3784 | ||
3785 | if (OPTIMIZE_INFTY - delta >= min - stopmin) | |
3786 | delta += min - stopmin; | |
3787 | else | |
3788 | delta = OPTIMIZE_INFTY; | |
3789 | min = stopmin; | |
3790 | } | |
3791 | ||
3792 | *scanp = scan; | |
3793 | *deltap = delta; | |
3794 | ||
3795 | if (flags & SCF_DO_SUBSTR && is_inf) | |
3796 | data->pos_delta = OPTIMIZE_INFTY - data->pos_min; | |
3797 | if (is_par > (I32)U8_MAX) | |
3798 | is_par = 0; | |
3799 | if (is_par && pars==1 && data) { | |
3800 | data->flags |= SF_IN_PAR; | |
3801 | data->flags &= ~SF_HAS_PAR; | |
3802 | } | |
3803 | else if (pars && data) { | |
3804 | data->flags |= SF_HAS_PAR; | |
3805 | data->flags &= ~SF_IN_PAR; | |
3806 | } | |
3807 | if (flags & SCF_DO_STCLASS_OR) | |
3808 | ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); | |
3809 | if (flags & SCF_TRIE_RESTUDY) | |
3810 | data->flags |= SCF_TRIE_RESTUDY; | |
3811 | ||
3812 | ||
3813 | if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { | |
3814 | if (min > OPTIMIZE_INFTY - delta) | |
3815 | RExC_maxlen = OPTIMIZE_INFTY; | |
3816 | else if (RExC_maxlen < min + delta) | |
3817 | RExC_maxlen = min + delta; | |
3818 | } | |
3819 | DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta); | |
3820 | return min; | |
3821 | } |