This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
new perldelta
[perl5.git] / regcomp.c
... / ...
CommitLineData
1/* regcomp.c
2 */
3
4/*
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
8 */
9
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
18 */
19
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
34/*
35 * pregcomp and pregexec -- regsub and regerror are not used in perl
36 *
37 * Copyright (c) 1986 by University of Toronto.
38 * Written by Henry Spencer. Not derived from licensed software.
39 *
40 * Permission is granted to anyone to use this software for any
41 * purpose on any computer system, and to redistribute it freely,
42 * subject to the following restrictions:
43 *
44 * 1. The author is not responsible for the consequences of use of
45 * this software, no matter how awful, even if they arise
46 * from defects in it.
47 *
48 * 2. The origin of this software must not be misrepresented, either
49 * by explicit claim or by omission.
50 *
51 * 3. Altered versions must be plainly marked as such, and must not
52 * be misrepresented as being the original software.
53 *
54 *
55 **** Alterations to Henry's code are...
56 ****
57 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
58 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
59 **** by Larry Wall and others
60 ****
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
63
64 *
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
68 */
69
70/* Note on debug output:
71 *
72 * This is set up so that -Dr turns on debugging like all other flags that are
73 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
74 * all regular expressions encountered in a program, and gives a huge amount of
75 * output for all but the shortest programs.
76 *
77 * The ability to output pattern debugging information lexically, and with much
78 * finer grained control was added, with 'use re qw(Debug ....);' available even
79 * in non-DEBUGGING builds. This is accomplished by copying the contents of
80 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
81 * Those files are compiled and linked into the perl executable, and they are
82 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
83 * to re.pm.
84 *
85 * That would normally mean linking errors when two functions of the same name
86 * are attempted to be placed into the same executable. That is solved in one
87 * of four ways:
88 * 1) Static functions aren't known outside the file they are in, so for the
89 * many functions of that type in this file, it just isn't a problem.
90 * 2) Most externally known functions are enclosed in
91 * #ifndef PERL_IN_XSUB_RE
92 * ...
93 * #endif
94 * blocks, so there is only one definition for them in the whole
95 * executable, the one in regcomp.c (or regexec.c). The implication of
96 * that is any debugging info that comes from them is controlled only by
97 * -Dr. Further, any static function they call will also be the version
98 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
99 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
100 * have different names, so that what gets loaded in the executable is
101 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
102 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
103 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
104 * versions and their callees are under control of re.pm. The catch is
105 * that references to all these go through the regexp_engine structure,
106 * which is initialized in regcomp.h to the Perl_foo versions, and
107 * substituted out in lexical scopes where 'use re' is in effect to the
108 * 'my_foo' ones. That structure is public API, so it would be a hard
109 * sell to add any additional members.
110 * 4) For functions in regcomp.c and re_comp.c that are called only from,
111 * respectively, regexec.c and re_exec.c, they can have two different
112 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
113 * embed.fnc.
114 *
115 * The bottom line is that if you add code to one of the public functions
116 * listed in ext/re/re_top.h, debugging automagically works. But if you write
117 * a new function that needs to do debugging or there is a chain of calls from
118 * it that need to do debugging, all functions in the chain should use options
119 * 2) or 4) above.
120 *
121 * A function may have to be split so that debugging stuff is static, but it
122 * calls out to some other function that only gets compiled in regcomp.c to
123 * access data that we don't want to duplicate.
124 */
125
126#ifdef PERL_EXT_RE_BUILD
127#include "re_top.h"
128#endif
129
130#include "EXTERN.h"
131#define PERL_IN_REGEX_ENGINE
132#define PERL_IN_REGCOMP_ANY
133#define PERL_IN_REGCOMP_C
134#include "perl.h"
135
136#ifdef PERL_IN_XSUB_RE
137# include "re_comp.h"
138EXTERN_C const struct regexp_engine my_reg_engine;
139EXTERN_C const struct regexp_engine wild_reg_engine;
140#else
141# include "regcomp.h"
142#endif
143
144#include "invlist_inline.h"
145#include "unicode_constants.h"
146#include "regcomp_internal.h"
147
148/* =========================================================
149 * BEGIN edit_distance stuff.
150 *
151 * This calculates how many single character changes of any type are needed to
152 * transform a string into another one. It is taken from version 3.1 of
153 *
154 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
155 */
156
157/* Our unsorted dictionary linked list. */
158/* Note we use UVs, not chars. */
159
160struct dictionary{
161 UV key;
162 UV value;
163 struct dictionary* next;
164};
165typedef struct dictionary item;
166
167
168PERL_STATIC_INLINE item*
169push(UV key, item* curr)
170{
171 item* head;
172 Newx(head, 1, item);
173 head->key = key;
174 head->value = 0;
175 head->next = curr;
176 return head;
177}
178
179
180PERL_STATIC_INLINE item*
181find(item* head, UV key)
182{
183 item* iterator = head;
184 while (iterator){
185 if (iterator->key == key){
186 return iterator;
187 }
188 iterator = iterator->next;
189 }
190
191 return NULL;
192}
193
194PERL_STATIC_INLINE item*
195uniquePush(item* head, UV key)
196{
197 item* iterator = head;
198
199 while (iterator){
200 if (iterator->key == key) {
201 return head;
202 }
203 iterator = iterator->next;
204 }
205
206 return push(key, head);
207}
208
209PERL_STATIC_INLINE void
210dict_free(item* head)
211{
212 item* iterator = head;
213
214 while (iterator) {
215 item* temp = iterator;
216 iterator = iterator->next;
217 Safefree(temp);
218 }
219
220 head = NULL;
221}
222
223/* End of Dictionary Stuff */
224
225/* All calculations/work are done here */
226STATIC int
227S_edit_distance(const UV* src,
228 const UV* tgt,
229 const STRLEN x, /* length of src[] */
230 const STRLEN y, /* length of tgt[] */
231 const SSize_t maxDistance
232)
233{
234 item *head = NULL;
235 UV swapCount, swapScore, targetCharCount, i, j;
236 UV *scores;
237 UV score_ceil = x + y;
238
239 PERL_ARGS_ASSERT_EDIT_DISTANCE;
240
241 /* initialize matrix start values */
242 Newx(scores, ( (x + 2) * (y + 2)), UV);
243 scores[0] = score_ceil;
244 scores[1 * (y + 2) + 0] = score_ceil;
245 scores[0 * (y + 2) + 1] = score_ceil;
246 scores[1 * (y + 2) + 1] = 0;
247 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
248
249 /* work loops */
250 /* i = src index */
251 /* j = tgt index */
252 for (i=1;i<=x;i++) {
253 if (i < x)
254 head = uniquePush(head, src[i]);
255 scores[(i+1) * (y + 2) + 1] = i;
256 scores[(i+1) * (y + 2) + 0] = score_ceil;
257 swapCount = 0;
258
259 for (j=1;j<=y;j++) {
260 if (i == 1) {
261 if(j < y)
262 head = uniquePush(head, tgt[j]);
263 scores[1 * (y + 2) + (j + 1)] = j;
264 scores[0 * (y + 2) + (j + 1)] = score_ceil;
265 }
266
267 targetCharCount = find(head, tgt[j-1])->value;
268 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
269
270 if (src[i-1] != tgt[j-1]){
271 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
272 }
273 else {
274 swapCount = j;
275 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
276 }
277 }
278
279 find(head, src[i-1])->value = i;
280 }
281
282 {
283 IV score = scores[(x+1) * (y + 2) + (y + 1)];
284 dict_free(head);
285 Safefree(scores);
286 return (maxDistance != 0 && maxDistance < score)?(-1):score;
287 }
288}
289
290/* END of edit_distance() stuff
291 * ========================================================= */
292
293#ifdef PERL_RE_BUILD_AUX
294/* add a data member to the struct reg_data attached to this regex, it should
295 * always return a non-zero return. the 's' argument is the type of the items
296 * being added and the n is the number of items. The length of 's' should match
297 * the number of items. */
298U32
299Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
300{
301 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
302
303 PERL_ARGS_ASSERT_REG_ADD_DATA;
304
305 /* in the below expression we have (count + n - 1), the minus one is there
306 * because the struct that we allocate already contains a slot for 1 data
307 * item, so we do not need to allocate it the first time. IOW, the
308 * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
309 * to allocate. See struct reg_data in regcomp.h
310 */
311 Renewc(RExC_rxi->data,
312 sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
313 char, struct reg_data);
314 /* however in the data->what expression we use (count + n) and do not
315 * subtract one from the result because the data structure contains a
316 * pointer to an array, and does not allocate the first element as part of
317 * the data struct. */
318 if (count > 1)
319 Renew(RExC_rxi->data->what, (count + n), U8);
320 else {
321 /* when count == 1 it means we have not initialized anything.
322 * we always fill the 0 slot of the data array with a '%' entry, which
323 * means "zero" (all the other types are letters) which exists purely
324 * so the return from reg_add_data is ALWAYS true, so we can tell it apart
325 * from a "no value" idx=0 in places where we would return an index
326 * into reg_add_data. This is particularly important with the new "single
327 * pass, usually, but not always" strategy that we use, where the code
328 * will use a 0 to represent "not able to compute this yet".
329 */
330 Newx(RExC_rxi->data->what, n+1, U8);
331 /* fill in the placeholder slot of 0 with a what of '%', we use
332 * this because it sorta looks like a zero (0/0) and it is not a letter
333 * like any of the other "whats", this type should never be created
334 * any other way but here. '%' happens to also not appear in this
335 * file for any other reason (at the time of writing this comment)*/
336 RExC_rxi->data->what[0]= '%';
337 RExC_rxi->data->data[0]= NULL;
338 }
339 RExC_rxi->data->count = count + n;
340 Copy(s, RExC_rxi->data->what + count, n, U8);
341 assert(count>0);
342 return count;
343}
344#endif /* PERL_RE_BUILD_AUX */
345
346/*XXX: todo make this not included in a non debugging perl, but appears to be
347 * used anyway there, in 'use re' */
348#ifndef PERL_IN_XSUB_RE
349void
350Perl_reginitcolors(pTHX)
351{
352 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
353 if (s) {
354 char *t = savepv(s);
355 int i = 0;
356 PL_colors[0] = t;
357 while (++i < 6) {
358 t = strchr(t, '\t');
359 if (t) {
360 *t = '\0';
361 PL_colors[i] = ++t;
362 }
363 else
364 PL_colors[i] = t = (char *)"";
365 }
366 } else {
367 int i = 0;
368 while (i < 6)
369 PL_colors[i++] = (char *)"";
370 }
371 PL_colorset = 1;
372}
373#endif
374
375
376#ifdef TRIE_STUDY_OPT
377/* search for "restudy" in this file for a detailed explanation */
378#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
379 STMT_START { \
380 if ( \
381 (data.flags & SCF_TRIE_RESTUDY) \
382 && ! restudied++ \
383 ) { \
384 dOsomething; \
385 goto reStudy; \
386 } \
387 } STMT_END
388#else
389#define CHECK_RESTUDY_GOTO_butfirst
390#endif
391
392/*
393 * pregcomp - compile a regular expression into internal code
394 *
395 * Decides which engine's compiler to call based on the hint currently in
396 * scope
397 */
398
399#ifndef PERL_IN_XSUB_RE
400
401/* return the currently in-scope regex engine (or the default if none) */
402
403regexp_engine const *
404Perl_current_re_engine(pTHX)
405{
406 if (IN_PERL_COMPILETIME) {
407 HV * const table = GvHV(PL_hintgv);
408 SV **ptr;
409
410 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
411 return &PL_core_reg_engine;
412 ptr = hv_fetchs(table, "regcomp", FALSE);
413 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
414 return &PL_core_reg_engine;
415 return INT2PTR(regexp_engine*, SvIV(*ptr));
416 }
417 else {
418 SV *ptr;
419 if (!PL_curcop->cop_hints_hash)
420 return &PL_core_reg_engine;
421 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
422 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
423 return &PL_core_reg_engine;
424 return INT2PTR(regexp_engine*, SvIV(ptr));
425 }
426}
427
428
429REGEXP *
430Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
431{
432 regexp_engine const *eng = current_re_engine();
433 DECLARE_AND_GET_RE_DEBUG_FLAGS;
434
435 PERL_ARGS_ASSERT_PREGCOMP;
436
437 /* Dispatch a request to compile a regexp to correct regexp engine. */
438 DEBUG_COMPILE_r({
439 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
440 PTR2UV(eng));
441 });
442 return CALLREGCOMP_ENG(eng, pattern, flags);
443}
444#endif
445
446/*
447=for apidoc re_compile
448
449Compile the regular expression pattern C<pattern>, returning a pointer to the
450compiled object for later matching with the internal regex engine.
451
452This function is typically used by a custom regexp engine C<.comp()> function
453to hand off to the core regexp engine those patterns it doesn't want to handle
454itself (typically passing through the same flags it was called with). In
455almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
456to compile using the currently active regexp engine.
457
458If C<pattern> is already a C<REGEXP>, this function does nothing but return a
459pointer to the input. Otherwise the PV is extracted and treated like a string
460representing a pattern. See L<perlre>.
461
462The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names
463all begin with C<RXf_>.
464
465=cut
466
467 * public entry point for the perl core's own regex compiling code.
468 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
469 * pattern rather than a list of OPs, and uses the internal engine rather
470 * than the current one */
471
472REGEXP *
473Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
474{
475 SV *pat = pattern; /* defeat constness! */
476
477 PERL_ARGS_ASSERT_RE_COMPILE;
478
479 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
480#ifdef PERL_IN_XSUB_RE
481 &my_reg_engine,
482#else
483 &PL_core_reg_engine,
484#endif
485 NULL, NULL, rx_flags, 0);
486}
487
488static void
489S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
490{
491 int n;
492
493 if (--cbs->refcnt > 0)
494 return;
495 for (n = 0; n < cbs->count; n++) {
496 REGEXP *rx = cbs->cb[n].src_regex;
497 if (rx) {
498 cbs->cb[n].src_regex = NULL;
499 SvREFCNT_dec_NN(rx);
500 }
501 }
502 Safefree(cbs->cb);
503 Safefree(cbs);
504}
505
506
507static struct reg_code_blocks *
508S_alloc_code_blocks(pTHX_ int ncode)
509{
510 struct reg_code_blocks *cbs;
511 Newx(cbs, 1, struct reg_code_blocks);
512 cbs->count = ncode;
513 cbs->refcnt = 1;
514 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
515 if (ncode)
516 Newx(cbs->cb, ncode, struct reg_code_block);
517 else
518 cbs->cb = NULL;
519 return cbs;
520}
521
522
523/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
524 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
525 * point to the realloced string and length.
526 *
527 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
528 * stuff added */
529
530static void
531S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
532 char **pat_p, STRLEN *plen_p, int num_code_blocks)
533{
534 U8 *const src = (U8*)*pat_p;
535 U8 *dst, *d;
536 int n=0;
537 STRLEN s = 0;
538 bool do_end = 0;
539 DECLARE_AND_GET_RE_DEBUG_FLAGS;
540
541 DEBUG_PARSE_r(Perl_re_printf( aTHX_
542 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
543
544 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
545 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
546 d = dst;
547
548 while (s < *plen_p) {
549 append_utf8_from_native_byte(src[s], &d);
550
551 if (n < num_code_blocks) {
552 assert(pRExC_state->code_blocks);
553 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
554 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
555 assert(*(d - 1) == '(');
556 do_end = 1;
557 }
558 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
559 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
560 assert(*(d - 1) == ')');
561 do_end = 0;
562 n++;
563 }
564 }
565 s++;
566 }
567 *d = '\0';
568 *plen_p = d - dst;
569 *pat_p = (char*) dst;
570 SAVEFREEPV(*pat_p);
571 RExC_orig_utf8 = RExC_utf8 = 1;
572}
573
574
575
576/* S_concat_pat(): concatenate a list of args to the pattern string pat,
577 * while recording any code block indices, and handling overloading,
578 * nested qr// objects etc. If pat is null, it will allocate a new
579 * string, or just return the first arg, if there's only one.
580 *
581 * Returns the malloced/updated pat.
582 * patternp and pat_count is the array of SVs to be concatted;
583 * oplist is the optional list of ops that generated the SVs;
584 * recompile_p is a pointer to a boolean that will be set if
585 * the regex will need to be recompiled.
586 * delim, if non-null is an SV that will be inserted between each element
587 */
588
589static SV*
590S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
591 SV *pat, SV ** const patternp, int pat_count,
592 OP *oplist, bool *recompile_p, SV *delim)
593{
594 SV **svp;
595 int n = 0;
596 bool use_delim = FALSE;
597 bool alloced = FALSE;
598
599 /* if we know we have at least two args, create an empty string,
600 * then concatenate args to that. For no args, return an empty string */
601 if (!pat && pat_count != 1) {
602 pat = newSVpvs("");
603 SAVEFREESV(pat);
604 alloced = TRUE;
605 }
606
607 for (svp = patternp; svp < patternp + pat_count; svp++) {
608 SV *sv;
609 SV *rx = NULL;
610 STRLEN orig_patlen = 0;
611 bool code = 0;
612 SV *msv = use_delim ? delim : *svp;
613 if (!msv) msv = &PL_sv_undef;
614
615 /* if we've got a delimiter, we go round the loop twice for each
616 * svp slot (except the last), using the delimiter the second
617 * time round */
618 if (use_delim) {
619 svp--;
620 use_delim = FALSE;
621 }
622 else if (delim)
623 use_delim = TRUE;
624
625 if (SvTYPE(msv) == SVt_PVAV) {
626 /* we've encountered an interpolated array within
627 * the pattern, e.g. /...@a..../. Expand the list of elements,
628 * then recursively append elements.
629 * The code in this block is based on S_pushav() */
630
631 AV *const av = (AV*)msv;
632 const SSize_t maxarg = AvFILL(av) + 1;
633 SV **array;
634
635 if (oplist) {
636 assert(oplist->op_type == OP_PADAV
637 || oplist->op_type == OP_RV2AV);
638 oplist = OpSIBLING(oplist);
639 }
640
641 if (SvRMAGICAL(av)) {
642 SSize_t i;
643
644 Newx(array, maxarg, SV*);
645 SAVEFREEPV(array);
646 for (i=0; i < maxarg; i++) {
647 SV ** const svp = av_fetch(av, i, FALSE);
648 array[i] = svp ? *svp : &PL_sv_undef;
649 }
650 }
651 else
652 array = AvARRAY(av);
653
654 if (maxarg > 0) {
655 pat = S_concat_pat(aTHX_ pRExC_state, pat,
656 array, maxarg, NULL, recompile_p,
657 /* $" */
658 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
659 }
660 else if (!pat) {
661 pat = newSVpvs_flags("", SVs_TEMP);
662 }
663
664 continue;
665 }
666
667
668 /* we make the assumption here that each op in the list of
669 * op_siblings maps to one SV pushed onto the stack,
670 * except for code blocks, with have both an OP_NULL and
671 * an OP_CONST.
672 * This allows us to match up the list of SVs against the
673 * list of OPs to find the next code block.
674 *
675 * Note that PUSHMARK PADSV PADSV ..
676 * is optimised to
677 * PADRANGE PADSV PADSV ..
678 * so the alignment still works. */
679
680 if (oplist) {
681 if (oplist->op_type == OP_NULL
682 && (oplist->op_flags & OPf_SPECIAL))
683 {
684 assert(n < pRExC_state->code_blocks->count);
685 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
686 pRExC_state->code_blocks->cb[n].block = oplist;
687 pRExC_state->code_blocks->cb[n].src_regex = NULL;
688 n++;
689 code = 1;
690 oplist = OpSIBLING(oplist); /* skip CONST */
691 assert(oplist);
692 }
693 oplist = OpSIBLING(oplist);;
694 }
695
696 /* apply magic and QR overloading to arg */
697
698 SvGETMAGIC(msv);
699 if (SvROK(msv) && SvAMAGIC(msv)) {
700 SV *sv = AMG_CALLunary(msv, regexp_amg);
701 if (sv) {
702 if (SvROK(sv))
703 sv = SvRV(sv);
704 if (SvTYPE(sv) != SVt_REGEXP)
705 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
706 msv = sv;
707 }
708 }
709
710 /* try concatenation overload ... */
711 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
712 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
713 {
714 sv_setsv(pat, sv);
715 /* overloading involved: all bets are off over literal
716 * code. Pretend we haven't seen it */
717 if (n)
718 pRExC_state->code_blocks->count -= n;
719 n = 0;
720 }
721 else {
722 /* ... or failing that, try "" overload */
723 while (SvAMAGIC(msv)
724 && (sv = AMG_CALLunary(msv, string_amg))
725 && sv != msv
726 && !( SvROK(msv)
727 && SvROK(sv)
728 && SvRV(msv) == SvRV(sv))
729 ) {
730 msv = sv;
731 SvGETMAGIC(msv);
732 }
733 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
734 msv = SvRV(msv);
735
736 if (pat) {
737 /* this is a partially unrolled
738 * sv_catsv_nomg(pat, msv);
739 * that allows us to adjust code block indices if
740 * needed */
741 STRLEN dlen;
742 char *dst = SvPV_force_nomg(pat, dlen);
743 orig_patlen = dlen;
744 if (SvUTF8(msv) && !SvUTF8(pat)) {
745 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
746 sv_setpvn(pat, dst, dlen);
747 SvUTF8_on(pat);
748 }
749 sv_catsv_nomg(pat, msv);
750 rx = msv;
751 }
752 else {
753 /* We have only one SV to process, but we need to verify
754 * it is properly null terminated or we will fail asserts
755 * later. In theory we probably shouldn't get such SV's,
756 * but if we do we should handle it gracefully. */
757 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
758 /* not a string, or a string with a trailing null */
759 pat = msv;
760 } else {
761 /* a string with no trailing null, we need to copy it
762 * so it has a trailing null */
763 pat = sv_2mortal(newSVsv(msv));
764 }
765 }
766
767 if (code)
768 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
769 }
770
771 /* extract any code blocks within any embedded qr//'s */
772 if (rx && SvTYPE(rx) == SVt_REGEXP
773 && RX_ENGINE((REGEXP*)rx)->op_comp)
774 {
775
776 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
777 if (ri->code_blocks && ri->code_blocks->count) {
778 int i;
779 /* the presence of an embedded qr// with code means
780 * we should always recompile: the text of the
781 * qr// may not have changed, but it may be a
782 * different closure than last time */
783 *recompile_p = 1;
784 if (pRExC_state->code_blocks) {
785 int new_count = pRExC_state->code_blocks->count
786 + ri->code_blocks->count;
787 Renew(pRExC_state->code_blocks->cb,
788 new_count, struct reg_code_block);
789 pRExC_state->code_blocks->count = new_count;
790 }
791 else
792 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
793 ri->code_blocks->count);
794
795 for (i=0; i < ri->code_blocks->count; i++) {
796 struct reg_code_block *src, *dst;
797 STRLEN offset = orig_patlen
798 + ReANY((REGEXP *)rx)->pre_prefix;
799 assert(n < pRExC_state->code_blocks->count);
800 src = &ri->code_blocks->cb[i];
801 dst = &pRExC_state->code_blocks->cb[n];
802 dst->start = src->start + offset;
803 dst->end = src->end + offset;
804 dst->block = src->block;
805 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
806 src->src_regex
807 ? src->src_regex
808 : (REGEXP*)rx);
809 n++;
810 }
811 }
812 }
813 }
814 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
815 if (alloced)
816 SvSETMAGIC(pat);
817
818 return pat;
819}
820
821
822
823/* see if there are any run-time code blocks in the pattern.
824 * False positives are allowed */
825
826static bool
827S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
828 char *pat, STRLEN plen)
829{
830 int n = 0;
831 STRLEN s;
832
833 PERL_UNUSED_CONTEXT;
834
835 for (s = 0; s < plen; s++) {
836 if ( pRExC_state->code_blocks
837 && n < pRExC_state->code_blocks->count
838 && s == pRExC_state->code_blocks->cb[n].start)
839 {
840 s = pRExC_state->code_blocks->cb[n].end;
841 n++;
842 continue;
843 }
844 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
845 * positives here */
846 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
847 (pat[s+2] == '{'
848 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
849 )
850 return 1;
851 }
852 return 0;
853}
854
855/* Handle run-time code blocks. We will already have compiled any direct
856 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
857 * copy of it, but with any literal code blocks blanked out and
858 * appropriate chars escaped; then feed it into
859 *
860 * eval "qr'modified_pattern'"
861 *
862 * For example,
863 *
864 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
865 *
866 * becomes
867 *
868 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
869 *
870 * After eval_sv()-ing that, grab any new code blocks from the returned qr
871 * and merge them with any code blocks of the original regexp.
872 *
873 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
874 * instead, just save the qr and return FALSE; this tells our caller that
875 * the original pattern needs upgrading to utf8.
876 */
877
878static bool
879S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
880 char *pat, STRLEN plen)
881{
882 SV *qr;
883
884 DECLARE_AND_GET_RE_DEBUG_FLAGS;
885
886 if (pRExC_state->runtime_code_qr) {
887 /* this is the second time we've been called; this should
888 * only happen if the main pattern got upgraded to utf8
889 * during compilation; re-use the qr we compiled first time
890 * round (which should be utf8 too)
891 */
892 qr = pRExC_state->runtime_code_qr;
893 pRExC_state->runtime_code_qr = NULL;
894 assert(RExC_utf8 && SvUTF8(qr));
895 }
896 else {
897 int n = 0;
898 STRLEN s;
899 char *p, *newpat;
900 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
901 SV *sv, *qr_ref;
902 dSP;
903
904 /* determine how many extra chars we need for ' and \ escaping */
905 for (s = 0; s < plen; s++) {
906 if (pat[s] == '\'' || pat[s] == '\\')
907 newlen++;
908 }
909
910 Newx(newpat, newlen, char);
911 p = newpat;
912 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
913
914 for (s = 0; s < plen; s++) {
915 if ( pRExC_state->code_blocks
916 && n < pRExC_state->code_blocks->count
917 && s == pRExC_state->code_blocks->cb[n].start)
918 {
919 /* blank out literal code block so that they aren't
920 * recompiled: eg change from/to:
921 * /(?{xyz})/
922 * /(?=====)/
923 * and
924 * /(??{xyz})/
925 * /(?======)/
926 * and
927 * /(?(?{xyz}))/
928 * /(?(?=====))/
929 */
930 assert(pat[s] == '(');
931 assert(pat[s+1] == '?');
932 *p++ = '(';
933 *p++ = '?';
934 s += 2;
935 while (s < pRExC_state->code_blocks->cb[n].end) {
936 *p++ = '=';
937 s++;
938 }
939 *p++ = ')';
940 n++;
941 continue;
942 }
943 if (pat[s] == '\'' || pat[s] == '\\')
944 *p++ = '\\';
945 *p++ = pat[s];
946 }
947 *p++ = '\'';
948 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
949 *p++ = 'x';
950 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
951 *p++ = 'x';
952 }
953 }
954 *p++ = '\0';
955 DEBUG_COMPILE_r({
956 Perl_re_printf( aTHX_
957 "%sre-parsing pattern for runtime code:%s %s\n",
958 PL_colors[4], PL_colors[5], newpat);
959 });
960
961 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
962 Safefree(newpat);
963
964 ENTER;
965 SAVETMPS;
966 save_re_context();
967 PUSHSTACKi(PERLSI_REQUIRE);
968 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
969 * parsing qr''; normally only q'' does this. It also alters
970 * hints handling */
971 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
972 SvREFCNT_dec_NN(sv);
973 SPAGAIN;
974 qr_ref = POPs;
975 PUTBACK;
976 {
977 SV * const errsv = ERRSV;
978 if (SvTRUE_NN(errsv))
979 /* use croak_sv ? */
980 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
981 }
982 assert(SvROK(qr_ref));
983 qr = SvRV(qr_ref);
984 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
985 /* the leaving below frees the tmp qr_ref.
986 * Give qr a life of its own */
987 SvREFCNT_inc(qr);
988 POPSTACK;
989 FREETMPS;
990 LEAVE;
991
992 }
993
994 if (!RExC_utf8 && SvUTF8(qr)) {
995 /* first time through; the pattern got upgraded; save the
996 * qr for the next time through */
997 assert(!pRExC_state->runtime_code_qr);
998 pRExC_state->runtime_code_qr = qr;
999 return 0;
1000 }
1001
1002
1003 /* extract any code blocks within the returned qr// */
1004
1005
1006 /* merge the main (r1) and run-time (r2) code blocks into one */
1007 {
1008 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
1009 struct reg_code_block *new_block, *dst;
1010 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
1011 int i1 = 0, i2 = 0;
1012 int r1c, r2c;
1013
1014 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
1015 {
1016 SvREFCNT_dec_NN(qr);
1017 return 1;
1018 }
1019
1020 if (!r1->code_blocks)
1021 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
1022
1023 r1c = r1->code_blocks->count;
1024 r2c = r2->code_blocks->count;
1025
1026 Newx(new_block, r1c + r2c, struct reg_code_block);
1027
1028 dst = new_block;
1029
1030 while (i1 < r1c || i2 < r2c) {
1031 struct reg_code_block *src;
1032 bool is_qr = 0;
1033
1034 if (i1 == r1c) {
1035 src = &r2->code_blocks->cb[i2++];
1036 is_qr = 1;
1037 }
1038 else if (i2 == r2c)
1039 src = &r1->code_blocks->cb[i1++];
1040 else if ( r1->code_blocks->cb[i1].start
1041 < r2->code_blocks->cb[i2].start)
1042 {
1043 src = &r1->code_blocks->cb[i1++];
1044 assert(src->end < r2->code_blocks->cb[i2].start);
1045 }
1046 else {
1047 assert( r1->code_blocks->cb[i1].start
1048 > r2->code_blocks->cb[i2].start);
1049 src = &r2->code_blocks->cb[i2++];
1050 is_qr = 1;
1051 assert(src->end < r1->code_blocks->cb[i1].start);
1052 }
1053
1054 assert(pat[src->start] == '(');
1055 assert(pat[src->end] == ')');
1056 dst->start = src->start;
1057 dst->end = src->end;
1058 dst->block = src->block;
1059 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
1060 : src->src_regex;
1061 dst++;
1062 }
1063 r1->code_blocks->count += r2c;
1064 Safefree(r1->code_blocks->cb);
1065 r1->code_blocks->cb = new_block;
1066 }
1067
1068 SvREFCNT_dec_NN(qr);
1069 return 1;
1070}
1071
1072
1073STATIC bool
1074S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
1075 struct reg_substr_datum *rsd,
1076 struct scan_data_substrs *sub,
1077 STRLEN longest_length)
1078{
1079 /* This is the common code for setting up the floating and fixed length
1080 * string data extracted from Perl_re_op_compile() below. Returns a boolean
1081 * as to whether succeeded or not */
1082
1083 I32 t;
1084 SSize_t ml;
1085 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
1086 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
1087
1088 if (! (longest_length
1089 || (eol /* Can't have SEOL and MULTI */
1090 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
1091 )
1092 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
1093 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
1094 {
1095 return FALSE;
1096 }
1097
1098 /* copy the information about the longest from the reg_scan_data
1099 over to the program. */
1100 if (SvUTF8(sub->str)) {
1101 rsd->substr = NULL;
1102 rsd->utf8_substr = sub->str;
1103 } else {
1104 rsd->substr = sub->str;
1105 rsd->utf8_substr = NULL;
1106 }
1107 /* end_shift is how many chars that must be matched that
1108 follow this item. We calculate it ahead of time as once the
1109 lookbehind offset is added in we lose the ability to correctly
1110 calculate it.*/
1111 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
1112 rsd->end_shift = ml - sub->min_offset
1113 - longest_length
1114 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
1115 * intead? - DAPM
1116 + (SvTAIL(sub->str) != 0)
1117 */
1118 + sub->lookbehind;
1119
1120 t = (eol/* Can't have SEOL and MULTI */
1121 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
1122 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
1123
1124 return TRUE;
1125}
1126
1127STATIC void
1128S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
1129{
1130 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
1131 * properly wrapped with the right modifiers */
1132
1133 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
1134 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
1135 != REGEX_DEPENDS_CHARSET);
1136
1137 /* The caret is output if there are any defaults: if not all the STD
1138 * flags are set, or if no character set specifier is needed */
1139 bool has_default =
1140 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
1141 || ! has_charset);
1142 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
1143 == REG_RUN_ON_COMMENT_SEEN);
1144 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
1145 >> RXf_PMf_STD_PMMOD_SHIFT);
1146 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
1147 char *p;
1148 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
1149
1150 /* We output all the necessary flags; we never output a minus, as all
1151 * those are defaults, so are
1152 * covered by the caret */
1153 const STRLEN wraplen = pat_len + has_p + has_runon
1154 + has_default /* If needs a caret */
1155 + PL_bitcount[reganch] /* 1 char for each set standard flag */
1156
1157 /* If needs a character set specifier */
1158 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
1159 + (sizeof("(?:)") - 1);
1160
1161 PERL_ARGS_ASSERT_SET_REGEX_PV;
1162
1163 /* make sure PL_bitcount bounds not exceeded */
1164 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
1165
1166 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
1167 SvPOK_on(Rx);
1168 if (RExC_utf8)
1169 SvFLAGS(Rx) |= SVf_UTF8;
1170 *p++='('; *p++='?';
1171
1172 /* If a default, cover it using the caret */
1173 if (has_default) {
1174 *p++= DEFAULT_PAT_MOD;
1175 }
1176 if (has_charset) {
1177 STRLEN len;
1178 const char* name;
1179
1180 name = get_regex_charset_name(RExC_rx->extflags, &len);
1181 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
1182 assert(RExC_utf8);
1183 name = UNICODE_PAT_MODS;
1184 len = sizeof(UNICODE_PAT_MODS) - 1;
1185 }
1186 Copy(name, p, len, char);
1187 p += len;
1188 }
1189 if (has_p)
1190 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
1191 {
1192 char ch;
1193 while((ch = *fptr++)) {
1194 if(reganch & 1)
1195 *p++ = ch;
1196 reganch >>= 1;
1197 }
1198 }
1199
1200 *p++ = ':';
1201 Copy(RExC_precomp, p, pat_len, char);
1202 assert ((RX_WRAPPED(Rx) - p) < 16);
1203 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
1204 p += pat_len;
1205
1206 /* Adding a trailing \n causes this to compile properly:
1207 my $R = qr / A B C # D E/x; /($R)/
1208 Otherwise the parens are considered part of the comment */
1209 if (has_runon)
1210 *p++ = '\n';
1211 *p++ = ')';
1212 *p = 0;
1213 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
1214}
1215
1216/*
1217 * Perl_re_op_compile - the perl internal RE engine's function to compile a
1218 * regular expression into internal code.
1219 * The pattern may be passed either as:
1220 * a list of SVs (patternp plus pat_count)
1221 * a list of OPs (expr)
1222 * If both are passed, the SV list is used, but the OP list indicates
1223 * which SVs are actually pre-compiled code blocks
1224 *
1225 * The SVs in the list have magic and qr overloading applied to them (and
1226 * the list may be modified in-place with replacement SVs in the latter
1227 * case).
1228 *
1229 * If the pattern hasn't changed from old_re, then old_re will be
1230 * returned.
1231 *
1232 * eng is the current engine. If that engine has an op_comp method, then
1233 * handle directly (i.e. we assume that op_comp was us); otherwise, just
1234 * do the initial concatenation of arguments and pass on to the external
1235 * engine.
1236 *
1237 * If is_bare_re is not null, set it to a boolean indicating whether the
1238 * arg list reduced (after overloading) to a single bare regex which has
1239 * been returned (i.e. /$qr/).
1240 *
1241 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
1242 *
1243 * pm_flags contains the PMf_* flags, typically based on those from the
1244 * pm_flags field of the related PMOP. Currently we're only interested in
1245 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
1246 *
1247 * For many years this code had an initial sizing pass that calculated
1248 * (sometimes incorrectly, leading to security holes) the size needed for the
1249 * compiled pattern. That was changed by commit
1250 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
1251 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
1252 * references to this sizing pass.
1253 *
1254 * Now, an initial crude guess as to the size needed is made, based on the
1255 * length of the pattern. Patches welcome to improve that guess. That amount
1256 * of space is malloc'd and then immediately freed, and then clawed back node
1257 * by node. This design is to minimize, to the extent possible, memory churn
1258 * when doing the reallocs.
1259 *
1260 * A separate parentheses counting pass may be needed in some cases.
1261 * (Previously the sizing pass did this.) Patches welcome to reduce the number
1262 * of these cases.
1263 *
1264 * The existence of a sizing pass necessitated design decisions that are no
1265 * longer needed. There are potential areas of simplification.
1266 *
1267 * Beware that the optimization-preparation code in here knows about some
1268 * of the structure of the compiled regexp. [I'll say.]
1269 */
1270
1271REGEXP *
1272Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
1273 OP *expr, const regexp_engine* eng, REGEXP *old_re,
1274 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
1275{
1276 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
1277 STRLEN plen;
1278 char *exp;
1279 regnode *scan;
1280 I32 flags;
1281 SSize_t minlen = 0;
1282 U32 rx_flags;
1283 SV *pat;
1284 SV** new_patternp = patternp;
1285
1286 /* these are all flags - maybe they should be turned
1287 * into a single int with different bit masks */
1288 I32 sawlookahead = 0;
1289 I32 sawplus = 0;
1290 I32 sawopen = 0;
1291 I32 sawminmod = 0;
1292
1293 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
1294 bool recompile = 0;
1295 bool runtime_code = 0;
1296 scan_data_t data;
1297 RExC_state_t RExC_state;
1298 RExC_state_t * const pRExC_state = &RExC_state;
1299#ifdef TRIE_STUDY_OPT
1300 /* search for "restudy" in this file for a detailed explanation */
1301 int restudied = 0;
1302 RExC_state_t copyRExC_state;
1303#endif
1304 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1305
1306 PERL_ARGS_ASSERT_RE_OP_COMPILE;
1307
1308 DEBUG_r(if (!PL_colorset) reginitcolors());
1309
1310
1311 pRExC_state->warn_text = NULL;
1312 pRExC_state->unlexed_names = NULL;
1313 pRExC_state->code_blocks = NULL;
1314
1315 if (is_bare_re)
1316 *is_bare_re = FALSE;
1317
1318 if (expr && (expr->op_type == OP_LIST ||
1319 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
1320 /* allocate code_blocks if needed */
1321 OP *o;
1322 int ncode = 0;
1323
1324 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
1325 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
1326 ncode++; /* count of DO blocks */
1327
1328 if (ncode)
1329 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
1330 }
1331
1332 if (!pat_count) {
1333 /* compile-time pattern with just OP_CONSTs and DO blocks */
1334
1335 int n;
1336 OP *o;
1337
1338 /* find how many CONSTs there are */
1339 assert(expr);
1340 n = 0;
1341 if (expr->op_type == OP_CONST)
1342 n = 1;
1343 else
1344 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1345 if (o->op_type == OP_CONST)
1346 n++;
1347 }
1348
1349 /* fake up an SV array */
1350
1351 assert(!new_patternp);
1352 Newx(new_patternp, n, SV*);
1353 SAVEFREEPV(new_patternp);
1354 pat_count = n;
1355
1356 n = 0;
1357 if (expr->op_type == OP_CONST)
1358 new_patternp[n] = cSVOPx_sv(expr);
1359 else
1360 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1361 if (o->op_type == OP_CONST)
1362 new_patternp[n++] = cSVOPo_sv;
1363 }
1364
1365 }
1366
1367 DEBUG_PARSE_r(Perl_re_printf( aTHX_
1368 "Assembling pattern from %d elements%s\n", pat_count,
1369 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1370
1371 /* set expr to the first arg op */
1372
1373 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
1374 && expr->op_type != OP_CONST)
1375 {
1376 expr = cLISTOPx(expr)->op_first;
1377 assert( expr->op_type == OP_PUSHMARK
1378 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
1379 || expr->op_type == OP_PADRANGE);
1380 expr = OpSIBLING(expr);
1381 }
1382
1383 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
1384 expr, &recompile, NULL);
1385
1386 /* handle bare (possibly after overloading) regex: foo =~ $re */
1387 {
1388 SV *re = pat;
1389 if (SvROK(re))
1390 re = SvRV(re);
1391 if (SvTYPE(re) == SVt_REGEXP) {
1392 if (is_bare_re)
1393 *is_bare_re = TRUE;
1394 SvREFCNT_inc(re);
1395 DEBUG_PARSE_r(Perl_re_printf( aTHX_
1396 "Precompiled pattern%s\n",
1397 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1398
1399 return (REGEXP*)re;
1400 }
1401 }
1402
1403 exp = SvPV_nomg(pat, plen);
1404
1405 if (!eng->op_comp) {
1406 if ((SvUTF8(pat) && IN_BYTES)
1407 || SvGMAGICAL(pat) || SvAMAGIC(pat))
1408 {
1409 /* make a temporary copy; either to convert to bytes,
1410 * or to avoid repeating get-magic / overloaded stringify */
1411 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
1412 (IN_BYTES ? 0 : SvUTF8(pat)));
1413 }
1414 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
1415 }
1416
1417 /* ignore the utf8ness if the pattern is 0 length */
1418 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
1419 RExC_uni_semantics = 0;
1420 RExC_contains_locale = 0;
1421 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
1422 RExC_in_script_run = 0;
1423 RExC_study_started = 0;
1424 pRExC_state->runtime_code_qr = NULL;
1425 RExC_frame_head= NULL;
1426 RExC_frame_last= NULL;
1427 RExC_frame_count= 0;
1428 RExC_latest_warn_offset = 0;
1429 RExC_use_BRANCHJ = 0;
1430 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
1431 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
1432 RExC_logical_total_parens = 0;
1433 RExC_total_parens = 0;
1434 RExC_logical_to_parno = NULL;
1435 RExC_parno_to_logical = NULL;
1436 RExC_open_parens = NULL;
1437 RExC_close_parens = NULL;
1438 RExC_paren_names = NULL;
1439 RExC_size = 0;
1440 RExC_seen_d_op = FALSE;
1441#ifdef DEBUGGING
1442 RExC_paren_name_list = NULL;
1443#endif
1444
1445 DEBUG_r({
1446 RExC_mysv1= sv_newmortal();
1447 RExC_mysv2= sv_newmortal();
1448 });
1449
1450 DEBUG_COMPILE_r({
1451 SV *dsv= sv_newmortal();
1452 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1453 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
1454 PL_colors[4], PL_colors[5], s);
1455 });
1456
1457 /* we jump here if we have to recompile, e.g., from upgrading the pattern
1458 * to utf8 */
1459
1460 if ((pm_flags & PMf_USE_RE_EVAL)
1461 /* this second condition covers the non-regex literal case,
1462 * i.e. $foo =~ '(?{})'. */
1463 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
1464 )
1465 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
1466
1467 redo_parse:
1468 /* return old regex if pattern hasn't changed */
1469 /* XXX: note in the below we have to check the flags as well as the
1470 * pattern.
1471 *
1472 * Things get a touch tricky as we have to compare the utf8 flag
1473 * independently from the compile flags. */
1474
1475 if ( old_re
1476 && !recompile
1477 && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
1478 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
1479 && RX_PRELEN(old_re) == plen
1480 && memEQ(RX_PRECOMP(old_re), exp, plen)
1481 && !runtime_code /* with runtime code, always recompile */ )
1482 {
1483 DEBUG_COMPILE_r({
1484 SV *dsv= sv_newmortal();
1485 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1486 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
1487 PL_colors[4], PL_colors[5], s);
1488 });
1489 return old_re;
1490 }
1491
1492 /* Allocate the pattern's SV */
1493 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
1494 RExC_rx = ReANY(Rx);
1495 if ( RExC_rx == NULL )
1496 FAIL("Regexp out of space");
1497
1498 rx_flags = orig_rx_flags;
1499
1500 if ( toUSE_UNI_CHARSET_NOT_DEPENDS
1501 && initial_charset == REGEX_DEPENDS_CHARSET)
1502 {
1503
1504 /* Set to use unicode semantics if the pattern is in utf8 and has the
1505 * 'depends' charset specified, as it means unicode when utf8 */
1506 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
1507 RExC_uni_semantics = 1;
1508 }
1509
1510 RExC_pm_flags = pm_flags;
1511
1512 if (runtime_code) {
1513 assert(TAINTING_get || !TAINT_get);
1514 if (TAINT_get)
1515 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
1516
1517 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
1518 /* whoops, we have a non-utf8 pattern, whilst run-time code
1519 * got compiled as utf8. Try again with a utf8 pattern */
1520 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1521 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1522 goto redo_parse;
1523 }
1524 }
1525 assert(!pRExC_state->runtime_code_qr);
1526
1527 RExC_sawback = 0;
1528
1529 RExC_seen = 0;
1530 RExC_maxlen = 0;
1531 RExC_in_lookaround = 0;
1532 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1533 RExC_recode_x_to_native = 0;
1534 RExC_in_multi_char_class = 0;
1535
1536 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
1537 RExC_precomp_end = RExC_end = exp + plen;
1538 RExC_nestroot = 0;
1539 RExC_whilem_seen = 0;
1540 RExC_end_op = NULL;
1541 RExC_recurse = NULL;
1542 RExC_study_chunk_recursed = NULL;
1543 RExC_study_chunk_recursed_bytes= 0;
1544 RExC_recurse_count = 0;
1545 RExC_sets_depth = 0;
1546 pRExC_state->code_index = 0;
1547
1548 /* Initialize the string in the compiled pattern. This is so that there is
1549 * something to output if necessary */
1550 set_regex_pv(pRExC_state, Rx);
1551
1552 DEBUG_PARSE_r({
1553 Perl_re_printf( aTHX_
1554 "Starting parse and generation\n");
1555 RExC_lastnum=0;
1556 RExC_lastparse=NULL;
1557 });
1558
1559 /* Allocate space and zero-initialize. Note, the two step process
1560 of zeroing when in debug mode, thus anything assigned has to
1561 happen after that */
1562 if (! RExC_size) {
1563
1564 /* On the first pass of the parse, we guess how big this will be. Then
1565 * we grow in one operation to that amount and then give it back. As
1566 * we go along, we re-allocate what we need.
1567 *
1568 * XXX Currently the guess is essentially that the pattern will be an
1569 * EXACT node with one byte input, one byte output. This is crude, and
1570 * better heuristics are welcome.
1571 *
1572 * On any subsequent passes, we guess what we actually computed in the
1573 * latest earlier pass. Such a pass probably didn't complete so is
1574 * missing stuff. We could improve those guesses by knowing where the
1575 * parse stopped, and use the length so far plus apply the above
1576 * assumption to what's left. */
1577 RExC_size = STR_SZ(RExC_end - RExC_start);
1578 }
1579
1580 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
1581 if ( RExC_rxi == NULL )
1582 FAIL("Regexp out of space");
1583
1584 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
1585 RXi_SET( RExC_rx, RExC_rxi );
1586
1587 /* We start from 0 (over from 0 in the case this is a reparse. The first
1588 * node parsed will give back any excess memory we have allocated so far).
1589 * */
1590 RExC_size = 0;
1591
1592 /* non-zero initialization begins here */
1593 RExC_rx->engine= eng;
1594 RExC_rx->extflags = rx_flags;
1595 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
1596
1597 if (pm_flags & PMf_IS_QR) {
1598 RExC_rxi->code_blocks = pRExC_state->code_blocks;
1599 if (RExC_rxi->code_blocks) {
1600 RExC_rxi->code_blocks->refcnt++;
1601 }
1602 }
1603
1604 RExC_rx->intflags = 0;
1605
1606 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
1607 RExC_parse_set(exp);
1608
1609 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
1610 * code makes sure the final byte is an uncounted NUL. But should this
1611 * ever not be the case, lots of things could read beyond the end of the
1612 * buffer: loops like
1613 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
1614 * strchr(RExC_parse, "foo");
1615 * etc. So it is worth noting. */
1616 assert(*RExC_end == '\0');
1617
1618 RExC_naughty = 0;
1619 RExC_npar = 1;
1620 RExC_logical_npar = 1;
1621 RExC_parens_buf_size = 0;
1622 RExC_emit_start = RExC_rxi->program;
1623 pRExC_state->code_index = 0;
1624
1625 *((char*) RExC_emit_start) = (char) REG_MAGIC;
1626 RExC_emit = NODE_STEP_REGNODE;
1627
1628 /* Do the parse */
1629 if (reg(pRExC_state, 0, &flags, 1)) {
1630
1631 /* Success!, But we may need to redo the parse knowing how many parens
1632 * there actually are */
1633 if (IN_PARENS_PASS) {
1634 flags |= RESTART_PARSE;
1635 }
1636
1637 /* We have that number in RExC_npar */
1638 RExC_total_parens = RExC_npar;
1639 RExC_logical_total_parens = RExC_logical_npar;
1640 }
1641 else if (! MUST_RESTART(flags)) {
1642 ReREFCNT_dec(Rx);
1643 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
1644 }
1645
1646 /* Here, we either have success, or we have to redo the parse for some reason */
1647 if (MUST_RESTART(flags)) {
1648
1649 /* It's possible to write a regexp in ascii that represents Unicode
1650 codepoints outside of the byte range, such as via \x{100}. If we
1651 detect such a sequence we have to convert the entire pattern to utf8
1652 and then recompile, as our sizing calculation will have been based
1653 on 1 byte == 1 character, but we will need to use utf8 to encode
1654 at least some part of the pattern, and therefore must convert the whole
1655 thing.
1656 -- dmq */
1657 if (flags & NEED_UTF8) {
1658
1659 /* We have stored the offset of the final warning output so far.
1660 * That must be adjusted. Any variant characters between the start
1661 * of the pattern and this warning count for 2 bytes in the final,
1662 * so just add them again */
1663 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
1664 RExC_latest_warn_offset +=
1665 variant_under_utf8_count((U8 *) exp, (U8 *) exp
1666 + RExC_latest_warn_offset);
1667 }
1668 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1669 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1670 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
1671 }
1672 else {
1673 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
1674 }
1675
1676 if (ALL_PARENS_COUNTED) {
1677 /* Make enough room for all the known parens, and zero it */
1678 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
1679 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
1680 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
1681
1682 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
1683 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
1684 /* we do NOT reinitialize RExC_logical_to_parno and
1685 * RExC_parno_to_logical here. We need their data on the second
1686 * pass */
1687 }
1688 else { /* Parse did not complete. Reinitialize the parentheses
1689 structures */
1690 RExC_total_parens = 0;
1691 if (RExC_open_parens) {
1692 Safefree(RExC_open_parens);
1693 RExC_open_parens = NULL;
1694 }
1695 if (RExC_close_parens) {
1696 Safefree(RExC_close_parens);
1697 RExC_close_parens = NULL;
1698 }
1699 if (RExC_logical_to_parno) {
1700 Safefree(RExC_logical_to_parno);
1701 RExC_logical_to_parno = NULL;
1702 }
1703 if (RExC_parno_to_logical) {
1704 Safefree(RExC_parno_to_logical);
1705 RExC_parno_to_logical = NULL;
1706 }
1707 }
1708
1709 /* Clean up what we did in this parse */
1710 SvREFCNT_dec_NN(RExC_rx_sv);
1711
1712 goto redo_parse;
1713 }
1714
1715 /* Here, we have successfully parsed and generated the pattern's program
1716 * for the regex engine. We are ready to finish things up and look for
1717 * optimizations. */
1718
1719 /* Update the string to compile, with correct modifiers, etc */
1720 set_regex_pv(pRExC_state, Rx);
1721
1722 RExC_rx->nparens = RExC_total_parens - 1;
1723 RExC_rx->logical_nparens = RExC_logical_total_parens - 1;
1724
1725 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
1726 if (RExC_whilem_seen > 15)
1727 RExC_whilem_seen = 15;
1728
1729 DEBUG_PARSE_r({
1730 Perl_re_printf( aTHX_
1731 "Required size %" IVdf " nodes\n", (IV)RExC_size);
1732 RExC_lastnum=0;
1733 RExC_lastparse=NULL;
1734 });
1735
1736 SetProgLen(RExC_rxi,RExC_size);
1737
1738 DEBUG_DUMP_PRE_OPTIMIZE_r({
1739 SV * const sv = sv_newmortal();
1740 RXi_GET_DECL(RExC_rx, ri);
1741 DEBUG_RExC_seen();
1742 Perl_re_printf( aTHX_ "Program before optimization:\n");
1743
1744 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
1745 sv, 0, 0);
1746 });
1747
1748 DEBUG_OPTIMISE_r(
1749 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
1750 );
1751
1752 /* XXXX To minimize changes to RE engine we always allocate
1753 3-units-long substrs field. */
1754 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
1755 if (RExC_recurse_count) {
1756 Newx(RExC_recurse, RExC_recurse_count, regnode *);
1757 SAVEFREEPV(RExC_recurse);
1758 }
1759
1760 if (RExC_seen & REG_RECURSE_SEEN) {
1761 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
1762 * So its 1 if there are no parens. */
1763 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
1764 ((RExC_total_parens & 0x07) != 0);
1765 Newx(RExC_study_chunk_recursed,
1766 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1767 SAVEFREEPV(RExC_study_chunk_recursed);
1768 }
1769
1770 reStudy:
1771 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
1772 DEBUG_r(
1773 RExC_study_chunk_recursed_count= 0;
1774 );
1775 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
1776 if (RExC_study_chunk_recursed) {
1777 Zero(RExC_study_chunk_recursed,
1778 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1779 }
1780
1781
1782#ifdef TRIE_STUDY_OPT
1783 /* search for "restudy" in this file for a detailed explanation */
1784 if (!restudied) {
1785 StructCopy(&zero_scan_data, &data, scan_data_t);
1786 copyRExC_state = RExC_state;
1787 } else {
1788 U32 seen=RExC_seen;
1789 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
1790
1791 RExC_state = copyRExC_state;
1792 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
1793 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
1794 else
1795 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
1796 StructCopy(&zero_scan_data, &data, scan_data_t);
1797 }
1798#else
1799 StructCopy(&zero_scan_data, &data, scan_data_t);
1800#endif
1801
1802 /* Dig out information for optimizations. */
1803 RExC_rx->extflags = RExC_flags; /* was pm_op */
1804 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
1805
1806 if (UTF)
1807 SvUTF8_on(Rx); /* Unicode in it? */
1808 RExC_rxi->regstclass = NULL;
1809 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
1810 RExC_rx->intflags |= PREGf_NAUGHTY;
1811 scan = RExC_rxi->program + 1; /* First BRANCH. */
1812
1813 /* testing for BRANCH here tells us whether there is "must appear"
1814 data in the pattern. If there is then we can use it for optimisations */
1815 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
1816 */
1817 SSize_t fake_deltap;
1818 STRLEN longest_length[2];
1819 regnode_ssc ch_class; /* pointed to by data */
1820 int stclass_flag;
1821 SSize_t last_close = 0; /* pointed to by data */
1822 regnode *first= scan;
1823 regnode *first_next= regnext(first);
1824 regnode *last_close_op= NULL;
1825 int i;
1826
1827 /*
1828 * Skip introductions and multiplicators >= 1
1829 * so that we can extract the 'meat' of the pattern that must
1830 * match in the large if() sequence following.
1831 * NOTE that EXACT is NOT covered here, as it is normally
1832 * picked up by the optimiser separately.
1833 *
1834 * This is unfortunate as the optimiser isnt handling lookahead
1835 * properly currently.
1836 *
1837 */
1838 while (1)
1839 {
1840 if (OP(first) == OPEN)
1841 sawopen = 1;
1842 else
1843 if (OP(first) == IFMATCH && !FLAGS(first))
1844 /* for now we can't handle lookbehind IFMATCH */
1845 sawlookahead = 1;
1846 else
1847 if (OP(first) == PLUS)
1848 sawplus = 1;
1849 else
1850 if (OP(first) == MINMOD)
1851 sawminmod = 1;
1852 else
1853 if (!(
1854 /* An OR of *one* alternative - should not happen now. */
1855 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
1856 /* An {n,m} with n>0 */
1857 (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) ||
1858 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END)
1859 )){
1860 break;
1861 }
1862
1863 first = REGNODE_AFTER(first);
1864 first_next= regnext(first);
1865 }
1866
1867 /* Starting-point info. */
1868 again:
1869 DEBUG_PEEP("first:", first, 0, 0);
1870 /* Ignore EXACT as we deal with it later. */
1871 if (REGNODE_TYPE(OP(first)) == EXACT) {
1872 if (! isEXACTFish(OP(first))) {
1873 NOOP; /* Empty, get anchored substr later. */
1874 }
1875 else
1876 RExC_rxi->regstclass = first;
1877 }
1878#ifdef TRIE_STCLASS
1879 else if (REGNODE_TYPE(OP(first)) == TRIE &&
1880 ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0)
1881 {
1882 /* this can happen only on restudy
1883 * Search for "restudy" in this file to find
1884 * a comment with details. */
1885 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
1886 }
1887#endif
1888 else if (REGNODE_SIMPLE(OP(first)))
1889 RExC_rxi->regstclass = first;
1890 else if (REGNODE_TYPE(OP(first)) == BOUND ||
1891 REGNODE_TYPE(OP(first)) == NBOUND)
1892 RExC_rxi->regstclass = first;
1893 else if (REGNODE_TYPE(OP(first)) == BOL) {
1894 RExC_rx->intflags |= (OP(first) == MBOL
1895 ? PREGf_ANCH_MBOL
1896 : PREGf_ANCH_SBOL);
1897 first = REGNODE_AFTER(first);
1898 goto again;
1899 }
1900 else if (OP(first) == GPOS) {
1901 RExC_rx->intflags |= PREGf_ANCH_GPOS;
1902 first = REGNODE_AFTER_type(first,tregnode_GPOS);
1903 goto again;
1904 }
1905 else if ((!sawopen || !RExC_sawback) &&
1906 !sawlookahead &&
1907 (OP(first) == STAR &&
1908 REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
1909 !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN))
1910 {
1911 /* turn .* into ^.* with an implied $*=1 */
1912 const int type =
1913 (OP(REGNODE_AFTER(first)) == REG_ANY)
1914 ? PREGf_ANCH_MBOL
1915 : PREGf_ANCH_SBOL;
1916 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
1917 first = REGNODE_AFTER(first);
1918 goto again;
1919 }
1920 if (sawplus && !sawminmod && !sawlookahead
1921 && (!sawopen || !RExC_sawback)
1922 && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */
1923 /* x+ must match at the 1st pos of run of x's */
1924 RExC_rx->intflags |= PREGf_SKIP;
1925
1926 /* Scan is after the zeroth branch, first is atomic matcher. */
1927#ifdef TRIE_STUDY_OPT
1928 /* search for "restudy" in this file for a detailed explanation */
1929 DEBUG_PARSE_r(
1930 if (!restudied)
1931 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
1932 (IV)(first - scan + 1))
1933 );
1934#else
1935 DEBUG_PARSE_r(
1936 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
1937 (IV)(first - scan + 1))
1938 );
1939#endif
1940
1941
1942 /*
1943 * If there's something expensive in the r.e., find the
1944 * longest literal string that must appear and make it the
1945 * regmust. Resolve ties in favor of later strings, since
1946 * the regstart check works with the beginning of the r.e.
1947 * and avoiding duplication strengthens checking. Not a
1948 * strong reason, but sufficient in the absence of others.
1949 * [Now we resolve ties in favor of the earlier string if
1950 * it happens that c_offset_min has been invalidated, since the
1951 * earlier string may buy us something the later one won't.]
1952 */
1953
1954 data.substrs[0].str = newSVpvs("");
1955 data.substrs[1].str = newSVpvs("");
1956 data.last_found = newSVpvs("");
1957 data.cur_is_floating = 0; /* initially any found substring is fixed */
1958 ENTER_with_name("study_chunk");
1959 SAVEFREESV(data.substrs[0].str);
1960 SAVEFREESV(data.substrs[1].str);
1961 SAVEFREESV(data.last_found);
1962 first = scan;
1963 if (!RExC_rxi->regstclass) {
1964 ssc_init(pRExC_state, &ch_class);
1965 data.start_class = &ch_class;
1966 stclass_flag = SCF_DO_STCLASS_AND;
1967 } else /* XXXX Check for BOUND? */
1968 stclass_flag = 0;
1969 data.last_closep = &last_close;
1970 data.last_close_opp = &last_close_op;
1971
1972 DEBUG_RExC_seen();
1973 /*
1974 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
1975 * (NO top level branches)
1976 */
1977 minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
1978 scan + RExC_size, /* Up to end */
1979 &data, -1, 0, NULL,
1980 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
1981 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
1982 0, TRUE);
1983 /* search for "restudy" in this file for a detailed explanation
1984 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
1985
1986
1987 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
1988
1989
1990 if ( RExC_total_parens == 1 && !data.cur_is_floating
1991 && data.last_start_min == 0 && data.last_end > 0
1992 && !RExC_seen_zerolen
1993 && !(RExC_seen & REG_VERBARG_SEEN)
1994 && !(RExC_seen & REG_GPOS_SEEN)
1995 ){
1996 RExC_rx->extflags |= RXf_CHECK_ALL;
1997 }
1998 scan_commit(pRExC_state, &data,&minlen, 0);
1999
2000
2001 /* XXX this is done in reverse order because that's the way the
2002 * code was before it was parameterised. Don't know whether it
2003 * actually needs doing in reverse order. DAPM */
2004 for (i = 1; i >= 0; i--) {
2005 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
2006
2007 if ( !( i
2008 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
2009 && data.substrs[0].min_offset
2010 == data.substrs[1].min_offset
2011 && SvCUR(data.substrs[0].str)
2012 == SvCUR(data.substrs[1].str)
2013 )
2014 && S_setup_longest (aTHX_ pRExC_state,
2015 &(RExC_rx->substrs->data[i]),
2016 &(data.substrs[i]),
2017 longest_length[i]))
2018 {
2019 RExC_rx->substrs->data[i].min_offset =
2020 data.substrs[i].min_offset - data.substrs[i].lookbehind;
2021
2022 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
2023 /* Don't offset infinity */
2024 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
2025 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
2026 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
2027 }
2028 else {
2029 RExC_rx->substrs->data[i].substr = NULL;
2030 RExC_rx->substrs->data[i].utf8_substr = NULL;
2031 longest_length[i] = 0;
2032 }
2033 }
2034
2035 LEAVE_with_name("study_chunk");
2036
2037 if (RExC_rxi->regstclass
2038 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
2039 RExC_rxi->regstclass = NULL;
2040
2041 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
2042 || RExC_rx->substrs->data[0].min_offset)
2043 && stclass_flag
2044 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2045 && is_ssc_worth_it(pRExC_state, data.start_class))
2046 {
2047 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2048
2049 ssc_finalize(pRExC_state, data.start_class);
2050
2051 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2052 StructCopy(data.start_class,
2053 (regnode_ssc*)RExC_rxi->data->data[n],
2054 regnode_ssc);
2055 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2056 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
2057 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
2058 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2059 Perl_re_printf( aTHX_
2060 "synthetic stclass \"%s\".\n",
2061 SvPVX_const(sv));});
2062 data.start_class = NULL;
2063 }
2064
2065 /* A temporary algorithm prefers floated substr to fixed one of
2066 * same length to dig more info. */
2067 i = (longest_length[0] <= longest_length[1]);
2068 RExC_rx->substrs->check_ix = i;
2069 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
2070 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
2071 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
2072 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
2073 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
2074 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
2075 RExC_rx->intflags |= PREGf_NOSCAN;
2076
2077 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
2078 RExC_rx->extflags |= RXf_USE_INTUIT;
2079 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
2080 RExC_rx->extflags |= RXf_INTUIT_TAIL;
2081 }
2082
2083 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
2084 if ( (STRLEN)minlen < longest_length[1] )
2085 minlen= longest_length[1];
2086 if ( (STRLEN)minlen < longest_length[0] )
2087 minlen= longest_length[0];
2088 */
2089 }
2090 else {
2091 /* Several toplevels. Best we can is to set minlen. */
2092 SSize_t fake_deltap;
2093 regnode_ssc ch_class;
2094 SSize_t last_close = 0;
2095 regnode *last_close_op = NULL;
2096
2097 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
2098
2099 scan = RExC_rxi->program + 1;
2100 ssc_init(pRExC_state, &ch_class);
2101 data.start_class = &ch_class;
2102 data.last_closep = &last_close;
2103 data.last_close_opp = &last_close_op;
2104
2105 DEBUG_RExC_seen();
2106 /*
2107 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
2108 * (patterns WITH top level branches)
2109 */
2110 minlen = study_chunk(pRExC_state,
2111 &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
2112 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
2113 ? SCF_TRIE_DOING_RESTUDY
2114 : 0),
2115 0, TRUE);
2116 /* search for "restudy" in this file for a detailed explanation
2117 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2118
2119 CHECK_RESTUDY_GOTO_butfirst(NOOP);
2120
2121 RExC_rx->check_substr = NULL;
2122 RExC_rx->check_utf8 = NULL;
2123 RExC_rx->substrs->data[0].substr = NULL;
2124 RExC_rx->substrs->data[0].utf8_substr = NULL;
2125 RExC_rx->substrs->data[1].substr = NULL;
2126 RExC_rx->substrs->data[1].utf8_substr = NULL;
2127
2128 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2129 && is_ssc_worth_it(pRExC_state, data.start_class))
2130 {
2131 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2132
2133 ssc_finalize(pRExC_state, data.start_class);
2134
2135 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2136 StructCopy(data.start_class,
2137 (regnode_ssc*)RExC_rxi->data->data[n],
2138 regnode_ssc);
2139 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2140 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
2141 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
2142 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2143 Perl_re_printf( aTHX_
2144 "synthetic stclass \"%s\".\n",
2145 SvPVX_const(sv));});
2146 data.start_class = NULL;
2147 }
2148 }
2149
2150 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
2151 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
2152 RExC_rx->maxlen = REG_INFTY;
2153 }
2154 else {
2155 RExC_rx->maxlen = RExC_maxlen;
2156 }
2157
2158 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
2159 the "real" pattern. */
2160 DEBUG_OPTIMISE_r({
2161 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
2162 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
2163 });
2164 RExC_rx->minlenret = minlen;
2165 if (RExC_rx->minlen < minlen)
2166 RExC_rx->minlen = minlen;
2167
2168 if (RExC_seen & REG_RECURSE_SEEN ) {
2169 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
2170 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
2171 }
2172 if (RExC_seen & REG_GPOS_SEEN)
2173 RExC_rx->intflags |= PREGf_GPOS_SEEN;
2174
2175 if (RExC_seen & REG_PESSIMIZE_SEEN)
2176 RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN;
2177
2178 if (RExC_seen & REG_LOOKBEHIND_SEEN)
2179 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
2180 lookbehind */
2181 if (pRExC_state->code_blocks)
2182 RExC_rx->extflags |= RXf_EVAL_SEEN;
2183
2184 if (RExC_seen & REG_VERBARG_SEEN) {
2185 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
2186 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
2187 }
2188
2189 if (RExC_seen & REG_CUTGROUP_SEEN)
2190 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
2191
2192 if (pm_flags & PMf_USE_RE_EVAL)
2193 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
2194
2195 if (RExC_paren_names)
2196 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
2197 else
2198 RXp_PAREN_NAMES(RExC_rx) = NULL;
2199
2200 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
2201 * so it can be used in pp.c */
2202 if (RExC_rx->intflags & PREGf_ANCH)
2203 RExC_rx->extflags |= RXf_IS_ANCHORED;
2204
2205
2206 {
2207 /* this is used to identify "special" patterns that might result
2208 * in Perl NOT calling the regex engine and instead doing the match "itself",
2209 * particularly special cases in split//. By having the regex compiler
2210 * do this pattern matching at a regop level (instead of by inspecting the pattern)
2211 * we avoid weird issues with equivalent patterns resulting in different behavior,
2212 * AND we allow non Perl engines to get the same optimizations by the setting the
2213 * flags appropriately - Yves */
2214 regnode *first = RExC_rxi->program + 1;
2215 U8 fop = OP(first);
2216 regnode *next = NULL;
2217 U8 nop = 0;
2218 if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
2219 next = REGNODE_AFTER(first);
2220 nop = OP(next);
2221 }
2222 /* It's safe to read through *next only if OP(first) is a regop of
2223 * the right type (not EXACT, for example).
2224 */
2225 if (REGNODE_TYPE(fop) == NOTHING && nop == END)
2226 RExC_rx->extflags |= RXf_NULL;
2227 else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END)
2228 /* when fop is SBOL first->flags will be true only when it was
2229 * produced by parsing /\A/, and not when parsing /^/. This is
2230 * very important for the split code as there we want to
2231 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
2232 * See rt #122761 for more details. -- Yves */
2233 RExC_rx->extflags |= RXf_START_ONLY;
2234 else if (fop == PLUS
2235 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
2236 && OP(regnext(first)) == END)
2237 RExC_rx->extflags |= RXf_WHITE;
2238 else if ( RExC_rx->extflags & RXf_SPLIT
2239 && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
2240 && STR_LEN(first) == 1
2241 && *(STRING(first)) == ' '
2242 && OP(regnext(first)) == END )
2243 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
2244
2245 }
2246
2247 if (RExC_contains_locale) {
2248 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
2249 }
2250
2251#ifdef DEBUGGING
2252 if (RExC_paren_names) {
2253 RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a"));
2254 RExC_rxi->data->data[RExC_rxi->name_list_idx]
2255 = (void*)SvREFCNT_inc(RExC_paren_name_list);
2256 } else
2257#endif
2258 RExC_rxi->name_list_idx = 0;
2259
2260 while ( RExC_recurse_count > 0 ) {
2261 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
2262 /*
2263 * This data structure is set up in study_chunk() and is used
2264 * to calculate the distance between a GOSUB regopcode and
2265 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
2266 * it refers to.
2267 *
2268 * If for some reason someone writes code that optimises
2269 * away a GOSUB opcode then the assert should be changed to
2270 * an if(scan) to guard the ARG2i_SET() - Yves
2271 *
2272 */
2273 assert(scan && OP(scan) == GOSUB);
2274 ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan));
2275 }
2276 if (RExC_logical_total_parens != RExC_total_parens) {
2277 Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32);
2278 /* we rebuild this below */
2279 Zero(RExC_logical_to_parno, RExC_total_parens, I32);
2280 for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) {
2281 int logical_parno= RExC_parno_to_logical[parno];
2282 assert(logical_parno);
2283 RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno];
2284 RExC_logical_to_parno[logical_parno] = parno;
2285 }
2286 RExC_rx->logical_to_parno = RExC_logical_to_parno;
2287 RExC_rx->parno_to_logical = RExC_parno_to_logical;
2288 RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next;
2289 RExC_logical_to_parno = NULL;
2290 RExC_parno_to_logical = NULL;
2291 RExC_parno_to_logical_next = NULL;
2292 } else {
2293 RExC_rx->logical_to_parno = NULL;
2294 RExC_rx->parno_to_logical = NULL;
2295 RExC_rx->parno_to_logical_next = NULL;
2296 }
2297
2298 Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair);
2299 /* assume we don't need to swap parens around before we match */
2300 DEBUG_TEST_r({
2301 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
2302 (unsigned long)RExC_study_chunk_recursed_count);
2303 });
2304 DEBUG_DUMP_r({
2305 DEBUG_RExC_seen();
2306 Perl_re_printf( aTHX_ "Final program:\n");
2307 regdump(RExC_rx);
2308 });
2309
2310 if (RExC_open_parens) {
2311 Safefree(RExC_open_parens);
2312 RExC_open_parens = NULL;
2313 }
2314 if (RExC_close_parens) {
2315 Safefree(RExC_close_parens);
2316 RExC_close_parens = NULL;
2317 }
2318 if (RExC_logical_to_parno) {
2319 Safefree(RExC_logical_to_parno);
2320 RExC_logical_to_parno = NULL;
2321 }
2322 if (RExC_parno_to_logical) {
2323 Safefree(RExC_parno_to_logical);
2324 RExC_parno_to_logical = NULL;
2325 }
2326
2327#ifdef USE_ITHREADS
2328 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
2329 * by setting the regexp SV to readonly-only instead. If the
2330 * pattern's been recompiled, the USEDness should remain. */
2331 if (old_re && SvREADONLY(old_re))
2332 SvREADONLY_on(Rx);
2333#endif
2334 return Rx;
2335}
2336
2337
2338
2339SV*
2340Perl_reg_qr_package(pTHX_ REGEXP * const rx)
2341{
2342 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
2343 PERL_UNUSED_ARG(rx);
2344 if (0)
2345 return NULL;
2346 else
2347 return newSVpvs("Regexp");
2348}
2349
2350/* Scans the name of a named buffer from the pattern.
2351 * If flags is REG_RSN_RETURN_NULL returns null.
2352 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
2353 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
2354 * to the parsed name as looked up in the RExC_paren_names hash.
2355 * If there is an error throws a vFAIL().. type exception.
2356 */
2357
2358#define REG_RSN_RETURN_NULL 0
2359#define REG_RSN_RETURN_NAME 1
2360#define REG_RSN_RETURN_DATA 2
2361
2362STATIC SV*
2363S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
2364{
2365 char *name_start = RExC_parse;
2366 SV* sv_name;
2367
2368 PERL_ARGS_ASSERT_REG_SCAN_NAME;
2369
2370 assert (RExC_parse <= RExC_end);
2371 if (RExC_parse == RExC_end) NOOP;
2372 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
2373 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
2374 * using do...while */
2375 if (UTF)
2376 do {
2377 RExC_parse_inc_utf8();
2378 } while ( RExC_parse < RExC_end
2379 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
2380 else
2381 do {
2382 RExC_parse_inc_by(1);
2383 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
2384 } else {
2385 RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
2386 character */
2387 vFAIL("Group name must start with a non-digit word character");
2388 }
2389 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
2390 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
2391 if ( flags == REG_RSN_RETURN_NAME)
2392 return sv_name;
2393 else if (flags==REG_RSN_RETURN_DATA) {
2394 HE *he_str = NULL;
2395 SV *sv_dat = NULL;
2396 if ( ! sv_name ) /* should not happen*/
2397 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
2398 if (RExC_paren_names)
2399 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
2400 if ( he_str )
2401 sv_dat = HeVAL(he_str);
2402 if ( ! sv_dat ) { /* Didn't find group */
2403
2404 /* It might be a forward reference; we can't fail until we
2405 * know, by completing the parse to get all the groups, and
2406 * then reparsing */
2407 if (ALL_PARENS_COUNTED) {
2408 vFAIL("Reference to nonexistent named group");
2409 }
2410 else {
2411 REQUIRE_PARENS_PASS;
2412 }
2413 }
2414 return sv_dat;
2415 }
2416
2417 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
2418 (unsigned long) flags);
2419}
2420
2421#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
2422 if (RExC_lastparse!=RExC_parse) { \
2423 Perl_re_printf( aTHX_ "%s", \
2424 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
2425 RExC_end - RExC_parse, 16, \
2426 "", "", \
2427 PERL_PV_ESCAPE_UNI_DETECT | \
2428 PERL_PV_PRETTY_ELLIPSES | \
2429 PERL_PV_PRETTY_LTGT | \
2430 PERL_PV_ESCAPE_RE | \
2431 PERL_PV_PRETTY_EXACTSIZE \
2432 ) \
2433 ); \
2434 } else \
2435 Perl_re_printf( aTHX_ "%16s",""); \
2436 \
2437 if (RExC_lastnum!=RExC_emit) \
2438 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
2439 else \
2440 Perl_re_printf( aTHX_ "|%4s",""); \
2441 Perl_re_printf( aTHX_ "|%*s%-4s", \
2442 (int)((depth*2)), "", \
2443 (funcname) \
2444 ); \
2445 RExC_lastnum=RExC_emit; \
2446 RExC_lastparse=RExC_parse; \
2447})
2448
2449
2450
2451#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
2452 DEBUG_PARSE_MSG((funcname)); \
2453 Perl_re_printf( aTHX_ "%4s","\n"); \
2454})
2455#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
2456 DEBUG_PARSE_MSG((funcname)); \
2457 Perl_re_printf( aTHX_ fmt "\n",args); \
2458})
2459
2460
2461STATIC void
2462S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
2463{
2464 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
2465 * constructs, and updates RExC_flags with them. On input, RExC_parse
2466 * should point to the first flag; it is updated on output to point to the
2467 * final ')' or ':'. There needs to be at least one flag, or this will
2468 * abort */
2469
2470 /* for (?g), (?gc), and (?o) warnings; warning
2471 about (?c) will warn about (?g) -- japhy */
2472
2473#define WASTED_O 0x01
2474#define WASTED_G 0x02
2475#define WASTED_C 0x04
2476#define WASTED_GC (WASTED_G|WASTED_C)
2477 I32 wastedflags = 0x00;
2478 U32 posflags = 0, negflags = 0;
2479 U32 *flagsp = &posflags;
2480 char has_charset_modifier = '\0';
2481 regex_charset cs;
2482 bool has_use_defaults = FALSE;
2483 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
2484 int x_mod_count = 0;
2485
2486 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
2487
2488 /* '^' as an initial flag sets certain defaults */
2489 if (UCHARAT(RExC_parse) == '^') {
2490 RExC_parse_inc_by(1);
2491 has_use_defaults = TRUE;
2492 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
2493 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2494 ? REGEX_UNICODE_CHARSET
2495 : REGEX_DEPENDS_CHARSET;
2496 set_regex_charset(&RExC_flags, cs);
2497 }
2498 else {
2499 cs = get_regex_charset(RExC_flags);
2500 if ( cs == REGEX_DEPENDS_CHARSET
2501 && (toUSE_UNI_CHARSET_NOT_DEPENDS))
2502 {
2503 cs = REGEX_UNICODE_CHARSET;
2504 }
2505 }
2506
2507 while (RExC_parse < RExC_end) {
2508 /* && memCHRs("iogcmsx", *RExC_parse) */
2509 /* (?g), (?gc) and (?o) are useless here
2510 and must be globally applied -- japhy */
2511 if ((RExC_pm_flags & PMf_WILDCARD)) {
2512 if (flagsp == & negflags) {
2513 if (*RExC_parse == 'm') {
2514 RExC_parse_inc_by(1);
2515 /* diag_listed_as: Use of %s is not allowed in Unicode
2516 property wildcard subpatterns in regex; marked by <--
2517 HERE in m/%s/ */
2518 vFAIL("Use of modifier '-m' is not allowed in Unicode"
2519 " property wildcard subpatterns");
2520 }
2521 }
2522 else {
2523 if (*RExC_parse == 's') {
2524 goto modifier_illegal_in_wildcard;
2525 }
2526 }
2527 }
2528
2529 switch (*RExC_parse) {
2530
2531 /* Code for the imsxn flags */
2532 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
2533
2534 case LOCALE_PAT_MOD:
2535 if (has_charset_modifier) {
2536 goto excess_modifier;
2537 }
2538 else if (flagsp == &negflags) {
2539 goto neg_modifier;
2540 }
2541 cs = REGEX_LOCALE_CHARSET;
2542 has_charset_modifier = LOCALE_PAT_MOD;
2543 break;
2544 case UNICODE_PAT_MOD:
2545 if (has_charset_modifier) {
2546 goto excess_modifier;
2547 }
2548 else if (flagsp == &negflags) {
2549 goto neg_modifier;
2550 }
2551 cs = REGEX_UNICODE_CHARSET;
2552 has_charset_modifier = UNICODE_PAT_MOD;
2553 break;
2554 case ASCII_RESTRICT_PAT_MOD:
2555 if (flagsp == &negflags) {
2556 goto neg_modifier;
2557 }
2558 if (has_charset_modifier) {
2559 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
2560 goto excess_modifier;
2561 }
2562 /* Doubled modifier implies more restricted */
2563 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
2564 }
2565 else {
2566 cs = REGEX_ASCII_RESTRICTED_CHARSET;
2567 }
2568 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
2569 break;
2570 case DEPENDS_PAT_MOD:
2571 if (has_use_defaults) {
2572 goto fail_modifiers;
2573 }
2574 else if (flagsp == &negflags) {
2575 goto neg_modifier;
2576 }
2577 else if (has_charset_modifier) {
2578 goto excess_modifier;
2579 }
2580
2581 /* The dual charset means unicode semantics if the
2582 * pattern (or target, not known until runtime) are
2583 * utf8, or something in the pattern indicates unicode
2584 * semantics */
2585 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2586 ? REGEX_UNICODE_CHARSET
2587 : REGEX_DEPENDS_CHARSET;
2588 has_charset_modifier = DEPENDS_PAT_MOD;
2589 break;
2590 excess_modifier:
2591 RExC_parse_inc_by(1);
2592 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
2593 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
2594 }
2595 else if (has_charset_modifier == *(RExC_parse - 1)) {
2596 vFAIL2("Regexp modifier \"%c\" may not appear twice",
2597 *(RExC_parse - 1));
2598 }
2599 else {
2600 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
2601 }
2602 NOT_REACHED; /*NOTREACHED*/
2603 neg_modifier:
2604 RExC_parse_inc_by(1);
2605 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
2606 *(RExC_parse - 1));
2607 NOT_REACHED; /*NOTREACHED*/
2608 case GLOBAL_PAT_MOD: /* 'g' */
2609 if (RExC_pm_flags & PMf_WILDCARD) {
2610 goto modifier_illegal_in_wildcard;
2611 }
2612 /*FALLTHROUGH*/
2613 case ONCE_PAT_MOD: /* 'o' */
2614 if (ckWARN(WARN_REGEXP)) {
2615 const I32 wflagbit = *RExC_parse == 'o'
2616 ? WASTED_O
2617 : WASTED_G;
2618 if (! (wastedflags & wflagbit) ) {
2619 wastedflags |= wflagbit;
2620 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2621 vWARN5(
2622 RExC_parse + 1,
2623 "Useless (%s%c) - %suse /%c modifier",
2624 flagsp == &negflags ? "?-" : "?",
2625 *RExC_parse,
2626 flagsp == &negflags ? "don't " : "",
2627 *RExC_parse
2628 );
2629 }
2630 }
2631 break;
2632
2633 case CONTINUE_PAT_MOD: /* 'c' */
2634 if (RExC_pm_flags & PMf_WILDCARD) {
2635 goto modifier_illegal_in_wildcard;
2636 }
2637 if (ckWARN(WARN_REGEXP)) {
2638 if (! (wastedflags & WASTED_C) ) {
2639 wastedflags |= WASTED_GC;
2640 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2641 vWARN3(
2642 RExC_parse + 1,
2643 "Useless (%sc) - %suse /gc modifier",
2644 flagsp == &negflags ? "?-" : "?",
2645 flagsp == &negflags ? "don't " : ""
2646 );
2647 }
2648 }
2649 break;
2650 case KEEPCOPY_PAT_MOD: /* 'p' */
2651 if (RExC_pm_flags & PMf_WILDCARD) {
2652 goto modifier_illegal_in_wildcard;
2653 }
2654 if (flagsp == &negflags) {
2655 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
2656 } else {
2657 *flagsp |= RXf_PMf_KEEPCOPY;
2658 }
2659 break;
2660 case '-':
2661 /* A flag is a default iff it is following a minus, so
2662 * if there is a minus, it means will be trying to
2663 * re-specify a default which is an error */
2664 if (has_use_defaults || flagsp == &negflags) {
2665 goto fail_modifiers;
2666 }
2667 flagsp = &negflags;
2668 wastedflags = 0; /* reset so (?g-c) warns twice */
2669 x_mod_count = 0;
2670 break;
2671 case ':':
2672 case ')':
2673
2674 if ( (RExC_pm_flags & PMf_WILDCARD)
2675 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
2676 {
2677 RExC_parse_inc_by(1);
2678 /* diag_listed_as: Use of %s is not allowed in Unicode
2679 property wildcard subpatterns in regex; marked by <--
2680 HERE in m/%s/ */
2681 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
2682 " property wildcard subpatterns",
2683 has_charset_modifier);
2684 }
2685
2686 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
2687 negflags |= RXf_PMf_EXTENDED_MORE;
2688 }
2689 RExC_flags |= posflags;
2690
2691 if (negflags & RXf_PMf_EXTENDED) {
2692 negflags |= RXf_PMf_EXTENDED_MORE;
2693 }
2694 RExC_flags &= ~negflags;
2695 set_regex_charset(&RExC_flags, cs);
2696
2697 return;
2698 default:
2699 fail_modifiers:
2700 RExC_parse_inc_if_char();
2701 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
2702 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
2703 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
2704 NOT_REACHED; /*NOTREACHED*/
2705 }
2706
2707 RExC_parse_inc();
2708 }
2709
2710 vFAIL("Sequence (?... not terminated");
2711
2712 modifier_illegal_in_wildcard:
2713 RExC_parse_inc_by(1);
2714 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
2715 subpatterns in regex; marked by <-- HERE in m/%s/ */
2716 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
2717 " subpatterns", *(RExC_parse - 1));
2718}
2719
2720/*
2721 - reg - regular expression, i.e. main body or parenthesized thing
2722 *
2723 * Caller must absorb opening parenthesis.
2724 *
2725 * Combining parenthesis handling with the base level of regular expression
2726 * is a trifle forced, but the need to tie the tails of the branches to what
2727 * follows makes it hard to avoid.
2728 */
2729
2730STATIC regnode_offset
2731S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
2732 I32 *flagp,
2733 char * backref_parse_start,
2734 char ch
2735 )
2736{
2737 regnode_offset ret;
2738 char* name_start = RExC_parse;
2739 U32 num = 0;
2740 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
2741 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2742
2743 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
2744
2745 if (RExC_parse != name_start && ch == '}') {
2746 while (isBLANK(*RExC_parse)) {
2747 RExC_parse_inc_by(1);
2748 }
2749 }
2750 if (RExC_parse == name_start || *RExC_parse != ch) {
2751 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
2752 vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
2753 }
2754
2755 if (sv_dat) {
2756 num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
2757 RExC_rxi->data->data[num]=(void*)sv_dat;
2758 SvREFCNT_inc_simple_void_NN(sv_dat);
2759 }
2760 RExC_sawback = 1;
2761 ret = reg2node(pRExC_state,
2762 ((! FOLD)
2763 ? REFN
2764 : (ASCII_FOLD_RESTRICTED)
2765 ? REFFAN
2766 : (AT_LEAST_UNI_SEMANTICS)
2767 ? REFFUN
2768 : (LOC)
2769 ? REFFLN
2770 : REFFN),
2771 num, RExC_nestroot);
2772 if (RExC_nestroot && num >= (U32)RExC_nestroot)
2773 FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
2774 *flagp |= HASWIDTH;
2775
2776 nextchar(pRExC_state);
2777 return ret;
2778}
2779
2780/* reg_la_NOTHING()
2781 *
2782 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2783 * NOTHING regop when the construct is empty.
2784 *
2785 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2786 *
2787 * Checks for unterminated constructs and throws a "not terminated" error
2788 * with the appropriate type if necessary
2789 *
2790 * Assuming it does not throw an exception increments RExC_seen_zerolen.
2791 *
2792 * If the construct is empty generates a NOTHING op and returns its
2793 * regnode_offset, which the caller would then return to its caller.
2794 *
2795 * If the construct is not empty increments RExC_in_lookaround, and turns
2796 * on any flags provided in RExC_seen, and then returns 0 to signify
2797 * that parsing should continue.
2798 *
2799 * PS: I would have called this reg_parse_lookaround_NOTHING() but then
2800 * any use of it would have had to be broken onto multiple lines, hence
2801 * the abbreviation.
2802 */
2803STATIC regnode_offset
2804S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2805 const char *type)
2806{
2807
2808 PERL_ARGS_ASSERT_REG_LA_NOTHING;
2809
2810 /* false below so we do not force /x */
2811 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2812
2813 if (RExC_parse >= RExC_end)
2814 vFAIL2("Sequence (%s... not terminated", type);
2815
2816 /* Always increment as NOTHING regops are zerolen */
2817 RExC_seen_zerolen++;
2818
2819 if (*RExC_parse == ')') {
2820 regnode_offset ret= reg_node(pRExC_state, NOTHING);
2821 nextchar(pRExC_state);
2822 return ret;
2823 }
2824
2825 RExC_seen |= flags;
2826 RExC_in_lookaround++;
2827 return 0; /* keep parsing! */
2828}
2829
2830/* reg_la_OPFAIL()
2831 *
2832 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2833 * OPFAIL regop when the construct is empty.
2834 *
2835 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2836 *
2837 * Checks for unterminated constructs and throws a "not terminated" error
2838 * if necessary.
2839 *
2840 * If the construct is empty generates an OPFAIL op and returns its
2841 * regnode_offset which the caller should then return to its caller.
2842 *
2843 * If the construct is not empty increments RExC_in_lookaround, and also
2844 * increments RExC_seen_zerolen, and turns on the flags provided in
2845 * RExC_seen, and then returns 0 to signify that parsing should continue.
2846 *
2847 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
2848 * any use of it would have had to be broken onto multiple lines, hence
2849 * the abbreviation.
2850 */
2851
2852STATIC regnode_offset
2853S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2854 const char *type)
2855{
2856
2857 PERL_ARGS_ASSERT_REG_LA_OPFAIL;
2858
2859 /* FALSE so we don't force to /x below */;
2860 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2861
2862 if (RExC_parse >= RExC_end)
2863 vFAIL2("Sequence (%s... not terminated", type);
2864
2865 if (*RExC_parse == ')') {
2866 regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0);
2867 nextchar(pRExC_state);
2868 return ret; /* return produced regop */
2869 }
2870
2871 /* only increment zerolen *after* we check if we produce an OPFAIL
2872 * as an OPFAIL does not match a zero length construct, as it
2873 * does not match ever. */
2874 RExC_seen_zerolen++;
2875 RExC_seen |= flags;
2876 RExC_in_lookaround++;
2877 return 0; /* keep parsing! */
2878}
2879
2880/* Below are the main parsing routines.
2881 *
2882 * S_reg() parses a whole pattern or subpattern. It itself handles things
2883 * like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
2884 * alternation '|' in the '...' pattern.
2885 * S_regbranch() effectively implements the concatenation operator, handling
2886 * one alternative of '|', repeatedly calling S_regpiece on each
2887 * segment of the input.
2888 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
2889 * and then adds any quantifier for that chunk.
2890 * S_regatom() parses the next chunk of the input, returning when it
2891 * determines it has found a complete atomic chunk. The chunk may
2892 * be a nested subpattern, in which case S_reg is called
2893 * recursively
2894 *
2895 * The functions generate regnodes as they go along, appending each to the
2896 * pattern data structure so far. They return the offset of the current final
2897 * node into that structure, or 0 on failure.
2898 *
2899 * There are three parameters common to all of them:
2900 * pRExC_state is a structure with much information about the current
2901 * state of the parse. It's easy to add new elements to
2902 * convey new information, but beware that an error return may
2903 * require clearing the element.
2904 * flagp is a pointer to bit flags set in a lower level to pass up
2905 * to higher levels information, such as the cause of a
2906 * failure, or some characteristic about the generated node
2907 * depth is roughly the recursion depth, mostly unused except for
2908 * pretty printing debugging info.
2909 *
2910 * There are ancillary functions that these may farm work out to, using the
2911 * same parameters.
2912 *
2913 * The protocol for handling flags is that each function will, before
2914 * returning, add into *flagp the flags it needs to pass up. Each function has
2915 * a second flags variable, typically named 'flags', which it sets and clears
2916 * at will. Flag bits in it are used in that function, and it calls the next
2917 * layer down with its 'flagp' parameter set to '&flags'. Thus, upon return,
2918 * 'flags' will contain whatever it had before the call, plus whatever that
2919 * function passed up. If it wants to pass any of these up to its caller, it
2920 * has to add them to its *flagp. This means that it takes extra steps to keep
2921 * passing a flag upwards, and otherwise the flag bit is cleared for higher
2922 * functions.
2923 */
2924
2925/* On success, returns the offset at which any next node should be placed into
2926 * the regex engine program being compiled.
2927 *
2928 * Returns 0 otherwise, with *flagp set to indicate why:
2929 * TRYAGAIN at the end of (?) that only sets flags.
2930 * RESTART_PARSE if the parse needs to be restarted, or'd with
2931 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
2932 * Otherwise would only return 0 if regbranch() returns 0, which cannot
2933 * happen. */
2934STATIC regnode_offset
2935S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
2936 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
2937 * 2 is like 1, but indicates that nextchar() has been called to advance
2938 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
2939 * this flag alerts us to the need to check for that */
2940{
2941 regnode_offset ret = 0; /* Will be the head of the group. */
2942 regnode_offset br;
2943 regnode_offset lastbr;
2944 regnode_offset ender = 0;
2945 I32 logical_parno = 0;
2946 I32 parno = 0;
2947 I32 flags;
2948 U32 oregflags = RExC_flags;
2949 bool have_branch = 0;
2950 bool is_open = 0;
2951 I32 freeze_paren = 0;
2952 I32 after_freeze = 0;
2953 I32 num; /* numeric backreferences */
2954 SV * max_open; /* Max number of unclosed parens */
2955 I32 was_in_lookaround = RExC_in_lookaround;
2956 I32 fake_eval = 0; /* matches paren */
2957
2958 /* The difference between the following variables can be seen with *
2959 * the broken pattern /(?:foo/ where segment_parse_start will point *
2960 * at the 'f', and reg_parse_start will point at the '(' */
2961
2962 /* the following is used for unmatched '(' errors */
2963 char * const reg_parse_start = RExC_parse;
2964
2965 /* the following is used to track where various segments of
2966 * the pattern that we parse out started. */
2967 char * segment_parse_start = RExC_parse;
2968
2969 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2970
2971 PERL_ARGS_ASSERT_REG;
2972 DEBUG_PARSE("reg ");
2973
2974 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
2975 assert(max_open);
2976 if (!SvIOK(max_open)) {
2977 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
2978 }
2979 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
2980 open paren */
2981 vFAIL("Too many nested open parens");
2982 }
2983
2984 *flagp = 0; /* Initialize. */
2985
2986 /* Having this true makes it feasible to have a lot fewer tests for the
2987 * parse pointer being in scope. For example, we can write
2988 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2989 * instead of
2990 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2991 */
2992 assert(*RExC_end == '\0');
2993
2994 /* Make an OPEN node, if parenthesized. */
2995 if (paren) {
2996
2997 /* Under /x, space and comments can be gobbled up between the '(' and
2998 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
2999 * intervening space, as the sequence is a token, and a token should be
3000 * indivisible */
3001 bool has_intervening_patws = (paren == 2)
3002 && *(RExC_parse - 1) != '(';
3003
3004 if (RExC_parse >= RExC_end) {
3005 vFAIL("Unmatched (");
3006 }
3007
3008 if (paren == 'r') { /* Atomic script run */
3009 paren = '>';
3010 goto parse_rest;
3011 }
3012 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
3013 if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */
3014 fake_eval = '{';
3015 goto handle_qmark;
3016 }
3017
3018 char *start_verb = RExC_parse + 1;
3019 STRLEN verb_len;
3020 char *start_arg = NULL;
3021 unsigned char op = 0;
3022 int arg_required = 0;
3023 int internal_argval = -1; /* if > -1 no argument allowed */
3024 bool has_upper = FALSE;
3025 U32 seen_flag_set = 0; /* RExC_seen flags we must set */
3026
3027 if (has_intervening_patws) {
3028 RExC_parse_inc_by(1); /* past the '*' */
3029
3030 /* For strict backwards compatibility, don't change the message
3031 * now that we also have lowercase operands */
3032 if (isUPPER(*RExC_parse)) {
3033 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
3034 }
3035 else {
3036 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
3037 }
3038 }
3039 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
3040 if ( *RExC_parse == ':' ) {
3041 start_arg = RExC_parse + 1;
3042 break;
3043 }
3044 else if (! UTF) {
3045 if (isUPPER(*RExC_parse)) {
3046 has_upper = TRUE;
3047 }
3048 RExC_parse_inc_by(1);
3049 }
3050 else {
3051 RExC_parse_inc_utf8();
3052 }
3053 }
3054 verb_len = RExC_parse - start_verb;
3055 if ( start_arg ) {
3056 if (RExC_parse >= RExC_end) {
3057 goto unterminated_verb_pattern;
3058 }
3059
3060 RExC_parse_inc();
3061 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
3062 RExC_parse_inc();
3063 }
3064 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3065 unterminated_verb_pattern:
3066 if (has_upper) {
3067 vFAIL("Unterminated verb pattern argument");
3068 }
3069 else {
3070 vFAIL("Unterminated '(*...' argument");
3071 }
3072 }
3073 } else {
3074 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3075 if (has_upper) {
3076 vFAIL("Unterminated verb pattern");
3077 }
3078 else {
3079 vFAIL("Unterminated '(*...' construct");
3080 }
3081 }
3082 }
3083
3084 /* Here, we know that RExC_parse < RExC_end */
3085
3086 switch ( *start_verb ) {
3087 case 'A': /* (*ACCEPT) */
3088 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
3089 op = ACCEPT;
3090 internal_argval = RExC_nestroot;
3091 }
3092 break;
3093 case 'C': /* (*COMMIT) */
3094 if ( memEQs(start_verb, verb_len,"COMMIT") )
3095 op = COMMIT;
3096 break;
3097 case 'F': /* (*FAIL) */
3098 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
3099 op = OPFAIL;
3100 }
3101 break;
3102 case ':': /* (*:NAME) */
3103 case 'M': /* (*MARK:NAME) */
3104 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
3105 op = MARKPOINT;
3106 arg_required = 1;
3107 }
3108 break;
3109 case 'P': /* (*PRUNE) */
3110 if ( memEQs(start_verb, verb_len,"PRUNE") )
3111 op = PRUNE;
3112 break;
3113 case 'S': /* (*SKIP) */
3114 if ( memEQs(start_verb, verb_len,"SKIP") )
3115 op = SKIP;
3116 break;
3117 case 'T': /* (*THEN) */
3118 /* [19:06] <TimToady> :: is then */
3119 if ( memEQs(start_verb, verb_len,"THEN") ) {
3120 op = CUTGROUP;
3121 RExC_seen |= REG_CUTGROUP_SEEN;
3122 }
3123 break;
3124 case 'a':
3125 if ( memEQs(start_verb, verb_len, "asr")
3126 || memEQs(start_verb, verb_len, "atomic_script_run"))
3127 {
3128 paren = 'r'; /* Mnemonic: recursed run */
3129 goto script_run;
3130 }
3131 else if (memEQs(start_verb, verb_len, "atomic")) {
3132 paren = 't'; /* AtOMIC */
3133 goto alpha_assertions;
3134 }
3135 break;
3136 case 'p':
3137 if ( memEQs(start_verb, verb_len, "plb")
3138 || memEQs(start_verb, verb_len, "positive_lookbehind"))
3139 {
3140 paren = 'b';
3141 goto lookbehind_alpha_assertions;
3142 }
3143 else if ( memEQs(start_verb, verb_len, "pla")
3144 || memEQs(start_verb, verb_len, "positive_lookahead"))
3145 {
3146 paren = 'a';
3147 goto alpha_assertions;
3148 }
3149 break;
3150 case 'n':
3151 if ( memEQs(start_verb, verb_len, "nlb")
3152 || memEQs(start_verb, verb_len, "negative_lookbehind"))
3153 {
3154 paren = 'B';
3155 goto lookbehind_alpha_assertions;
3156 }
3157 else if ( memEQs(start_verb, verb_len, "nla")
3158 || memEQs(start_verb, verb_len, "negative_lookahead"))
3159 {
3160 paren = 'A';
3161 goto alpha_assertions;
3162 }
3163 break;
3164 case 's':
3165 if ( memEQs(start_verb, verb_len, "sr")
3166 || memEQs(start_verb, verb_len, "script_run"))
3167 {
3168 regnode_offset atomic;
3169
3170 paren = 's';
3171
3172 script_run:
3173
3174 /* This indicates Unicode rules. */
3175 REQUIRE_UNI_RULES(flagp, 0);
3176
3177 if (! start_arg) {
3178 goto no_colon;
3179 }
3180
3181 RExC_parse_set(start_arg);
3182
3183 if (RExC_in_script_run) {
3184
3185 /* Nested script runs are treated as no-ops, because
3186 * if the nested one fails, the outer one must as
3187 * well. It could fail sooner, and avoid (??{} with
3188 * side effects, but that is explicitly documented as
3189 * undefined behavior. */
3190
3191 ret = 0;
3192
3193 if (paren == 's') {
3194 paren = ':';
3195 goto parse_rest;
3196 }
3197
3198 /* But, the atomic part of a nested atomic script run
3199 * isn't a no-op, but can be treated just like a '(?>'
3200 * */
3201 paren = '>';
3202 goto parse_rest;
3203 }
3204
3205 if (paren == 's') {
3206 /* Here, we're starting a new regular script run */
3207 ret = reg_node(pRExC_state, SROPEN);
3208 RExC_in_script_run = 1;
3209 is_open = 1;
3210 goto parse_rest;
3211 }
3212
3213 /* Here, we are starting an atomic script run. This is
3214 * handled by recursing to deal with the atomic portion
3215 * separately, enclosed in SROPEN ... SRCLOSE nodes */
3216
3217 ret = reg_node(pRExC_state, SROPEN);
3218
3219 RExC_in_script_run = 1;
3220
3221 atomic = reg(pRExC_state, 'r', &flags, depth);
3222 if (flags & (RESTART_PARSE|NEED_UTF8)) {
3223 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
3224 return 0;
3225 }
3226
3227 if (! REGTAIL(pRExC_state, ret, atomic)) {
3228 REQUIRE_BRANCHJ(flagp, 0);
3229 }
3230
3231 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
3232 SRCLOSE)))
3233 {
3234 REQUIRE_BRANCHJ(flagp, 0);
3235 }
3236
3237 RExC_in_script_run = 0;
3238 return ret;
3239 }
3240
3241 break;
3242
3243 lookbehind_alpha_assertions:
3244 seen_flag_set = REG_LOOKBEHIND_SEEN;
3245 /*FALLTHROUGH*/
3246
3247 alpha_assertions:
3248
3249 if ( !start_arg ) {
3250 goto no_colon;
3251 }
3252
3253 if ( RExC_parse == start_arg ) {
3254 if ( paren == 'A' || paren == 'B' ) {
3255 /* An empty negative lookaround assertion is failure.
3256 * See also: S_reg_la_OPFAIL() */
3257
3258 /* Note: OPFAIL is *not* zerolen. */
3259 ret = reg1node(pRExC_state, OPFAIL, 0);
3260 nextchar(pRExC_state);
3261 return ret;
3262 }
3263 else
3264 if ( paren == 'a' || paren == 'b' ) {
3265 /* An empty positive lookaround assertion is success.
3266 * See also: S_reg_la_NOTHING() */
3267
3268 /* Note: NOTHING is zerolen, so increment here */
3269 RExC_seen_zerolen++;
3270 ret = reg_node(pRExC_state, NOTHING);
3271 nextchar(pRExC_state);
3272 return ret;
3273 }
3274 }
3275
3276 RExC_seen_zerolen++;
3277 RExC_in_lookaround++;
3278 RExC_seen |= seen_flag_set;
3279
3280 RExC_parse_set(start_arg);
3281 goto parse_rest;
3282
3283 no_colon:
3284 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
3285 UTF8fARG(UTF, verb_len, start_verb));
3286 NOT_REACHED; /*NOTREACHED*/
3287
3288 } /* End of switch */
3289 if ( ! op ) {
3290 RExC_parse_inc_safe();
3291 if (has_upper || verb_len == 0) {
3292 vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
3293 UTF8fARG(UTF, verb_len, start_verb));
3294 }
3295 else {
3296 vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
3297 UTF8fARG(UTF, verb_len, start_verb));
3298 }
3299 }
3300 if ( RExC_parse == start_arg ) {
3301 start_arg = NULL;
3302 }
3303 if ( arg_required && !start_arg ) {
3304 vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
3305 (int) verb_len, start_verb);
3306 }
3307 if (internal_argval == -1) {
3308 ret = reg1node(pRExC_state, op, 0);
3309 } else {
3310 ret = reg2node(pRExC_state, op, 0, internal_argval);
3311 }
3312 RExC_seen |= REG_VERBARG_SEEN;
3313 if (start_arg) {
3314 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
3315 ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state,
3316 STR_WITH_LEN("S"));
3317 RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv;
3318 FLAGS(REGNODE_p(ret)) = 1;
3319 } else {
3320 FLAGS(REGNODE_p(ret)) = 0;
3321 }
3322 if ( internal_argval != -1 )
3323 ARG2i_SET(REGNODE_p(ret), internal_argval);
3324 nextchar(pRExC_state);
3325 return ret;
3326 }
3327 else if (*RExC_parse == '?') { /* (?...) */
3328 handle_qmark:
3329 ; /* make sure the label has a statement associated with it*/
3330 bool is_logical = 0, is_optimistic = 0;
3331 const char * const seqstart = RExC_parse;
3332 const char * endptr;
3333 const char non_existent_group_msg[]
3334 = "Reference to nonexistent group";
3335 const char impossible_group[] = "Invalid reference to group";
3336
3337 if (has_intervening_patws) {
3338 RExC_parse_inc_by(1);
3339 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
3340 }
3341
3342 RExC_parse_inc_by(1); /* past the '?' */
3343 if (!fake_eval) {
3344 paren = *RExC_parse; /* might be a trailing NUL, if not
3345 well-formed */
3346 is_optimistic = 0;
3347 } else {
3348 is_optimistic = 1;
3349 paren = fake_eval;
3350 }
3351 RExC_parse_inc();
3352 if (RExC_parse > RExC_end) {
3353 paren = '\0';
3354 }
3355 ret = 0; /* For look-ahead/behind. */
3356 switch (paren) {
3357
3358 case 'P': /* (?P...) variants for those used to PCRE/Python */
3359 paren = *RExC_parse;
3360 if ( paren == '<') { /* (?P<...>) named capture */
3361 RExC_parse_inc_by(1);
3362 if (RExC_parse >= RExC_end) {
3363 vFAIL("Sequence (?P<... not terminated");
3364 }
3365 goto named_capture;
3366 }
3367 else if (paren == '>') { /* (?P>name) named recursion */
3368 RExC_parse_inc_by(1);
3369 if (RExC_parse >= RExC_end) {
3370 vFAIL("Sequence (?P>... not terminated");
3371 }
3372 goto named_recursion;
3373 }
3374 else if (paren == '=') { /* (?P=...) named backref */
3375 RExC_parse_inc_by(1);
3376 return handle_named_backref(pRExC_state, flagp,
3377 segment_parse_start, ')');
3378 }
3379 RExC_parse_inc_if_char();
3380 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3381 vFAIL3("Sequence (%.*s...) not recognized",
3382 (int) (RExC_parse - seqstart), seqstart);
3383 NOT_REACHED; /*NOTREACHED*/
3384 case '<': /* (?<...) */
3385 /* If you want to support (?<*...), first reconcile with GH #17363 */
3386 if (*RExC_parse == '!') {
3387 paren = ','; /* negative lookbehind (?<! ... ) */
3388 RExC_parse_inc_by(1);
3389 if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
3390 return ret;
3391 break;
3392 }
3393 else
3394 if (*RExC_parse == '=') {
3395 /* paren = '<' - negative lookahead (?<= ... ) */
3396 RExC_parse_inc_by(1);
3397 if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
3398 return ret;
3399 break;
3400 }
3401 else
3402 named_capture:
3403 { /* (?<...>) */
3404 char *name_start;
3405 SV *svname;
3406 paren= '>';
3407 /* FALLTHROUGH */
3408 case '\'': /* (?'...') */
3409 name_start = RExC_parse;
3410 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
3411 if ( RExC_parse == name_start
3412 || RExC_parse >= RExC_end
3413 || *RExC_parse != paren)
3414 {
3415 vFAIL2("Sequence (?%c... not terminated",
3416 paren=='>' ? '<' : (char) paren);
3417 }
3418 {
3419 HE *he_str;
3420 SV *sv_dat = NULL;
3421 if (!svname) /* shouldn't happen */
3422 Perl_croak(aTHX_
3423 "panic: reg_scan_name returned NULL");
3424 if (!RExC_paren_names) {
3425 RExC_paren_names= newHV();
3426 sv_2mortal(MUTABLE_SV(RExC_paren_names));
3427#ifdef DEBUGGING
3428 RExC_paren_name_list= newAV();
3429 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
3430#endif
3431 }
3432 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
3433 if ( he_str )
3434 sv_dat = HeVAL(he_str);
3435 if ( ! sv_dat ) {
3436 /* croak baby croak */
3437 Perl_croak(aTHX_
3438 "panic: paren_name hash element allocation failed");
3439 } else if ( SvPOK(sv_dat) ) {
3440 /* (?|...) can mean we have dupes so scan to check
3441 its already been stored. Maybe a flag indicating
3442 we are inside such a construct would be useful,
3443 but the arrays are likely to be quite small, so
3444 for now we punt -- dmq */
3445 IV count = SvIV(sv_dat);
3446 I32 *pv = (I32*)SvPVX(sv_dat);
3447 IV i;
3448 for ( i = 0 ; i < count ; i++ ) {
3449 if ( pv[i] == RExC_npar ) {
3450 count = 0;
3451 break;
3452 }
3453 }
3454 if ( count ) {
3455 pv = (I32*)SvGROW(sv_dat,
3456 SvCUR(sv_dat) + sizeof(I32)+1);
3457 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
3458 pv[count] = RExC_npar;
3459 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
3460 }
3461 } else {
3462 (void)SvUPGRADE(sv_dat, SVt_PVNV);
3463 sv_setpvn(sv_dat, (char *)&(RExC_npar),
3464 sizeof(I32));
3465 SvIOK_on(sv_dat);
3466 SvIV_set(sv_dat, 1);
3467 }
3468#ifdef DEBUGGING
3469 /* No, this does not cause a memory leak under
3470 * debugging. RExC_paren_name_list is freed later
3471 * on in the dump process. - Yves
3472 */
3473 if (!av_store(RExC_paren_name_list,
3474 RExC_npar, SvREFCNT_inc_NN(svname)))
3475 SvREFCNT_dec_NN(svname);
3476#endif
3477
3478 }
3479 nextchar(pRExC_state);
3480 paren = 1;
3481 goto capturing_parens;
3482 }
3483 NOT_REACHED; /*NOTREACHED*/
3484 case '=': /* (?=...) */
3485 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
3486 return ret;
3487 break;
3488 case '!': /* (?!...) */
3489 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
3490 return ret;
3491 break;
3492 case '|': /* (?|...) */
3493 /* branch reset, behave like a (?:...) except that
3494 buffers in alternations share the same numbers */
3495 paren = ':';
3496 after_freeze = freeze_paren = RExC_logical_npar;
3497
3498 /* XXX This construct currently requires an extra pass.
3499 * Investigation would be required to see if that could be
3500 * changed */
3501 REQUIRE_PARENS_PASS;
3502 break;
3503 case ':': /* (?:...) */
3504 case '>': /* (?>...) */
3505 break;
3506 case '$': /* (?$...) */
3507 case '@': /* (?@...) */
3508 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3509 break;
3510 case '0' : /* (?0) */
3511 case 'R' : /* (?R) */
3512 if (RExC_parse == RExC_end || *RExC_parse != ')')
3513 FAIL("Sequence (?R) not terminated");
3514 num = 0;
3515 RExC_seen |= REG_RECURSE_SEEN;
3516
3517 /* XXX These constructs currently require an extra pass.
3518 * It probably could be changed */
3519 REQUIRE_PARENS_PASS;
3520
3521 *flagp |= POSTPONED;
3522 goto gen_recurse_regop;
3523 /*notreached*/
3524 /* named and numeric backreferences */
3525 case '&': /* (?&NAME) */
3526 segment_parse_start = RExC_parse - 1;
3527 named_recursion:
3528 {
3529 SV *sv_dat = reg_scan_name(pRExC_state,
3530 REG_RSN_RETURN_DATA);
3531 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
3532 }
3533 if (RExC_parse >= RExC_end || *RExC_parse != ')')
3534 vFAIL("Sequence (?&... not terminated");
3535 goto gen_recurse_regop;
3536 /* NOTREACHED */
3537 case '+':
3538 if (! inRANGE(RExC_parse[0], '1', '9')) {
3539 RExC_parse_inc_by(1);
3540 vFAIL("Illegal pattern");
3541 }
3542 goto parse_recursion;
3543 /* NOTREACHED*/
3544 case '-': /* (?-1) */
3545 if (! inRANGE(RExC_parse[0], '1', '9')) {
3546 RExC_parse--; /* rewind to let it be handled later */
3547 goto parse_flags;
3548 }
3549 /* FALLTHROUGH */
3550 case '1': case '2': case '3': case '4': /* (?1) */
3551 case '5': case '6': case '7': case '8': case '9':
3552 RExC_parse_set((char *) seqstart + 1); /* Point to the digit */
3553 parse_recursion:
3554 {
3555 bool is_neg = FALSE;
3556 UV unum;
3557 segment_parse_start = RExC_parse - 1;
3558 if (*RExC_parse == '-') {
3559 RExC_parse_inc_by(1);
3560 is_neg = TRUE;
3561 }
3562 endptr = RExC_end;
3563 if (grok_atoUV(RExC_parse, &unum, &endptr)
3564 && unum <= I32_MAX
3565 ) {
3566 num = (I32)unum;
3567 RExC_parse_set((char*)endptr);
3568 }
3569 else { /* Overflow, or something like that. Position
3570 beyond all digits for the message */
3571 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
3572 RExC_parse_inc_by(1);
3573 }
3574 vFAIL(impossible_group);
3575 }
3576 if (is_neg) {
3577 /* -num is always representable on 1 and 2's complement
3578 * machines */
3579 num = -num;
3580 }
3581 }
3582 if (*RExC_parse!=')')
3583 vFAIL("Expecting close bracket");
3584
3585 if (paren == '-' || paren == '+') {
3586
3587 /* Don't overflow */
3588 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
3589 RExC_parse_inc_by(1);
3590 vFAIL(impossible_group);
3591 }
3592
3593 /*
3594 Diagram of capture buffer numbering.
3595 Top line is the normal capture buffer numbers
3596 Bottom line is the negative indexing as from
3597 the X (the (?-2))
3598
3599 1 2 3 4 5 X Y 6 7
3600 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
3601 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
3602 - 5 4 3 2 1 X Y x x
3603
3604 Resolve to absolute group. Recall that RExC_npar is +1 of
3605 the actual parenthesis group number. For lookahead, we
3606 have to compensate for that. Using the above example, when
3607 we get to Y in the parse, num is 2 and RExC_npar is 6. We
3608 want 7 for +2, and 4 for -2.
3609 */
3610 if ( paren == '+' ) {
3611 num--;
3612 }
3613
3614 num += RExC_npar;
3615
3616 if (paren == '-' && num < 1) {
3617 RExC_parse_inc_by(1);
3618 vFAIL(non_existent_group_msg);
3619 }
3620 }
3621 else
3622 if (num && num < RExC_logical_npar) {
3623 num = RExC_logical_to_parno[num];
3624 }
3625 else
3626 if (ALL_PARENS_COUNTED) {
3627 if (num < RExC_logical_total_parens) {
3628 num = RExC_logical_to_parno[num];
3629 }
3630 else {
3631 RExC_parse_inc_by(1);
3632 vFAIL(non_existent_group_msg);
3633 }
3634 }
3635 else {
3636 REQUIRE_PARENS_PASS;
3637 }
3638
3639
3640 gen_recurse_regop:
3641 if (num >= RExC_npar) {
3642
3643 /* It might be a forward reference; we can't fail until we
3644 * know, by completing the parse to get all the groups, and
3645 * then reparsing */
3646 if (ALL_PARENS_COUNTED) {
3647 if (num >= RExC_total_parens) {
3648 RExC_parse_inc_by(1);
3649 vFAIL(non_existent_group_msg);
3650 }
3651 }
3652 else {
3653 REQUIRE_PARENS_PASS;
3654 }
3655 }
3656
3657 /* We keep track how many GOSUB items we have produced.
3658 To start off the ARG2i() of the GOSUB holds its "id",
3659 which is used later in conjunction with RExC_recurse
3660 to calculate the offset we need to jump for the GOSUB,
3661 which it will store in the final representation.
3662 We have to defer the actual calculation until much later
3663 as the regop may move.
3664 */
3665 ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count);
3666 RExC_recurse_count++;
3667 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
3668 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
3669 22, "| |", (int)(depth * 2 + 1), "",
3670 (UV)ARG1u(REGNODE_p(ret)),
3671 (IV)ARG2i(REGNODE_p(ret))));
3672 RExC_seen |= REG_RECURSE_SEEN;
3673
3674 *flagp |= POSTPONED;
3675 assert(*RExC_parse == ')');
3676 nextchar(pRExC_state);
3677 return ret;
3678
3679 /* NOTREACHED */
3680
3681 case '?': /* (??...) */
3682 is_logical = 1;
3683 if (*RExC_parse != '{') {
3684 RExC_parse_inc_if_char();
3685 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3686 vFAIL2utf8f(
3687 "Sequence (%" UTF8f "...) not recognized",
3688 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
3689 NOT_REACHED; /*NOTREACHED*/
3690 }
3691 *flagp |= POSTPONED;
3692 paren = '{';
3693 RExC_parse_inc_by(1);
3694 /* FALLTHROUGH */
3695 case '{': /* (?{...}) */
3696 {
3697 U32 n = 0;
3698 struct reg_code_block *cb;
3699 OP * o;
3700
3701 RExC_seen_zerolen++;
3702
3703 if ( !pRExC_state->code_blocks
3704 || pRExC_state->code_index
3705 >= pRExC_state->code_blocks->count
3706 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
3707 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
3708 - RExC_start)
3709 ) {
3710 if (RExC_pm_flags & PMf_USE_RE_EVAL)
3711 FAIL("panic: Sequence (?{...}): no code block found\n");
3712 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3713 }
3714 /* this is a pre-compiled code block (?{...}) */
3715 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
3716 RExC_parse_set(RExC_start + cb->end);
3717 o = cb->block;
3718 if (cb->src_regex) {
3719 n = reg_add_data(pRExC_state, STR_WITH_LEN("rl"));
3720 RExC_rxi->data->data[n] =
3721 (void*)SvREFCNT_inc((SV*)cb->src_regex);
3722 RExC_rxi->data->data[n+1] = (void*)o;
3723 }
3724 else {
3725 n = reg_add_data(pRExC_state,
3726 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
3727 RExC_rxi->data->data[n] = (void*)o;
3728 }
3729 pRExC_state->code_index++;
3730 nextchar(pRExC_state);
3731 if (!is_optimistic)
3732 RExC_seen |= REG_PESSIMIZE_SEEN;
3733
3734 if (is_logical) {
3735 regnode_offset eval;
3736 ret = reg_node(pRExC_state, LOGICAL);
3737 FLAGS(REGNODE_p(ret)) = 2;
3738
3739 eval = reg2node(pRExC_state, EVAL,
3740 n,
3741
3742 /* for later propagation into (??{})
3743 * return value */
3744 RExC_flags & RXf_PMf_COMPILETIME
3745 );
3746 FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3747 if (! REGTAIL(pRExC_state, ret, eval)) {
3748 REQUIRE_BRANCHJ(flagp, 0);
3749 }
3750 return ret;
3751 }
3752 ret = reg2node(pRExC_state, EVAL, n, 0);
3753 FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3754
3755 return ret;
3756 }
3757 case '(': /* (?(?{...})...) and (?(?=...)...) */
3758 {
3759 int is_define= 0;
3760 const int DEFINE_len = sizeof("DEFINE") - 1;
3761 if ( RExC_parse < RExC_end - 1
3762 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
3763 && ( RExC_parse[1] == '='
3764 || RExC_parse[1] == '!'
3765 || RExC_parse[1] == '<'
3766 || RExC_parse[1] == '{'))
3767 || ( RExC_parse[0] == '*' /* (?(*...)) */
3768 && ( RExC_parse[1] == '{'
3769 || ( memBEGINs(RExC_parse + 1,
3770 (Size_t) (RExC_end - (RExC_parse + 1)),
3771 "pla:")
3772 || memBEGINs(RExC_parse + 1,
3773 (Size_t) (RExC_end - (RExC_parse + 1)),
3774 "plb:")
3775 || memBEGINs(RExC_parse + 1,
3776 (Size_t) (RExC_end - (RExC_parse + 1)),
3777 "nla:")
3778 || memBEGINs(RExC_parse + 1,
3779 (Size_t) (RExC_end - (RExC_parse + 1)),
3780 "nlb:")
3781 || memBEGINs(RExC_parse + 1,
3782 (Size_t) (RExC_end - (RExC_parse + 1)),
3783 "positive_lookahead:")
3784 || memBEGINs(RExC_parse + 1,
3785 (Size_t) (RExC_end - (RExC_parse + 1)),
3786 "positive_lookbehind:")
3787 || memBEGINs(RExC_parse + 1,
3788 (Size_t) (RExC_end - (RExC_parse + 1)),
3789 "negative_lookahead:")
3790 || memBEGINs(RExC_parse + 1,
3791 (Size_t) (RExC_end - (RExC_parse + 1)),
3792 "negative_lookbehind:")))))
3793 ) { /* Lookahead or eval. */
3794 I32 flag;
3795 regnode_offset tail;
3796
3797 ret = reg_node(pRExC_state, LOGICAL);
3798 FLAGS(REGNODE_p(ret)) = 1;
3799
3800 tail = reg(pRExC_state, 1, &flag, depth+1);
3801 RETURN_FAIL_ON_RESTART(flag, flagp);
3802 if (! REGTAIL(pRExC_state, ret, tail)) {
3803 REQUIRE_BRANCHJ(flagp, 0);
3804 }
3805 goto insert_if;
3806 }
3807 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
3808 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
3809 {
3810 char ch = RExC_parse[0] == '<' ? '>' : '\'';
3811 char *name_start= RExC_parse;
3812 RExC_parse_inc_by(1);
3813 U32 num = 0;
3814 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
3815 if ( RExC_parse == name_start
3816 || RExC_parse >= RExC_end
3817 || *RExC_parse != ch)
3818 {
3819 vFAIL2("Sequence (?(%c... not terminated",
3820 (ch == '>' ? '<' : ch));
3821 }
3822 RExC_parse_inc_by(1);
3823 if (sv_dat) {
3824 num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
3825 RExC_rxi->data->data[num]=(void*)sv_dat;
3826 SvREFCNT_inc_simple_void_NN(sv_dat);
3827 }
3828 ret = reg1node(pRExC_state, GROUPPN, num);
3829 goto insert_if_check_paren;
3830 }
3831 else if (memBEGINs(RExC_parse,
3832 (STRLEN) (RExC_end - RExC_parse),
3833 "DEFINE"))
3834 {
3835 ret = reg1node(pRExC_state, DEFINEP, 0);
3836 RExC_parse_inc_by(DEFINE_len);
3837 is_define = 1;
3838 goto insert_if_check_paren;
3839 }
3840 else if (RExC_parse[0] == 'R') {
3841 RExC_parse_inc_by(1);
3842 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
3843 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
3844 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
3845 */
3846 parno = 0;
3847 if (RExC_parse[0] == '0') {
3848 parno = 1;
3849 RExC_parse_inc_by(1);
3850 }
3851 else if (inRANGE(RExC_parse[0], '1', '9')) {
3852 UV uv;
3853 endptr = RExC_end;
3854 if (grok_atoUV(RExC_parse, &uv, &endptr)
3855 && uv <= I32_MAX
3856 ) {
3857 parno = (I32)uv + 1;
3858 RExC_parse_set((char*)endptr);
3859 }
3860 /* else "Switch condition not recognized" below */
3861 } else if (RExC_parse[0] == '&') {
3862 SV *sv_dat;
3863 RExC_parse_inc_by(1);
3864 sv_dat = reg_scan_name(pRExC_state,
3865 REG_RSN_RETURN_DATA);
3866 if (sv_dat)
3867 parno = 1 + *((I32 *)SvPVX(sv_dat));
3868 }
3869 ret = reg1node(pRExC_state, INSUBP, parno);
3870 goto insert_if_check_paren;
3871 }
3872 else if (inRANGE(RExC_parse[0], '1', '9')) {
3873 /* (?(1)...) */
3874 char c;
3875 UV uv;
3876 endptr = RExC_end;
3877 if (grok_atoUV(RExC_parse, &uv, &endptr)
3878 && uv <= I32_MAX
3879 ) {
3880 parno = (I32)uv;
3881 RExC_parse_set((char*)endptr);
3882 }
3883 else {
3884 vFAIL("panic: grok_atoUV returned FALSE");
3885 }
3886 ret = reg1node(pRExC_state, GROUPP, parno);
3887
3888 insert_if_check_paren:
3889 if (UCHARAT(RExC_parse) != ')') {
3890 RExC_parse_inc_safe();
3891 vFAIL("Switch condition not recognized");
3892 }
3893 nextchar(pRExC_state);
3894 insert_if:
3895 if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state,
3896 IFTHEN, 0)))
3897 {
3898 REQUIRE_BRANCHJ(flagp, 0);
3899 }
3900 br = regbranch(pRExC_state, &flags, 1, depth+1);
3901 if (br == 0) {
3902 RETURN_FAIL_ON_RESTART(flags,flagp);
3903 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3904 (UV) flags);
3905 } else
3906 if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state,
3907 LONGJMP, 0)))
3908 {
3909 REQUIRE_BRANCHJ(flagp, 0);
3910 }
3911 c = UCHARAT(RExC_parse);
3912 nextchar(pRExC_state);
3913 if (flags&HASWIDTH)
3914 *flagp |= HASWIDTH;
3915 if (c == '|') {
3916 if (is_define)
3917 vFAIL("(?(DEFINE)....) does not allow branches");
3918
3919 /* Fake one for optimizer. */
3920 lastbr = reg1node(pRExC_state, IFTHEN, 0);
3921
3922 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
3923 RETURN_FAIL_ON_RESTART(flags, flagp);
3924 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3925 (UV) flags);
3926 }
3927 if (! REGTAIL(pRExC_state, ret, lastbr)) {
3928 REQUIRE_BRANCHJ(flagp, 0);
3929 }
3930 if (flags&HASWIDTH)
3931 *flagp |= HASWIDTH;
3932 c = UCHARAT(RExC_parse);
3933 nextchar(pRExC_state);
3934 }
3935 else
3936 lastbr = 0;
3937 if (c != ')') {
3938 if (RExC_parse >= RExC_end)
3939 vFAIL("Switch (?(condition)... not terminated");
3940 else
3941 vFAIL("Switch (?(condition)... contains too many branches");
3942 }
3943 ender = reg_node(pRExC_state, TAIL);
3944 if (! REGTAIL(pRExC_state, br, ender)) {
3945 REQUIRE_BRANCHJ(flagp, 0);
3946 }
3947 if (lastbr) {
3948 if (! REGTAIL(pRExC_state, lastbr, ender)) {
3949 REQUIRE_BRANCHJ(flagp, 0);
3950 }
3951 if (! REGTAIL(pRExC_state,
3952 REGNODE_OFFSET(
3953 REGNODE_AFTER(REGNODE_p(lastbr))),
3954 ender))
3955 {
3956 REQUIRE_BRANCHJ(flagp, 0);
3957 }
3958 }
3959 else
3960 if (! REGTAIL(pRExC_state, ret, ender)) {
3961 REQUIRE_BRANCHJ(flagp, 0);
3962 }
3963#if 0 /* Removing this doesn't cause failures in the test suite -- khw */
3964 RExC_size++; /* XXX WHY do we need this?!!
3965 For large programs it seems to be required
3966 but I can't figure out why. -- dmq*/
3967#endif
3968 return ret;
3969 }
3970 RExC_parse_inc_safe();
3971 vFAIL("Unknown switch condition (?(...))");
3972 }
3973 case '[': /* (?[ ... ]) */
3974 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
3975 case 0: /* A NUL */
3976 RExC_parse--; /* for vFAIL to print correctly */
3977 vFAIL("Sequence (? incomplete");
3978 break;
3979
3980 case ')':
3981 if (RExC_strict) { /* [perl #132851] */
3982 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
3983 }
3984 /* FALLTHROUGH */
3985 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
3986 /* FALLTHROUGH */
3987 default: /* e.g., (?i) */
3988 RExC_parse_set((char *) seqstart + 1);
3989 parse_flags:
3990 parse_lparen_question_flags(pRExC_state);
3991 if (UCHARAT(RExC_parse) != ':') {
3992 if (RExC_parse < RExC_end)
3993 nextchar(pRExC_state);
3994 *flagp = TRYAGAIN;
3995 return 0;
3996 }
3997 paren = ':';
3998 nextchar(pRExC_state);
3999 ret = 0;
4000 goto parse_rest;
4001 } /* end switch */
4002 }
4003 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
4004 capturing_parens:
4005 parno = RExC_npar;
4006 RExC_npar++;
4007 if (RExC_npar >= U16_MAX)
4008 FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar);
4009
4010 logical_parno = RExC_logical_npar;
4011 RExC_logical_npar++;
4012 if (! ALL_PARENS_COUNTED) {
4013 /* If we are in our first pass through (and maybe only pass),
4014 * we need to allocate memory for the capturing parentheses
4015 * data structures.
4016 */
4017
4018 if (!RExC_parens_buf_size) {
4019 /* first guess at number of parens we might encounter */
4020 RExC_parens_buf_size = 10;
4021
4022 /* setup RExC_open_parens, which holds the address of each
4023 * OPEN tag, and to make things simpler for the 0 index the
4024 * start of the program - this is used later for offsets */
4025 Newxz(RExC_open_parens, RExC_parens_buf_size,
4026 regnode_offset);
4027 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
4028
4029 /* setup RExC_close_parens, which holds the address of each
4030 * CLOSE tag, and to make things simpler for the 0 index
4031 * the end of the program - this is used later for offsets
4032 * */
4033 Newxz(RExC_close_parens, RExC_parens_buf_size,
4034 regnode_offset);
4035 /* we don't know where end op starts yet, so we don't need to
4036 * set RExC_close_parens[0] like we do RExC_open_parens[0]
4037 * above */
4038
4039 Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4040 Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4041 }
4042 else if (RExC_npar > RExC_parens_buf_size) {
4043 I32 old_size = RExC_parens_buf_size;
4044
4045 RExC_parens_buf_size *= 2;
4046
4047 Renew(RExC_open_parens, RExC_parens_buf_size,
4048 regnode_offset);
4049 Zero(RExC_open_parens + old_size,
4050 RExC_parens_buf_size - old_size, regnode_offset);
4051
4052 Renew(RExC_close_parens, RExC_parens_buf_size,
4053 regnode_offset);
4054 Zero(RExC_close_parens + old_size,
4055 RExC_parens_buf_size - old_size, regnode_offset);
4056
4057 Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4058 Zero(RExC_logical_to_parno + old_size,
4059 RExC_parens_buf_size - old_size, I32);
4060
4061 Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4062 Zero(RExC_parno_to_logical + old_size,
4063 RExC_parens_buf_size - old_size, I32);
4064 }
4065 }
4066
4067 ret = reg1node(pRExC_state, OPEN, parno);
4068 if (!RExC_nestroot)
4069 RExC_nestroot = parno;
4070 if (RExC_open_parens && !RExC_open_parens[parno])
4071 {
4072 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4073 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
4074 22, "| |", (int)(depth * 2 + 1), "",
4075 (IV)parno, ret));
4076 RExC_open_parens[parno]= ret;
4077 }
4078 if (RExC_parno_to_logical) {
4079 RExC_parno_to_logical[parno] = logical_parno;
4080 if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno])
4081 RExC_logical_to_parno[logical_parno] = parno;
4082 }
4083 is_open = 1;
4084 } else {
4085 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
4086 paren = ':';
4087 ret = 0;
4088 }
4089 }
4090 else /* ! paren */
4091 ret = 0;
4092
4093 parse_rest:
4094 /* Pick up the branches, linking them together. */
4095 segment_parse_start = RExC_parse;
4096 I32 npar_before_regbranch = RExC_npar - 1;
4097 br = regbranch(pRExC_state, &flags, 1, depth+1);
4098
4099 /* branch_len = (paren != 0); */
4100
4101 if (br == 0) {
4102 RETURN_FAIL_ON_RESTART(flags, flagp);
4103 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4104 }
4105 if (*RExC_parse == '|') {
4106 if (RExC_use_BRANCHJ) {
4107 reginsert(pRExC_state, BRANCHJ, br, depth+1);
4108 ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
4109 ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4110 }
4111 else {
4112 reginsert(pRExC_state, BRANCH, br, depth+1);
4113 ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
4114 ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4115 }
4116 have_branch = 1;
4117 }
4118 else if (paren == ':') {
4119 *flagp |= flags&SIMPLE;
4120 }
4121 if (is_open) { /* Starts with OPEN. */
4122 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
4123 REQUIRE_BRANCHJ(flagp, 0);
4124 }
4125 }
4126 else if (paren != '?') /* Not Conditional */
4127 ret = br;
4128 *flagp |= flags & (HASWIDTH | POSTPONED);
4129 lastbr = br;
4130 while (*RExC_parse == '|') {
4131 if (RExC_use_BRANCHJ) {
4132 bool shut_gcc_up;
4133
4134 ender = reg1node(pRExC_state, LONGJMP, 0);
4135
4136 /* Append to the previous. */
4137 shut_gcc_up = REGTAIL(pRExC_state,
4138 REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
4139 ender);
4140 PERL_UNUSED_VAR(shut_gcc_up);
4141 }
4142 nextchar(pRExC_state);
4143 if (freeze_paren) {
4144 if (RExC_logical_npar > after_freeze)
4145 after_freeze = RExC_logical_npar;
4146 RExC_logical_npar = freeze_paren;
4147 }
4148 br = regbranch(pRExC_state, &flags, 0, depth+1);
4149
4150 if (br == 0) {
4151 RETURN_FAIL_ON_RESTART(flags, flagp);
4152 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4153 }
4154 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
4155 REQUIRE_BRANCHJ(flagp, 0);
4156 }
4157 assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
4158 assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
4159 if (OP(REGNODE_p(br)) == BRANCH) {
4160 if (OP(REGNODE_p(lastbr)) == BRANCH)
4161 ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4162 else
4163 ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4164 }
4165 else
4166 if (OP(REGNODE_p(br)) == BRANCHJ) {
4167 if (OP(REGNODE_p(lastbr)) == BRANCH)
4168 ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4169 else
4170 ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4171 }
4172
4173 lastbr = br;
4174 *flagp |= flags & (HASWIDTH | POSTPONED);
4175 }
4176
4177 if (have_branch || paren != ':') {
4178 regnode * br;
4179
4180 /* Make a closing node, and hook it on the end. */
4181 switch (paren) {
4182 case ':':
4183 ender = reg_node(pRExC_state, TAIL);
4184 break;
4185 case 1: case 2:
4186 ender = reg1node(pRExC_state, CLOSE, parno);
4187 if ( RExC_close_parens ) {
4188 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4189 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
4190 22, "| |", (int)(depth * 2 + 1), "",
4191 (IV)parno, ender));
4192 RExC_close_parens[parno]= ender;
4193 if (RExC_nestroot == parno)
4194 RExC_nestroot = 0;
4195 }
4196 break;
4197 case 's':
4198 ender = reg_node(pRExC_state, SRCLOSE);
4199 RExC_in_script_run = 0;
4200 break;
4201 /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
4202 case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
4203 case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
4204 case '<': /* (?<= ... ) */
4205 case ',': /* (?<! ... ) */
4206 *flagp &= ~HASWIDTH;
4207 ender = reg_node(pRExC_state, LOOKBEHIND_END);
4208 break;
4209 /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
4210 case 'a':
4211 case 'A':
4212 case '=':
4213 case '!':
4214 *flagp &= ~HASWIDTH;
4215 /* FALLTHROUGH */
4216 case 't': /* aTomic */
4217 case '>':
4218 ender = reg_node(pRExC_state, SUCCEED);
4219 break;
4220 case 0:
4221 ender = reg_node(pRExC_state, END);
4222 assert(!RExC_end_op); /* there can only be one! */
4223 RExC_end_op = REGNODE_p(ender);
4224 if (RExC_close_parens) {
4225 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4226 "%*s%*s Setting close paren #0 (END) to %zu\n",
4227 22, "| |", (int)(depth * 2 + 1), "",
4228 ender));
4229
4230 RExC_close_parens[0]= ender;
4231 }
4232 break;
4233 }
4234 DEBUG_PARSE_r({
4235 DEBUG_PARSE_MSG("lsbr");
4236 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
4237 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
4238 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4239 SvPV_nolen_const(RExC_mysv1),
4240 (IV)lastbr,
4241 SvPV_nolen_const(RExC_mysv2),
4242 (IV)ender,
4243 (IV)(ender - lastbr)
4244 );
4245 });
4246 if (OP(REGNODE_p(lastbr)) == BRANCH) {
4247 ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4248 }
4249 else
4250 if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
4251 ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4252 }
4253
4254 if (! REGTAIL(pRExC_state, lastbr, ender)) {
4255 REQUIRE_BRANCHJ(flagp, 0);
4256 }
4257
4258 if (have_branch) {
4259 char is_nothing= 1;
4260 if (depth==1)
4261 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
4262
4263 /* Hook the tails of the branches to the closing node. */
4264 for (br = REGNODE_p(ret); br; br = regnext(br)) {
4265 const U8 op = REGNODE_TYPE(OP(br));
4266 regnode *nextoper = REGNODE_AFTER(br);
4267 if (op == BRANCH) {
4268 if (! REGTAIL_STUDY(pRExC_state,
4269 REGNODE_OFFSET(nextoper),
4270 ender))
4271 {
4272 REQUIRE_BRANCHJ(flagp, 0);
4273 }
4274 if ( OP(nextoper) != NOTHING
4275 || regnext(nextoper) != REGNODE_p(ender))
4276 is_nothing= 0;
4277 }
4278 else if (op == BRANCHJ) {
4279 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
4280 REGNODE_OFFSET(nextoper),
4281 ender);
4282 PERL_UNUSED_VAR(shut_gcc_up);
4283 /* for now we always disable this optimisation * /
4284 regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
4285 if ( OP(nopr) != NOTHING
4286 || regnext(nopr) != REGNODE_p(ender))
4287 */
4288 is_nothing= 0;
4289 }
4290 }
4291 if (is_nothing) {
4292 regnode * ret_as_regnode = REGNODE_p(ret);
4293 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
4294 ? regnext(ret_as_regnode)
4295 : ret_as_regnode;
4296 DEBUG_PARSE_r({
4297 DEBUG_PARSE_MSG("NADA");
4298 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
4299 NULL, pRExC_state);
4300 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
4301 NULL, pRExC_state);
4302 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4303 SvPV_nolen_const(RExC_mysv1),
4304 (IV)REG_NODE_NUM(ret_as_regnode),
4305 SvPV_nolen_const(RExC_mysv2),
4306 (IV)ender,
4307 (IV)(ender - ret)
4308 );
4309 });
4310 OP(br)= NOTHING;
4311 if (OP(REGNODE_p(ender)) == TAIL) {
4312 NEXT_OFF(br)= 0;
4313 RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
4314 } else {
4315 regnode *opt;
4316 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
4317 OP(opt)= OPTIMIZED;
4318 NEXT_OFF(br)= REGNODE_p(ender) - br;
4319 }
4320 }
4321 }
4322 }
4323
4324 {
4325 const char *p;
4326 /* Even/odd or x=don't care: 010101x10x */
4327 static const char parens[] = "=!aA<,>Bbt";
4328 /* flag below is set to 0 up through 'A'; 1 for larger */
4329
4330 if (paren && (p = strchr(parens, paren))) {
4331 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4332 int flag = (p - parens) > 3;
4333
4334 if (paren == '>' || paren == 't') {
4335 node = SUSPEND, flag = 0;
4336 }
4337
4338 reginsert(pRExC_state, node, ret, depth+1);
4339 FLAGS(REGNODE_p(ret)) = flag;
4340 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
4341 {
4342 REQUIRE_BRANCHJ(flagp, 0);
4343 }
4344 }
4345 }
4346
4347 /* Check for proper termination. */
4348 if (paren) {
4349 /* restore original flags, but keep (?p) and, if we've encountered
4350 * something in the parse that changes /d rules into /u, keep the /u */
4351 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
4352 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
4353 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
4354 }
4355 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
4356 RExC_parse_set(reg_parse_start);
4357 vFAIL("Unmatched (");
4358 }
4359 nextchar(pRExC_state);
4360 }
4361 else if (!paren && RExC_parse < RExC_end) {
4362 if (*RExC_parse == ')') {
4363 RExC_parse_inc_by(1);
4364 vFAIL("Unmatched )");
4365 }
4366 else
4367 FAIL("Junk on end of regexp"); /* "Can't happen". */
4368 NOT_REACHED; /* NOTREACHED */
4369 }
4370
4371 if (after_freeze > RExC_logical_npar)
4372 RExC_logical_npar = after_freeze;
4373
4374 RExC_in_lookaround = was_in_lookaround;
4375
4376 return(ret);
4377}
4378
4379/*
4380 - regbranch - one alternative of an | operator
4381 *
4382 * Implements the concatenation operator.
4383 *
4384 * On success, returns the offset at which any next node should be placed into
4385 * the regex engine program being compiled.
4386 *
4387 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
4388 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
4389 * UTF-8
4390 */
4391STATIC regnode_offset
4392S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4393{
4394 regnode_offset ret;
4395 regnode_offset chain = 0;
4396 regnode_offset latest;
4397 regnode *branch_node = NULL;
4398 I32 flags = 0, c = 0;
4399 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4400
4401 PERL_ARGS_ASSERT_REGBRANCH;
4402
4403 DEBUG_PARSE("brnc");
4404
4405 if (first)
4406 ret = 0;
4407 else {
4408 if (RExC_use_BRANCHJ) {
4409 ret = reg2node(pRExC_state, BRANCHJ, 0, 0);
4410 branch_node = REGNODE_p(ret);
4411 ARG2a_SET(branch_node, (U16)RExC_npar-1);
4412 } else {
4413 ret = reg1node(pRExC_state, BRANCH, 0);
4414 branch_node = REGNODE_p(ret);
4415 ARG1a_SET(branch_node, (U16)RExC_npar-1);
4416 }
4417 }
4418
4419 *flagp = 0; /* Initialize. */
4420
4421 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
4422 FALSE /* Don't force to /x */ );
4423 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4424 flags &= ~TRYAGAIN;
4425 latest = regpiece(pRExC_state, &flags, depth+1);
4426 if (latest == 0) {
4427 if (flags & TRYAGAIN)
4428 continue;
4429 RETURN_FAIL_ON_RESTART(flags, flagp);
4430 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
4431 }
4432 else if (ret == 0)
4433 ret = latest;
4434 *flagp |= flags&(HASWIDTH|POSTPONED);
4435 if (chain != 0) {
4436 /* FIXME adding one for every branch after the first is probably
4437 * excessive now we have TRIE support. (hv) */
4438 MARK_NAUGHTY(1);
4439 if (! REGTAIL(pRExC_state, chain, latest)) {
4440 /* XXX We could just redo this branch, but figuring out what
4441 * bookkeeping needs to be reset is a pain, and it's likely
4442 * that other branches that goto END will also be too large */
4443 REQUIRE_BRANCHJ(flagp, 0);
4444 }
4445 }
4446 chain = latest;
4447 c++;
4448 }
4449 if (chain == 0) { /* Loop ran zero times. */
4450 chain = reg_node(pRExC_state, NOTHING);
4451 if (ret == 0)
4452 ret = chain;
4453 }
4454 if (c == 1) {
4455 *flagp |= flags & SIMPLE;
4456 }
4457 return ret;
4458}
4459
4460#define RBRACE 0
4461#define MIN_S 1
4462#define MIN_E 2
4463#define MAX_S 3
4464#define MAX_E 4
4465
4466#ifndef PERL_IN_XSUB_RE
4467bool
4468Perl_regcurly(const char *s, const char *e, const char * result[5])
4469{
4470 /* This function matches a {m,n} quantifier. When called with a NULL final
4471 * argument, it simply parses the input from 's' up through 'e-1', and
4472 * returns a boolean as to whether or not this input is syntactically a
4473 * {m,n} quantifier.
4474 *
4475 * When called with a non-NULL final parameter, and when the function
4476 * returns TRUE, it additionally stores information into the array
4477 * specified by that parameter about what it found in the parse. The
4478 * parameter must be a pointer into a 5 element array of 'const char *'
4479 * elements. The returned information is as follows:
4480 * result[RBRACE] points to the closing brace
4481 * result[MIN_S] points to the first byte of the lower bound
4482 * result[MIN_E] points to one beyond the final byte of the lower bound
4483 * result[MAX_S] points to the first byte of the upper bound
4484 * result[MAX_E] points to one beyond the final byte of the upper bound
4485 *
4486 * If the quantifier is of the form {m,} (meaning an infinite upper
4487 * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
4488 * to is irrelevant, just that it's the same place
4489 *
4490 * If instead the quantifier is of the form {m} there is actually only
4491 * one bound, and both the upper and lower result[] elements are set to
4492 * point to it.
4493 *
4494 * This function checks only for syntactic validity; it leaves checking for
4495 * semantic validity and raising any diagnostics to the caller. This
4496 * function is called in multiple places to check for syntax, but only from
4497 * one for semantics. It makes it as simple as possible for the
4498 * syntax-only callers, while furnishing just enough information for the
4499 * semantic caller.
4500 */
4501
4502 const char * min_start = NULL;
4503 const char * max_start = NULL;
4504 const char * min_end = NULL;
4505 const char * max_end = NULL;
4506
4507 bool has_comma = FALSE;
4508
4509 PERL_ARGS_ASSERT_REGCURLY;
4510
4511 if (s >= e || *s++ != '{')
4512 return FALSE;
4513
4514 while (s < e && isBLANK(*s)) {
4515 s++;
4516 }
4517
4518 if isDIGIT(*s) {
4519 min_start = s;
4520 do {
4521 s++;
4522 } while (s < e && isDIGIT(*s));
4523 min_end = s;
4524 }
4525
4526 while (s < e && isBLANK(*s)) {
4527 s++;
4528 }
4529
4530 if (*s == ',') {
4531 has_comma = TRUE;
4532 s++;
4533
4534 while (s < e && isBLANK(*s)) {
4535 s++;
4536 }
4537
4538 if isDIGIT(*s) {
4539 max_start = s;
4540 do {
4541 s++;
4542 } while (s < e && isDIGIT(*s));
4543 max_end = s;
4544 }
4545 }
4546
4547 while (s < e && isBLANK(*s)) {
4548 s++;
4549 }
4550 /* Need at least one number */
4551 if (s >= e || *s != '}' || (! min_start && ! max_end)) {
4552 return FALSE;
4553 }
4554
4555 if (result) {
4556
4557 result[RBRACE] = s;
4558
4559 result[MIN_S] = min_start;
4560 result[MIN_E] = min_end;
4561 if (has_comma) {
4562 if (max_start) {
4563 result[MAX_S] = max_start;
4564 result[MAX_E] = max_end;
4565 }
4566 else {
4567 /* Having no value after the comma is signalled by setting
4568 * start and end to the same value. What that value is isn't
4569 * relevant; NULL is chosen simply because it will fail if the
4570 * caller mistakenly uses it */
4571 result[MAX_S] = result[MAX_E] = NULL;
4572 }
4573 }
4574 else { /* No comma means lower and upper bounds are the same */
4575 result[MAX_S] = min_start;
4576 result[MAX_E] = min_end;
4577 }
4578 }
4579
4580 return TRUE;
4581}
4582#endif
4583
4584U32
4585S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
4586 const char * start, const char * end)
4587{
4588 /* This is a helper function for regpiece() to compute, given the
4589 * quantifier {m,n}, the value of either m or n, based on the starting
4590 * position 'start' in the string, through the byte 'end-1', returning it
4591 * if valid, and failing appropriately if not. It knows the restrictions
4592 * imposed on quantifier values */
4593
4594 UV uv;
4595 STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
4596
4597 PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
4598
4599 if (grok_atoUV(start, &uv, &end)) {
4600 if (uv < REG_INFTY) { /* A valid, small-enough number */
4601 return (U32) uv;
4602 }
4603 }
4604 else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
4605 leading zeros or overflow */
4606 RExC_parse_set((char * ) end);
4607
4608 /* Perhaps too generic a msg for what is only failure from having
4609 * leading zeros, but this is how it's always behaved. */
4610 vFAIL("Invalid quantifier in {,}");
4611 NOT_REACHED; /*NOTREACHED*/
4612 }
4613
4614 /* Here, found a quantifier, but was too large; either it overflowed or was
4615 * too big a legal number */
4616 RExC_parse_set((char * ) end);
4617 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4618
4619 NOT_REACHED; /*NOTREACHED*/
4620 return U32_MAX; /* Perhaps some compilers will be expecting a return */
4621}
4622
4623/*
4624 - regpiece - something followed by possible quantifier * + ? {n,m}
4625 *
4626 * Note that the branching code sequences used for ? and the general cases
4627 * of * and + are somewhat optimized: they use the same NOTHING node as
4628 * both the endmarker for their branch list and the body of the last branch.
4629 * It might seem that this node could be dispensed with entirely, but the
4630 * endmarker role is not redundant.
4631 *
4632 * On success, returns the offset at which any next node should be placed into
4633 * the regex engine program being compiled.
4634 *
4635 * Returns 0 otherwise, with *flagp set to indicate why:
4636 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
4637 * RESTART_PARSE if the parse needs to be restarted, or'd with
4638 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
4639 */
4640STATIC regnode_offset
4641S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4642{
4643 regnode_offset ret;
4644 char op;
4645 I32 flags;
4646 const char * const origparse = RExC_parse;
4647 I32 min;
4648 I32 max = REG_INFTY;
4649 I32 npar_before = RExC_npar-1;
4650
4651 /* Save the original in case we change the emitted regop to a FAIL. */
4652 const regnode_offset orig_emit = RExC_emit;
4653
4654 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4655
4656 PERL_ARGS_ASSERT_REGPIECE;
4657
4658 DEBUG_PARSE("piec");
4659
4660 ret = regatom(pRExC_state, &flags, depth+1);
4661 if (ret == 0) {
4662 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
4663 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
4664 }
4665 I32 npar_after = RExC_npar-1;
4666
4667 op = *RExC_parse;
4668 switch (op) {
4669 const char * regcurly_return[5];
4670
4671 case '*':
4672 nextchar(pRExC_state);
4673 min = 0;
4674 break;
4675
4676 case '+':
4677 nextchar(pRExC_state);
4678 min = 1;
4679 break;
4680
4681 case '?':
4682 nextchar(pRExC_state);
4683 min = 0; max = 1;
4684 break;
4685
4686 case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
4687 to determine which */
4688 if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
4689 const char * min_start = regcurly_return[MIN_S];
4690 const char * min_end = regcurly_return[MIN_E];
4691 const char * max_start = regcurly_return[MAX_S];
4692 const char * max_end = regcurly_return[MAX_E];
4693
4694 if (min_start) {
4695 min = get_quantifier_value(pRExC_state, min_start, min_end);
4696 }
4697 else {
4698 min = 0;
4699 }
4700
4701 if (max_start == max_end) { /* Was of the form {m,} */
4702 max = REG_INFTY;
4703 }
4704 else if (max_start == min_start) { /* Was of the form {m} */
4705 max = min;
4706 }
4707 else { /* Was of the form {m,n} */
4708 assert(max_end >= max_start);
4709
4710 max = get_quantifier_value(pRExC_state, max_start, max_end);
4711 }
4712
4713 RExC_parse_set((char *) regcurly_return[RBRACE]);
4714 nextchar(pRExC_state);
4715
4716 if (max < min) { /* If can't match, warn and optimize to fail
4717 unconditionally */
4718 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
4719 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
4720 NEXT_OFF(REGNODE_p(orig_emit)) =
4721 REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
4722 return ret;
4723 }
4724 else if (min == max && *RExC_parse == '?') {
4725 ckWARN2reg(RExC_parse + 1,
4726 "Useless use of greediness modifier '%c'",
4727 *RExC_parse);
4728 }
4729
4730 break;
4731 } /* End of is {m,n} */
4732
4733 /* Here was a '{', but what followed it didn't form a quantifier. */
4734 /* FALLTHROUGH */
4735
4736 default:
4737 *flagp = flags;
4738 return(ret);
4739 NOT_REACHED; /*NOTREACHED*/
4740 }
4741
4742 /* Here we have a quantifier, and have calculated 'min' and 'max'.
4743 *
4744 * Check and possibly adjust a zero width operand */
4745 if (! (flags & (HASWIDTH|POSTPONED))) {
4746 if (max > REG_INFTY/3) {
4747 ckWARN2reg(RExC_parse,
4748 "%" UTF8f " matches null string many times",
4749 UTF8fARG(UTF, (RExC_parse >= origparse
4750 ? RExC_parse - origparse
4751 : 0),
4752 origparse));
4753 }
4754
4755 /* There's no point in trying to match something 0 length more than
4756 * once except for extra side effects, which we don't have here since
4757 * not POSTPONED */
4758 if (max > 1) {
4759 max = 1;
4760 if (min > max) {
4761 min = max;
4762 }
4763 }
4764 }
4765
4766 /* If this is a code block pass it up */
4767 *flagp |= (flags & POSTPONED);
4768
4769 if (max > 0) {
4770 *flagp |= (flags & HASWIDTH);
4771 if (max == REG_INFTY)
4772 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
4773 }
4774
4775 /* 'SIMPLE' operands don't require full generality */
4776 if ((flags&SIMPLE)) {
4777 if (max == REG_INFTY) {
4778 if (min == 0) {
4779 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
4780 goto min0_maxINF_wildcard_forbidden;
4781 }
4782
4783 reginsert(pRExC_state, STAR, ret, depth+1);
4784 MARK_NAUGHTY(4);
4785 goto done_main_op;
4786 }
4787 else if (min == 1) {
4788 reginsert(pRExC_state, PLUS, ret, depth+1);
4789 MARK_NAUGHTY(3);
4790 goto done_main_op;
4791 }
4792 }
4793
4794 /* Here, SIMPLE, but not the '*' and '+' special cases */
4795
4796 MARK_NAUGHTY_EXP(2, 2);
4797 reginsert(pRExC_state, CURLY, ret, depth+1);
4798 }
4799 else { /* not SIMPLE */
4800 const regnode_offset w = reg_node(pRExC_state, WHILEM);
4801
4802 FLAGS(REGNODE_p(w)) = 0;
4803 if (! REGTAIL(pRExC_state, ret, w)) {
4804 REQUIRE_BRANCHJ(flagp, 0);
4805 }
4806 if (RExC_use_BRANCHJ) {
4807 reginsert(pRExC_state, LONGJMP, ret, depth+1);
4808 reginsert(pRExC_state, NOTHING, ret, depth+1);
4809 REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP);
4810 }
4811 reginsert(pRExC_state, CURLYX, ret, depth+1);
4812 if (RExC_use_BRANCHJ)
4813 /* Go over NOTHING to LONGJMP. */
4814 REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING);
4815
4816 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
4817 NOTHING)))
4818 {
4819 REQUIRE_BRANCHJ(flagp, 0);
4820 }
4821 RExC_whilem_seen++;
4822 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
4823 }
4824
4825 /* Finish up the CURLY/CURLYX case */
4826 FLAGS(REGNODE_p(ret)) = 0;
4827
4828 ARG1i_SET(REGNODE_p(ret), min);
4829 ARG2i_SET(REGNODE_p(ret), max);
4830
4831 /* if we had a npar_after then we need to increment npar_before,
4832 * we want to track the range of parens we need to reset each iteration
4833 */
4834 if (npar_after!=npar_before) {
4835 ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1);
4836 ARG3b_SET(REGNODE_p(ret), (U16)npar_after);
4837 } else {
4838 ARG3a_SET(REGNODE_p(ret), 0);
4839 ARG3b_SET(REGNODE_p(ret), 0);
4840 }
4841
4842 done_main_op:
4843
4844 /* Process any greediness modifiers */
4845 if (*RExC_parse == '?') {
4846 nextchar(pRExC_state);
4847 reginsert(pRExC_state, MINMOD, ret, depth+1);
4848 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
4849 REQUIRE_BRANCHJ(flagp, 0);
4850 }
4851 }
4852 else if (*RExC_parse == '+') {
4853 regnode_offset ender;
4854 nextchar(pRExC_state);
4855 ender = reg_node(pRExC_state, SUCCEED);
4856 if (! REGTAIL(pRExC_state, ret, ender)) {
4857 REQUIRE_BRANCHJ(flagp, 0);
4858 }
4859 reginsert(pRExC_state, SUSPEND, ret, depth+1);
4860 ender = reg_node(pRExC_state, TAIL);
4861 if (! REGTAIL(pRExC_state, ret, ender)) {
4862 REQUIRE_BRANCHJ(flagp, 0);
4863 }
4864 }
4865
4866 /* Forbid extra quantifiers */
4867 if (isQUANTIFIER(RExC_parse, RExC_end)) {
4868 RExC_parse_inc_by(1);
4869 vFAIL("Nested quantifiers");
4870 }
4871
4872 return(ret);
4873
4874 min0_maxINF_wildcard_forbidden:
4875
4876 /* Here we are in a wildcard match, and the minimum match length is 0, and
4877 * the max could be infinity. This is currently forbidden. The only
4878 * reason is to make it harder to write patterns that take a long long time
4879 * to halt, and because the use of this construct isn't necessary in
4880 * matching Unicode property values */
4881 RExC_parse_inc_by(1);
4882 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
4883 subpatterns in regex; marked by <-- HERE in m/%s/
4884 */
4885 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
4886 " subpatterns");
4887
4888 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
4889 * legal at all in wildcards, so can't get this far */
4890
4891 NOT_REACHED; /*NOTREACHED*/
4892}
4893
4894STATIC bool
4895S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
4896 regnode_offset * node_p,
4897 UV * code_point_p,
4898 int * cp_count,
4899 I32 * flagp,
4900 const bool strict,
4901 const U32 depth
4902 )
4903{
4904 /* This routine teases apart the various meanings of \N and returns
4905 * accordingly. The input parameters constrain which meaning(s) is/are valid
4906 * in the current context.
4907 *
4908 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
4909 *
4910 * If <code_point_p> is not NULL, the context is expecting the result to be a
4911 * single code point. If this \N instance turns out to a single code point,
4912 * the function returns TRUE and sets *code_point_p to that code point.
4913 *
4914 * If <node_p> is not NULL, the context is expecting the result to be one of
4915 * the things representable by a regnode. If this \N instance turns out to be
4916 * one such, the function generates the regnode, returns TRUE and sets *node_p
4917 * to point to the offset of that regnode into the regex engine program being
4918 * compiled.
4919 *
4920 * If this instance of \N isn't legal in any context, this function will
4921 * generate a fatal error and not return.
4922 *
4923 * On input, RExC_parse should point to the first char following the \N at the
4924 * time of the call. On successful return, RExC_parse will have been updated
4925 * to point to just after the sequence identified by this routine. Also
4926 * *flagp has been updated as needed.
4927 *
4928 * When there is some problem with the current context and this \N instance,
4929 * the function returns FALSE, without advancing RExC_parse, nor setting
4930 * *node_p, nor *code_point_p, nor *flagp.
4931 *
4932 * If <cp_count> is not NULL, the caller wants to know the length (in code
4933 * points) that this \N sequence matches. This is set, and the input is
4934 * parsed for errors, even if the function returns FALSE, as detailed below.
4935 *
4936 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
4937 *
4938 * Probably the most common case is for the \N to specify a single code point.
4939 * *cp_count will be set to 1, and *code_point_p will be set to that code
4940 * point.
4941 *
4942 * Another possibility is for the input to be an empty \N{}. This is no
4943 * longer accepted, and will generate a fatal error.
4944 *
4945 * Another possibility is for a custom charnames handler to be in effect which
4946 * translates the input name to an empty string. *cp_count will be set to 0.
4947 * *node_p will be set to a generated NOTHING node.
4948 *
4949 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
4950 * set to 0. *node_p will be set to a generated REG_ANY node.
4951 *
4952 * The fifth possibility is that \N resolves to a sequence of more than one
4953 * code points. *cp_count will be set to the number of code points in the
4954 * sequence. *node_p will be set to a generated node returned by this
4955 * function calling S_reg().
4956 *
4957 * The sixth and final possibility is that it is premature to be calling this
4958 * function; the parse needs to be restarted. This can happen when this
4959 * changes from /d to /u rules, or when the pattern needs to be upgraded to
4960 * UTF-8. The latter occurs only when the fifth possibility would otherwise
4961 * be in effect, and is because one of those code points requires the pattern
4962 * to be recompiled as UTF-8. The function returns FALSE, and sets the
4963 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
4964 * happens, the caller needs to desist from continuing parsing, and return
4965 * this information to its caller. This is not set for when there is only one
4966 * code point, as this can be called as part of an ANYOF node, and they can
4967 * store above-Latin1 code points without the pattern having to be in UTF-8.
4968 *
4969 * For non-single-quoted regexes, the tokenizer has resolved character and
4970 * sequence names inside \N{...} into their Unicode values, normalizing the
4971 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
4972 * hex-represented code points in the sequence. This is done there because
4973 * the names can vary based on what charnames pragma is in scope at the time,
4974 * so we need a way to take a snapshot of what they resolve to at the time of
4975 * the original parse. [perl #56444].
4976 *
4977 * That parsing is skipped for single-quoted regexes, so here we may get
4978 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
4979 * like '\N{U+41}', that code point is Unicode, and has to be translated into
4980 * the native character set for non-ASCII platforms. The other possibilities
4981 * are already native, so no translation is done. */
4982
4983 char * endbrace; /* points to '}' following the name */
4984 char * e; /* points to final non-blank before endbrace */
4985 char* p = RExC_parse; /* Temporary */
4986
4987 SV * substitute_parse = NULL;
4988 char *orig_end;
4989 char *save_start;
4990 I32 flags;
4991
4992 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4993
4994 PERL_ARGS_ASSERT_GROK_BSLASH_N;
4995
4996 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
4997 assert(! (node_p && cp_count)); /* At most 1 should be set */
4998
4999 if (cp_count) { /* Initialize return for the most common case */
5000 *cp_count = 1;
5001 }
5002
5003 /* The [^\n] meaning of \N ignores spaces and comments under the /x
5004 * modifier. The other meanings do not (except blanks adjacent to and
5005 * within the braces), so use a temporary until we find out which we are
5006 * being called with */
5007 skip_to_be_ignored_text(pRExC_state, &p,
5008 FALSE /* Don't force to /x */ );
5009
5010 /* Disambiguate between \N meaning a named character versus \N meaning
5011 * [^\n]. The latter is assumed when the {...} following the \N is a legal
5012 * quantifier, or if there is no '{' at all */
5013 if (*p != '{' || regcurly(p, RExC_end, NULL)) {
5014 RExC_parse_set(p);
5015 if (cp_count) {
5016 *cp_count = -1;
5017 }
5018
5019 if (! node_p) {
5020 return FALSE;
5021 }
5022
5023 *node_p = reg_node(pRExC_state, REG_ANY);
5024 *flagp |= HASWIDTH|SIMPLE;
5025 MARK_NAUGHTY(1);
5026 return TRUE;
5027 }
5028
5029 /* The test above made sure that the next real character is a '{', but
5030 * under the /x modifier, it could be separated by space (or a comment and
5031 * \n) and this is not allowed (for consistency with \x{...} and the
5032 * tokenizer handling of \N{NAME}). */
5033 if (*RExC_parse != '{') {
5034 vFAIL("Missing braces on \\N{}");
5035 }
5036
5037 RExC_parse_inc_by(1); /* Skip past the '{' */
5038
5039 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
5040 if (! endbrace) { /* no trailing brace */
5041 vFAIL2("Missing right brace on \\%c{}", 'N');
5042 }
5043
5044 /* Here, we have decided it should be a named character or sequence. These
5045 * imply Unicode semantics */
5046 REQUIRE_UNI_RULES(flagp, FALSE);
5047
5048 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
5049 * nothing at all (not allowed under strict) */
5050 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
5051 RExC_parse_set(endbrace);
5052 if (strict) {
5053 RExC_parse_inc_by(1); /* Position after the "}" */
5054 vFAIL("Zero length \\N{}");
5055 }
5056
5057 if (cp_count) {
5058 *cp_count = 0;
5059 }
5060 nextchar(pRExC_state);
5061 if (! node_p) {
5062 return FALSE;
5063 }
5064
5065 *node_p = reg_node(pRExC_state, NOTHING);
5066 return TRUE;
5067 }
5068
5069 while (isBLANK(*RExC_parse)) {
5070 RExC_parse_inc_by(1);
5071 }
5072
5073 e = endbrace;
5074 while (RExC_parse < e && isBLANK(*(e-1))) {
5075 e--;
5076 }
5077
5078 if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
5079
5080 /* Here, the name isn't of the form U+.... This can happen if the
5081 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
5082 * is the time to find out what the name means */
5083
5084 const STRLEN name_len = e - RExC_parse;
5085 SV * value_sv; /* What does this name evaluate to */
5086 SV ** value_svp;
5087 const U8 * value; /* string of name's value */
5088 STRLEN value_len; /* and its length */
5089
5090 /* RExC_unlexed_names is a hash of names that weren't evaluated by
5091 * toke.c, and their values. Make sure is initialized */
5092 if (! RExC_unlexed_names) {
5093 RExC_unlexed_names = newHV();
5094 }
5095
5096 /* If we have already seen this name in this pattern, use that. This
5097 * allows us to only call the charnames handler once per name per
5098 * pattern. A broken or malicious handler could return something
5099 * different each time, which could cause the results to vary depending
5100 * on if something gets added or subtracted from the pattern that
5101 * causes the number of passes to change, for example */
5102 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
5103 name_len, 0)))
5104 {
5105 value_sv = *value_svp;
5106 }
5107 else { /* Otherwise we have to go out and get the name */
5108 const char * error_msg = NULL;
5109 value_sv = get_and_check_backslash_N_name(RExC_parse, e,
5110 UTF,
5111 &error_msg);
5112 if (error_msg) {
5113 RExC_parse_set(endbrace);
5114 vFAIL(error_msg);
5115 }
5116
5117 /* If no error message, should have gotten a valid return */
5118 assert (value_sv);
5119
5120 /* Save the name's meaning for later use */
5121 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
5122 value_sv, 0))
5123 {
5124 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
5125 }
5126 }
5127
5128 /* Here, we have the value the name evaluates to in 'value_sv' */
5129 value = (U8 *) SvPV(value_sv, value_len);
5130
5131 /* See if the result is one code point vs 0 or multiple */
5132 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
5133 ? UTF8SKIP(value)
5134 : 1)))
5135 {
5136 /* Here, exactly one code point. If that isn't what is wanted,
5137 * fail */
5138 if (! code_point_p) {
5139 RExC_parse_set(p);
5140 return FALSE;
5141 }
5142
5143 /* Convert from string to numeric code point */
5144 *code_point_p = (SvUTF8(value_sv))
5145 ? valid_utf8_to_uvchr(value, NULL)
5146 : *value;
5147
5148 /* Have parsed this entire single code point \N{...}. *cp_count
5149 * has already been set to 1, so don't do it again. */
5150 RExC_parse_set(endbrace);
5151 nextchar(pRExC_state);
5152 return TRUE;
5153 } /* End of is a single code point */
5154
5155 /* Count the code points, if caller desires. The API says to do this
5156 * even if we will later return FALSE */
5157 if (cp_count) {
5158 *cp_count = 0;
5159
5160 *cp_count = (SvUTF8(value_sv))
5161 ? utf8_length(value, value + value_len)
5162 : value_len;
5163 }
5164
5165 /* Fail if caller doesn't want to handle a multi-code-point sequence.
5166 * But don't back the pointer up if the caller wants to know how many
5167 * code points there are (they need to handle it themselves in this
5168 * case). */
5169 if (! node_p) {
5170 if (! cp_count) {
5171 RExC_parse_set(p);
5172 }
5173 return FALSE;
5174 }
5175
5176 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
5177 * reg recursively to parse it. That way, it retains its atomicness,
5178 * while not having to worry about any special handling that some code
5179 * points may have. */
5180
5181 substitute_parse = newSVpvs("?:");
5182 sv_catsv(substitute_parse, value_sv);
5183 sv_catpv(substitute_parse, ")");
5184
5185 /* The value should already be native, so no need to convert on EBCDIC
5186 * platforms.*/
5187 assert(! RExC_recode_x_to_native);
5188
5189 }
5190 else { /* \N{U+...} */
5191 Size_t count = 0; /* code point count kept internally */
5192
5193 /* We can get to here when the input is \N{U+...} or when toke.c has
5194 * converted a name to the \N{U+...} form. This include changing a
5195 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
5196
5197 RExC_parse_inc_by(2); /* Skip past the 'U+' */
5198
5199 /* Code points are separated by dots. The '}' terminates the whole
5200 * thing. */
5201
5202 do { /* Loop until the ending brace */
5203 I32 flags = PERL_SCAN_SILENT_OVERFLOW
5204 | PERL_SCAN_SILENT_ILLDIGIT
5205 | PERL_SCAN_NOTIFY_ILLDIGIT
5206 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
5207 | PERL_SCAN_DISALLOW_PREFIX;
5208 STRLEN len = e - RExC_parse;
5209 NV overflow_value;
5210 char * start_digit = RExC_parse;
5211 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
5212
5213 if (len == 0) {
5214 RExC_parse_inc_by(1);
5215 bad_NU:
5216 vFAIL("Invalid hexadecimal number in \\N{U+...}");
5217 }
5218
5219 RExC_parse_inc_by(len);
5220
5221 if (cp > MAX_LEGAL_CP) {
5222 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
5223 }
5224
5225 if (RExC_parse >= e) { /* Got to the closing '}' */
5226 if (count) {
5227 goto do_concat;
5228 }
5229
5230 /* Here, is a single code point; fail if doesn't want that */
5231 if (! code_point_p) {
5232 RExC_parse_set(p);
5233 return FALSE;
5234 }
5235
5236 /* A single code point is easy to handle; just return it */
5237 *code_point_p = UNI_TO_NATIVE(cp);
5238 RExC_parse_set(endbrace);
5239 nextchar(pRExC_state);
5240 return TRUE;
5241 }
5242
5243 /* Here, the parse stopped bfore the ending brace. This is legal
5244 * only if that character is a dot separating code points, like a
5245 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
5246 * So the next character must be a dot (and the one after that
5247 * can't be the ending brace, or we'd have something like
5248 * \N{U+100.} )
5249 * */
5250 if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
5251 /*point to after 1st invalid */
5252 RExC_parse_incf(RExC_orig_utf8);
5253 /*Guard against malformed utf8*/
5254 RExC_parse_set(MIN(e, RExC_parse));
5255 goto bad_NU;
5256 }
5257
5258 /* Here, looks like its really a multiple character sequence. Fail
5259 * if that's not what the caller wants. But continue with counting
5260 * and error checking if they still want a count */
5261 if (! node_p && ! cp_count) {
5262 return FALSE;
5263 }
5264
5265 /* What is done here is to convert this to a sub-pattern of the
5266 * form \x{char1}\x{char2}... and then call reg recursively to
5267 * parse it (enclosing in "(?: ... )" ). That way, it retains its
5268 * atomicness, while not having to worry about special handling
5269 * that some code points may have. We don't create a subpattern,
5270 * but go through the motions of code point counting and error
5271 * checking, if the caller doesn't want a node returned. */
5272
5273 if (node_p && ! substitute_parse) {
5274 substitute_parse = newSVpvs("?:");
5275 }
5276
5277 do_concat:
5278
5279 if (node_p) {
5280 /* Convert to notation the rest of the code understands */
5281 sv_catpvs(substitute_parse, "\\x{");
5282 sv_catpvn(substitute_parse, start_digit,
5283 RExC_parse - start_digit);
5284 sv_catpvs(substitute_parse, "}");
5285 }
5286
5287 /* Move to after the dot (or ending brace the final time through.)
5288 * */
5289 RExC_parse_inc_by(1);
5290 count++;
5291
5292 } while (RExC_parse < e);
5293
5294 if (! node_p) { /* Doesn't want the node */
5295 assert (cp_count);
5296
5297 *cp_count = count;
5298 return FALSE;
5299 }
5300
5301 sv_catpvs(substitute_parse, ")");
5302
5303 /* The values are Unicode, and therefore have to be converted to native
5304 * on a non-Unicode (meaning non-ASCII) platform. */
5305 SET_recode_x_to_native(1);
5306 }
5307
5308 /* Here, we have the string the name evaluates to, ready to be parsed,
5309 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
5310 * constructs. This can be called from within a substitute parse already.
5311 * The error reporting mechanism doesn't work for 2 levels of this, but the
5312 * code above has validated this new construct, so there should be no
5313 * errors generated by the below. And this isn't an exact copy, so the
5314 * mechanism to seamlessly deal with this won't work, so turn off warnings
5315 * during it */
5316 save_start = RExC_start;
5317 orig_end = RExC_end;
5318
5319 RExC_start = SvPVX(substitute_parse);
5320 RExC_parse_set(RExC_start);
5321 RExC_end = RExC_parse + SvCUR(substitute_parse);
5322 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
5323
5324 *node_p = reg(pRExC_state, 1, &flags, depth+1);
5325
5326 /* Restore the saved values */
5327 RESTORE_WARNINGS;
5328 RExC_start = save_start;
5329 RExC_parse_set(endbrace);
5330 RExC_end = orig_end;
5331 SET_recode_x_to_native(0);
5332
5333 SvREFCNT_dec_NN(substitute_parse);
5334
5335 if (! *node_p) {
5336 RETURN_FAIL_ON_RESTART(flags, flagp);
5337 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
5338 (UV) flags);
5339 }
5340 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5341
5342 nextchar(pRExC_state);
5343
5344 return TRUE;
5345}
5346
5347
5348STATIC U8
5349S_compute_EXACTish(RExC_state_t *pRExC_state)
5350{
5351 U8 op;
5352
5353 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
5354
5355 if (! FOLD) {
5356 return (LOC)
5357 ? EXACTL
5358 : EXACT;
5359 }
5360
5361 op = get_regex_charset(RExC_flags);
5362 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
5363 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
5364 been, so there is no hole */
5365 }
5366
5367 return op + EXACTF;
5368}
5369
5370/* Parse backref decimal value, unless it's too big to sensibly be a backref,
5371 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
5372
5373static I32
5374S_backref_value(char *p, char *e)
5375{
5376 const char* endptr = e;
5377 UV val;
5378 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
5379 return (I32)val;
5380 return I32_MAX;
5381}
5382
5383
5384/*
5385 - regatom - the lowest level
5386
5387 Try to identify anything special at the start of the current parse position.
5388 If there is, then handle it as required. This may involve generating a
5389 single regop, such as for an assertion; or it may involve recursing, such as
5390 to handle a () structure.
5391
5392 If the string doesn't start with something special then we gobble up
5393 as much literal text as we can. If we encounter a quantifier, we have to
5394 back off the final literal character, as that quantifier applies to just it
5395 and not to the whole string of literals.
5396
5397 Once we have been able to handle whatever type of thing started the
5398 sequence, we return the offset into the regex engine program being compiled
5399 at which any next regnode should be placed.
5400
5401 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
5402 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
5403 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
5404 Otherwise does not return 0.
5405
5406 Note: we have to be careful with escapes, as they can be both literal
5407 and special, and in the case of \10 and friends, context determines which.
5408
5409 A summary of the code structure is:
5410
5411 switch (first_byte) {
5412 cases for each special:
5413 handle this special;
5414 break;
5415 case '\\':
5416 switch (2nd byte) {
5417 cases for each unambiguous special:
5418 handle this special;
5419 break;
5420 cases for each ambiguous special/literal:
5421 disambiguate;
5422 if (special) handle here
5423 else goto defchar;
5424 default: // unambiguously literal:
5425 goto defchar;
5426 }
5427 default: // is a literal char
5428 // FALL THROUGH
5429 defchar:
5430 create EXACTish node for literal;
5431 while (more input and node isn't full) {
5432 switch (input_byte) {
5433 cases for each special;
5434 make sure parse pointer is set so that the next call to
5435 regatom will see this special first
5436 goto loopdone; // EXACTish node terminated by prev. char
5437 default:
5438 append char to EXACTISH node;
5439 }
5440 get next input byte;
5441 }
5442 loopdone:
5443 }
5444 return the generated node;
5445
5446 Specifically there are two separate switches for handling
5447 escape sequences, with the one for handling literal escapes requiring
5448 a dummy entry for all of the special escapes that are actually handled
5449 by the other.
5450
5451*/
5452
5453STATIC regnode_offset
5454S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5455{
5456 regnode_offset ret = 0;
5457 I32 flags = 0;
5458 char *atom_parse_start;
5459 U8 op;
5460 int invert = 0;
5461
5462 DECLARE_AND_GET_RE_DEBUG_FLAGS;
5463
5464 *flagp = 0; /* Initialize. */
5465
5466 DEBUG_PARSE("atom");
5467
5468 PERL_ARGS_ASSERT_REGATOM;
5469
5470 tryagain:
5471 atom_parse_start = RExC_parse;
5472 assert(RExC_parse < RExC_end);
5473 switch ((U8)*RExC_parse) {
5474 case '^':
5475 RExC_seen_zerolen++;
5476 nextchar(pRExC_state);
5477 if (RExC_flags & RXf_PMf_MULTILINE)
5478 ret = reg_node(pRExC_state, MBOL);
5479 else
5480 ret = reg_node(pRExC_state, SBOL);
5481 break;
5482 case '$':
5483 nextchar(pRExC_state);
5484 if (*RExC_parse)
5485 RExC_seen_zerolen++;
5486 if (RExC_flags & RXf_PMf_MULTILINE)
5487 ret = reg_node(pRExC_state, MEOL);
5488 else
5489 ret = reg_node(pRExC_state, SEOL);
5490 break;
5491 case '.':
5492 nextchar(pRExC_state);
5493 if (RExC_flags & RXf_PMf_SINGLELINE)
5494 ret = reg_node(pRExC_state, SANY);
5495 else
5496 ret = reg_node(pRExC_state, REG_ANY);
5497 *flagp |= HASWIDTH|SIMPLE;
5498 MARK_NAUGHTY(1);
5499 break;
5500 case '[':
5501 {
5502 char * const cc_parse_start = ++RExC_parse;
5503 ret = regclass(pRExC_state, flagp, depth+1,
5504 FALSE, /* means parse the whole char class */
5505 TRUE, /* allow multi-char folds */
5506 FALSE, /* don't silence non-portable warnings. */
5507 (bool) RExC_strict,
5508 TRUE, /* Allow an optimized regnode result */
5509 NULL);
5510 if (ret == 0) {
5511 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5512 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5513 (UV) *flagp);
5514 }
5515 if (*RExC_parse != ']') {
5516 RExC_parse_set(cc_parse_start);
5517 vFAIL("Unmatched [");
5518 }
5519 nextchar(pRExC_state);
5520 break;
5521 }
5522 case '(':
5523 nextchar(pRExC_state);
5524 ret = reg(pRExC_state, 2, &flags, depth+1);
5525 if (ret == 0) {
5526 if (flags & TRYAGAIN) {
5527 if (RExC_parse >= RExC_end) {
5528 /* Make parent create an empty node if needed. */
5529 *flagp |= TRYAGAIN;
5530 return(0);
5531 }
5532 goto tryagain;
5533 }
5534 RETURN_FAIL_ON_RESTART(flags, flagp);
5535 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
5536 (UV) flags);
5537 }
5538 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5539 break;
5540 case '|':
5541 case ')':
5542 if (flags & TRYAGAIN) {
5543 *flagp |= TRYAGAIN;
5544 return 0;
5545 }
5546 vFAIL("Internal urp");
5547 /* Supposed to be caught earlier. */
5548 break;
5549 case '?':
5550 case '+':
5551 case '*':
5552 RExC_parse_inc_by(1);
5553 vFAIL("Quantifier follows nothing");
5554 break;
5555 case '\\':
5556 /* Special Escapes
5557
5558 This switch handles escape sequences that resolve to some kind
5559 of special regop and not to literal text. Escape sequences that
5560 resolve to literal text are handled below in the switch marked
5561 "Literal Escapes".
5562
5563 Every entry in this switch *must* have a corresponding entry
5564 in the literal escape switch. However, the opposite is not
5565 required, as the default for this switch is to jump to the
5566 literal text handling code.
5567 */
5568 RExC_parse_inc_by(1);
5569 switch ((U8)*RExC_parse) {
5570 /* Special Escapes */
5571 case 'A':
5572 RExC_seen_zerolen++;
5573 /* Under wildcards, this is changed to match \n; should be
5574 * invisible to the user, as they have to compile under /m */
5575 if (RExC_pm_flags & PMf_WILDCARD) {
5576 ret = reg_node(pRExC_state, MBOL);
5577 }
5578 else {
5579 ret = reg_node(pRExC_state, SBOL);
5580 /* SBOL is shared with /^/ so we set the flags so we can tell
5581 * /\A/ from /^/ in split. */
5582 FLAGS(REGNODE_p(ret)) = 1;
5583 }
5584 goto finish_meta_pat;
5585 case 'G':
5586 if (RExC_pm_flags & PMf_WILDCARD) {
5587 RExC_parse_inc_by(1);
5588 /* diag_listed_as: Use of %s is not allowed in Unicode property
5589 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
5590 */
5591 vFAIL("Use of '\\G' is not allowed in Unicode property"
5592 " wildcard subpatterns");
5593 }
5594 ret = reg_node(pRExC_state, GPOS);
5595 RExC_seen |= REG_GPOS_SEEN;
5596 goto finish_meta_pat;
5597 case 'K':
5598 if (!RExC_in_lookaround) {
5599 RExC_seen_zerolen++;
5600 ret = reg_node(pRExC_state, KEEPS);
5601 /* XXX:dmq : disabling in-place substitution seems to
5602 * be necessary here to avoid cases of memory corruption, as
5603 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
5604 */
5605 RExC_seen |= REG_LOOKBEHIND_SEEN;
5606 goto finish_meta_pat;
5607 }
5608 else {
5609 ++RExC_parse; /* advance past the 'K' */
5610 vFAIL("\\K not permitted in lookahead/lookbehind");
5611 }
5612 case 'Z':
5613 if (RExC_pm_flags & PMf_WILDCARD) {
5614 /* See comment under \A above */
5615 ret = reg_node(pRExC_state, MEOL);
5616 }
5617 else {
5618 ret = reg_node(pRExC_state, SEOL);
5619 }
5620 RExC_seen_zerolen++; /* Do not optimize RE away */
5621 goto finish_meta_pat;
5622 case 'z':
5623 if (RExC_pm_flags & PMf_WILDCARD) {
5624 /* See comment under \A above */
5625 ret = reg_node(pRExC_state, MEOL);
5626 }
5627 else {
5628 ret = reg_node(pRExC_state, EOS);
5629 }
5630 RExC_seen_zerolen++; /* Do not optimize RE away */
5631 goto finish_meta_pat;
5632 case 'C':
5633 vFAIL("\\C no longer supported");
5634 case 'X':
5635 ret = reg_node(pRExC_state, CLUMP);
5636 *flagp |= HASWIDTH;
5637 goto finish_meta_pat;
5638
5639 case 'B':
5640 invert = 1;
5641 /* FALLTHROUGH */
5642 case 'b':
5643 {
5644 U8 flags = 0;
5645 regex_charset charset = get_regex_charset(RExC_flags);
5646
5647 RExC_seen_zerolen++;
5648 RExC_seen |= REG_LOOKBEHIND_SEEN;
5649 op = BOUND + charset;
5650
5651 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
5652 flags = TRADITIONAL_BOUND;
5653 if (op > BOUNDA) { /* /aa is same as /a */
5654 op = BOUNDA;
5655 }
5656 }
5657 else {
5658 STRLEN length;
5659 char name = *RExC_parse;
5660 char * endbrace = (char *) memchr(RExC_parse, '}',
5661 RExC_end - RExC_parse);
5662 char * e = endbrace;
5663
5664 RExC_parse_inc_by(2);
5665
5666 if (! endbrace) {
5667 vFAIL2("Missing right brace on \\%c{}", name);
5668 }
5669
5670 while (isBLANK(*RExC_parse)) {
5671 RExC_parse_inc_by(1);
5672 }
5673
5674 while (RExC_parse < e && isBLANK(*(e - 1))) {
5675 e--;
5676 }
5677
5678 if (e == RExC_parse) {
5679 RExC_parse_set(endbrace + 1); /* After the '}' */
5680 vFAIL2("Empty \\%c{}", name);
5681 }
5682
5683 length = e - RExC_parse;
5684
5685 switch (*RExC_parse) {
5686 case 'g':
5687 if ( length != 1
5688 && (memNEs(RExC_parse + 1, length - 1, "cb")))
5689 {
5690 goto bad_bound_type;
5691 }
5692 flags = GCB_BOUND;
5693 break;
5694 case 'l':
5695 if (length != 2 || *(RExC_parse + 1) != 'b') {
5696 goto bad_bound_type;
5697 }
5698 flags = LB_BOUND;
5699 break;
5700 case 's':
5701 if (length != 2 || *(RExC_parse + 1) != 'b') {
5702 goto bad_bound_type;
5703 }
5704 flags = SB_BOUND;
5705 break;
5706 case 'w':
5707 if (length != 2 || *(RExC_parse + 1) != 'b') {
5708 goto bad_bound_type;
5709 }
5710 flags = WB_BOUND;
5711 break;
5712 default:
5713 bad_bound_type:
5714 RExC_parse_set(e);
5715 vFAIL2utf8f(
5716 "'%" UTF8f "' is an unknown bound type",
5717 UTF8fARG(UTF, length, e - length));
5718 NOT_REACHED; /*NOTREACHED*/
5719 }
5720 RExC_parse_set(endbrace);
5721 REQUIRE_UNI_RULES(flagp, 0);
5722
5723 if (op == BOUND) {
5724 op = BOUNDU;
5725 }
5726 else if (op >= BOUNDA) { /* /aa is same as /a */
5727 op = BOUNDU;
5728 length += 4;
5729
5730 /* Don't have to worry about UTF-8, in this message because
5731 * to get here the contents of the \b must be ASCII */
5732 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
5733 "Using /u for '%.*s' instead of /%s",
5734 (unsigned) length,
5735 endbrace - length + 1,
5736 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
5737 ? ASCII_RESTRICT_PAT_MODS
5738 : ASCII_MORE_RESTRICT_PAT_MODS);
5739 }
5740 }
5741
5742 if (op == BOUND) {
5743 RExC_seen_d_op = TRUE;
5744 }
5745 else if (op == BOUNDL) {
5746 RExC_contains_locale = 1;
5747 }
5748
5749 if (invert) {
5750 op += NBOUND - BOUND;
5751 }
5752
5753 ret = reg_node(pRExC_state, op);
5754 FLAGS(REGNODE_p(ret)) = flags;
5755
5756 goto finish_meta_pat;
5757 }
5758
5759 case 'R':
5760 ret = reg_node(pRExC_state, LNBREAK);
5761 *flagp |= HASWIDTH|SIMPLE;
5762 goto finish_meta_pat;
5763
5764 case 'd':
5765 case 'D':
5766 case 'h':
5767 case 'H':
5768 case 'p':
5769 case 'P':
5770 case 's':
5771 case 'S':
5772 case 'v':
5773 case 'V':
5774 case 'w':
5775 case 'W':
5776 /* These all have the same meaning inside [brackets], and it knows
5777 * how to do the best optimizations for them. So, pretend we found
5778 * these within brackets, and let it do the work */
5779 RExC_parse--;
5780
5781 ret = regclass(pRExC_state, flagp, depth+1,
5782 TRUE, /* means just parse this element */
5783 FALSE, /* don't allow multi-char folds */
5784 FALSE, /* don't silence non-portable warnings. It
5785 would be a bug if these returned
5786 non-portables */
5787 (bool) RExC_strict,
5788 TRUE, /* Allow an optimized regnode result */
5789 NULL);
5790 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5791 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
5792 * multi-char folds are allowed. */
5793 if (!ret)
5794 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5795 (UV) *flagp);
5796
5797 RExC_parse--; /* regclass() leaves this one too far ahead */
5798
5799 finish_meta_pat:
5800 /* The escapes above that don't take a parameter can't be
5801 * followed by a '{'. But 'pX', 'p{foo}' and
5802 * correspondingly 'P' can be */
5803 if ( RExC_parse - atom_parse_start == 1
5804 && UCHARAT(RExC_parse + 1) == '{'
5805 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
5806 {
5807 RExC_parse_inc_by(2);
5808 vFAIL("Unescaped left brace in regex is illegal here");
5809 }
5810 nextchar(pRExC_state);
5811 break;
5812 case 'N':
5813 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
5814 * \N{...} evaluates to a sequence of more than one code points).
5815 * The function call below returns a regnode, which is our result.
5816 * The parameters cause it to fail if the \N{} evaluates to a
5817 * single code point; we handle those like any other literal. The
5818 * reason that the multicharacter case is handled here and not as
5819 * part of the EXACtish code is because of quantifiers. In
5820 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
5821 * this way makes that Just Happen. dmq.
5822 * join_exact() will join this up with adjacent EXACTish nodes
5823 * later on, if appropriate. */
5824 ++RExC_parse;
5825 if (grok_bslash_N(pRExC_state,
5826 &ret, /* Want a regnode returned */
5827 NULL, /* Fail if evaluates to a single code
5828 point */
5829 NULL, /* Don't need a count of how many code
5830 points */
5831 flagp,
5832 RExC_strict,
5833 depth)
5834 ) {
5835 break;
5836 }
5837
5838 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5839
5840 /* Here, evaluates to a single code point. Go get that */
5841 RExC_parse_set(atom_parse_start);
5842 goto defchar;
5843
5844 case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
5845 parse_named_seq: /* Also handle non-numeric \g{...} */
5846 {
5847 char ch;
5848 if ( RExC_parse >= RExC_end - 1
5849 || (( ch = RExC_parse[1]) != '<'
5850 && ch != '\''
5851 && ch != '{'))
5852 {
5853 RExC_parse_inc_by(1);
5854 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
5855 vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
5856 } else {
5857 RExC_parse_inc_by(2);
5858 if (ch == '{') {
5859 while (isBLANK(*RExC_parse)) {
5860 RExC_parse_inc_by(1);
5861 }
5862 }
5863 ret = handle_named_backref(pRExC_state,
5864 flagp,
5865 atom_parse_start,
5866 (ch == '<')
5867 ? '>'
5868 : (ch == '{')
5869 ? '}'
5870 : '\'');
5871 }
5872 break;
5873 }
5874 case 'g':
5875 case '1': case '2': case '3': case '4':
5876 case '5': case '6': case '7': case '8': case '9':
5877 {
5878 I32 num;
5879 char * endbrace = NULL;
5880 char * s = RExC_parse;
5881 char * e = RExC_end;
5882
5883 if (*s == 'g') {
5884 bool isrel = 0;
5885
5886 s++;
5887 if (*s == '{') {
5888 endbrace = (char *) memchr(s, '}', RExC_end - s);
5889 if (! endbrace ) {
5890
5891 /* Missing '}'. Position after the number to give
5892 * a better indication to the user of where the
5893 * problem is. */
5894 s++;
5895 if (*s == '-') {
5896 s++;
5897 }
5898
5899 /* If it looks to be a name and not a number, go
5900 * handle it there */
5901 if (! isDIGIT(*s)) {
5902 goto parse_named_seq;
5903 }
5904
5905 do {
5906 s++;
5907 } while isDIGIT(*s);
5908
5909 RExC_parse_set(s);
5910 vFAIL("Unterminated \\g{...} pattern");
5911 }
5912
5913 s++; /* Past the '{' */
5914
5915 while (isBLANK(*s)) {
5916 s++;
5917 }
5918
5919 /* Ignore trailing blanks */
5920 e = endbrace;
5921 while (s < e && isBLANK(*(e - 1))) {
5922 e--;
5923 }
5924 }
5925
5926 /* Here, have isolated the meat of the construct from any
5927 * surrounding braces */
5928
5929 if (*s == '-') {
5930 isrel = 1;
5931 s++;
5932 }
5933
5934 if (endbrace && !isDIGIT(*s)) {
5935 goto parse_named_seq;
5936 }
5937
5938 RExC_parse_set(s);
5939 num = S_backref_value(RExC_parse, RExC_end);
5940 if (num == 0)
5941 vFAIL("Reference to invalid group 0");
5942 else if (num == I32_MAX) {
5943 if (isDIGIT(*RExC_parse))
5944 vFAIL("Reference to nonexistent group");
5945 else
5946 vFAIL("Unterminated \\g... pattern");
5947 }
5948
5949 if (isrel) {
5950 num = RExC_npar - num;
5951 if (num < 1)
5952 vFAIL("Reference to nonexistent or unclosed group");
5953 }
5954 else
5955 if (num < RExC_logical_npar) {
5956 num = RExC_logical_to_parno[num];
5957 }
5958 else
5959 if (ALL_PARENS_COUNTED) {
5960 if (num < RExC_logical_total_parens)
5961 num = RExC_logical_to_parno[num];
5962 else {
5963 num = -1;
5964 }
5965 }
5966 else{
5967 REQUIRE_PARENS_PASS;
5968 }
5969 }
5970 else {
5971 num = S_backref_value(RExC_parse, RExC_end);
5972 /* bare \NNN might be backref or octal - if it is larger
5973 * than or equal RExC_npar then it is assumed to be an
5974 * octal escape. Note RExC_npar is +1 from the actual
5975 * number of parens. */
5976 /* Note we do NOT check if num == I32_MAX here, as that is
5977 * handled by the RExC_npar check */
5978
5979 if ( /* any numeric escape < 10 is always a backref */
5980 num > 9
5981 /* any numeric escape < RExC_npar is a backref */
5982 && num >= RExC_logical_npar
5983 /* cannot be an octal escape if it starts with [89]
5984 * */
5985 && ! inRANGE(*RExC_parse, '8', '9')
5986 ) {
5987 /* Probably not meant to be a backref, instead likely
5988 * to be an octal character escape, e.g. \35 or \777.
5989 * The above logic should make it obvious why using
5990 * octal escapes in patterns is problematic. - Yves */
5991 RExC_parse_set(atom_parse_start);
5992 goto defchar;
5993 }
5994 if (num < RExC_logical_npar) {
5995 num = RExC_logical_to_parno[num];
5996 }
5997 else
5998 if (ALL_PARENS_COUNTED) {
5999 if (num < RExC_logical_total_parens) {
6000 num = RExC_logical_to_parno[num];
6001 } else {
6002 num = -1;
6003 }
6004 } else {
6005 REQUIRE_PARENS_PASS;
6006 }
6007 }
6008
6009 /* At this point RExC_parse points at a numeric escape like
6010 * \12 or \88 or the digits in \g{34} or \g34 or something
6011 * similar, which we should NOT treat as an octal escape. It
6012 * may or may not be a valid backref escape. For instance
6013 * \88888888 is unlikely to be a valid backref.
6014 *
6015 * We've already figured out what value the digits represent.
6016 * Now, move the parse to beyond them. */
6017 if (endbrace) {
6018 RExC_parse_set(endbrace + 1);
6019 }
6020 else while (isDIGIT(*RExC_parse)) {
6021 RExC_parse_inc_by(1);
6022 }
6023 if (num < 0)
6024 vFAIL("Reference to nonexistent group");
6025
6026 if (num >= (I32)RExC_npar) {
6027 /* It might be a forward reference; we can't fail until we
6028 * know, by completing the parse to get all the groups, and
6029 * then reparsing */
6030 if (ALL_PARENS_COUNTED) {
6031 if (num >= RExC_total_parens) {
6032 vFAIL("Reference to nonexistent group");
6033 }
6034 }
6035 else {
6036 REQUIRE_PARENS_PASS;
6037 }
6038 }
6039 RExC_sawback = 1;
6040 ret = reg2node(pRExC_state,
6041 ((! FOLD)
6042 ? REF
6043 : (ASCII_FOLD_RESTRICTED)
6044 ? REFFA
6045 : (AT_LEAST_UNI_SEMANTICS)
6046 ? REFFU
6047 : (LOC)
6048 ? REFFL
6049 : REFF),
6050 num, RExC_nestroot);
6051 if (RExC_nestroot && num >= RExC_nestroot)
6052 FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
6053 if (OP(REGNODE_p(ret)) == REFF) {
6054 RExC_seen_d_op = TRUE;
6055 }
6056 *flagp |= HASWIDTH;
6057
6058 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
6059 FALSE /* Don't force to /x */ );
6060 }
6061 break;
6062 case '\0':
6063 if (RExC_parse >= RExC_end)
6064 FAIL("Trailing \\");
6065 /* FALLTHROUGH */
6066 default:
6067 /* Do not generate "unrecognized" warnings here, we fall
6068 back into the quick-grab loop below */
6069 RExC_parse_set(atom_parse_start);
6070 goto defchar;
6071 } /* end of switch on a \foo sequence */
6072 break;
6073
6074 case '#':
6075
6076 /* '#' comments should have been spaced over before this function was
6077 * called */
6078 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
6079 /*
6080 if (RExC_flags & RXf_PMf_EXTENDED) {
6081 RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
6082 if (RExC_parse < RExC_end)
6083 goto tryagain;
6084 }
6085 */
6086
6087 /* FALLTHROUGH */
6088
6089 default:
6090 defchar: {
6091
6092 /* Here, we have determined that the next thing is probably a
6093 * literal character. RExC_parse points to the first byte of its
6094 * definition. (It still may be an escape sequence that evaluates
6095 * to a single character) */
6096
6097 STRLEN len = 0;
6098 UV ender = 0;
6099 char *p;
6100 char *s, *old_s = NULL, *old_old_s = NULL;
6101 char *s0;
6102 U32 max_string_len = 255;
6103
6104 /* We may have to reparse the node, artificially stopping filling
6105 * it early, based on info gleaned in the first parse. This
6106 * variable gives where we stop. Make it above the normal stopping
6107 * place first time through; otherwise it would stop too early */
6108 U32 upper_fill = max_string_len + 1;
6109
6110 /* We start out as an EXACT node, even if under /i, until we find a
6111 * character which is in a fold. The algorithm now segregates into
6112 * separate nodes, characters that fold from those that don't under
6113 * /i. (This hopefully will create nodes that are fixed strings
6114 * even under /i, giving the optimizer something to grab on to.)
6115 * So, if a node has something in it and the next character is in
6116 * the opposite category, that node is closed up, and the function
6117 * returns. Then regatom is called again, and a new node is
6118 * created for the new category. */
6119 U8 node_type = EXACT;
6120
6121 /* Assume the node will be fully used; the excess is given back at
6122 * the end. Under /i, we may need to temporarily add the fold of
6123 * an extra character or two at the end to check for splitting
6124 * multi-char folds, so allocate extra space for that. We can't
6125 * make any other length assumptions, as a byte input sequence
6126 * could shrink down. */
6127 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
6128 + ((! FOLD)
6129 ? 0
6130 : 2 * ((UTF)
6131 ? UTF8_MAXBYTES_CASE
6132 /* Max non-UTF-8 expansion is 2 */ : 2)));
6133
6134 bool next_is_quantifier;
6135 char * oldp = NULL;
6136
6137 /* We can convert EXACTF nodes to EXACTFU if they contain only
6138 * characters that match identically regardless of the target
6139 * string's UTF8ness. The reason to do this is that EXACTF is not
6140 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
6141 * runtime.
6142 *
6143 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
6144 * contain only above-Latin1 characters (hence must be in UTF8),
6145 * which don't participate in folds with Latin1-range characters,
6146 * as the latter's folds aren't known until runtime. */
6147 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6148
6149 /* Single-character EXACTish nodes are almost always SIMPLE. This
6150 * allows us to override this as encountered */
6151 U8 maybe_SIMPLE = SIMPLE;
6152
6153 /* Does this node contain something that can't match unless the
6154 * target string is (also) in UTF-8 */
6155 bool requires_utf8_target = FALSE;
6156
6157 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
6158 bool has_ss = FALSE;
6159
6160 /* So is the MICRO SIGN */
6161 bool has_micro_sign = FALSE;
6162
6163 /* Set when we fill up the current node and there is still more
6164 * text to process */
6165 bool overflowed;
6166
6167 /* Allocate an EXACT node. The node_type may change below to
6168 * another EXACTish node, but since the size of the node doesn't
6169 * change, it works */
6170 ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
6171 FILL_NODE(ret, node_type);
6172 RExC_emit += NODE_STEP_REGNODE;
6173
6174 s = STRING(REGNODE_p(ret));
6175
6176 s0 = s;
6177
6178 reparse:
6179
6180 p = RExC_parse;
6181 len = 0;
6182 s = s0;
6183 node_type = EXACT;
6184 oldp = NULL;
6185 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6186 maybe_SIMPLE = SIMPLE;
6187 requires_utf8_target = FALSE;
6188 has_ss = FALSE;
6189 has_micro_sign = FALSE;
6190
6191 continue_parse:
6192
6193 /* This breaks under rare circumstances. If folding, we do not
6194 * want to split a node at a character that is a non-final in a
6195 * multi-char fold, as an input string could just happen to want to
6196 * match across the node boundary. The code at the end of the loop
6197 * looks for this, and backs off until it finds not such a
6198 * character, but it is possible (though extremely, extremely
6199 * unlikely) for all characters in the node to be non-final fold
6200 * ones, in which case we just leave the node fully filled, and
6201 * hope that it doesn't match the string in just the wrong place */
6202
6203 assert( ! UTF /* Is at the beginning of a character */
6204 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
6205 || UTF8_IS_START(UCHARAT(RExC_parse)));
6206
6207 overflowed = FALSE;
6208
6209 /* Here, we have a literal character. Find the maximal string of
6210 * them in the input that we can fit into a single EXACTish node.
6211 * We quit at the first non-literal or when the node gets full, or
6212 * under /i the categorization of folding/non-folding character
6213 * changes */
6214 while (p < RExC_end && len < upper_fill) {
6215
6216 /* In most cases each iteration adds one byte to the output.
6217 * The exceptions override this */
6218 Size_t added_len = 1;
6219
6220 oldp = p;
6221 old_old_s = old_s;
6222 old_s = s;
6223
6224 /* White space has already been ignored */
6225 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
6226 || ! is_PATWS_safe((p), RExC_end, UTF));
6227
6228 switch ((U8)*p) {
6229 const char* message;
6230 U32 packed_warn;
6231 U8 grok_c_char;
6232
6233 case '^':
6234 case '$':
6235 case '.':
6236 case '[':
6237 case '(':
6238 case ')':
6239 case '|':
6240 goto loopdone;
6241 case '\\':
6242 /* Literal Escapes Switch
6243
6244 This switch is meant to handle escape sequences that
6245 resolve to a literal character.
6246
6247 Every escape sequence that represents something
6248 else, like an assertion or a char class, is handled
6249 in the switch marked 'Special Escapes' above in this
6250 routine, but also has an entry here as anything that
6251 isn't explicitly mentioned here will be treated as
6252 an unescaped equivalent literal.
6253 */
6254
6255 switch ((U8)*++p) {
6256
6257 /* These are all the special escapes. */
6258 case 'A': /* Start assertion */
6259 case 'b': case 'B': /* Word-boundary assertion*/
6260 case 'C': /* Single char !DANGEROUS! */
6261 case 'd': case 'D': /* digit class */
6262 case 'g': case 'G': /* generic-backref, pos assertion */
6263 case 'h': case 'H': /* HORIZWS */
6264 case 'k': case 'K': /* named backref, keep marker */
6265 case 'p': case 'P': /* Unicode property */
6266 case 'R': /* LNBREAK */
6267 case 's': case 'S': /* space class */
6268 case 'v': case 'V': /* VERTWS */
6269 case 'w': case 'W': /* word class */
6270 case 'X': /* eXtended Unicode "combining
6271 character sequence" */
6272 case 'z': case 'Z': /* End of line/string assertion */
6273 --p;
6274 goto loopdone;
6275
6276 /* Anything after here is an escape that resolves to a
6277 literal. (Except digits, which may or may not)
6278 */
6279 case 'n':
6280 ender = '\n';
6281 p++;
6282 break;
6283 case 'N': /* Handle a single-code point named character. */
6284 RExC_parse_set( p + 1 );
6285 if (! grok_bslash_N(pRExC_state,
6286 NULL, /* Fail if evaluates to
6287 anything other than a
6288 single code point */
6289 &ender, /* The returned single code
6290 point */
6291 NULL, /* Don't need a count of
6292 how many code points */
6293 flagp,
6294 RExC_strict,
6295 depth)
6296 ) {
6297 if (*flagp & NEED_UTF8)
6298 FAIL("panic: grok_bslash_N set NEED_UTF8");
6299 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
6300
6301 /* Here, it wasn't a single code point. Go close
6302 * up this EXACTish node. The switch() prior to
6303 * this switch handles the other cases */
6304 p = oldp;
6305 RExC_parse_set(p);
6306 goto loopdone;
6307 }
6308 p = RExC_parse;
6309 RExC_parse_set(atom_parse_start);
6310
6311 /* The \N{} means the pattern, if previously /d,
6312 * becomes /u. That means it can't be an EXACTF node,
6313 * but an EXACTFU */
6314 if (node_type == EXACTF) {
6315 node_type = EXACTFU;
6316
6317 /* If the node already contains something that
6318 * differs between EXACTF and EXACTFU, reparse it
6319 * as EXACTFU */
6320 if (! maybe_exactfu) {
6321 len = 0;
6322 s = s0;
6323 goto reparse;
6324 }
6325 }
6326
6327 break;
6328 case 'r':
6329 ender = '\r';
6330 p++;
6331 break;
6332 case 't':
6333 ender = '\t';
6334 p++;
6335 break;
6336 case 'f':
6337 ender = '\f';
6338 p++;
6339 break;
6340 case 'e':
6341 ender = ESC_NATIVE;
6342 p++;
6343 break;
6344 case 'a':
6345 ender = '\a';
6346 p++;
6347 break;
6348 case 'o':
6349 if (! grok_bslash_o(&p,
6350 RExC_end,
6351 &ender,
6352 &message,
6353 &packed_warn,
6354 (bool) RExC_strict,
6355 FALSE, /* No illegal cp's */
6356 UTF))
6357 {
6358 RExC_parse_set(p); /* going to die anyway; point to
6359 exact spot of failure */
6360 vFAIL(message);
6361 }
6362
6363 if (message && TO_OUTPUT_WARNINGS(p)) {
6364 warn_non_literal_string(p, packed_warn, message);
6365 }
6366 break;
6367 case 'x':
6368 if (! grok_bslash_x(&p,
6369 RExC_end,
6370 &ender,
6371 &message,
6372 &packed_warn,
6373 (bool) RExC_strict,
6374 FALSE, /* No illegal cp's */
6375 UTF))
6376 {
6377 RExC_parse_set(p); /* going to die anyway; point
6378 to exact spot of failure */
6379 vFAIL(message);
6380 }
6381
6382 if (message && TO_OUTPUT_WARNINGS(p)) {
6383 warn_non_literal_string(p, packed_warn, message);
6384 }
6385
6386#ifdef EBCDIC
6387 if (ender < 0x100) {
6388 if (RExC_recode_x_to_native) {
6389 ender = LATIN1_TO_NATIVE(ender);
6390 }
6391 }
6392#endif
6393 break;
6394 case 'c':
6395 p++;
6396 if (! grok_bslash_c(*p, &grok_c_char,
6397 &message, &packed_warn))
6398 {
6399 /* going to die anyway; point to exact spot of
6400 * failure */
6401 char *new_p= p + ((UTF)
6402 ? UTF8_SAFE_SKIP(p, RExC_end)
6403 : 1);
6404 RExC_parse_set(new_p);
6405 vFAIL(message);
6406 }
6407
6408 ender = grok_c_char;
6409 p++;
6410 if (message && TO_OUTPUT_WARNINGS(p)) {
6411 warn_non_literal_string(p, packed_warn, message);
6412 }
6413
6414 break;
6415 case '8': case '9': /* must be a backreference */
6416 --p;
6417 /* we have an escape like \8 which cannot be an octal escape
6418 * so we exit the loop, and let the outer loop handle this
6419 * escape which may or may not be a legitimate backref. */
6420 goto loopdone;
6421 case '1': case '2': case '3':case '4':
6422 case '5': case '6': case '7':
6423
6424 /* When we parse backslash escapes there is ambiguity
6425 * between backreferences and octal escapes. Any escape
6426 * from \1 - \9 is a backreference, any multi-digit
6427 * escape which does not start with 0 and which when
6428 * evaluated as decimal could refer to an already
6429 * parsed capture buffer is a back reference. Anything
6430 * else is octal.
6431 *
6432 * Note this implies that \118 could be interpreted as
6433 * 118 OR as "\11" . "8" depending on whether there
6434 * were 118 capture buffers defined already in the
6435 * pattern. */
6436
6437 /* NOTE, RExC_npar is 1 more than the actual number of
6438 * parens we have seen so far, hence the "<" as opposed
6439 * to "<=" */
6440 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
6441 { /* Not to be treated as an octal constant, go
6442 find backref */
6443 p = oldp;
6444 goto loopdone;
6445 }
6446 /* FALLTHROUGH */
6447 case '0':
6448 {
6449 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
6450 | PERL_SCAN_NOTIFY_ILLDIGIT;
6451 STRLEN numlen = 3;
6452 ender = grok_oct(p, &numlen, &flags, NULL);
6453 p += numlen;
6454 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
6455 && isDIGIT(*p) /* like \08, \178 */
6456 && ckWARN(WARN_REGEXP))
6457 {
6458 reg_warn_non_literal_string(
6459 p + 1,
6460 form_alien_digit_msg(8, numlen, p,
6461 RExC_end, UTF, FALSE));
6462 }
6463 }
6464 break;
6465 case '\0':
6466 if (p >= RExC_end)
6467 FAIL("Trailing \\");
6468 /* FALLTHROUGH */
6469 default:
6470 if (isALPHANUMERIC(*p)) {
6471 /* An alpha followed by '{' is going to fail next
6472 * iteration, so don't output this warning in that
6473 * case */
6474 if (! isALPHA(*p) || *(p + 1) != '{') {
6475 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
6476 " passed through", p);
6477 }
6478 }
6479 goto normal_default;
6480 } /* End of switch on '\' */
6481 break;
6482 case '{':
6483 /* Trying to gain new uses for '{' without breaking too
6484 * much existing code is hard. The solution currently
6485 * adopted is:
6486 * 1) If there is no ambiguity that a '{' should always
6487 * be taken literally, at the start of a construct, we
6488 * just do so.
6489 * 2) If the literal '{' conflicts with our desired use
6490 * of it as a metacharacter, we die. The deprecation
6491 * cycles for this have come and gone.
6492 * 3) If there is ambiguity, we raise a simple warning.
6493 * This could happen, for example, if the user
6494 * intended it to introduce a quantifier, but slightly
6495 * misspelled the quantifier. Without this warning,
6496 * the quantifier would silently be taken as a literal
6497 * string of characters instead of a meta construct */
6498 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
6499 if ( RExC_strict
6500 || ( p > atom_parse_start + 1
6501 && isALPHA_A(*(p - 1))
6502 && *(p - 2) == '\\'))
6503 {
6504 RExC_parse_set(p + 1);
6505 vFAIL("Unescaped left brace in regex is "
6506 "illegal here");
6507 }
6508 ckWARNreg(p + 1, "Unescaped left brace in regex is"
6509 " passed through");
6510 }
6511 goto normal_default;
6512 case '}':
6513 case ']':
6514 if (p > RExC_parse && RExC_strict) {
6515 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
6516 }
6517 /*FALLTHROUGH*/
6518 default: /* A literal character */
6519 normal_default:
6520 if (! UTF8_IS_INVARIANT(*p) && UTF) {
6521 STRLEN numlen;
6522 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6523 &numlen, UTF8_ALLOW_DEFAULT);
6524 p += numlen;
6525 }
6526 else
6527 ender = (U8) *p++;
6528 break;
6529 } /* End of switch on the literal */
6530
6531 /* Here, have looked at the literal character, and <ender>
6532 * contains its ordinal; <p> points to the character after it.
6533 * */
6534
6535 if (ender > 255) {
6536 REQUIRE_UTF8(flagp);
6537 if ( UNICODE_IS_PERL_EXTENDED(ender)
6538 && TO_OUTPUT_WARNINGS(p))
6539 {
6540 ckWARN2_non_literal_string(p,
6541 packWARN(WARN_PORTABLE),
6542 PL_extended_cp_format,
6543 ender);
6544 }
6545 }
6546
6547 /* We need to check if the next non-ignored thing is a
6548 * quantifier. Move <p> to after anything that should be
6549 * ignored, which, as a side effect, positions <p> for the next
6550 * loop iteration */
6551 skip_to_be_ignored_text(pRExC_state, &p,
6552 FALSE /* Don't force to /x */ );
6553
6554 /* If the next thing is a quantifier, it applies to this
6555 * character only, which means that this character has to be in
6556 * its own node and can't just be appended to the string in an
6557 * existing node, so if there are already other characters in
6558 * the node, close the node with just them, and set up to do
6559 * this character again next time through, when it will be the
6560 * only thing in its new node */
6561
6562 next_is_quantifier = LIKELY(p < RExC_end)
6563 && UNLIKELY(isQUANTIFIER(p, RExC_end));
6564
6565 if (next_is_quantifier && LIKELY(len)) {
6566 p = oldp;
6567 goto loopdone;
6568 }
6569
6570 /* Ready to add 'ender' to the node */
6571
6572 if (! FOLD) { /* The simple case, just append the literal */
6573 not_fold_common:
6574
6575 /* Don't output if it would overflow */
6576 if (UNLIKELY(len > max_string_len - ((UTF)
6577 ? UVCHR_SKIP(ender)
6578 : 1)))
6579 {
6580 overflowed = TRUE;
6581 break;
6582 }
6583
6584 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
6585 *(s++) = (char) ender;
6586 }
6587 else {
6588 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
6589 added_len = (char *) new_s - s;
6590 s = (char *) new_s;
6591
6592 if (ender > 255) {
6593 requires_utf8_target = TRUE;
6594 }
6595 }
6596 }
6597 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
6598
6599 /* Here are folding under /l, and the code point is
6600 * problematic. If this is the first character in the
6601 * node, change the node type to folding. Otherwise, if
6602 * this is the first problematic character, close up the
6603 * existing node, so can start a new node with this one */
6604 if (! len) {
6605 node_type = EXACTFL;
6606 RExC_contains_locale = 1;
6607 }
6608 else if (node_type == EXACT) {
6609 p = oldp;
6610 goto loopdone;
6611 }
6612
6613 /* This problematic code point means we can't simplify
6614 * things */
6615 maybe_exactfu = FALSE;
6616
6617 /* Although these two characters have folds that are
6618 * locale-problematic, they also have folds to above Latin1
6619 * that aren't a problem. Doing these now helps at
6620 * runtime. */
6621 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
6622 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
6623 {
6624 goto fold_anyway;
6625 }
6626
6627 /* Here, we are adding a problematic fold character.
6628 * "Problematic" in this context means that its fold isn't
6629 * known until runtime. (The non-problematic code points
6630 * are the above-Latin1 ones that fold to also all
6631 * above-Latin1. Their folds don't vary no matter what the
6632 * locale is.) But here we have characters whose fold
6633 * depends on the locale. We just add in the unfolded
6634 * character, and wait until runtime to fold it */
6635 goto not_fold_common;
6636 }
6637 else /* regular fold; see if actually is in a fold */
6638 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
6639 || (ender > 255
6640 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
6641 {
6642 /* Here, folding, but the character isn't in a fold.
6643 *
6644 * Start a new node if previous characters in the node were
6645 * folded */
6646 if (len && node_type != EXACT) {
6647 p = oldp;
6648 goto loopdone;
6649 }
6650
6651 /* Here, continuing a node with non-folded characters. Add
6652 * this one */
6653 goto not_fold_common;
6654 }
6655 else { /* Here, does participate in some fold */
6656
6657 /* If this is the first character in the node, change its
6658 * type to folding. Otherwise, if this is the first
6659 * folding character in the node, close up the existing
6660 * node, so can start a new node with this one. */
6661 if (! len) {
6662 node_type = compute_EXACTish(pRExC_state);
6663 }
6664 else if (node_type == EXACT) {
6665 p = oldp;
6666 goto loopdone;
6667 }
6668
6669 if (UTF) { /* Alway use the folded value for UTF-8
6670 patterns */
6671 if (UVCHR_IS_INVARIANT(ender)) {
6672 if (UNLIKELY(len + 1 > max_string_len)) {
6673 overflowed = TRUE;
6674 break;
6675 }
6676
6677 *(s)++ = (U8) toFOLD(ender);
6678 }
6679 else {
6680 UV folded;
6681
6682 fold_anyway:
6683 folded = _to_uni_fold_flags(
6684 ender,
6685 (U8 *) s, /* We have allocated extra space
6686 in 's' so can't run off the
6687 end */
6688 &added_len,
6689 FOLD_FLAGS_FULL
6690 | (( ASCII_FOLD_RESTRICTED
6691 || node_type == EXACTFL)
6692 ? FOLD_FLAGS_NOMIX_ASCII
6693 : 0));
6694 if (UNLIKELY(len + added_len > max_string_len)) {
6695 overflowed = TRUE;
6696 break;
6697 }
6698
6699 s += added_len;
6700
6701 if ( folded > 255
6702 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
6703 {
6704 /* U+B5 folds to the MU, so its possible for a
6705 * non-UTF-8 target to match it */
6706 requires_utf8_target = TRUE;
6707 }
6708 }
6709 }
6710 else { /* Here is non-UTF8. */
6711
6712 /* The fold will be one or (rarely) two characters.
6713 * Check that there's room for at least a single one
6714 * before setting any flags, etc. Because otherwise an
6715 * overflowing character could cause a flag to be set
6716 * even though it doesn't end up in this node. (For
6717 * the two character fold, we check again, before
6718 * setting any flags) */
6719 if (UNLIKELY(len + 1 > max_string_len)) {
6720 overflowed = TRUE;
6721 break;
6722 }
6723
6724#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
6725 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
6726 || UNICODE_DOT_DOT_VERSION > 0)
6727
6728 /* On non-ancient Unicodes, check for the only possible
6729 * multi-char fold */
6730 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
6731
6732 /* This potential multi-char fold means the node
6733 * can't be simple (because it could match more
6734 * than a single char). And in some cases it will
6735 * match 'ss', so set that flag */
6736 maybe_SIMPLE = 0;
6737 has_ss = TRUE;
6738
6739 /* It can't change to be an EXACTFU (unless already
6740 * is one). We fold it iff under /u rules. */
6741 if (node_type != EXACTFU) {
6742 maybe_exactfu = FALSE;
6743 }
6744 else {
6745 if (UNLIKELY(len + 2 > max_string_len)) {
6746 overflowed = TRUE;
6747 break;
6748 }
6749
6750 *(s++) = 's';
6751 *(s++) = 's';
6752 added_len = 2;
6753
6754 goto done_with_this_char;
6755 }
6756 }
6757 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
6758 && LIKELY(len > 0)
6759 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
6760 {
6761 /* Also, the sequence 'ss' is special when not
6762 * under /u. If the target string is UTF-8, it
6763 * should match SHARP S; otherwise it won't. So,
6764 * here we have to exclude the possibility of this
6765 * node moving to /u.*/
6766 has_ss = TRUE;
6767 maybe_exactfu = FALSE;
6768 }
6769#endif
6770 /* Here, the fold will be a single character */
6771
6772 if (UNLIKELY(ender == MICRO_SIGN)) {
6773 has_micro_sign = TRUE;
6774 }
6775 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
6776
6777 /* If the character's fold differs between /d and
6778 * /u, this can't change to be an EXACTFU node */
6779 maybe_exactfu = FALSE;
6780 }
6781
6782 *(s++) = (DEPENDS_SEMANTICS)
6783 ? (char) toFOLD(ender)
6784
6785 /* Under /u, the fold of any character in
6786 * the 0-255 range happens to be its
6787 * lowercase equivalent, except for LATIN
6788 * SMALL LETTER SHARP S, which was handled
6789 * above, and the MICRO SIGN, whose fold
6790 * requires UTF-8 to represent. */
6791 : (char) toLOWER_L1(ender);
6792 }
6793 } /* End of adding current character to the node */
6794
6795 done_with_this_char:
6796
6797 len += added_len;
6798
6799 if (next_is_quantifier) {
6800
6801 /* Here, the next input is a quantifier, and to get here,
6802 * the current character is the only one in the node. */
6803 goto loopdone;
6804 }
6805
6806 } /* End of loop through literal characters */
6807
6808 /* Here we have either exhausted the input or run out of room in
6809 * the node. If the former, we are done. (If we encountered a
6810 * character that can't be in the node, transfer is made directly
6811 * to <loopdone>, and so we wouldn't have fallen off the end of the
6812 * loop.) */
6813 if (LIKELY(! overflowed)) {
6814 goto loopdone;
6815 }
6816
6817 /* Here we have run out of room. We can grow plain EXACT and
6818 * LEXACT nodes. If the pattern is gigantic enough, though,
6819 * eventually we'll have to artificially chunk the pattern into
6820 * multiple nodes. */
6821 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
6822 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
6823 Size_t overhead_expansion = 0;
6824 char temp[256];
6825 Size_t max_nodes_for_string;
6826 Size_t achievable;
6827 SSize_t delta;
6828
6829 /* Here we couldn't fit the final character in the current
6830 * node, so it will have to be reparsed, no matter what else we
6831 * do */
6832 p = oldp;
6833
6834 /* If would have overflowed a regular EXACT node, switch
6835 * instead to an LEXACT. The code below is structured so that
6836 * the actual growing code is common to changing from an EXACT
6837 * or just increasing the LEXACT size. This means that we have
6838 * to save the string in the EXACT case before growing, and
6839 * then copy it afterwards to its new location */
6840 if (node_type == EXACT) {
6841 overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
6842 RExC_emit += overhead_expansion;
6843 Copy(s0, temp, len, char);
6844 }
6845
6846 /* Ready to grow. If it was a plain EXACT, the string was
6847 * saved, and the first few bytes of it overwritten by adding
6848 * an argument field. We assume, as we do elsewhere in this
6849 * file, that one byte of remaining input will translate into
6850 * one byte of output, and if that's too small, we grow again,
6851 * if too large the excess memory is freed at the end */
6852
6853 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
6854 achievable = MIN(max_nodes_for_string,
6855 current_string_nodes + STR_SZ(RExC_end - p));
6856 delta = achievable - current_string_nodes;
6857
6858 /* If there is just no more room, go finish up this chunk of
6859 * the pattern. */
6860 if (delta <= 0) {
6861 goto loopdone;
6862 }
6863
6864 change_engine_size(pRExC_state, delta + overhead_expansion);
6865 current_string_nodes += delta;
6866 max_string_len
6867 = sizeof(struct regnode) * current_string_nodes;
6868 upper_fill = max_string_len + 1;
6869
6870 /* If the length was small, we know this was originally an
6871 * EXACT node now converted to LEXACT, and the string has to be
6872 * restored. Otherwise the string was untouched. 260 is just
6873 * a number safely above 255 so don't have to worry about
6874 * getting it precise */
6875 if (len < 260) {
6876 node_type = LEXACT;
6877 FILL_NODE(ret, node_type);
6878 s0 = STRING(REGNODE_p(ret));
6879 Copy(temp, s0, len, char);
6880 s = s0 + len;
6881 }
6882
6883 goto continue_parse;
6884 }
6885 else if (FOLD) {
6886 bool splittable = FALSE;
6887 bool backed_up = FALSE;
6888 char * e; /* should this be U8? */
6889 char * s_start; /* should this be U8? */
6890
6891 /* Here is /i. Running out of room creates a problem if we are
6892 * folding, and the split happens in the middle of a
6893 * multi-character fold, as a match that should have occurred,
6894 * won't, due to the way nodes are matched, and our artificial
6895 * boundary. So back off until we aren't splitting such a
6896 * fold. If there is no such place to back off to, we end up
6897 * taking the entire node as-is. This can happen if the node
6898 * consists entirely of 'f' or entirely of 's' characters (or
6899 * things that fold to them) as 'ff' and 'ss' are
6900 * multi-character folds.
6901 *
6902 * The Unicode standard says that multi character folds consist
6903 * of either two or three characters. That means we would be
6904 * splitting one if the final character in the node is at the
6905 * beginning of either type, or is the second of a three
6906 * character fold.
6907 *
6908 * At this point:
6909 * ender is the code point of the character that won't fit
6910 * in the node
6911 * s points to just beyond the final byte in the node.
6912 * It's where we would place ender if there were
6913 * room, and where in fact we do place ender's fold
6914 * in the code below, as we've over-allocated space
6915 * for s0 (hence s) to allow for this
6916 * e starts at 's' and advances as we append things.
6917 * old_s is the same as 's'. (If ender had fit, 's' would
6918 * have been advanced to beyond it).
6919 * old_old_s points to the beginning byte of the final
6920 * character in the node
6921 * p points to the beginning byte in the input of the
6922 * character beyond 'ender'.
6923 * oldp points to the beginning byte in the input of
6924 * 'ender'.
6925 *
6926 * In the case of /il, we haven't folded anything that could be
6927 * affected by the locale. That means only above-Latin1
6928 * characters that fold to other above-latin1 characters get
6929 * folded at compile time. To check where a good place to
6930 * split nodes is, everything in it will have to be folded.
6931 * The boolean 'maybe_exactfu' keeps track in /il if there are
6932 * any unfolded characters in the node. */
6933 bool need_to_fold_loc = LOC && ! maybe_exactfu;
6934
6935 /* If we do need to fold the node, we need a place to store the
6936 * folded copy, and a way to map back to the unfolded original
6937 * */
6938 char * locfold_buf = NULL;
6939 Size_t * loc_correspondence = NULL;
6940
6941 if (! need_to_fold_loc) { /* The normal case. Just
6942 initialize to the actual node */
6943 e = s;
6944 s_start = s0;
6945 s = old_old_s; /* Point to the beginning of the final char
6946 that fits in the node */
6947 }
6948 else {
6949
6950 /* Here, we have filled a /il node, and there are unfolded
6951 * characters in it. If the runtime locale turns out to be
6952 * UTF-8, there are possible multi-character folds, just
6953 * like when not under /l. The node hence can't terminate
6954 * in the middle of such a fold. To determine this, we
6955 * have to create a folded copy of this node. That means
6956 * reparsing the node, folding everything assuming a UTF-8
6957 * locale. (If at runtime it isn't such a locale, the
6958 * actions here wouldn't have been necessary, but we have
6959 * to assume the worst case.) If we find we need to back
6960 * off the folded string, we do so, and then map that
6961 * position back to the original unfolded node, which then
6962 * gets output, truncated at that spot */
6963
6964 char * redo_p = RExC_parse;
6965 char * redo_e;
6966 char * old_redo_e;
6967
6968 /* Allow enough space assuming a single byte input folds to
6969 * a single byte output, plus assume that the two unparsed
6970 * characters (that we may need) fold to the largest number
6971 * of bytes possible, plus extra for one more worst case
6972 * scenario. In the loop below, if we start eating into
6973 * that final spare space, we enlarge this initial space */
6974 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
6975
6976 Newxz(locfold_buf, size, char);
6977 Newxz(loc_correspondence, size, Size_t);
6978
6979 /* Redo this node's parse, folding into 'locfold_buf' */
6980 redo_p = RExC_parse;
6981 old_redo_e = redo_e = locfold_buf;
6982 while (redo_p <= oldp) {
6983
6984 old_redo_e = redo_e;
6985 loc_correspondence[redo_e - locfold_buf]
6986 = redo_p - RExC_parse;
6987
6988 if (UTF) {
6989 Size_t added_len;
6990
6991 (void) _to_utf8_fold_flags((U8 *) redo_p,
6992 (U8 *) RExC_end,
6993 (U8 *) redo_e,
6994 &added_len,
6995 FOLD_FLAGS_FULL);
6996 redo_e += added_len;
6997 redo_p += UTF8SKIP(redo_p);
6998 }
6999 else {
7000
7001 /* Note that if this code is run on some ancient
7002 * Unicode versions, SHARP S doesn't fold to 'ss',
7003 * but rather than clutter the code with #ifdef's,
7004 * as is done above, we ignore that possibility.
7005 * This is ok because this code doesn't affect what
7006 * gets matched, but merely where the node gets
7007 * split */
7008 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
7009 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
7010 }
7011 else {
7012 *redo_e++ = 's';
7013 *redo_e++ = 's';
7014 }
7015 redo_p++;
7016 }
7017
7018
7019 /* If we're getting so close to the end that a
7020 * worst-case fold in the next character would cause us
7021 * to overflow, increase, assuming one byte output byte
7022 * per one byte input one, plus room for another worst
7023 * case fold */
7024 if ( redo_p <= oldp
7025 && redo_e > locfold_buf + size
7026 - (UTF8_MAXBYTES_CASE + 1))
7027 {
7028 Size_t new_size = size
7029 + (oldp - redo_p)
7030 + UTF8_MAXBYTES_CASE + 1;
7031 Ptrdiff_t e_offset = redo_e - locfold_buf;
7032
7033 Renew(locfold_buf, new_size, char);
7034 Renew(loc_correspondence, new_size, Size_t);
7035 size = new_size;
7036
7037 redo_e = locfold_buf + e_offset;
7038 }
7039 }
7040
7041 /* Set so that things are in terms of the folded, temporary
7042 * string */
7043 s = old_redo_e;
7044 s_start = locfold_buf;
7045 e = redo_e;
7046
7047 }
7048
7049 /* Here, we have 's', 's_start' and 'e' set up to point to the
7050 * input that goes into the node, folded.
7051 *
7052 * If the final character of the node and the fold of ender
7053 * form the first two characters of a three character fold, we
7054 * need to peek ahead at the next (unparsed) character in the
7055 * input to determine if the three actually do form such a
7056 * fold. Just looking at that character is not generally
7057 * sufficient, as it could be, for example, an escape sequence
7058 * that evaluates to something else, and it needs to be folded.
7059 *
7060 * khw originally thought to just go through the parse loop one
7061 * extra time, but that doesn't work easily as that iteration
7062 * could cause things to think that the parse is over and to
7063 * goto loopdone. The character could be a '$' for example, or
7064 * the character beyond could be a quantifier, and other
7065 * glitches as well.
7066 *
7067 * The solution used here for peeking ahead is to look at that
7068 * next character. If it isn't ASCII punctuation, then it will
7069 * be something that would continue on in an EXACTish node if
7070 * there were space. We append the fold of it to s, having
7071 * reserved enough room in s0 for the purpose. If we can't
7072 * reasonably peek ahead, we instead assume the worst case:
7073 * that it is something that would form the completion of a
7074 * multi-char fold.
7075 *
7076 * If we can't split between s and ender, we work backwards
7077 * character-by-character down to s0. At each current point
7078 * see if we are at the beginning of a multi-char fold. If so,
7079 * that means we would be splitting the fold across nodes, and
7080 * so we back up one and try again.
7081 *
7082 * If we're not at the beginning, we still could be at the
7083 * final two characters of a (rare) three character fold. We
7084 * check if the sequence starting at the character before the
7085 * current position (and including the current and next
7086 * characters) is a three character fold. If not, the node can
7087 * be split here. If it is, we have to backup two characters
7088 * and try again.
7089 *
7090 * Otherwise, the node can be split at the current position.
7091 *
7092 * The same logic is used for UTF-8 patterns and not */
7093 if (UTF) {
7094 Size_t added_len;
7095
7096 /* Append the fold of ender */
7097 (void) _to_uni_fold_flags(
7098 ender,
7099 (U8 *) e,
7100 &added_len,
7101 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7102 ? FOLD_FLAGS_NOMIX_ASCII
7103 : 0));
7104 e += added_len;
7105
7106 /* 's' and the character folded to by ender may be the
7107 * first two of a three-character fold, in which case the
7108 * node should not be split here. That may mean examining
7109 * the so-far unparsed character starting at 'p'. But if
7110 * ender folded to more than one character, we already have
7111 * three characters to look at. Also, we first check if
7112 * the sequence consisting of s and the next character form
7113 * the first two of some three character fold. If not,
7114 * there's no need to peek ahead. */
7115 if ( added_len <= UTF8SKIP(e - added_len)
7116 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
7117 {
7118 /* Here, the two do form the beginning of a potential
7119 * three character fold. The unexamined character may
7120 * or may not complete it. Peek at it. It might be
7121 * something that ends the node or an escape sequence,
7122 * in which case we don't know without a lot of work
7123 * what it evaluates to, so we have to assume the worst
7124 * case: that it does complete the fold, and so we
7125 * can't split here. All such instances will have
7126 * that character be an ASCII punctuation character,
7127 * like a backslash. So, for that case, backup one and
7128 * drop down to try at that position */
7129 if (isPUNCT(*p)) {
7130 s = (char *) utf8_hop_back((U8 *) s, -1,
7131 (U8 *) s_start);
7132 backed_up = TRUE;
7133 }
7134 else {
7135 /* Here, since it's not punctuation, it must be a
7136 * real character, and we can append its fold to
7137 * 'e' (having deliberately reserved enough space
7138 * for this eventuality) and drop down to check if
7139 * the three actually do form a folded sequence */
7140 (void) _to_utf8_fold_flags(
7141 (U8 *) p, (U8 *) RExC_end,
7142 (U8 *) e,
7143 &added_len,
7144 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7145 ? FOLD_FLAGS_NOMIX_ASCII
7146 : 0));
7147 e += added_len;
7148 }
7149 }
7150
7151 /* Here, we either have three characters available in
7152 * sequence starting at 's', or we have two characters and
7153 * know that the following one can't possibly be part of a
7154 * three character fold. We go through the node backwards
7155 * until we find a place where we can split it without
7156 * breaking apart a multi-character fold. At any given
7157 * point we have to worry about if such a fold begins at
7158 * the current 's', and also if a three-character fold
7159 * begins at s-1, (containing s and s+1). Splitting in
7160 * either case would break apart a fold */
7161 do {
7162 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
7163 (U8 *) s_start);
7164
7165 /* If is a multi-char fold, can't split here. Backup
7166 * one char and try again */
7167 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
7168 s = prev_s;
7169 backed_up = TRUE;
7170 continue;
7171 }
7172
7173 /* If the two characters beginning at 's' are part of a
7174 * three character fold starting at the character
7175 * before s, we can't split either before or after s.
7176 * Backup two chars and try again */
7177 if ( LIKELY(s > s_start)
7178 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
7179 {
7180 s = prev_s;
7181 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
7182 backed_up = TRUE;
7183 continue;
7184 }
7185
7186 /* Here there's no multi-char fold between s and the
7187 * next character following it. We can split */
7188 splittable = TRUE;
7189 break;
7190
7191 } while (s > s_start); /* End of loops backing up through the node */
7192
7193 /* Here we either couldn't find a place to split the node,
7194 * or else we broke out of the loop setting 'splittable' to
7195 * true. In the latter case, the place to split is between
7196 * the first and second characters in the sequence starting
7197 * at 's' */
7198 if (splittable) {
7199 s += UTF8SKIP(s);
7200 }
7201 }
7202 else { /* Pattern not UTF-8 */
7203 if ( ender != LATIN_SMALL_LETTER_SHARP_S
7204 || ASCII_FOLD_RESTRICTED)
7205 {
7206 assert( toLOWER_L1(ender) < 256 );
7207 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7208 }
7209 else {
7210 *e++ = 's';
7211 *e++ = 's';
7212 }
7213
7214 if ( e - s <= 1
7215 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
7216 {
7217 if (isPUNCT(*p)) {
7218 s--;
7219 backed_up = TRUE;
7220 }
7221 else {
7222 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
7223 || ASCII_FOLD_RESTRICTED)
7224 {
7225 assert( toLOWER_L1(ender) < 256 );
7226 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7227 }
7228 else {
7229 *e++ = 's';
7230 *e++ = 's';
7231 }
7232 }
7233 }
7234
7235 do {
7236 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
7237 s--;
7238 backed_up = TRUE;
7239 continue;
7240 }
7241
7242 if ( LIKELY(s > s_start)
7243 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
7244 {
7245 s -= 2;
7246 backed_up = TRUE;
7247 continue;
7248 }
7249
7250 splittable = TRUE;
7251 break;
7252
7253 } while (s > s_start);
7254
7255 if (splittable) {
7256 s++;
7257 }
7258 }
7259
7260 /* Here, we are done backing up. If we didn't backup at all
7261 * (the likely case), just proceed */
7262 if (backed_up) {
7263
7264 /* If we did find a place to split, reparse the entire node
7265 * stopping where we have calculated. */
7266 if (splittable) {
7267
7268 /* If we created a temporary folded string under /l, we
7269 * have to map that back to the original */
7270 if (need_to_fold_loc) {
7271 upper_fill = loc_correspondence[s - s_start];
7272 if (upper_fill == 0) {
7273 FAIL2("panic: loc_correspondence[%d] is 0",
7274 (int) (s - s_start));
7275 }
7276 Safefree(locfold_buf);
7277 Safefree(loc_correspondence);
7278 }
7279 else {
7280 upper_fill = s - s0;
7281 }
7282 goto reparse;
7283 }
7284
7285 /* Here the node consists entirely of non-final multi-char
7286 * folds. (Likely it is all 'f's or all 's's.) There's no
7287 * decent place to split it, so give up and just take the
7288 * whole thing */
7289 len = old_s - s0;
7290 }
7291
7292 if (need_to_fold_loc) {
7293 Safefree(locfold_buf);
7294 Safefree(loc_correspondence);
7295 }
7296 } /* End of verifying node ends with an appropriate char */
7297
7298 /* We need to start the next node at the character that didn't fit
7299 * in this one */
7300 p = oldp;
7301
7302 loopdone: /* Jumped to when encounters something that shouldn't be
7303 in the node */
7304
7305 /* Free up any over-allocated space; cast is to silence bogus
7306 * warning in MS VC */
7307 change_engine_size(pRExC_state,
7308 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
7309
7310 /* I (khw) don't know if you can get here with zero length, but the
7311 * old code handled this situation by creating a zero-length EXACT
7312 * node. Might as well be NOTHING instead */
7313 if (len == 0) {
7314 OP(REGNODE_p(ret)) = NOTHING;
7315 }
7316 else {
7317
7318 /* If the node type is EXACT here, check to see if it
7319 * should be EXACTL, or EXACT_REQ8. */
7320 if (node_type == EXACT) {
7321 if (LOC) {
7322 node_type = EXACTL;
7323 }
7324 else if (requires_utf8_target) {
7325 node_type = EXACT_REQ8;
7326 }
7327 }
7328 else if (node_type == LEXACT) {
7329 if (requires_utf8_target) {
7330 node_type = LEXACT_REQ8;
7331 }
7332 }
7333 else if (FOLD) {
7334 if ( UNLIKELY(has_micro_sign || has_ss)
7335 && (node_type == EXACTFU || ( node_type == EXACTF
7336 && maybe_exactfu)))
7337 { /* These two conditions are problematic in non-UTF-8
7338 EXACTFU nodes. */
7339 assert(! UTF);
7340 node_type = EXACTFUP;
7341 }
7342 else if (node_type == EXACTFL) {
7343
7344 /* 'maybe_exactfu' is deliberately set above to
7345 * indicate this node type, where all code points in it
7346 * are above 255 */
7347 if (maybe_exactfu) {
7348 node_type = EXACTFLU8;
7349 }
7350 else if (UNLIKELY(
7351 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
7352 {
7353 /* A character that folds to more than one will
7354 * match multiple characters, so can't be SIMPLE.
7355 * We don't have to worry about this with EXACTFLU8
7356 * nodes just above, as they have already been
7357 * folded (since the fold doesn't vary at run
7358 * time). Here, if the final character in the node
7359 * folds to multiple, it can't be simple. (This
7360 * only has an effect if the node has only a single
7361 * character, hence the final one, as elsewhere we
7362 * turn off simple for nodes whose length > 1 */
7363 maybe_SIMPLE = 0;
7364 }
7365 }
7366 else if (node_type == EXACTF) { /* Means is /di */
7367
7368 /* This intermediate variable is needed solely because
7369 * the asserts in the macro where used exceed Win32's
7370 * literal string capacity */
7371 char first_char = * STRING(REGNODE_p(ret));
7372
7373 /* If 'maybe_exactfu' is clear, then we need to stay
7374 * /di. If it is set, it means there are no code
7375 * points that match differently depending on UTF8ness
7376 * of the target string, so it can become an EXACTFU
7377 * node */
7378 if (! maybe_exactfu) {
7379 RExC_seen_d_op = TRUE;
7380 }
7381 else if ( isALPHA_FOLD_EQ(first_char, 's')
7382 || isALPHA_FOLD_EQ(ender, 's'))
7383 {
7384 /* But, if the node begins or ends in an 's' we
7385 * have to defer changing it into an EXACTFU, as
7386 * the node could later get joined with another one
7387 * that ends or begins with 's' creating an 'ss'
7388 * sequence which would then wrongly match the
7389 * sharp s without the target being UTF-8. We
7390 * create a special node that we resolve later when
7391 * we join nodes together */
7392
7393 node_type = EXACTFU_S_EDGE;
7394 }
7395 else {
7396 node_type = EXACTFU;
7397 }
7398 }
7399
7400 if (requires_utf8_target && node_type == EXACTFU) {
7401 node_type = EXACTFU_REQ8;
7402 }
7403 }
7404
7405 OP(REGNODE_p(ret)) = node_type;
7406 setSTR_LEN(REGNODE_p(ret), len);
7407 RExC_emit += STR_SZ(len);
7408
7409 /* If the node isn't a single character, it can't be SIMPLE */
7410 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
7411 maybe_SIMPLE = 0;
7412 }
7413
7414 *flagp |= HASWIDTH | maybe_SIMPLE;
7415 }
7416
7417 RExC_parse_set(p);
7418
7419 {
7420 /* len is STRLEN which is unsigned, need to copy to signed */
7421 IV iv = len;
7422 if (iv < 0)
7423 vFAIL("Internal disaster");
7424 }
7425
7426 } /* End of label 'defchar:' */
7427 break;
7428 } /* End of giant switch on input character */
7429
7430 /* Position parse to next real character */
7431 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
7432 FALSE /* Don't force to /x */ );
7433 if ( *RExC_parse == '{'
7434 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
7435 {
7436 if (RExC_strict) {
7437 RExC_parse_inc_by(1);
7438 vFAIL("Unescaped left brace in regex is illegal here");
7439 }
7440 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
7441 " passed through");
7442 }
7443
7444 return(ret);
7445}
7446
7447
7448#ifdef PERL_RE_BUILD_AUX
7449void
7450Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
7451{
7452 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
7453 * sets up the bitmap and any flags, removing those code points from the
7454 * inversion list, setting it to NULL should it become completely empty */
7455
7456
7457 PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
7458
7459 /* There is no bitmap for this node type */
7460 if (REGNODE_TYPE(OP(node)) != ANYOF) {
7461 return;
7462 }
7463
7464 ANYOF_BITMAP_ZERO(node);
7465 if (*invlist_ptr) {
7466
7467 /* This gets set if we actually need to modify things */
7468 bool change_invlist = FALSE;
7469
7470 UV start, end;
7471
7472 /* Start looking through *invlist_ptr */
7473 invlist_iterinit(*invlist_ptr);
7474 while (invlist_iternext(*invlist_ptr, &start, &end)) {
7475 UV high;
7476 int i;
7477
7478 /* Quit if are above what we should change */
7479 if (start >= NUM_ANYOF_CODE_POINTS) {
7480 break;
7481 }
7482
7483 change_invlist = TRUE;
7484
7485 /* Set all the bits in the range, up to the max that we are doing */
7486 high = (end < NUM_ANYOF_CODE_POINTS - 1)
7487 ? end
7488 : NUM_ANYOF_CODE_POINTS - 1;
7489 for (i = start; i <= (int) high; i++) {
7490 ANYOF_BITMAP_SET(node, i);
7491 }
7492 }
7493 invlist_iterfinish(*invlist_ptr);
7494
7495 /* Done with loop; remove any code points that are in the bitmap from
7496 * *invlist_ptr */
7497 if (change_invlist) {
7498 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
7499 }
7500
7501 /* If have completely emptied it, remove it completely */
7502 if (_invlist_len(*invlist_ptr) == 0) {
7503 SvREFCNT_dec_NN(*invlist_ptr);
7504 *invlist_ptr = NULL;
7505 }
7506 }
7507}
7508#endif /* PERL_RE_BUILD_AUX */
7509
7510/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7511 Character classes ([:foo:]) can also be negated ([:^foo:]).
7512 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7513 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7514 but trigger failures because they are currently unimplemented. */
7515
7516#define POSIXCC_DONE(c) ((c) == ':')
7517#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7518#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7519#define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
7520
7521#define WARNING_PREFIX "Assuming NOT a POSIX class since "
7522#define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
7523#define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
7524
7525#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
7526
7527/* 'posix_warnings' and 'warn_text' are names of variables in the following
7528 * routine. q.v. */
7529#define ADD_POSIX_WARNING(p, text) STMT_START { \
7530 if (posix_warnings) { \
7531 if (! RExC_warn_text ) RExC_warn_text = \
7532 (AV *) sv_2mortal((SV *) newAV()); \
7533 av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_ \
7534 WARNING_PREFIX \
7535 text \
7536 REPORT_LOCATION, \
7537 REPORT_LOCATION_ARGS(p))); \
7538 } \
7539 } STMT_END
7540#define CLEAR_POSIX_WARNINGS() \
7541 STMT_START { \
7542 if (posix_warnings && RExC_warn_text) \
7543 av_clear(RExC_warn_text); \
7544 } STMT_END
7545
7546#define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
7547 STMT_START { \
7548 CLEAR_POSIX_WARNINGS(); \
7549 return ret; \
7550 } STMT_END
7551
7552STATIC int
7553S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
7554
7555 const char * const s, /* Where the putative posix class begins.
7556 Normally, this is one past the '['. This
7557 parameter exists so it can be somewhere
7558 besides RExC_parse. */
7559 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
7560 NULL */
7561 AV ** posix_warnings, /* Where to place any generated warnings, or
7562 NULL */
7563 const bool check_only /* Don't die if error */
7564)
7565{
7566 /* This parses what the caller thinks may be one of the three POSIX
7567 * constructs:
7568 * 1) a character class, like [:blank:]
7569 * 2) a collating symbol, like [. .]
7570 * 3) an equivalence class, like [= =]
7571 * In the latter two cases, it croaks if it finds a syntactically legal
7572 * one, as these are not handled by Perl.
7573 *
7574 * The main purpose is to look for a POSIX character class. It returns:
7575 * a) the class number
7576 * if it is a completely syntactically and semantically legal class.
7577 * 'updated_parse_ptr', if not NULL, is set to point to just after the
7578 * closing ']' of the class
7579 * b) OOB_NAMEDCLASS
7580 * if it appears that one of the three POSIX constructs was meant, but
7581 * its specification was somehow defective. 'updated_parse_ptr', if
7582 * not NULL, is set to point to the character just after the end
7583 * character of the class. See below for handling of warnings.
7584 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
7585 * if it doesn't appear that a POSIX construct was intended.
7586 * 'updated_parse_ptr' is not changed. No warnings nor errors are
7587 * raised.
7588 *
7589 * In b) there may be errors or warnings generated. If 'check_only' is
7590 * TRUE, then any errors are discarded. Warnings are returned to the
7591 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
7592 * instead it is NULL, warnings are suppressed.
7593 *
7594 * The reason for this function, and its complexity is that a bracketed
7595 * character class can contain just about anything. But it's easy to
7596 * mistype the very specific posix class syntax but yielding a valid
7597 * regular bracketed class, so it silently gets compiled into something
7598 * quite unintended.
7599 *
7600 * The solution adopted here maintains backward compatibility except that
7601 * it adds a warning if it looks like a posix class was intended but
7602 * improperly specified. The warning is not raised unless what is input
7603 * very closely resembles one of the 14 legal posix classes. To do this,
7604 * it uses fuzzy parsing. It calculates how many single-character edits it
7605 * would take to transform what was input into a legal posix class. Only
7606 * if that number is quite small does it think that the intention was a
7607 * posix class. Obviously these are heuristics, and there will be cases
7608 * where it errs on one side or another, and they can be tweaked as
7609 * experience informs.
7610 *
7611 * The syntax for a legal posix class is:
7612 *
7613 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
7614 *
7615 * What this routine considers syntactically to be an intended posix class
7616 * is this (the comments indicate some restrictions that the pattern
7617 * doesn't show):
7618 *
7619 * qr/(?x: \[? # The left bracket, possibly
7620 * # omitted
7621 * \h* # possibly followed by blanks
7622 * (?: \^ \h* )? # possibly a misplaced caret
7623 * [:;]? # The opening class character,
7624 * # possibly omitted. A typo
7625 * # semi-colon can also be used.
7626 * \h*
7627 * \^? # possibly a correctly placed
7628 * # caret, but not if there was also
7629 * # a misplaced one
7630 * \h*
7631 * .{3,15} # The class name. If there are
7632 * # deviations from the legal syntax,
7633 * # its edit distance must be close
7634 * # to a real class name in order
7635 * # for it to be considered to be
7636 * # an intended posix class.
7637 * \h*
7638 * [[:punct:]]? # The closing class character,
7639 * # possibly omitted. If not a colon
7640 * # nor semi colon, the class name
7641 * # must be even closer to a valid
7642 * # one
7643 * \h*
7644 * \]? # The right bracket, possibly
7645 * # omitted.
7646 * )/
7647 *
7648 * In the above, \h must be ASCII-only.
7649 *
7650 * These are heuristics, and can be tweaked as field experience dictates.
7651 * There will be cases when someone didn't intend to specify a posix class
7652 * that this warns as being so. The goal is to minimize these, while
7653 * maximizing the catching of things intended to be a posix class that
7654 * aren't parsed as such.
7655 */
7656
7657 const char* p = s;
7658 const char * const e = RExC_end;
7659 unsigned complement = 0; /* If to complement the class */
7660 bool found_problem = FALSE; /* Assume OK until proven otherwise */
7661 bool has_opening_bracket = FALSE;
7662 bool has_opening_colon = FALSE;
7663 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
7664 valid class */
7665 const char * possible_end = NULL; /* used for a 2nd parse pass */
7666 const char* name_start; /* ptr to class name first char */
7667
7668 /* If the number of single-character typos the input name is away from a
7669 * legal name is no more than this number, it is considered to have meant
7670 * the legal name */
7671 int max_distance = 2;
7672
7673 /* to store the name. The size determines the maximum length before we
7674 * decide that no posix class was intended. Should be at least
7675 * sizeof("alphanumeric") */
7676 UV input_text[15];
7677 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
7678
7679 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
7680
7681 CLEAR_POSIX_WARNINGS();
7682
7683 if (p >= e) {
7684 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
7685 }
7686
7687 if (*(p - 1) != '[') {
7688 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
7689 found_problem = TRUE;
7690 }
7691 else {
7692 has_opening_bracket = TRUE;
7693 }
7694
7695 /* They could be confused and think you can put spaces between the
7696 * components */
7697 if (isBLANK(*p)) {
7698 found_problem = TRUE;
7699
7700 do {
7701 p++;
7702 } while (p < e && isBLANK(*p));
7703
7704 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7705 }
7706
7707 /* For [. .] and [= =]. These are quite different internally from [: :],
7708 * so they are handled separately. */
7709 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
7710 and 1 for at least one char in it
7711 */
7712 {
7713 const char open_char = *p;
7714 const char * temp_ptr = p + 1;
7715
7716 /* These two constructs are not handled by perl, and if we find a
7717 * syntactically valid one, we croak. khw, who wrote this code, finds
7718 * this explanation of them very unclear:
7719 * https://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
7720 * And searching the rest of the internet wasn't very helpful either.
7721 * It looks like just about any byte can be in these constructs,
7722 * depending on the locale. But unless the pattern is being compiled
7723 * under /l, which is very rare, Perl runs under the C or POSIX locale.
7724 * In that case, it looks like [= =] isn't allowed at all, and that
7725 * [. .] could be any single code point, but for longer strings the
7726 * constituent characters would have to be the ASCII alphabetics plus
7727 * the minus-hyphen. Any sensible locale definition would limit itself
7728 * to these. And any portable one definitely should. Trying to parse
7729 * the general case is a nightmare (see [perl #127604]). So, this code
7730 * looks only for interiors of these constructs that match:
7731 * qr/.|[-\w]{2,}/
7732 * Using \w relaxes the apparent rules a little, without adding much
7733 * danger of mistaking something else for one of these constructs.
7734 *
7735 * [. .] in some implementations described on the internet is usable to
7736 * escape a character that otherwise is special in bracketed character
7737 * classes. For example [.].] means a literal right bracket instead of
7738 * the ending of the class
7739 *
7740 * [= =] can legitimately contain a [. .] construct, but we don't
7741 * handle this case, as that [. .] construct will later get parsed
7742 * itself and croak then. And [= =] is checked for even when not under
7743 * /l, as Perl has long done so.
7744 *
7745 * The code below relies on there being a trailing NUL, so it doesn't
7746 * have to keep checking if the parse ptr < e.
7747 */
7748 if (temp_ptr[1] == open_char) {
7749 temp_ptr++;
7750 }
7751 else while ( temp_ptr < e
7752 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
7753 {
7754 temp_ptr++;
7755 }
7756
7757 if (*temp_ptr == open_char) {
7758 temp_ptr++;
7759 if (*temp_ptr == ']') {
7760 temp_ptr++;
7761 if (! found_problem && ! check_only) {
7762 RExC_parse_set((char *) temp_ptr);
7763 vFAIL3("POSIX syntax [%c %c] is reserved for future "
7764 "extensions", open_char, open_char);
7765 }
7766
7767 /* Here, the syntax wasn't completely valid, or else the call
7768 * is to check-only */
7769 if (updated_parse_ptr) {
7770 *updated_parse_ptr = (char *) temp_ptr;
7771 }
7772
7773 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
7774 }
7775 }
7776
7777 /* If we find something that started out to look like one of these
7778 * constructs, but isn't, we continue below so that it can be checked
7779 * for being a class name with a typo of '.' or '=' instead of a colon.
7780 * */
7781 }
7782
7783 /* Here, we think there is a possibility that a [: :] class was meant, and
7784 * we have the first real character. It could be they think the '^' comes
7785 * first */
7786 if (*p == '^') {
7787 found_problem = TRUE;
7788 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
7789 complement = 1;
7790 p++;
7791
7792 if (isBLANK(*p)) {
7793 found_problem = TRUE;
7794
7795 do {
7796 p++;
7797 } while (p < e && isBLANK(*p));
7798
7799 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7800 }
7801 }
7802
7803 /* But the first character should be a colon, which they could have easily
7804 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
7805 * distinguish from a colon, so treat that as a colon). */
7806 if (*p == ':') {
7807 p++;
7808 has_opening_colon = TRUE;
7809 }
7810 else if (*p == ';') {
7811 found_problem = TRUE;
7812 p++;
7813 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7814 has_opening_colon = TRUE;
7815 }
7816 else {
7817 found_problem = TRUE;
7818 ADD_POSIX_WARNING(p, "there must be a starting ':'");
7819
7820 /* Consider an initial punctuation (not one of the recognized ones) to
7821 * be a left terminator */
7822 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
7823 p++;
7824 }
7825 }
7826
7827 /* They may think that you can put spaces between the components */
7828 if (isBLANK(*p)) {
7829 found_problem = TRUE;
7830
7831 do {
7832 p++;
7833 } while (p < e && isBLANK(*p));
7834
7835 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7836 }
7837
7838 if (*p == '^') {
7839
7840 /* We consider something like [^:^alnum:]] to not have been intended to
7841 * be a posix class, but XXX maybe we should */
7842 if (complement) {
7843 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7844 }
7845
7846 complement = 1;
7847 p++;
7848 }
7849
7850 /* Again, they may think that you can put spaces between the components */
7851 if (isBLANK(*p)) {
7852 found_problem = TRUE;
7853
7854 do {
7855 p++;
7856 } while (p < e && isBLANK(*p));
7857
7858 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7859 }
7860
7861 if (*p == ']') {
7862
7863 /* XXX This ']' may be a typo, and something else was meant. But
7864 * treating it as such creates enough complications, that that
7865 * possibility isn't currently considered here. So we assume that the
7866 * ']' is what is intended, and if we've already found an initial '[',
7867 * this leaves this construct looking like [:] or [:^], which almost
7868 * certainly weren't intended to be posix classes */
7869 if (has_opening_bracket) {
7870 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7871 }
7872
7873 /* But this function can be called when we parse the colon for
7874 * something like qr/[alpha:]]/, so we back up to look for the
7875 * beginning */
7876 p--;
7877
7878 if (*p == ';') {
7879 found_problem = TRUE;
7880 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7881 }
7882 else if (*p != ':') {
7883
7884 /* XXX We are currently very restrictive here, so this code doesn't
7885 * consider the possibility that, say, /[alpha.]]/ was intended to
7886 * be a posix class. */
7887 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7888 }
7889
7890 /* Here we have something like 'foo:]'. There was no initial colon,
7891 * and we back up over 'foo. XXX Unlike the going forward case, we
7892 * don't handle typos of non-word chars in the middle */
7893 has_opening_colon = FALSE;
7894 p--;
7895
7896 while (p > RExC_start && isWORDCHAR(*p)) {
7897 p--;
7898 }
7899 p++;
7900
7901 /* Here, we have positioned ourselves to where we think the first
7902 * character in the potential class is */
7903 }
7904
7905 /* Now the interior really starts. There are certain key characters that
7906 * can end the interior, or these could just be typos. To catch both
7907 * cases, we may have to do two passes. In the first pass, we keep on
7908 * going unless we come to a sequence that matches
7909 * qr/ [[:punct:]] [[:blank:]]* \] /xa
7910 * This means it takes a sequence to end the pass, so two typos in a row if
7911 * that wasn't what was intended. If the class is perfectly formed, just
7912 * this one pass is needed. We also stop if there are too many characters
7913 * being accumulated, but this number is deliberately set higher than any
7914 * real class. It is set high enough so that someone who thinks that
7915 * 'alphanumeric' is a correct name would get warned that it wasn't.
7916 * While doing the pass, we keep track of where the key characters were in
7917 * it. If we don't find an end to the class, and one of the key characters
7918 * was found, we redo the pass, but stop when we get to that character.
7919 * Thus the key character was considered a typo in the first pass, but a
7920 * terminator in the second. If two key characters are found, we stop at
7921 * the second one in the first pass. Again this can miss two typos, but
7922 * catches a single one
7923 *
7924 * In the first pass, 'possible_end' starts as NULL, and then gets set to
7925 * point to the first key character. For the second pass, it starts as -1.
7926 * */
7927
7928 name_start = p;
7929 parse_name:
7930 {
7931 bool has_blank = FALSE;
7932 bool has_upper = FALSE;
7933 bool has_terminating_colon = FALSE;
7934 bool has_terminating_bracket = FALSE;
7935 bool has_semi_colon = FALSE;
7936 unsigned int name_len = 0;
7937 int punct_count = 0;
7938
7939 while (p < e) {
7940
7941 /* Squeeze out blanks when looking up the class name below */
7942 if (isBLANK(*p) ) {
7943 has_blank = TRUE;
7944 found_problem = TRUE;
7945 p++;
7946 continue;
7947 }
7948
7949 /* The name will end with a punctuation */
7950 if (isPUNCT(*p)) {
7951 const char * peek = p + 1;
7952
7953 /* Treat any non-']' punctuation followed by a ']' (possibly
7954 * with intervening blanks) as trying to terminate the class.
7955 * ']]' is very likely to mean a class was intended (but
7956 * missing the colon), but the warning message that gets
7957 * generated shows the error position better if we exit the
7958 * loop at the bottom (eventually), so skip it here. */
7959 if (*p != ']') {
7960 if (peek < e && isBLANK(*peek)) {
7961 has_blank = TRUE;
7962 found_problem = TRUE;
7963 do {
7964 peek++;
7965 } while (peek < e && isBLANK(*peek));
7966 }
7967
7968 if (peek < e && *peek == ']') {
7969 has_terminating_bracket = TRUE;
7970 if (*p == ':') {
7971 has_terminating_colon = TRUE;
7972 }
7973 else if (*p == ';') {
7974 has_semi_colon = TRUE;
7975 has_terminating_colon = TRUE;
7976 }
7977 else {
7978 found_problem = TRUE;
7979 }
7980 p = peek + 1;
7981 goto try_posix;
7982 }
7983 }
7984
7985 /* Here we have punctuation we thought didn't end the class.
7986 * Keep track of the position of the key characters that are
7987 * more likely to have been class-enders */
7988 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
7989
7990 /* Allow just one such possible class-ender not actually
7991 * ending the class. */
7992 if (possible_end) {
7993 break;
7994 }
7995 possible_end = p;
7996 }
7997
7998 /* If we have too many punctuation characters, no use in
7999 * keeping going */
8000 if (++punct_count > max_distance) {
8001 break;
8002 }
8003
8004 /* Treat the punctuation as a typo. */
8005 input_text[name_len++] = *p;
8006 p++;
8007 }
8008 else if (isUPPER(*p)) { /* Use lowercase for lookup */
8009 input_text[name_len++] = toLOWER(*p);
8010 has_upper = TRUE;
8011 found_problem = TRUE;
8012 p++;
8013 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
8014 input_text[name_len++] = *p;
8015 p++;
8016 }
8017 else {
8018 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
8019 p+= UTF8SKIP(p);
8020 }
8021
8022 /* The declaration of 'input_text' is how long we allow a potential
8023 * class name to be, before saying they didn't mean a class name at
8024 * all */
8025 if (name_len >= C_ARRAY_LENGTH(input_text)) {
8026 break;
8027 }
8028 }
8029
8030 /* We get to here when the possible class name hasn't been properly
8031 * terminated before:
8032 * 1) we ran off the end of the pattern; or
8033 * 2) found two characters, each of which might have been intended to
8034 * be the name's terminator
8035 * 3) found so many punctuation characters in the purported name,
8036 * that the edit distance to a valid one is exceeded
8037 * 4) we decided it was more characters than anyone could have
8038 * intended to be one. */
8039
8040 found_problem = TRUE;
8041
8042 /* In the final two cases, we know that looking up what we've
8043 * accumulated won't lead to a match, even a fuzzy one. */
8044 if ( name_len >= C_ARRAY_LENGTH(input_text)
8045 || punct_count > max_distance)
8046 {
8047 /* If there was an intermediate key character that could have been
8048 * an intended end, redo the parse, but stop there */
8049 if (possible_end && possible_end != (char *) -1) {
8050 possible_end = (char *) -1; /* Special signal value to say
8051 we've done a first pass */
8052 p = name_start;
8053 goto parse_name;
8054 }
8055
8056 /* Otherwise, it can't have meant to have been a class */
8057 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8058 }
8059
8060 /* If we ran off the end, and the final character was a punctuation
8061 * one, back up one, to look at that final one just below. Later, we
8062 * will restore the parse pointer if appropriate */
8063 if (name_len && p == e && isPUNCT(*(p-1))) {
8064 p--;
8065 name_len--;
8066 }
8067
8068 if (p < e && isPUNCT(*p)) {
8069 if (*p == ']') {
8070 has_terminating_bracket = TRUE;
8071
8072 /* If this is a 2nd ']', and the first one is just below this
8073 * one, consider that to be the real terminator. This gives a
8074 * uniform and better positioning for the warning message */
8075 if ( possible_end
8076 && possible_end != (char *) -1
8077 && *possible_end == ']'
8078 && name_len && input_text[name_len - 1] == ']')
8079 {
8080 name_len--;
8081 p = possible_end;
8082
8083 /* And this is actually equivalent to having done the 2nd
8084 * pass now, so set it to not try again */
8085 possible_end = (char *) -1;
8086 }
8087 }
8088 else {
8089 if (*p == ':') {
8090 has_terminating_colon = TRUE;
8091 }
8092 else if (*p == ';') {
8093 has_semi_colon = TRUE;
8094 has_terminating_colon = TRUE;
8095 }
8096 p++;
8097 }
8098 }
8099
8100 try_posix:
8101
8102 /* Here, we have a class name to look up. We can short circuit the
8103 * stuff below for short names that can't possibly be meant to be a
8104 * class name. (We can do this on the first pass, as any second pass
8105 * will yield an even shorter name) */
8106 if (name_len < 3) {
8107 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8108 }
8109
8110 /* Find which class it is. Initially switch on the length of the name.
8111 * */
8112 switch (name_len) {
8113 case 4:
8114 if (memEQs(name_start, 4, "word")) {
8115 /* this is not POSIX, this is the Perl \w */
8116 class_number = ANYOF_WORDCHAR;
8117 }
8118 break;
8119 case 5:
8120 /* Names all of length 5: alnum alpha ascii blank cntrl digit
8121 * graph lower print punct space upper
8122 * Offset 4 gives the best switch position. */
8123 switch (name_start[4]) {
8124 case 'a':
8125 if (memBEGINs(name_start, 5, "alph")) /* alpha */
8126 class_number = ANYOF_ALPHA;
8127 break;
8128 case 'e':
8129 if (memBEGINs(name_start, 5, "spac")) /* space */
8130 class_number = ANYOF_SPACE;
8131 break;
8132 case 'h':
8133 if (memBEGINs(name_start, 5, "grap")) /* graph */
8134 class_number = ANYOF_GRAPH;
8135 break;
8136 case 'i':
8137 if (memBEGINs(name_start, 5, "asci")) /* ascii */
8138 class_number = ANYOF_ASCII;
8139 break;
8140 case 'k':
8141 if (memBEGINs(name_start, 5, "blan")) /* blank */
8142 class_number = ANYOF_BLANK;
8143 break;
8144 case 'l':
8145 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
8146 class_number = ANYOF_CNTRL;
8147 break;
8148 case 'm':
8149 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
8150 class_number = ANYOF_ALPHANUMERIC;
8151 break;
8152 case 'r':
8153 if (memBEGINs(name_start, 5, "lowe")) /* lower */
8154 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
8155 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
8156 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
8157 break;
8158 case 't':
8159 if (memBEGINs(name_start, 5, "digi")) /* digit */
8160 class_number = ANYOF_DIGIT;
8161 else if (memBEGINs(name_start, 5, "prin")) /* print */
8162 class_number = ANYOF_PRINT;
8163 else if (memBEGINs(name_start, 5, "punc")) /* punct */
8164 class_number = ANYOF_PUNCT;
8165 break;
8166 }
8167 break;
8168 case 6:
8169 if (memEQs(name_start, 6, "xdigit"))
8170 class_number = ANYOF_XDIGIT;
8171 break;
8172 }
8173
8174 /* If the name exactly matches a posix class name the class number will
8175 * here be set to it, and the input almost certainly was meant to be a
8176 * posix class, so we can skip further checking. If instead the syntax
8177 * is exactly correct, but the name isn't one of the legal ones, we
8178 * will return that as an error below. But if neither of these apply,
8179 * it could be that no posix class was intended at all, or that one
8180 * was, but there was a typo. We tease these apart by doing fuzzy
8181 * matching on the name */
8182 if (class_number == OOB_NAMEDCLASS && found_problem) {
8183 const UV posix_names[][6] = {
8184 { 'a', 'l', 'n', 'u', 'm' },
8185 { 'a', 'l', 'p', 'h', 'a' },
8186 { 'a', 's', 'c', 'i', 'i' },
8187 { 'b', 'l', 'a', 'n', 'k' },
8188 { 'c', 'n', 't', 'r', 'l' },
8189 { 'd', 'i', 'g', 'i', 't' },
8190 { 'g', 'r', 'a', 'p', 'h' },
8191 { 'l', 'o', 'w', 'e', 'r' },
8192 { 'p', 'r', 'i', 'n', 't' },
8193 { 'p', 'u', 'n', 'c', 't' },
8194 { 's', 'p', 'a', 'c', 'e' },
8195 { 'u', 'p', 'p', 'e', 'r' },
8196 { 'w', 'o', 'r', 'd' },
8197 { 'x', 'd', 'i', 'g', 'i', 't' }
8198 };
8199 /* The names of the above all have added NULs to make them the same
8200 * size, so we need to also have the real lengths */
8201 const UV posix_name_lengths[] = {
8202 sizeof("alnum") - 1,
8203 sizeof("alpha") - 1,
8204 sizeof("ascii") - 1,
8205 sizeof("blank") - 1,
8206 sizeof("cntrl") - 1,
8207 sizeof("digit") - 1,
8208 sizeof("graph") - 1,
8209 sizeof("lower") - 1,
8210 sizeof("print") - 1,
8211 sizeof("punct") - 1,
8212 sizeof("space") - 1,
8213 sizeof("upper") - 1,
8214 sizeof("word") - 1,
8215 sizeof("xdigit")- 1
8216 };
8217 unsigned int i;
8218 int temp_max = max_distance; /* Use a temporary, so if we
8219 reparse, we haven't changed the
8220 outer one */
8221
8222 /* Use a smaller max edit distance if we are missing one of the
8223 * delimiters */
8224 if ( has_opening_bracket + has_opening_colon < 2
8225 || has_terminating_bracket + has_terminating_colon < 2)
8226 {
8227 temp_max--;
8228 }
8229
8230 /* See if the input name is close to a legal one */
8231 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
8232
8233 /* Short circuit call if the lengths are too far apart to be
8234 * able to match */
8235 if (abs( (int) (name_len - posix_name_lengths[i]))
8236 > temp_max)
8237 {
8238 continue;
8239 }
8240
8241 if (edit_distance(input_text,
8242 posix_names[i],
8243 name_len,
8244 posix_name_lengths[i],
8245 temp_max
8246 )
8247 > -1)
8248 { /* If it is close, it probably was intended to be a class */
8249 goto probably_meant_to_be;
8250 }
8251 }
8252
8253 /* Here the input name is not close enough to a valid class name
8254 * for us to consider it to be intended to be a posix class. If
8255 * we haven't already done so, and the parse found a character that
8256 * could have been terminators for the name, but which we absorbed
8257 * as typos during the first pass, repeat the parse, signalling it
8258 * to stop at that character */
8259 if (possible_end && possible_end != (char *) -1) {
8260 possible_end = (char *) -1;
8261 p = name_start;
8262 goto parse_name;
8263 }
8264
8265 /* Here neither pass found a close-enough class name */
8266 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8267 }
8268
8269 probably_meant_to_be:
8270
8271 /* Here we think that a posix specification was intended. Update any
8272 * parse pointer */
8273 if (updated_parse_ptr) {
8274 *updated_parse_ptr = (char *) p;
8275 }
8276
8277 /* If a posix class name was intended but incorrectly specified, we
8278 * output or return the warnings */
8279 if (found_problem) {
8280
8281 /* We set flags for these issues in the parse loop above instead of
8282 * adding them to the list of warnings, because we can parse it
8283 * twice, and we only want one warning instance */
8284 if (has_upper) {
8285 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
8286 }
8287 if (has_blank) {
8288 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
8289 }
8290 if (has_semi_colon) {
8291 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
8292 }
8293 else if (! has_terminating_colon) {
8294 ADD_POSIX_WARNING(p, "there is no terminating ':'");
8295 }
8296 if (! has_terminating_bracket) {
8297 ADD_POSIX_WARNING(p, "there is no terminating ']'");
8298 }
8299
8300 if ( posix_warnings
8301 && RExC_warn_text
8302 && av_count(RExC_warn_text) > 0)
8303 {
8304 *posix_warnings = RExC_warn_text;
8305 }
8306 }
8307 else if (class_number != OOB_NAMEDCLASS) {
8308 /* If it is a known class, return the class. The class number
8309 * #defines are structured so each complement is +1 to the normal
8310 * one */
8311 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
8312 }
8313 else if (! check_only) {
8314
8315 /* Here, it is an unrecognized class. This is an error (unless the
8316 * call is to check only, which we've already handled above) */
8317 const char * const complement_string = (complement)
8318 ? "^"
8319 : "";
8320 RExC_parse_set((char *) p);
8321 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
8322 complement_string,
8323 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
8324 }
8325 }
8326
8327 return OOB_NAMEDCLASS;
8328}
8329#undef ADD_POSIX_WARNING
8330
8331STATIC unsigned int
8332S_regex_set_precedence(const U8 my_operator) {
8333
8334 /* Returns the precedence in the (?[...]) construct of the input operator,
8335 * specified by its character representation. The precedence follows
8336 * general Perl rules, but it extends this so that ')' and ']' have (low)
8337 * precedence even though they aren't really operators */
8338
8339 switch (my_operator) {
8340 case '!':
8341 return 5;
8342 case '&':
8343 return 4;
8344 case '^':
8345 case '|':
8346 case '+':
8347 case '-':
8348 return 3;
8349 case ')':
8350 return 2;
8351 case ']':
8352 return 1;
8353 }
8354
8355 NOT_REACHED; /* NOTREACHED */
8356 return 0; /* Silence compiler warning */
8357}
8358
8359STATIC regnode_offset
8360S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
8361 I32 *flagp, U32 depth)
8362{
8363 /* Handle the (?[...]) construct to do set operations */
8364
8365 U8 curchar; /* Current character being parsed */
8366 UV start, end; /* End points of code point ranges */
8367 SV* final = NULL; /* The end result inversion list */
8368 SV* result_string; /* 'final' stringified */
8369 AV* stack; /* stack of operators and operands not yet
8370 resolved */
8371 AV* fence_stack = NULL; /* A stack containing the positions in
8372 'stack' of where the undealt-with left
8373 parens would be if they were actually
8374 put there */
8375 /* The 'volatile' is a workaround for an optimiser bug
8376 * in Solaris Studio 12.3. See RT #127455 */
8377 volatile IV fence = 0; /* Position of where most recent undealt-
8378 with left paren in stack is; -1 if none.
8379 */
8380 STRLEN len; /* Temporary */
8381 regnode_offset node; /* Temporary, and final regnode returned by
8382 this function */
8383 const bool save_fold = FOLD; /* Temporary */
8384 char *save_end, *save_parse; /* Temporaries */
8385 const bool in_locale = LOC; /* we turn off /l during processing */
8386
8387 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8388
8389 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
8390
8391 DEBUG_PARSE("xcls");
8392
8393 if (in_locale) {
8394 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
8395 }
8396
8397 /* The use of this operator implies /u. This is required so that the
8398 * compile time values are valid in all runtime cases */
8399 REQUIRE_UNI_RULES(flagp, 0);
8400
8401 /* Everything in this construct is a metacharacter. Operands begin with
8402 * either a '\' (for an escape sequence), or a '[' for a bracketed
8403 * character class. Any other character should be an operator, or
8404 * parenthesis for grouping. Both types of operands are handled by calling
8405 * regclass() to parse them. It is called with a parameter to indicate to
8406 * return the computed inversion list. The parsing here is implemented via
8407 * a stack. Each entry on the stack is a single character representing one
8408 * of the operators; or else a pointer to an operand inversion list. */
8409
8410#define IS_OPERATOR(a) SvIOK(a)
8411#define IS_OPERAND(a) (! IS_OPERATOR(a))
8412
8413 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
8414 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
8415 * with pronouncing it called it Reverse Polish instead, but now that YOU
8416 * know how to pronounce it you can use the correct term, thus giving due
8417 * credit to the person who invented it, and impressing your geek friends.
8418 * Wikipedia says that the pronunciation of "Ł" has been changing so that
8419 * it is now more like an English initial W (as in wonk) than an L.)
8420 *
8421 * This means that, for example, 'a | b & c' is stored on the stack as
8422 *
8423 * c [4]
8424 * b [3]
8425 * & [2]
8426 * a [1]
8427 * | [0]
8428 *
8429 * where the numbers in brackets give the stack [array] element number.
8430 * In this implementation, parentheses are not stored on the stack.
8431 * Instead a '(' creates a "fence" so that the part of the stack below the
8432 * fence is invisible except to the corresponding ')' (this allows us to
8433 * replace testing for parens, by using instead subtraction of the fence
8434 * position). As new operands are processed they are pushed onto the stack
8435 * (except as noted in the next paragraph). New operators of higher
8436 * precedence than the current final one are inserted on the stack before
8437 * the lhs operand (so that when the rhs is pushed next, everything will be
8438 * in the correct positions shown above. When an operator of equal or
8439 * lower precedence is encountered in parsing, all the stacked operations
8440 * of equal or higher precedence are evaluated, leaving the result as the
8441 * top entry on the stack. This makes higher precedence operations
8442 * evaluate before lower precedence ones, and causes operations of equal
8443 * precedence to left associate.
8444 *
8445 * The only unary operator '!' is immediately pushed onto the stack when
8446 * encountered. When an operand is encountered, if the top of the stack is
8447 * a '!", the complement is immediately performed, and the '!' popped. The
8448 * resulting value is treated as a new operand, and the logic in the
8449 * previous paragraph is executed. Thus in the expression
8450 * [a] + ! [b]
8451 * the stack looks like
8452 *
8453 * !
8454 * a
8455 * +
8456 *
8457 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
8458 * becomes
8459 *
8460 * !b
8461 * a
8462 * +
8463 *
8464 * A ')' is treated as an operator with lower precedence than all the
8465 * aforementioned ones, which causes all operations on the stack above the
8466 * corresponding '(' to be evaluated down to a single resultant operand.
8467 * Then the fence for the '(' is removed, and the operand goes through the
8468 * algorithm above, without the fence.
8469 *
8470 * A separate stack is kept of the fence positions, so that the position of
8471 * the latest so-far unbalanced '(' is at the top of it.
8472 *
8473 * The ']' ending the construct is treated as the lowest operator of all,
8474 * so that everything gets evaluated down to a single operand, which is the
8475 * result */
8476
8477 stack = (AV*)newSV_type_mortal(SVt_PVAV);
8478 fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
8479
8480 while (RExC_parse < RExC_end) {
8481 I32 top_index; /* Index of top-most element in 'stack' */
8482 SV** top_ptr; /* Pointer to top 'stack' element */
8483 SV* current = NULL; /* To contain the current inversion list
8484 operand */
8485 SV* only_to_avoid_leaks;
8486
8487 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
8488 TRUE /* Force /x */ );
8489 if (RExC_parse >= RExC_end) { /* Fail */
8490 break;
8491 }
8492
8493 curchar = UCHARAT(RExC_parse);
8494
8495redo_curchar:
8496
8497#ifdef ENABLE_REGEX_SETS_DEBUGGING
8498 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
8499 DEBUG_U(dump_regex_sets_structures(pRExC_state,
8500 stack, fence, fence_stack));
8501#endif
8502
8503 top_index = av_tindex_skip_len_mg(stack);
8504
8505 switch (curchar) {
8506 SV** stacked_ptr; /* Ptr to something already on 'stack' */
8507 char stacked_operator; /* The topmost operator on the 'stack'. */
8508 SV* lhs; /* Operand to the left of the operator */
8509 SV* rhs; /* Operand to the right of the operator */
8510 SV* fence_ptr; /* Pointer to top element of the fence
8511 stack */
8512 case '(':
8513
8514 if ( RExC_parse < RExC_end - 2
8515 && UCHARAT(RExC_parse + 1) == '?'
8516 && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
8517 {
8518 const regnode_offset orig_emit = RExC_emit;
8519 SV * resultant_invlist;
8520
8521 /* Here it could be an embedded '(?flags:(?[...])'.
8522 * This happens when we have some thing like
8523 *
8524 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
8525 * ...
8526 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
8527 *
8528 * Here we would be handling the interpolated
8529 * '$thai_or_lao'. We handle this by a recursive call to
8530 * reg which returns the inversion list the
8531 * interpolated expression evaluates to. Actually, the
8532 * return is a special regnode containing a pointer to that
8533 * inversion list. If the return isn't that regnode alone,
8534 * we know that this wasn't such an interpolation, which is
8535 * an error: we need to get a single inversion list back
8536 * from the recursion */
8537
8538 RExC_parse_inc_by(1);
8539 RExC_sets_depth++;
8540
8541 node = reg(pRExC_state, 2, flagp, depth+1);
8542 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8543
8544 if ( OP(REGNODE_p(node)) != REGEX_SET
8545 /* If more than a single node returned, the nested
8546 * parens evaluated to more than just a (?[...]),
8547 * which isn't legal */
8548 || RExC_emit != orig_emit
8549 + NODE_STEP_REGNODE
8550 + REGNODE_ARG_LEN(REGEX_SET))
8551 {
8552 vFAIL("Expecting interpolated extended charclass");
8553 }
8554 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
8555 current = invlist_clone(resultant_invlist, NULL);
8556 SvREFCNT_dec(resultant_invlist);
8557
8558 RExC_sets_depth--;
8559 RExC_emit = orig_emit;
8560 goto handle_operand;
8561 }
8562
8563 /* A regular '('. Look behind for illegal syntax */
8564 if (top_index - fence >= 0) {
8565 /* If the top entry on the stack is an operator, it had
8566 * better be a '!', otherwise the entry below the top
8567 * operand should be an operator */
8568 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
8569 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
8570 || ( IS_OPERAND(*top_ptr)
8571 && ( top_index - fence < 1
8572 || ! (stacked_ptr = av_fetch(stack,
8573 top_index - 1,
8574 FALSE))
8575 || ! IS_OPERATOR(*stacked_ptr))))
8576 {
8577 RExC_parse_inc_by(1);
8578 vFAIL("Unexpected '(' with no preceding operator");
8579 }
8580 }
8581
8582 /* Stack the position of this undealt-with left paren */
8583 av_push_simple(fence_stack, newSViv(fence));
8584 fence = top_index + 1;
8585 break;
8586
8587 case '\\':
8588 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8589 * multi-char folds are allowed. */
8590 if (!regclass(pRExC_state, flagp, depth+1,
8591 TRUE, /* means parse just the next thing */
8592 FALSE, /* don't allow multi-char folds */
8593 FALSE, /* don't silence non-portable warnings. */
8594 TRUE, /* strict */
8595 FALSE, /* Require return to be an ANYOF */
8596 &current))
8597 {
8598 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8599 goto regclass_failed;
8600 }
8601
8602 assert(current);
8603
8604 /* regclass() will return with parsing just the \ sequence,
8605 * leaving the parse pointer at the next thing to parse */
8606 RExC_parse--;
8607 goto handle_operand;
8608
8609 case '[': /* Is a bracketed character class */
8610 {
8611 /* See if this is a [:posix:] class. */
8612 bool is_posix_class = (OOB_NAMEDCLASS
8613 < handle_possible_posix(pRExC_state,
8614 RExC_parse + 1,
8615 NULL,
8616 NULL,
8617 TRUE /* checking only */));
8618 /* If it is a posix class, leave the parse pointer at the '['
8619 * to fool regclass() into thinking it is part of a
8620 * '[[:posix:]]'. */
8621 if (! is_posix_class) {
8622 RExC_parse_inc_by(1);
8623 }
8624
8625 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8626 * multi-char folds are allowed. */
8627 if (!regclass(pRExC_state, flagp, depth+1,
8628 is_posix_class, /* parse the whole char
8629 class only if not a
8630 posix class */
8631 FALSE, /* don't allow multi-char folds */
8632 TRUE, /* silence non-portable warnings. */
8633 TRUE, /* strict */
8634 FALSE, /* Require return to be an ANYOF */
8635 &current))
8636 {
8637 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8638 goto regclass_failed;
8639 }
8640
8641 assert(current);
8642
8643 /* function call leaves parse pointing to the ']', except if we
8644 * faked it */
8645 if (is_posix_class) {
8646 RExC_parse--;
8647 }
8648
8649 goto handle_operand;
8650 }
8651
8652 case ']':
8653 if (top_index >= 1) {
8654 goto join_operators;
8655 }
8656
8657 /* Only a single operand on the stack: are done */
8658 goto done;
8659
8660 case ')':
8661 if (av_tindex_skip_len_mg(fence_stack) < 0) {
8662 if (UCHARAT(RExC_parse - 1) == ']') {
8663 break;
8664 }
8665 RExC_parse_inc_by(1);
8666 vFAIL("Unexpected ')'");
8667 }
8668
8669 /* If nothing after the fence, is missing an operand */
8670 if (top_index - fence < 0) {
8671 RExC_parse_inc_by(1);
8672 goto bad_syntax;
8673 }
8674 /* If at least two things on the stack, treat this as an
8675 * operator */
8676 if (top_index - fence >= 1) {
8677 goto join_operators;
8678 }
8679
8680 /* Here only a single thing on the fenced stack, and there is a
8681 * fence. Get rid of it */
8682 fence_ptr = av_pop(fence_stack);
8683 assert(fence_ptr);
8684 fence = SvIV(fence_ptr);
8685 SvREFCNT_dec_NN(fence_ptr);
8686 fence_ptr = NULL;
8687
8688 if (fence < 0) {
8689 fence = 0;
8690 }
8691
8692 /* Having gotten rid of the fence, we pop the operand at the
8693 * stack top and process it as a newly encountered operand */
8694 current = av_pop(stack);
8695 if (IS_OPERAND(current)) {
8696 goto handle_operand;
8697 }
8698
8699 RExC_parse_inc_by(1);
8700 goto bad_syntax;
8701
8702 case '&':
8703 case '|':
8704 case '+':
8705 case '-':
8706 case '^':
8707
8708 /* These binary operators should have a left operand already
8709 * parsed */
8710 if ( top_index - fence < 0
8711 || top_index - fence == 1
8712 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
8713 || ! IS_OPERAND(*top_ptr))
8714 {
8715 goto unexpected_binary;
8716 }
8717
8718 /* If only the one operand is on the part of the stack visible
8719 * to us, we just place this operator in the proper position */
8720 if (top_index - fence < 2) {
8721
8722 /* Place the operator before the operand */
8723
8724 SV* lhs = av_pop(stack);
8725 av_push_simple(stack, newSVuv(curchar));
8726 av_push_simple(stack, lhs);
8727 break;
8728 }
8729
8730 /* But if there is something else on the stack, we need to
8731 * process it before this new operator if and only if the
8732 * stacked operation has equal or higher precedence than the
8733 * new one */
8734
8735 join_operators:
8736
8737 /* The operator on the stack is supposed to be below both its
8738 * operands */
8739 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
8740 || IS_OPERAND(*stacked_ptr))
8741 {
8742 /* But if not, it's legal and indicates we are completely
8743 * done if and only if we're currently processing a ']',
8744 * which should be the final thing in the expression */
8745 if (curchar == ']') {
8746 goto done;
8747 }
8748
8749 unexpected_binary:
8750 RExC_parse_inc_by(1);
8751 vFAIL2("Unexpected binary operator '%c' with no "
8752 "preceding operand", curchar);
8753 }
8754 stacked_operator = (char) SvUV(*stacked_ptr);
8755
8756 if (regex_set_precedence(curchar)
8757 > regex_set_precedence(stacked_operator))
8758 {
8759 /* Here, the new operator has higher precedence than the
8760 * stacked one. This means we need to add the new one to
8761 * the stack to await its rhs operand (and maybe more
8762 * stuff). We put it before the lhs operand, leaving
8763 * untouched the stacked operator and everything below it
8764 * */
8765 lhs = av_pop(stack);
8766 assert(IS_OPERAND(lhs));
8767 av_push_simple(stack, newSVuv(curchar));
8768 av_push_simple(stack, lhs);
8769 break;
8770 }
8771
8772 /* Here, the new operator has equal or lower precedence than
8773 * what's already there. This means the operation already
8774 * there should be performed now, before the new one. */
8775
8776 rhs = av_pop(stack);
8777 if (! IS_OPERAND(rhs)) {
8778
8779 /* This can happen when a ! is not followed by an operand,
8780 * like in /(?[\t &!])/ */
8781 goto bad_syntax;
8782 }
8783
8784 lhs = av_pop(stack);
8785
8786 if (! IS_OPERAND(lhs)) {
8787
8788 /* This can happen when there is an empty (), like in
8789 * /(?[[0]+()+])/ */
8790 goto bad_syntax;
8791 }
8792
8793 switch (stacked_operator) {
8794 case '&':
8795 _invlist_intersection(lhs, rhs, &rhs);
8796 break;
8797
8798 case '|':
8799 case '+':
8800 _invlist_union(lhs, rhs, &rhs);
8801 break;
8802
8803 case '-':
8804 _invlist_subtract(lhs, rhs, &rhs);
8805 break;
8806
8807 case '^': /* The union minus the intersection */
8808 {
8809 SV* i = NULL;
8810 SV* u = NULL;
8811
8812 _invlist_union(lhs, rhs, &u);
8813 _invlist_intersection(lhs, rhs, &i);
8814 _invlist_subtract(u, i, &rhs);
8815 SvREFCNT_dec_NN(i);
8816 SvREFCNT_dec_NN(u);
8817 break;
8818 }
8819 }
8820 SvREFCNT_dec(lhs);
8821
8822 /* Here, the higher precedence operation has been done, and the
8823 * result is in 'rhs'. We overwrite the stacked operator with
8824 * the result. Then we redo this code to either push the new
8825 * operator onto the stack or perform any higher precedence
8826 * stacked operation */
8827 only_to_avoid_leaks = av_pop(stack);
8828 SvREFCNT_dec(only_to_avoid_leaks);
8829 av_push_simple(stack, rhs);
8830 goto redo_curchar;
8831
8832 case '!': /* Highest priority, right associative */
8833
8834 /* If what's already at the top of the stack is another '!",
8835 * they just cancel each other out */
8836 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
8837 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
8838 {
8839 only_to_avoid_leaks = av_pop(stack);
8840 SvREFCNT_dec(only_to_avoid_leaks);
8841 }
8842 else { /* Otherwise, since it's right associative, just push
8843 onto the stack */
8844 av_push_simple(stack, newSVuv(curchar));
8845 }
8846 break;
8847
8848 default:
8849 RExC_parse_inc();
8850 if (RExC_parse >= RExC_end) {
8851 break;
8852 }
8853 vFAIL("Unexpected character");
8854
8855 handle_operand:
8856
8857 /* Here 'current' is the operand. If something is already on the
8858 * stack, we have to check if it is a !. But first, the code above
8859 * may have altered the stack in the time since we earlier set
8860 * 'top_index'. */
8861
8862 top_index = av_tindex_skip_len_mg(stack);
8863 if (top_index - fence >= 0) {
8864 /* If the top entry on the stack is an operator, it had better
8865 * be a '!', otherwise the entry below the top operand should
8866 * be an operator */
8867 top_ptr = av_fetch(stack, top_index, FALSE);
8868 assert(top_ptr);
8869 if (IS_OPERATOR(*top_ptr)) {
8870
8871 /* The only permissible operator at the top of the stack is
8872 * '!', which is applied immediately to this operand. */
8873 curchar = (char) SvUV(*top_ptr);
8874 if (curchar != '!') {
8875 SvREFCNT_dec(current);
8876 vFAIL2("Unexpected binary operator '%c' with no "
8877 "preceding operand", curchar);
8878 }
8879
8880 _invlist_invert(current);
8881
8882 only_to_avoid_leaks = av_pop(stack);
8883 SvREFCNT_dec(only_to_avoid_leaks);
8884
8885 /* And we redo with the inverted operand. This allows
8886 * handling multiple ! in a row */
8887 goto handle_operand;
8888 }
8889 /* Single operand is ok only for the non-binary ')'
8890 * operator */
8891 else if ((top_index - fence == 0 && curchar != ')')
8892 || (top_index - fence > 0
8893 && (! (stacked_ptr = av_fetch(stack,
8894 top_index - 1,
8895 FALSE))
8896 || IS_OPERAND(*stacked_ptr))))
8897 {
8898 SvREFCNT_dec(current);
8899 vFAIL("Operand with no preceding operator");
8900 }
8901 }
8902
8903 /* Here there was nothing on the stack or the top element was
8904 * another operand. Just add this new one */
8905 av_push_simple(stack, current);
8906
8907 } /* End of switch on next parse token */
8908
8909 RExC_parse_inc();
8910 } /* End of loop parsing through the construct */
8911
8912 vFAIL("Syntax error in (?[...])");
8913
8914 done:
8915
8916 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
8917 if (RExC_parse < RExC_end) {
8918 RExC_parse_inc_by(1);
8919 }
8920
8921 vFAIL("Unexpected ']' with no following ')' in (?[...");
8922 }
8923
8924 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
8925 vFAIL("Unmatched (");
8926 }
8927
8928 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
8929 || ((final = av_pop(stack)) == NULL)
8930 || ! IS_OPERAND(final)
8931 || ! is_invlist(final)
8932 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
8933 {
8934 bad_syntax:
8935 SvREFCNT_dec(final);
8936 vFAIL("Incomplete expression within '(?[ ])'");
8937 }
8938
8939 /* Here, 'final' is the resultant inversion list from evaluating the
8940 * expression. Return it if so requested */
8941 if (return_invlist) {
8942 *return_invlist = final;
8943 return END;
8944 }
8945
8946 if (RExC_sets_depth) { /* If within a recursive call, return in a special
8947 regnode */
8948 RExC_parse_inc_by(1);
8949 node = regpnode(pRExC_state, REGEX_SET, final);
8950 }
8951 else {
8952
8953 /* Otherwise generate a resultant node, based on 'final'. regclass()
8954 * is expecting a string of ranges and individual code points */
8955 invlist_iterinit(final);
8956 result_string = newSVpvs("");
8957 while (invlist_iternext(final, &start, &end)) {
8958 if (start == end) {
8959 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
8960 }
8961 else {
8962 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
8963 UVXf "}", start, end);
8964 }
8965 }
8966
8967 /* About to generate an ANYOF (or similar) node from the inversion list
8968 * we have calculated */
8969 save_parse = RExC_parse;
8970 RExC_parse_set(SvPV(result_string, len));
8971 save_end = RExC_end;
8972 RExC_end = RExC_parse + len;
8973 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
8974
8975 /* We turn off folding around the call, as the class we have
8976 * constructed already has all folding taken into consideration, and we
8977 * don't want regclass() to add to that */
8978 RExC_flags &= ~RXf_PMf_FOLD;
8979 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
8980 * folds are allowed. */
8981 node = regclass(pRExC_state, flagp, depth+1,
8982 FALSE, /* means parse the whole char class */
8983 FALSE, /* don't allow multi-char folds */
8984 TRUE, /* silence non-portable warnings. The above may
8985 very well have generated non-portable code
8986 points, but they're valid on this machine */
8987 FALSE, /* similarly, no need for strict */
8988
8989 /* We can optimize into something besides an ANYOF,
8990 * except under /l, which needs to be ANYOF because of
8991 * runtime checks for locale sanity, etc */
8992 ! in_locale,
8993 NULL
8994 );
8995
8996 RESTORE_WARNINGS;
8997 RExC_parse_set(save_parse + 1);
8998 RExC_end = save_end;
8999 SvREFCNT_dec_NN(final);
9000 SvREFCNT_dec_NN(result_string);
9001
9002 if (save_fold) {
9003 RExC_flags |= RXf_PMf_FOLD;
9004 }
9005
9006 if (!node) {
9007 RETURN_FAIL_ON_RESTART(*flagp, flagp);
9008 goto regclass_failed;
9009 }
9010
9011 /* Fix up the node type if we are in locale. (We have pretended we are
9012 * under /u for the purposes of regclass(), as this construct will only
9013 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
9014 * (so as to cause any warnings about bad locales to be output in
9015 * regexec.c), and add the flag that indicates to check if not in a
9016 * UTF-8 locale. The reason we above forbid optimization into
9017 * something other than an ANYOF node is simply to minimize the number
9018 * of code changes in regexec.c. Otherwise we would have to create new
9019 * EXACTish node types and deal with them. This decision could be
9020 * revisited should this construct become popular.
9021 *
9022 * (One might think we could look at the resulting ANYOF node and
9023 * suppress the flag if everything is above 255, as those would be
9024 * UTF-8 only, but this isn't true, as the components that led to that
9025 * result could have been locale-affected, and just happen to cancel
9026 * each other out under UTF-8 locales.) */
9027 if (in_locale) {
9028 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
9029
9030 assert(OP(REGNODE_p(node)) == ANYOF);
9031
9032 OP(REGNODE_p(node)) = ANYOFL;
9033 ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
9034 }
9035 }
9036
9037 nextchar(pRExC_state);
9038 return node;
9039
9040 regclass_failed:
9041 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
9042 (UV) *flagp);
9043}
9044
9045#ifdef ENABLE_REGEX_SETS_DEBUGGING
9046
9047STATIC void
9048S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
9049 AV * stack, const IV fence, AV * fence_stack)
9050{ /* Dumps the stacks in handle_regex_sets() */
9051
9052 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
9053 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
9054 SSize_t i;
9055
9056 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
9057
9058 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
9059
9060 if (stack_top < 0) {
9061 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
9062 }
9063 else {
9064 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
9065 for (i = stack_top; i >= 0; i--) {
9066 SV ** element_ptr = av_fetch(stack, i, FALSE);
9067 if (! element_ptr) {
9068 }
9069
9070 if (IS_OPERATOR(*element_ptr)) {
9071 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
9072 (int) i, (int) SvIV(*element_ptr));
9073 }
9074 else {
9075 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
9076 sv_dump(*element_ptr);
9077 }
9078 }
9079 }
9080
9081 if (fence_stack_top < 0) {
9082 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
9083 }
9084 else {
9085 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
9086 for (i = fence_stack_top; i >= 0; i--) {
9087 SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE);
9088 if (! element_ptr) {
9089 }
9090
9091 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
9092 (int) i, (int) SvIV(*element_ptr));
9093 }
9094 }
9095}
9096
9097#endif
9098
9099#undef IS_OPERATOR
9100#undef IS_OPERAND
9101
9102#ifdef PERL_RE_BUILD_AUX
9103void
9104Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
9105{
9106 /* This adds the Latin1/above-Latin1 folding rules.
9107 *
9108 * This should be called only for a Latin1-range code points, cp, which is
9109 * known to be involved in a simple fold with other code points above
9110 * Latin1. It would give false results if /aa has been specified.
9111 * Multi-char folds are outside the scope of this, and must be handled
9112 * specially. */
9113
9114 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
9115
9116 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
9117
9118 /* The rules that are valid for all Unicode versions are hard-coded in */
9119 switch (cp) {
9120 case 'k':
9121 case 'K':
9122 *invlist =
9123 add_cp_to_invlist(*invlist, KELVIN_SIGN);
9124 break;
9125 case 's':
9126 case 'S':
9127 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
9128 break;
9129 case MICRO_SIGN:
9130 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
9131 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
9132 break;
9133 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9134 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9135 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
9136 break;
9137 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9138 *invlist = add_cp_to_invlist(*invlist,
9139 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9140 break;
9141
9142 default: /* Other code points are checked against the data for the
9143 current Unicode version */
9144 {
9145 Size_t folds_count;
9146 U32 first_fold;
9147 const U32 * remaining_folds;
9148 UV folded_cp;
9149
9150 if (isASCII(cp)) {
9151 folded_cp = toFOLD(cp);
9152 }
9153 else {
9154 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
9155 Size_t dummy_len;
9156 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
9157 }
9158
9159 if (folded_cp > 255) {
9160 *invlist = add_cp_to_invlist(*invlist, folded_cp);
9161 }
9162
9163 folds_count = _inverse_folds(folded_cp, &first_fold,
9164 &remaining_folds);
9165 if (folds_count == 0) {
9166
9167 /* Use deprecated warning to increase the chances of this being
9168 * output */
9169 ckWARN2reg_d(RExC_parse,
9170 "Perl folding rules are not up-to-date for 0x%02X;"
9171 " please use the perlbug utility to report;", cp);
9172 }
9173 else {
9174 unsigned int i;
9175
9176 if (first_fold > 255) {
9177 *invlist = add_cp_to_invlist(*invlist, first_fold);
9178 }
9179 for (i = 0; i < folds_count - 1; i++) {
9180 if (remaining_folds[i] > 255) {
9181 *invlist = add_cp_to_invlist(*invlist,
9182 remaining_folds[i]);
9183 }
9184 }
9185 }
9186 break;
9187 }
9188 }
9189}
9190#endif /* PERL_RE_BUILD_AUX */
9191
9192
9193STATIC void
9194S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
9195{
9196 /* Output the elements of the array given by '*posix_warnings' as REGEXP
9197 * warnings. */
9198
9199 SV * msg;
9200 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
9201
9202 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
9203
9204 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
9205 CLEAR_POSIX_WARNINGS();
9206 return;
9207 }
9208
9209 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
9210 if (first_is_fatal) { /* Avoid leaking this */
9211 av_undef(posix_warnings); /* This isn't necessary if the
9212 array is mortal, but is a
9213 fail-safe */
9214 (void) sv_2mortal(msg);
9215 PREPARE_TO_DIE;
9216 }
9217 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
9218 SvREFCNT_dec_NN(msg);
9219 }
9220
9221 UPDATE_WARNINGS_LOC(RExC_parse);
9222}
9223
9224PERL_STATIC_INLINE Size_t
9225S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
9226{
9227 const U8 * const start = s1;
9228 const U8 * const send = start + max;
9229
9230 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
9231
9232 while (s1 < send && *s1 == *s2) {
9233 s1++; s2++;
9234 }
9235
9236 return s1 - start;
9237}
9238
9239STATIC AV *
9240S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
9241{
9242 /* This adds the string scalar <multi_string> to the array
9243 * <multi_char_matches>. <multi_string> is known to have exactly
9244 * <cp_count> code points in it. This is used when constructing a
9245 * bracketed character class and we find something that needs to match more
9246 * than a single character.
9247 *
9248 * <multi_char_matches> is actually an array of arrays. Each top-level
9249 * element is an array that contains all the strings known so far that are
9250 * the same length. And that length (in number of code points) is the same
9251 * as the index of the top-level array. Hence, the [2] element is an
9252 * array, each element thereof is a string containing TWO code points;
9253 * while element [3] is for strings of THREE characters, and so on. Since
9254 * this is for multi-char strings there can never be a [0] nor [1] element.
9255 *
9256 * When we rewrite the character class below, we will do so such that the
9257 * longest strings are written first, so that it prefers the longest
9258 * matching strings first. This is done even if it turns out that any
9259 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
9260 * Christiansen has agreed that this is ok. This makes the test for the
9261 * ligature 'ffi' come before the test for 'ff', for example */
9262
9263 AV* this_array;
9264 AV** this_array_ptr;
9265
9266 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
9267
9268 if (! multi_char_matches) {
9269 multi_char_matches = newAV();
9270 }
9271
9272 if (av_exists(multi_char_matches, cp_count)) {
9273 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE);
9274 this_array = *this_array_ptr;
9275 }
9276 else {
9277 this_array = newAV();
9278 av_store_simple(multi_char_matches, cp_count,
9279 (SV*) this_array);
9280 }
9281 av_push_simple(this_array, multi_string);
9282
9283 return multi_char_matches;
9284}
9285
9286/* The names of properties whose definitions are not known at compile time are
9287 * stored in this SV, after a constant heading. So if the length has been
9288 * changed since initialization, then there is a run-time definition. */
9289#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
9290 (SvCUR(listsv) != initial_listsv_len)
9291
9292/* There is a restricted set of white space characters that are legal when
9293 * ignoring white space in a bracketed character class. This generates the
9294 * code to skip them.
9295 *
9296 * There is a line below that uses the same white space criteria but is outside
9297 * this macro. Both here and there must use the same definition */
9298#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
9299 STMT_START { \
9300 if (do_skip) { \
9301 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
9302 { \
9303 p++; \
9304 } \
9305 } \
9306 } STMT_END
9307
9308STATIC regnode_offset
9309S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
9310 const bool stop_at_1, /* Just parse the next thing, don't
9311 look for a full character class */
9312 bool allow_mutiple_chars,
9313 const bool silence_non_portable, /* Don't output warnings
9314 about too large
9315 characters */
9316 const bool strict,
9317 bool optimizable, /* ? Allow a non-ANYOF return
9318 node */
9319 SV** ret_invlist /* Return an inversion list, not a node */
9320 )
9321{
9322 /* parse a bracketed class specification. Most of these will produce an
9323 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
9324 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
9325 * under /i with multi-character folds: it will be rewritten following the
9326 * paradigm of this example, where the <multi-fold>s are characters which
9327 * fold to multiple character sequences:
9328 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
9329 * gets effectively rewritten as:
9330 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
9331 * reg() gets called (recursively) on the rewritten version, and this
9332 * function will return what it constructs. (Actually the <multi-fold>s
9333 * aren't physically removed from the [abcdefghi], it's just that they are
9334 * ignored in the recursion by means of a flag:
9335 * <RExC_in_multi_char_class>.)
9336 *
9337 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
9338 * characters, with the corresponding bit set if that character is in the
9339 * list. For characters above this, an inversion list is used. There
9340 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
9341 * determinable at compile time
9342 *
9343 * On success, returns the offset at which any next node should be placed
9344 * into the regex engine program being compiled.
9345 *
9346 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
9347 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
9348 * UTF-8
9349 */
9350
9351 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
9352 IV range = 0;
9353 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
9354 regnode_offset ret = -1; /* Initialized to an illegal value */
9355 STRLEN numlen;
9356 int namedclass = OOB_NAMEDCLASS;
9357 char *rangebegin = NULL;
9358 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
9359 aren't available at the time this was called */
9360 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9361 than just initialized. */
9362 SV* properties = NULL; /* Code points that match \p{} \P{} */
9363 SV* posixes = NULL; /* Code points that match classes like [:word:],
9364 extended beyond the Latin1 range. These have to
9365 be kept separate from other code points for much
9366 of this function because their handling is
9367 different under /i, and for most classes under
9368 /d as well */
9369 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
9370 separate for a while from the non-complemented
9371 versions because of complications with /d
9372 matching */
9373 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
9374 treated more simply than the general case,
9375 leading to less compilation and execution
9376 work */
9377 UV element_count = 0; /* Number of distinct elements in the class.
9378 Optimizations may be possible if this is tiny */
9379 AV * multi_char_matches = NULL; /* Code points that fold to more than one
9380 character; used under /i */
9381 UV n;
9382 char * stop_ptr = RExC_end; /* where to stop parsing */
9383
9384 /* ignore unescaped whitespace? */
9385 const bool skip_white = cBOOL( ret_invlist
9386 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
9387
9388 /* inversion list of code points this node matches only when the target
9389 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
9390 * /d) */
9391 SV* upper_latin1_only_utf8_matches = NULL;
9392
9393 /* Inversion list of code points this node matches regardless of things
9394 * like locale, folding, utf8ness of the target string */
9395 SV* cp_list = NULL;
9396
9397 /* Like cp_list, but code points on this list need to be checked for things
9398 * that fold to/from them under /i */
9399 SV* cp_foldable_list = NULL;
9400
9401 /* Like cp_list, but code points on this list are valid only when the
9402 * runtime locale is UTF-8 */
9403 SV* only_utf8_locale_list = NULL;
9404
9405 /* In a range, if one of the endpoints is non-character-set portable,
9406 * meaning that it hard-codes a code point that may mean a different
9407 * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
9408 * mnemonic '\t' which each mean the same character no matter which
9409 * character set the platform is on. */
9410 unsigned int non_portable_endpoint = 0;
9411
9412 /* Is the range unicode? which means on a platform that isn't 1-1 native
9413 * to Unicode (i.e. non-ASCII), each code point in it should be considered
9414 * to be a Unicode value. */
9415 bool unicode_range = FALSE;
9416 bool invert = FALSE; /* Is this class to be complemented */
9417
9418 bool warn_super = ALWAYS_WARN_SUPER;
9419
9420 const char * orig_parse = RExC_parse;
9421
9422 /* This variable is used to mark where the end in the input is of something
9423 * that looks like a POSIX construct but isn't. During the parse, when
9424 * something looks like it could be such a construct is encountered, it is
9425 * checked for being one, but not if we've already checked this area of the
9426 * input. Only after this position is reached do we check again */
9427 char *not_posix_region_end = RExC_parse - 1;
9428
9429 AV* posix_warnings = NULL;
9430 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
9431 U8 op = ANYOF; /* The returned node-type, initialized to the expected
9432 type. */
9433 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
9434 U32 posixl = 0; /* bit field of posix classes matched under /l */
9435
9436
9437/* Flags as to what things aren't knowable until runtime. (Note that these are
9438 * mutually exclusive.) */
9439#define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
9440 haven't been defined as of yet */
9441#define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
9442 UTF-8 or not */
9443#define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
9444 what gets folded */
9445 U32 has_runtime_dependency = 0; /* OR of the above flags */
9446
9447 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9448
9449 PERL_ARGS_ASSERT_REGCLASS;
9450#ifndef DEBUGGING
9451 PERL_UNUSED_ARG(depth);
9452#endif
9453
9454 assert(! (ret_invlist && allow_mutiple_chars));
9455
9456 /* If wants an inversion list returned, we can't optimize to something
9457 * else. */
9458 if (ret_invlist) {
9459 optimizable = FALSE;
9460 }
9461
9462 DEBUG_PARSE("clas");
9463
9464#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
9465 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
9466 && UNICODE_DOT_DOT_VERSION == 0)
9467 allow_mutiple_chars = FALSE;
9468#endif
9469
9470 /* We include the /i status at the beginning of this so that we can
9471 * know it at runtime */
9472 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
9473 initial_listsv_len = SvCUR(listsv);
9474 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
9475
9476 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9477
9478 assert(RExC_parse <= RExC_end);
9479
9480 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
9481 RExC_parse_inc_by(1);
9482 invert = TRUE;
9483 allow_mutiple_chars = FALSE;
9484 MARK_NAUGHTY(1);
9485 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9486 }
9487
9488 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
9489 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
9490 int maybe_class = handle_possible_posix(pRExC_state,
9491 RExC_parse,
9492 &not_posix_region_end,
9493 NULL,
9494 TRUE /* checking only */);
9495 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
9496 ckWARN4reg(not_posix_region_end,
9497 "POSIX syntax [%c %c] belongs inside character classes%s",
9498 *RExC_parse, *RExC_parse,
9499 (maybe_class == OOB_NAMEDCLASS)
9500 ? ((POSIXCC_NOTYET(*RExC_parse))
9501 ? " (but this one isn't implemented)"
9502 : " (but this one isn't fully valid)")
9503 : ""
9504 );
9505 }
9506 }
9507
9508 /* If the caller wants us to just parse a single element, accomplish this
9509 * by faking the loop ending condition */
9510 if (stop_at_1 && RExC_end > RExC_parse) {
9511 stop_ptr = RExC_parse + 1;
9512 }
9513
9514 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
9515 if (UCHARAT(RExC_parse) == ']')
9516 goto charclassloop;
9517
9518 while (1) {
9519
9520 if ( posix_warnings
9521 && av_tindex_skip_len_mg(posix_warnings) >= 0
9522 && RExC_parse > not_posix_region_end)
9523 {
9524 /* Warnings about posix class issues are considered tentative until
9525 * we are far enough along in the parse that we can no longer
9526 * change our mind, at which point we output them. This is done
9527 * each time through the loop so that a later class won't zap them
9528 * before they have been dealt with. */
9529 output_posix_warnings(pRExC_state, posix_warnings);
9530 }
9531
9532 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9533
9534 if (RExC_parse >= stop_ptr) {
9535 break;
9536 }
9537
9538 if (UCHARAT(RExC_parse) == ']') {
9539 break;
9540 }
9541
9542 charclassloop:
9543
9544 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9545 save_value = value;
9546 save_prevvalue = prevvalue;
9547
9548 if (!range) {
9549 rangebegin = RExC_parse;
9550 element_count++;
9551 non_portable_endpoint = 0;
9552 }
9553 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
9554 value = utf8n_to_uvchr((U8*)RExC_parse,
9555 RExC_end - RExC_parse,
9556 &numlen, UTF8_ALLOW_DEFAULT);
9557 RExC_parse_inc_by(numlen);
9558 }
9559 else {
9560 value = UCHARAT(RExC_parse);
9561 RExC_parse_inc_by(1);
9562 }
9563
9564 if (value == '[') {
9565 char * posix_class_end;
9566 namedclass = handle_possible_posix(pRExC_state,
9567 RExC_parse,
9568 &posix_class_end,
9569 do_posix_warnings ? &posix_warnings : NULL,
9570 FALSE /* die if error */);
9571 if (namedclass > OOB_NAMEDCLASS) {
9572
9573 /* If there was an earlier attempt to parse this particular
9574 * posix class, and it failed, it was a false alarm, as this
9575 * successful one proves */
9576 if ( posix_warnings
9577 && av_tindex_skip_len_mg(posix_warnings) >= 0
9578 && not_posix_region_end >= RExC_parse
9579 && not_posix_region_end <= posix_class_end)
9580 {
9581 av_undef(posix_warnings);
9582 }
9583
9584 RExC_parse_set(posix_class_end);
9585 }
9586 else if (namedclass == OOB_NAMEDCLASS) {
9587 not_posix_region_end = posix_class_end;
9588 }
9589 else {
9590 namedclass = OOB_NAMEDCLASS;
9591 }
9592 }
9593 else if ( RExC_parse - 1 > not_posix_region_end
9594 && MAYBE_POSIXCC(value))
9595 {
9596 (void) handle_possible_posix(
9597 pRExC_state,
9598 RExC_parse - 1, /* -1 because parse has already been
9599 advanced */
9600 &not_posix_region_end,
9601 do_posix_warnings ? &posix_warnings : NULL,
9602 TRUE /* checking only */);
9603 }
9604 else if ( strict && ! skip_white
9605 && ( generic_isCC_(value, CC_VERTSPACE_)
9606 || is_VERTWS_cp_high(value)))
9607 {
9608 vFAIL("Literal vertical space in [] is illegal except under /x");
9609 }
9610 else if (value == '\\') {
9611 /* Is a backslash; get the code point of the char after it */
9612
9613 if (RExC_parse >= RExC_end) {
9614 vFAIL("Unmatched [");
9615 }
9616
9617 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
9618 value = utf8n_to_uvchr((U8*)RExC_parse,
9619 RExC_end - RExC_parse,
9620 &numlen, UTF8_ALLOW_DEFAULT);
9621 RExC_parse_inc_by(numlen);
9622 }
9623 else {
9624 value = UCHARAT(RExC_parse);
9625 RExC_parse_inc_by(1);
9626 }
9627
9628 /* Some compilers cannot handle switching on 64-bit integer
9629 * values, therefore value cannot be an UV. Yes, this will
9630 * be a problem later if we want switch on Unicode.
9631 * A similar issue a little bit later when switching on
9632 * namedclass. --jhi */
9633
9634 /* If the \ is escaping white space when white space is being
9635 * skipped, it means that that white space is wanted literally, and
9636 * is already in 'value'. Otherwise, need to translate the escape
9637 * into what it signifies. */
9638 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
9639 const char * message;
9640 U32 packed_warn;
9641 U8 grok_c_char;
9642
9643 case 'w': namedclass = ANYOF_WORDCHAR; break;
9644 case 'W': namedclass = ANYOF_NWORDCHAR; break;
9645 case 's': namedclass = ANYOF_SPACE; break;
9646 case 'S': namedclass = ANYOF_NSPACE; break;
9647 case 'd': namedclass = ANYOF_DIGIT; break;
9648 case 'D': namedclass = ANYOF_NDIGIT; break;
9649 case 'v': namedclass = ANYOF_VERTWS; break;
9650 case 'V': namedclass = ANYOF_NVERTWS; break;
9651 case 'h': namedclass = ANYOF_HORIZWS; break;
9652 case 'H': namedclass = ANYOF_NHORIZWS; break;
9653 case 'N': /* Handle \N{NAME} in class */
9654 {
9655 const char * const backslash_N_beg = RExC_parse - 2;
9656 int cp_count;
9657
9658 if (! grok_bslash_N(pRExC_state,
9659 NULL, /* No regnode */
9660 &value, /* Yes single value */
9661 &cp_count, /* Multiple code pt count */
9662 flagp,
9663 strict,
9664 depth)
9665 ) {
9666
9667 if (*flagp & NEED_UTF8)
9668 FAIL("panic: grok_bslash_N set NEED_UTF8");
9669
9670 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
9671
9672 if (cp_count < 0) {
9673 vFAIL("\\N in a character class must be a named character: \\N{...}");
9674 }
9675 else if (cp_count == 0) {
9676 ckWARNreg(RExC_parse,
9677 "Ignoring zero length \\N{} in character class");
9678 }
9679 else { /* cp_count > 1 */
9680 assert(cp_count > 1);
9681 if (! RExC_in_multi_char_class) {
9682 if ( ! allow_mutiple_chars
9683 || invert
9684 || range
9685 || *RExC_parse == '-')
9686 {
9687 if (strict) {
9688 RExC_parse--;
9689 vFAIL("\\N{} here is restricted to one character");
9690 }
9691 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
9692 break; /* <value> contains the first code
9693 point. Drop out of the switch to
9694 process it */
9695 }
9696 else {
9697 SV * multi_char_N = newSVpvn(backslash_N_beg,
9698 RExC_parse - backslash_N_beg);
9699 multi_char_matches
9700 = add_multi_match(multi_char_matches,
9701 multi_char_N,
9702 cp_count);
9703 }
9704 }
9705 } /* End of cp_count != 1 */
9706
9707 /* This element should not be processed further in this
9708 * class */
9709 element_count--;
9710 value = save_value;
9711 prevvalue = save_prevvalue;
9712 continue; /* Back to top of loop to get next char */
9713 }
9714
9715 /* Here, is a single code point, and <value> contains it */
9716 unicode_range = TRUE; /* \N{} are Unicode */
9717 }
9718 break;
9719 case 'p':
9720 case 'P':
9721 {
9722 char *e;
9723
9724 if (RExC_pm_flags & PMf_WILDCARD) {
9725 RExC_parse_inc_by(1);
9726 /* diag_listed_as: Use of %s is not allowed in Unicode
9727 property wildcard subpatterns in regex; marked by <--
9728 HERE in m/%s/ */
9729 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
9730 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
9731 }
9732
9733 /* \p means they want Unicode semantics */
9734 REQUIRE_UNI_RULES(flagp, 0);
9735
9736 if (RExC_parse >= RExC_end)
9737 vFAIL2("Empty \\%c", (U8)value);
9738 if (*RExC_parse == '{') {
9739 const U8 c = (U8)value;
9740 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
9741 if (!e) {
9742 RExC_parse_inc_by(1);
9743 vFAIL2("Missing right brace on \\%c{}", c);
9744 }
9745
9746 RExC_parse_inc_by(1);
9747
9748 /* White space is allowed adjacent to the braces and after
9749 * any '^', even when not under /x */
9750 while (isSPACE(*RExC_parse)) {
9751 RExC_parse_inc_by(1);
9752 }
9753
9754 if (UCHARAT(RExC_parse) == '^') {
9755
9756 /* toggle. (The rhs xor gets the single bit that
9757 * differs between P and p; the other xor inverts just
9758 * that bit) */
9759 value ^= 'P' ^ 'p';
9760
9761 RExC_parse_inc_by(1);
9762 while (isSPACE(*RExC_parse)) {
9763 RExC_parse_inc_by(1);
9764 }
9765 }
9766
9767 if (e == RExC_parse)
9768 vFAIL2("Empty \\%c{}", c);
9769
9770 n = e - RExC_parse;
9771 while (isSPACE(*(RExC_parse + n - 1)))
9772 n--;
9773
9774 } /* The \p isn't immediately followed by a '{' */
9775 else if (! isALPHA(*RExC_parse)) {
9776 RExC_parse_inc_safe();
9777 vFAIL2("Character following \\%c must be '{' or a "
9778 "single-character Unicode property name",
9779 (U8) value);
9780 }
9781 else {
9782 e = RExC_parse;
9783 n = 1;
9784 }
9785 {
9786 char* name = RExC_parse;
9787
9788 /* Any message returned about expanding the definition */
9789 SV* msg = newSVpvs_flags("", SVs_TEMP);
9790
9791 /* If set TRUE, the property is user-defined as opposed to
9792 * official Unicode */
9793 bool user_defined = FALSE;
9794 AV * strings = NULL;
9795
9796 SV * prop_definition = parse_uniprop_string(
9797 name, n, UTF, FOLD,
9798 FALSE, /* This is compile-time */
9799
9800 /* We can't defer this defn when
9801 * the full result is required in
9802 * this call */
9803 ! cBOOL(ret_invlist),
9804
9805 &strings,
9806 &user_defined,
9807 msg,
9808 0 /* Base level */
9809 );
9810 if (SvCUR(msg)) { /* Assumes any error causes a msg */
9811 assert(prop_definition == NULL);
9812 RExC_parse_set(e + 1);
9813 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
9814 thing so, or else the display is
9815 mojibake */
9816 RExC_utf8 = TRUE;
9817 }
9818 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
9819 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
9820 SvCUR(msg), SvPVX(msg)));
9821 }
9822
9823 assert(prop_definition || strings);
9824
9825 if (strings) {
9826 if (ret_invlist) {
9827 if (! prop_definition) {
9828 RExC_parse_set(e + 1);
9829 vFAIL("Unicode string properties are not implemented in (?[...])");
9830 }
9831 else {
9832 ckWARNreg(e + 1,
9833 "Using just the single character results"
9834 " returned by \\p{} in (?[...])");
9835 }
9836 }
9837 else if (! RExC_in_multi_char_class) {
9838 if (invert ^ (value == 'P')) {
9839 RExC_parse_set(e + 1);
9840 vFAIL("Inverting a character class which contains"
9841 " a multi-character sequence is illegal");
9842 }
9843
9844 /* For each multi-character string ... */
9845 while (av_count(strings) > 0) {
9846 /* ... Each entry is itself an array of code
9847 * points. */
9848 AV * this_string = (AV *) av_shift( strings);
9849 STRLEN cp_count = av_count(this_string);
9850 SV * final = newSV(cp_count ? cp_count * 4 : 1);
9851 SvPVCLEAR_FRESH(final);
9852
9853 /* Create another string of sequences of \x{...} */
9854 while (av_count(this_string) > 0) {
9855 SV * character = av_shift(this_string);
9856 UV cp = SvUV(character);
9857
9858 if (cp > 255) {
9859 REQUIRE_UTF8(flagp);
9860 }
9861 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
9862 cp);
9863 SvREFCNT_dec_NN(character);
9864 }
9865 SvREFCNT_dec_NN(this_string);
9866
9867 /* And add that to the list of such things */
9868 multi_char_matches
9869 = add_multi_match(multi_char_matches,
9870 final,
9871 cp_count);
9872 }
9873 }
9874 SvREFCNT_dec_NN(strings);
9875 }
9876
9877 if (! prop_definition) { /* If we got only a string,
9878 this iteration didn't really
9879 find a character */
9880 element_count--;
9881 }
9882 else if (! is_invlist(prop_definition)) {
9883
9884 /* Here, the definition isn't known, so we have gotten
9885 * returned a string that will be evaluated if and when
9886 * encountered at runtime. We add it to the list of
9887 * such properties, along with whether it should be
9888 * complemented or not */
9889 if (value == 'P') {
9890 sv_catpvs(listsv, "!");
9891 }
9892 else {
9893 sv_catpvs(listsv, "+");
9894 }
9895 sv_catsv(listsv, prop_definition);
9896
9897 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
9898
9899 /* We don't know yet what this matches, so have to flag
9900 * it */
9901 anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
9902 }
9903 else {
9904 assert (prop_definition && is_invlist(prop_definition));
9905
9906 /* Here we do have the complete property definition
9907 *
9908 * Temporary workaround for [GH #16520]. For this
9909 * precise input that is in the .t that is failing,
9910 * load utf8.pm, which is what the test wants, so that
9911 * that .t passes */
9912 if ( memEQs(RExC_start, e + 1 - RExC_start,
9913 "foo\\p{Alnum}")
9914 && ! hv_common(GvHVn(PL_incgv),
9915 NULL,
9916 "utf8.pm", sizeof("utf8.pm") - 1,
9917 0, HV_FETCH_ISEXISTS, NULL, 0))
9918 {
9919 require_pv("utf8.pm");
9920 }
9921
9922 if (! user_defined &&
9923 /* We warn on matching an above-Unicode code point
9924 * if the match would return true, except don't
9925 * warn for \p{All}, which has exactly one element
9926 * = 0 */
9927 (_invlist_contains_cp(prop_definition, 0x110000)
9928 && (! (_invlist_len(prop_definition) == 1
9929 && *invlist_array(prop_definition) == 0))))
9930 {
9931 warn_super = TRUE;
9932 }
9933
9934 /* Invert if asking for the complement */
9935 if (value == 'P') {
9936 _invlist_union_complement_2nd(properties,
9937 prop_definition,
9938 &properties);
9939 }
9940 else {
9941 _invlist_union(properties, prop_definition, &properties);
9942 }
9943 }
9944 }
9945
9946 RExC_parse_set(e + 1);
9947 namedclass = ANYOF_UNIPROP; /* no official name, but it's
9948 named */
9949 }
9950 break;
9951 case 'n': value = '\n'; break;
9952 case 'r': value = '\r'; break;
9953 case 't': value = '\t'; break;
9954 case 'f': value = '\f'; break;
9955 case 'b': value = '\b'; break;
9956 case 'e': value = ESC_NATIVE; break;
9957 case 'a': value = '\a'; break;
9958 case 'o':
9959 RExC_parse--; /* function expects to be pointed at the 'o' */
9960 if (! grok_bslash_o(&RExC_parse,
9961 RExC_end,
9962 &value,
9963 &message,
9964 &packed_warn,
9965 strict,
9966 cBOOL(range), /* MAX_UV allowed for range
9967 upper limit */
9968 UTF))
9969 {
9970 vFAIL(message);
9971 }
9972 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9973 warn_non_literal_string(RExC_parse, packed_warn, message);
9974 }
9975
9976 if (value < 256) {
9977 non_portable_endpoint++;
9978 }
9979 break;
9980 case 'x':
9981 RExC_parse--; /* function expects to be pointed at the 'x' */
9982 if (! grok_bslash_x(&RExC_parse,
9983 RExC_end,
9984 &value,
9985 &message,
9986 &packed_warn,
9987 strict,
9988 cBOOL(range), /* MAX_UV allowed for range
9989 upper limit */
9990 UTF))
9991 {
9992 vFAIL(message);
9993 }
9994 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9995 warn_non_literal_string(RExC_parse, packed_warn, message);
9996 }
9997
9998 if (value < 256) {
9999 non_portable_endpoint++;
10000 }
10001 break;
10002 case 'c':
10003 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
10004 &packed_warn))
10005 {
10006 /* going to die anyway; point to exact spot of
10007 * failure */
10008 RExC_parse_inc_safe();
10009 vFAIL(message);
10010 }
10011
10012 value = grok_c_char;
10013 RExC_parse_inc_by(1);
10014 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10015 warn_non_literal_string(RExC_parse, packed_warn, message);
10016 }
10017
10018 non_portable_endpoint++;
10019 break;
10020 case '0': case '1': case '2': case '3': case '4':
10021 case '5': case '6': case '7':
10022 {
10023 /* Take 1-3 octal digits */
10024 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
10025 | PERL_SCAN_NOTIFY_ILLDIGIT;
10026 numlen = (strict) ? 4 : 3;
10027 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10028 RExC_parse_inc_by(numlen);
10029 if (numlen != 3) {
10030 if (strict) {
10031 RExC_parse_inc_safe();
10032 vFAIL("Need exactly 3 octal digits");
10033 }
10034 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
10035 && RExC_parse < RExC_end
10036 && isDIGIT(*RExC_parse)
10037 && ckWARN(WARN_REGEXP))
10038 {
10039 reg_warn_non_literal_string(
10040 RExC_parse + 1,
10041 form_alien_digit_msg(8, numlen, RExC_parse,
10042 RExC_end, UTF, FALSE));
10043 }
10044 }
10045 if (value < 256) {
10046 non_portable_endpoint++;
10047 }
10048 break;
10049 }
10050 default:
10051 /* Allow \_ to not give an error */
10052 if (isWORDCHAR(value) && value != '_') {
10053 if (strict) {
10054 vFAIL2("Unrecognized escape \\%c in character class",
10055 (int)value);
10056 }
10057 else {
10058 ckWARN2reg(RExC_parse,
10059 "Unrecognized escape \\%c in character class passed through",
10060 (int)value);
10061 }
10062 }
10063 break;
10064 } /* End of switch on char following backslash */
10065 } /* end of handling backslash escape sequences */
10066
10067 /* Here, we have the current token in 'value' */
10068
10069 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10070 U8 classnum;
10071
10072 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
10073 * literal, as is the character that began the false range, i.e.
10074 * the 'a' in the examples */
10075 if (range) {
10076 const int w = (RExC_parse >= rangebegin)
10077 ? RExC_parse - rangebegin
10078 : 0;
10079 if (strict) {
10080 vFAIL2utf8f(
10081 "False [] range \"%" UTF8f "\"",
10082 UTF8fARG(UTF, w, rangebegin));
10083 }
10084 else {
10085 ckWARN2reg(RExC_parse,
10086 "False [] range \"%" UTF8f "\"",
10087 UTF8fARG(UTF, w, rangebegin));
10088 cp_list = add_cp_to_invlist(cp_list, '-');
10089 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
10090 prevvalue);
10091 }
10092
10093 range = 0; /* this was not a true range */
10094 element_count += 2; /* So counts for three values */
10095 }
10096
10097 classnum = namedclass_to_classnum(namedclass);
10098
10099 if (LOC && namedclass < ANYOF_POSIXL_MAX
10100#ifndef HAS_ISASCII
10101 && classnum != CC_ASCII_
10102#endif
10103 ) {
10104 SV* scratch_list = NULL;
10105
10106 /* What the Posix classes (like \w, [:space:]) match isn't
10107 * generally knowable under locale until actual match time. A
10108 * special node is used for these which has extra space for a
10109 * bitmap, with a bit reserved for each named class that is to
10110 * be matched against. (This isn't needed for \p{} and
10111 * pseudo-classes, as they are not affected by locale, and
10112 * hence are dealt with separately.) However, if a named class
10113 * and its complement are both present, then it matches
10114 * everything, and there is no runtime dependency. Odd numbers
10115 * are the complements of the next lower number, so xor works.
10116 * (Note that something like [\w\D] should match everything,
10117 * because \d should be a proper subset of \w. But rather than
10118 * trust that the locale is well behaved, we leave this to
10119 * runtime to sort out) */
10120 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
10121 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
10122 POSIXL_ZERO(posixl);
10123 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
10124 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
10125 continue; /* We could ignore the rest of the class, but
10126 best to parse it for any errors */
10127 }
10128 else { /* Here, isn't the complement of any already parsed
10129 class */
10130 POSIXL_SET(posixl, namedclass);
10131 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
10132 anyof_flags |= ANYOF_MATCHES_POSIXL;
10133
10134 /* The above-Latin1 characters are not subject to locale
10135 * rules. Just add them to the unconditionally-matched
10136 * list */
10137
10138 /* Get the list of the above-Latin1 code points this
10139 * matches */
10140 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
10141 PL_XPosix_ptrs[classnum],
10142
10143 /* Odd numbers are complements,
10144 * like NDIGIT, NASCII, ... */
10145 namedclass % 2 != 0,
10146 &scratch_list);
10147 /* Checking if 'cp_list' is NULL first saves an extra
10148 * clone. Its reference count will be decremented at the
10149 * next union, etc, or if this is the only instance, at the
10150 * end of the routine */
10151 if (! cp_list) {
10152 cp_list = scratch_list;
10153 }
10154 else {
10155 _invlist_union(cp_list, scratch_list, &cp_list);
10156 SvREFCNT_dec_NN(scratch_list);
10157 }
10158 continue; /* Go get next character */
10159 }
10160 }
10161 else {
10162
10163 /* Here, is not /l, or is a POSIX class for which /l doesn't
10164 * matter (or is a Unicode property, which is skipped here). */
10165 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
10166 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
10167
10168 /* Here, should be \h, \H, \v, or \V. None of /d, /i
10169 * nor /l make a difference in what these match,
10170 * therefore we just add what they match to cp_list. */
10171 if (classnum != CC_VERTSPACE_) {
10172 assert( namedclass == ANYOF_HORIZWS
10173 || namedclass == ANYOF_NHORIZWS);
10174
10175 /* It turns out that \h is just a synonym for
10176 * XPosixBlank */
10177 classnum = CC_BLANK_;
10178 }
10179
10180 _invlist_union_maybe_complement_2nd(
10181 cp_list,
10182 PL_XPosix_ptrs[classnum],
10183 namedclass % 2 != 0, /* Complement if odd
10184 (NHORIZWS, NVERTWS)
10185 */
10186 &cp_list);
10187 }
10188 }
10189 else if ( AT_LEAST_UNI_SEMANTICS
10190 || classnum == CC_ASCII_
10191 || (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_
10192 || classnum == CC_XDIGIT_)))
10193 {
10194 /* We usually have to worry about /d affecting what POSIX
10195 * classes match, with special code needed because we won't
10196 * know until runtime what all matches. But there is no
10197 * extra work needed under /u and /a; and [:ascii:] is
10198 * unaffected by /d; and :digit: and :xdigit: don't have
10199 * runtime differences under /d. So we can special case
10200 * these, and avoid some extra work below, and at runtime.
10201 * */
10202 _invlist_union_maybe_complement_2nd(
10203 simple_posixes,
10204 ((AT_LEAST_ASCII_RESTRICTED)
10205 ? PL_Posix_ptrs[classnum]
10206 : PL_XPosix_ptrs[classnum]),
10207 namedclass % 2 != 0,
10208 &simple_posixes);
10209 }
10210 else { /* Garden variety class. If is NUPPER, NALPHA, ...
10211 complement and use nposixes */
10212 SV** posixes_ptr = namedclass % 2 == 0
10213 ? &posixes
10214 : &nposixes;
10215 _invlist_union_maybe_complement_2nd(
10216 *posixes_ptr,
10217 PL_XPosix_ptrs[classnum],
10218 namedclass % 2 != 0,
10219 posixes_ptr);
10220 }
10221 }
10222 } /* end of namedclass \blah */
10223
10224 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
10225
10226 /* If 'range' is set, 'value' is the ending of a range--check its
10227 * validity. (If value isn't a single code point in the case of a
10228 * range, we should have figured that out above in the code that
10229 * catches false ranges). Later, we will handle each individual code
10230 * point in the range. If 'range' isn't set, this could be the
10231 * beginning of a range, so check for that by looking ahead to see if
10232 * the next real character to be processed is the range indicator--the
10233 * minus sign */
10234
10235 if (range) {
10236#ifdef EBCDIC
10237 /* For unicode ranges, we have to test that the Unicode as opposed
10238 * to the native values are not decreasing. (Above 255, there is
10239 * no difference between native and Unicode) */
10240 if (unicode_range && prevvalue < 255 && value < 255) {
10241 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
10242 goto backwards_range;
10243 }
10244 }
10245 else
10246#endif
10247 if (prevvalue > value) /* b-a */ {
10248 int w;
10249#ifdef EBCDIC
10250 backwards_range:
10251#endif
10252 w = RExC_parse - rangebegin;
10253 vFAIL2utf8f(
10254 "Invalid [] range \"%" UTF8f "\"",
10255 UTF8fARG(UTF, w, rangebegin));
10256 NOT_REACHED; /* NOTREACHED */
10257 }
10258 }
10259 else {
10260 prevvalue = value; /* save the beginning of the potential range */
10261 if (! stop_at_1 /* Can't be a range if parsing just one thing */
10262 && *RExC_parse == '-')
10263 {
10264 char* next_char_ptr = RExC_parse + 1;
10265
10266 /* Get the next real char after the '-' */
10267 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
10268
10269 /* If the '-' is at the end of the class (just before the ']',
10270 * it is a literal minus; otherwise it is a range */
10271 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
10272 RExC_parse_set(next_char_ptr);
10273
10274 /* a bad range like \w-, [:word:]- ? */
10275 if (namedclass > OOB_NAMEDCLASS) {
10276 if (strict || ckWARN(WARN_REGEXP)) {
10277 const int w = RExC_parse >= rangebegin
10278 ? RExC_parse - rangebegin
10279 : 0;
10280 if (strict) {
10281 vFAIL4("False [] range \"%*.*s\"",
10282 w, w, rangebegin);
10283 }
10284 else {
10285 vWARN4(RExC_parse,
10286 "False [] range \"%*.*s\"",
10287 w, w, rangebegin);
10288 }
10289 }
10290 cp_list = add_cp_to_invlist(cp_list, '-');
10291 element_count++;
10292 } else
10293 range = 1; /* yeah, it's a range! */
10294 continue; /* but do it the next time */
10295 }
10296 }
10297 }
10298
10299 if (namedclass > OOB_NAMEDCLASS) {
10300 continue;
10301 }
10302
10303 /* Here, we have a single value this time through the loop, and
10304 * <prevvalue> is the beginning of the range, if any; or <value> if
10305 * not. */
10306
10307 /* non-Latin1 code point implies unicode semantics. */
10308 if (value > 255) {
10309 if (value > MAX_LEGAL_CP && ( value != UV_MAX
10310 || prevvalue > MAX_LEGAL_CP))
10311 {
10312 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
10313 }
10314 REQUIRE_UNI_RULES(flagp, 0);
10315 if ( ! silence_non_portable
10316 && UNICODE_IS_PERL_EXTENDED(value)
10317 && TO_OUTPUT_WARNINGS(RExC_parse))
10318 {
10319 ckWARN2_non_literal_string(RExC_parse,
10320 packWARN(WARN_PORTABLE),
10321 PL_extended_cp_format,
10322 value);
10323 }
10324 }
10325
10326 /* Ready to process either the single value, or the completed range.
10327 * For single-valued non-inverted ranges, we consider the possibility
10328 * of multi-char folds. (We made a conscious decision to not do this
10329 * for the other cases because it can often lead to non-intuitive
10330 * results. For example, you have the peculiar case that:
10331 * "s s" =~ /^[^\xDF]+$/i => Y
10332 * "ss" =~ /^[^\xDF]+$/i => N
10333 *
10334 * See [perl #89750] */
10335 if (FOLD && allow_mutiple_chars && value == prevvalue) {
10336 if ( value == LATIN_SMALL_LETTER_SHARP_S
10337 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
10338 value)))
10339 {
10340 /* Here <value> is indeed a multi-char fold. Get what it is */
10341
10342 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10343 STRLEN foldlen;
10344
10345 UV folded = _to_uni_fold_flags(
10346 value,
10347 foldbuf,
10348 &foldlen,
10349 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
10350 ? FOLD_FLAGS_NOMIX_ASCII
10351 : 0)
10352 );
10353
10354 /* Here, <folded> should be the first character of the
10355 * multi-char fold of <value>, with <foldbuf> containing the
10356 * whole thing. But, if this fold is not allowed (because of
10357 * the flags), <fold> will be the same as <value>, and should
10358 * be processed like any other character, so skip the special
10359 * handling */
10360 if (folded != value) {
10361
10362 /* Skip if we are recursed, currently parsing the class
10363 * again. Otherwise add this character to the list of
10364 * multi-char folds. */
10365 if (! RExC_in_multi_char_class) {
10366 STRLEN cp_count = utf8_length(foldbuf,
10367 foldbuf + foldlen);
10368 SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
10369
10370 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
10371
10372 multi_char_matches
10373 = add_multi_match(multi_char_matches,
10374 multi_fold,
10375 cp_count);
10376
10377 }
10378
10379 /* This element should not be processed further in this
10380 * class */
10381 element_count--;
10382 value = save_value;
10383 prevvalue = save_prevvalue;
10384 continue;
10385 }
10386 }
10387 }
10388
10389 if (strict && ckWARN(WARN_REGEXP)) {
10390 if (range) {
10391
10392 /* If the range starts above 255, everything is portable and
10393 * likely to be so for any forseeable character set, so don't
10394 * warn. */
10395 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
10396 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
10397 }
10398 else if (prevvalue != value) {
10399
10400 /* Under strict, ranges that stop and/or end in an ASCII
10401 * printable should have each end point be a portable value
10402 * for it (preferably like 'A', but we don't warn if it is
10403 * a (portable) Unicode name or code point), and the range
10404 * must be all digits or all letters of the same case.
10405 * Otherwise, the range is non-portable and unclear as to
10406 * what it contains */
10407 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
10408 && ( non_portable_endpoint
10409 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
10410 || (isLOWER_A(prevvalue) && isLOWER_A(value))
10411 || (isUPPER_A(prevvalue) && isUPPER_A(value))
10412 ))) {
10413 vWARN(RExC_parse, "Ranges of ASCII printables should"
10414 " be some subset of \"0-9\","
10415 " \"A-Z\", or \"a-z\"");
10416 }
10417 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
10418 SSize_t index_start;
10419 SSize_t index_final;
10420
10421 /* But the nature of Unicode and languages mean we
10422 * can't do the same checks for above-ASCII ranges,
10423 * except in the case of digit ones. These should
10424 * contain only digits from the same group of 10. The
10425 * ASCII case is handled just above. Hence here, the
10426 * range could be a range of digits. First some
10427 * unlikely special cases. Grandfather in that a range
10428 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
10429 * if its starting value is one of the 10 digits prior
10430 * to it. This is because it is an alternate way of
10431 * writing 19D1, and some people may expect it to be in
10432 * that group. But it is bad, because it won't give
10433 * the expected results. In Unicode 5.2 it was
10434 * considered to be in that group (of 11, hence), but
10435 * this was fixed in the next version */
10436
10437 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
10438 goto warn_bad_digit_range;
10439 }
10440 else if (UNLIKELY( prevvalue >= 0x1D7CE
10441 && value <= 0x1D7FF))
10442 {
10443 /* This is the only other case currently in Unicode
10444 * where the algorithm below fails. The code
10445 * points just above are the end points of a single
10446 * range containing only decimal digits. It is 5
10447 * different series of 0-9. All other ranges of
10448 * digits currently in Unicode are just a single
10449 * series. (And mktables will notify us if a later
10450 * Unicode version breaks this.)
10451 *
10452 * If the range being checked is at most 9 long,
10453 * and the digit values represented are in
10454 * numerical order, they are from the same series.
10455 * */
10456 if ( value - prevvalue > 9
10457 || ((( value - 0x1D7CE) % 10)
10458 <= (prevvalue - 0x1D7CE) % 10))
10459 {
10460 goto warn_bad_digit_range;
10461 }
10462 }
10463 else {
10464
10465 /* For all other ranges of digits in Unicode, the
10466 * algorithm is just to check if both end points
10467 * are in the same series, which is the same range.
10468 * */
10469 index_start = _invlist_search(
10470 PL_XPosix_ptrs[CC_DIGIT_],
10471 prevvalue);
10472
10473 /* Warn if the range starts and ends with a digit,
10474 * and they are not in the same group of 10. */
10475 if ( index_start >= 0
10476 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
10477 && (index_final =
10478 _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
10479 value)) != index_start
10480 && index_final >= 0
10481 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
10482 {
10483 warn_bad_digit_range:
10484 vWARN(RExC_parse, "Ranges of digits should be"
10485 " from the same group of"
10486 " 10");
10487 }
10488 }
10489 }
10490 }
10491 }
10492 if ((! range || prevvalue == value) && non_portable_endpoint) {
10493 if (isPRINT_A(value)) {
10494 char literal[3];
10495 unsigned d = 0;
10496 if (isBACKSLASHED_PUNCT(value)) {
10497 literal[d++] = '\\';
10498 }
10499 literal[d++] = (char) value;
10500 literal[d++] = '\0';
10501
10502 vWARN4(RExC_parse,
10503 "\"%.*s\" is more clearly written simply as \"%s\"",
10504 (int) (RExC_parse - rangebegin),
10505 rangebegin,
10506 literal
10507 );
10508 }
10509 else if (isMNEMONIC_CNTRL(value)) {
10510 vWARN4(RExC_parse,
10511 "\"%.*s\" is more clearly written simply as \"%s\"",
10512 (int) (RExC_parse - rangebegin),
10513 rangebegin,
10514 cntrl_to_mnemonic((U8) value)
10515 );
10516 }
10517 }
10518 }
10519
10520 /* Deal with this element of the class */
10521
10522#ifndef EBCDIC
10523 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10524 prevvalue, value);
10525#else
10526 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
10527 * that don't require special handling, we can just add the range like
10528 * we do for ASCII platforms */
10529 if ((UNLIKELY(prevvalue == 0) && value >= 255)
10530 || ! (prevvalue < 256
10531 && (unicode_range
10532 || (! non_portable_endpoint
10533 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
10534 || (isUPPER_A(prevvalue)
10535 && isUPPER_A(value)))))))
10536 {
10537 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10538 prevvalue, value);
10539 }
10540 else {
10541 /* Here, requires special handling. This can be because it is a
10542 * range whose code points are considered to be Unicode, and so
10543 * must be individually translated into native, or because its a
10544 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
10545 * EBCDIC, but we have defined them to include only the "expected"
10546 * upper or lower case ASCII alphabetics. Subranges above 255 are
10547 * the same in native and Unicode, so can be added as a range */
10548 U8 start = NATIVE_TO_LATIN1(prevvalue);
10549 unsigned j;
10550 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
10551 for (j = start; j <= end; j++) {
10552 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
10553 }
10554 if (value > 255) {
10555 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10556 256, value);
10557 }
10558 }
10559#endif
10560
10561 range = 0; /* this range (if it was one) is done now */
10562 } /* End of loop through all the text within the brackets */
10563
10564 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
10565 output_posix_warnings(pRExC_state, posix_warnings);
10566 }
10567
10568 /* If anything in the class expands to more than one character, we have to
10569 * deal with them by building up a substitute parse string, and recursively
10570 * calling reg() on it, instead of proceeding */
10571 if (multi_char_matches) {
10572 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
10573 I32 cp_count;
10574 STRLEN len;
10575 char *save_end = RExC_end;
10576 char *save_parse = RExC_parse;
10577 char *save_start = RExC_start;
10578 Size_t constructed_prefix_len = 0; /* This gives the length of the
10579 constructed portion of the
10580 substitute parse. */
10581 bool first_time = TRUE; /* First multi-char occurrence doesn't get
10582 a "|" */
10583 I32 reg_flags;
10584
10585 assert(! invert);
10586 /* Only one level of recursion allowed */
10587 assert(RExC_copy_start_in_constructed == RExC_precomp);
10588
10589#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
10590 because too confusing */
10591 if (invert) {
10592 sv_catpvs(substitute_parse, "(?:");
10593 }
10594#endif
10595
10596 /* Look at the longest strings first */
10597 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
10598 cp_count > 0;
10599 cp_count--)
10600 {
10601
10602 if (av_exists(multi_char_matches, cp_count)) {
10603 AV** this_array_ptr;
10604 SV* this_sequence;
10605
10606 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches,
10607 cp_count, FALSE);
10608 while ((this_sequence = av_pop(*this_array_ptr)) !=
10609 &PL_sv_undef)
10610 {
10611 if (! first_time) {
10612 sv_catpvs(substitute_parse, "|");
10613 }
10614 first_time = FALSE;
10615
10616 sv_catpv(substitute_parse, SvPVX(this_sequence));
10617 }
10618 }
10619 }
10620
10621 /* If the character class contains anything else besides these
10622 * multi-character strings, have to include it in recursive parsing */
10623 if (element_count) {
10624 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
10625
10626 sv_catpvs(substitute_parse, "|");
10627 if (has_l_bracket) { /* Add an [ if the original had one */
10628 sv_catpvs(substitute_parse, "[");
10629 }
10630 constructed_prefix_len = SvCUR(substitute_parse);
10631 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
10632
10633 /* Put in a closing ']' to match any opening one, but not if going
10634 * off the end, as otherwise we are adding something that really
10635 * isn't there */
10636 if (has_l_bracket && RExC_parse < RExC_end) {
10637 sv_catpvs(substitute_parse, "]");
10638 }
10639 }
10640
10641 sv_catpvs(substitute_parse, ")");
10642#if 0
10643 if (invert) {
10644 /* This is a way to get the parse to skip forward a whole named
10645 * sequence instead of matching the 2nd character when it fails the
10646 * first */
10647 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
10648 }
10649#endif
10650
10651 /* Set up the data structure so that any errors will be properly
10652 * reported. See the comments at the definition of
10653 * REPORT_LOCATION_ARGS for details */
10654 RExC_copy_start_in_input = (char *) orig_parse;
10655 RExC_start = SvPV(substitute_parse, len);
10656 RExC_parse_set( RExC_start );
10657 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
10658 RExC_end = RExC_parse + len;
10659 RExC_in_multi_char_class = 1;
10660
10661 ret = reg(pRExC_state, 1, &reg_flags, depth+1);
10662
10663 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
10664
10665 /* And restore so can parse the rest of the pattern */
10666 RExC_parse_set(save_parse);
10667 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
10668 RExC_end = save_end;
10669 RExC_in_multi_char_class = 0;
10670 SvREFCNT_dec_NN(multi_char_matches);
10671 SvREFCNT_dec(properties);
10672 SvREFCNT_dec(cp_list);
10673 SvREFCNT_dec(simple_posixes);
10674 SvREFCNT_dec(posixes);
10675 SvREFCNT_dec(nposixes);
10676 SvREFCNT_dec(cp_foldable_list);
10677 return ret;
10678 }
10679
10680 /* If folding, we calculate all characters that could fold to or from the
10681 * ones already on the list */
10682 if (cp_foldable_list) {
10683 if (FOLD) {
10684 UV start, end; /* End points of code point ranges */
10685
10686 SV* fold_intersection = NULL;
10687 SV** use_list;
10688
10689 /* Our calculated list will be for Unicode rules. For locale
10690 * matching, we have to keep a separate list that is consulted at
10691 * runtime only when the locale indicates Unicode rules (and we
10692 * don't include potential matches in the ASCII/Latin1 range, as
10693 * any code point could fold to any other, based on the run-time
10694 * locale). For non-locale, we just use the general list */
10695 if (LOC) {
10696 use_list = &only_utf8_locale_list;
10697 }
10698 else {
10699 use_list = &cp_list;
10700 }
10701
10702 /* Only the characters in this class that participate in folds need
10703 * be checked. Get the intersection of this class and all the
10704 * possible characters that are foldable. This can quickly narrow
10705 * down a large class */
10706 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
10707 &fold_intersection);
10708
10709 /* Now look at the foldable characters in this class individually */
10710 invlist_iterinit(fold_intersection);
10711 while (invlist_iternext(fold_intersection, &start, &end)) {
10712 UV j;
10713 UV folded;
10714
10715 /* Look at every character in the range */
10716 for (j = start; j <= end; j++) {
10717 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10718 STRLEN foldlen;
10719 unsigned int k;
10720 Size_t folds_count;
10721 U32 first_fold;
10722 const U32 * remaining_folds;
10723
10724 if (j < 256) {
10725
10726 /* Under /l, we don't know what code points below 256
10727 * fold to, except we do know the MICRO SIGN folds to
10728 * an above-255 character if the locale is UTF-8, so we
10729 * add it to the special list (in *use_list) Otherwise
10730 * we know now what things can match, though some folds
10731 * are valid under /d only if the target is UTF-8.
10732 * Those go in a separate list */
10733 if ( IS_IN_SOME_FOLD_L1(j)
10734 && ! (LOC && j != MICRO_SIGN))
10735 {
10736
10737 /* ASCII is always matched; non-ASCII is matched
10738 * only under Unicode rules (which could happen
10739 * under /l if the locale is a UTF-8 one */
10740 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
10741 *use_list = add_cp_to_invlist(*use_list,
10742 PL_fold_latin1[j]);
10743 }
10744 else if (j != PL_fold_latin1[j]) {
10745 upper_latin1_only_utf8_matches
10746 = add_cp_to_invlist(
10747 upper_latin1_only_utf8_matches,
10748 PL_fold_latin1[j]);
10749 }
10750 }
10751
10752 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
10753 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
10754 {
10755 add_above_Latin1_folds(pRExC_state,
10756 (U8) j,
10757 use_list);
10758 }
10759 continue;
10760 }
10761
10762 /* Here is an above Latin1 character. We don't have the
10763 * rules hard-coded for it. First, get its fold. This is
10764 * the simple fold, as the multi-character folds have been
10765 * handled earlier and separated out */
10766 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
10767 (ASCII_FOLD_RESTRICTED)
10768 ? FOLD_FLAGS_NOMIX_ASCII
10769 : 0);
10770
10771 /* Single character fold of above Latin1. Add everything
10772 * in its fold closure to the list that this node should
10773 * match. */
10774 folds_count = _inverse_folds(folded, &first_fold,
10775 &remaining_folds);
10776 for (k = 0; k <= folds_count; k++) {
10777 UV c = (k == 0) /* First time through use itself */
10778 ? folded
10779 : (k == 1) /* 2nd time use, the first fold */
10780 ? first_fold
10781
10782 /* Then the remaining ones */
10783 : remaining_folds[k-2];
10784
10785 /* /aa doesn't allow folds between ASCII and non- */
10786 if (( ASCII_FOLD_RESTRICTED
10787 && (isASCII(c) != isASCII(j))))
10788 {
10789 continue;
10790 }
10791
10792 /* Folds under /l which cross the 255/256 boundary are
10793 * added to a separate list. (These are valid only
10794 * when the locale is UTF-8.) */
10795 if (c < 256 && LOC) {
10796 *use_list = add_cp_to_invlist(*use_list, c);
10797 continue;
10798 }
10799
10800 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
10801 {
10802 cp_list = add_cp_to_invlist(cp_list, c);
10803 }
10804 else {
10805 /* Similarly folds involving non-ascii Latin1
10806 * characters under /d are added to their list */
10807 upper_latin1_only_utf8_matches
10808 = add_cp_to_invlist(
10809 upper_latin1_only_utf8_matches,
10810 c);
10811 }
10812 }
10813 }
10814 }
10815 SvREFCNT_dec_NN(fold_intersection);
10816 }
10817
10818 /* Now that we have finished adding all the folds, there is no reason
10819 * to keep the foldable list separate */
10820 _invlist_union(cp_list, cp_foldable_list, &cp_list);
10821 SvREFCNT_dec_NN(cp_foldable_list);
10822 }
10823
10824 /* And combine the result (if any) with any inversion lists from posix
10825 * classes. The lists are kept separate up to now because we don't want to
10826 * fold the classes */
10827 if (simple_posixes) { /* These are the classes known to be unaffected by
10828 /a, /aa, and /d */
10829 if (cp_list) {
10830 _invlist_union(cp_list, simple_posixes, &cp_list);
10831 SvREFCNT_dec_NN(simple_posixes);
10832 }
10833 else {
10834 cp_list = simple_posixes;
10835 }
10836 }
10837 if (posixes || nposixes) {
10838 if (! DEPENDS_SEMANTICS) {
10839
10840 /* For everything but /d, we can just add the current 'posixes' and
10841 * 'nposixes' to the main list */
10842 if (posixes) {
10843 if (cp_list) {
10844 _invlist_union(cp_list, posixes, &cp_list);
10845 SvREFCNT_dec_NN(posixes);
10846 }
10847 else {
10848 cp_list = posixes;
10849 }
10850 }
10851 if (nposixes) {
10852 if (cp_list) {
10853 _invlist_union(cp_list, nposixes, &cp_list);
10854 SvREFCNT_dec_NN(nposixes);
10855 }
10856 else {
10857 cp_list = nposixes;
10858 }
10859 }
10860 }
10861 else {
10862 /* Under /d, things like \w match upper Latin1 characters only if
10863 * the target string is in UTF-8. But things like \W match all the
10864 * upper Latin1 characters if the target string is not in UTF-8.
10865 *
10866 * Handle the case with something like \W separately */
10867 if (nposixes) {
10868 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
10869
10870 /* A complemented posix class matches all upper Latin1
10871 * characters if not in UTF-8. And it matches just certain
10872 * ones when in UTF-8. That means those certain ones are
10873 * matched regardless, so can just be added to the
10874 * unconditional list */
10875 if (cp_list) {
10876 _invlist_union(cp_list, nposixes, &cp_list);
10877 SvREFCNT_dec_NN(nposixes);
10878 nposixes = NULL;
10879 }
10880 else {
10881 cp_list = nposixes;
10882 }
10883
10884 /* Likewise for 'posixes' */
10885 _invlist_union(posixes, cp_list, &cp_list);
10886 SvREFCNT_dec(posixes);
10887
10888 /* Likewise for anything else in the range that matched only
10889 * under UTF-8 */
10890 if (upper_latin1_only_utf8_matches) {
10891 _invlist_union(cp_list,
10892 upper_latin1_only_utf8_matches,
10893 &cp_list);
10894 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10895 upper_latin1_only_utf8_matches = NULL;
10896 }
10897
10898 /* If we don't match all the upper Latin1 characters regardless
10899 * of UTF-8ness, we have to set a flag to match the rest when
10900 * not in UTF-8 */
10901 _invlist_subtract(only_non_utf8_list, cp_list,
10902 &only_non_utf8_list);
10903 if (_invlist_len(only_non_utf8_list) != 0) {
10904 anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
10905 }
10906 SvREFCNT_dec_NN(only_non_utf8_list);
10907 }
10908 else {
10909 /* Here there were no complemented posix classes. That means
10910 * the upper Latin1 characters in 'posixes' match only when the
10911 * target string is in UTF-8. So we have to add them to the
10912 * list of those types of code points, while adding the
10913 * remainder to the unconditional list.
10914 *
10915 * First calculate what they are */
10916 SV* nonascii_but_latin1_properties = NULL;
10917 _invlist_intersection(posixes, PL_UpperLatin1,
10918 &nonascii_but_latin1_properties);
10919
10920 /* And add them to the final list of such characters. */
10921 _invlist_union(upper_latin1_only_utf8_matches,
10922 nonascii_but_latin1_properties,
10923 &upper_latin1_only_utf8_matches);
10924
10925 /* Remove them from what now becomes the unconditional list */
10926 _invlist_subtract(posixes, nonascii_but_latin1_properties,
10927 &posixes);
10928
10929 /* And add those unconditional ones to the final list */
10930 if (cp_list) {
10931 _invlist_union(cp_list, posixes, &cp_list);
10932 SvREFCNT_dec_NN(posixes);
10933 posixes = NULL;
10934 }
10935 else {
10936 cp_list = posixes;
10937 }
10938
10939 SvREFCNT_dec(nonascii_but_latin1_properties);
10940
10941 /* Get rid of any characters from the conditional list that we
10942 * now know are matched unconditionally, which may make that
10943 * list empty */
10944 _invlist_subtract(upper_latin1_only_utf8_matches,
10945 cp_list,
10946 &upper_latin1_only_utf8_matches);
10947 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
10948 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10949 upper_latin1_only_utf8_matches = NULL;
10950 }
10951 }
10952 }
10953 }
10954
10955 /* And combine the result (if any) with any inversion list from properties.
10956 * The lists are kept separate up to now so that we can distinguish the two
10957 * in regards to matching above-Unicode. A run-time warning is generated
10958 * if a Unicode property is matched against a non-Unicode code point. But,
10959 * we allow user-defined properties to match anything, without any warning,
10960 * and we also suppress the warning if there is a portion of the character
10961 * class that isn't a Unicode property, and which matches above Unicode, \W
10962 * or [\x{110000}] for example.
10963 * (Note that in this case, unlike the Posix one above, there is no
10964 * <upper_latin1_only_utf8_matches>, because having a Unicode property
10965 * forces Unicode semantics */
10966 if (properties) {
10967 if (cp_list) {
10968
10969 /* If it matters to the final outcome, see if a non-property
10970 * component of the class matches above Unicode. If so, the
10971 * warning gets suppressed. This is true even if just a single
10972 * such code point is specified, as, though not strictly correct if
10973 * another such code point is matched against, the fact that they
10974 * are using above-Unicode code points indicates they should know
10975 * the issues involved */
10976 if (warn_super) {
10977 warn_super = ! (invert
10978 ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
10979 }
10980
10981 _invlist_union(properties, cp_list, &cp_list);
10982 SvREFCNT_dec_NN(properties);
10983 }
10984 else {
10985 cp_list = properties;
10986 }
10987
10988 if (warn_super) {
10989 anyof_flags |= ANYOF_WARN_SUPER__shared;
10990
10991 /* Because an ANYOF node is the only one that warns, this node
10992 * can't be optimized into something else */
10993 optimizable = FALSE;
10994 }
10995 }
10996
10997 /* Here, we have calculated what code points should be in the character
10998 * class.
10999 *
11000 * Now we can see about various optimizations. Fold calculation (which we
11001 * did above) needs to take place before inversion. Otherwise /[^k]/i
11002 * would invert to include K, which under /i would match k, which it
11003 * shouldn't. Therefore we can't invert folded locale now, as it won't be
11004 * folded until runtime */
11005
11006 /* If we didn't do folding, it's because some information isn't available
11007 * until runtime; set the run-time fold flag for these We know to set the
11008 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
11009 * at least one 0-255 range code point */
11010 if (LOC && FOLD) {
11011
11012 /* Some things on the list might be unconditionally included because of
11013 * other components. Remove them, and clean up the list if it goes to
11014 * 0 elements */
11015 if (only_utf8_locale_list && cp_list) {
11016 _invlist_subtract(only_utf8_locale_list, cp_list,
11017 &only_utf8_locale_list);
11018
11019 if (_invlist_len(only_utf8_locale_list) == 0) {
11020 SvREFCNT_dec_NN(only_utf8_locale_list);
11021 only_utf8_locale_list = NULL;
11022 }
11023 }
11024 if ( only_utf8_locale_list
11025 || ( cp_list
11026 && ( _invlist_contains_cp(cp_list,
11027 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
11028 || _invlist_contains_cp(cp_list,
11029 LATIN_SMALL_LETTER_DOTLESS_I))))
11030 {
11031 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11032 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11033 }
11034 else if (cp_list && invlist_lowest(cp_list) < 256) {
11035 /* If nothing is below 256, has no locale dependency; otherwise it
11036 * does */
11037 anyof_flags |= ANYOFL_FOLD;
11038 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11039
11040 /* In a Turkish locale these could match, notify the run-time code
11041 * to check for that */
11042 if ( _invlist_contains_cp(cp_list, 'I')
11043 || _invlist_contains_cp(cp_list, 'i'))
11044 {
11045 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11046 }
11047 }
11048 }
11049 else if ( DEPENDS_SEMANTICS
11050 && ( upper_latin1_only_utf8_matches
11051 || ( anyof_flags
11052 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
11053 {
11054 RExC_seen_d_op = TRUE;
11055 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
11056 }
11057
11058 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
11059 * compile time. */
11060 if ( cp_list
11061 && invert
11062 && ! has_runtime_dependency)
11063 {
11064 _invlist_invert(cp_list);
11065
11066 /* Clear the invert flag since have just done it here */
11067 invert = FALSE;
11068 }
11069
11070 /* All possible optimizations below still have these characteristics.
11071 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
11072 * routine) */
11073 *flagp |= HASWIDTH|SIMPLE;
11074
11075 if (ret_invlist) {
11076 *ret_invlist = cp_list;
11077
11078 return (cp_list) ? RExC_emit : 0;
11079 }
11080
11081 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
11082 RExC_contains_locale = 1;
11083 }
11084
11085 if (optimizable) {
11086
11087 /* Some character classes are equivalent to other nodes. Such nodes
11088 * take up less room, and some nodes require fewer operations to
11089 * execute, than ANYOF nodes. EXACTish nodes may be joinable with
11090 * adjacent nodes to improve efficiency. */
11091 op = optimize_regclass(pRExC_state, cp_list,
11092 only_utf8_locale_list,
11093 upper_latin1_only_utf8_matches,
11094 has_runtime_dependency,
11095 posixl,
11096 &anyof_flags, &invert, &ret, flagp);
11097 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
11098
11099 /* If optimized to something else and emitted, clean up and return */
11100 if (ret >= 0) {
11101 SvREFCNT_dec(cp_list);;
11102 SvREFCNT_dec(only_utf8_locale_list);
11103 SvREFCNT_dec(upper_latin1_only_utf8_matches);
11104 return ret;
11105 }
11106
11107 /* If no optimization was found, an END was returned and we will now
11108 * emit an ANYOF */
11109 if (op == END) {
11110 op = ANYOF;
11111 }
11112 }
11113
11114 /* Here are going to emit an ANYOF; set the particular type */
11115 if (op == ANYOF) {
11116 if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
11117 op = ANYOFD;
11118 }
11119 else if (posixl) {
11120 op = ANYOFPOSIXL;
11121 }
11122 else if (LOC) {
11123 op = ANYOFL;
11124 }
11125 }
11126
11127 ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
11128 FILL_NODE(ret, op); /* We set the argument later */
11129 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
11130 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
11131
11132 /* Here, <cp_list> contains all the code points we can determine at
11133 * compile time that match under all conditions. Go through it, and
11134 * for things that belong in the bitmap, put them there, and delete from
11135 * <cp_list>. While we are at it, see if everything above 255 is in the
11136 * list, and if so, set a flag to speed up execution */
11137
11138 populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
11139
11140 if (posixl) {
11141 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
11142 }
11143
11144 if (invert) {
11145 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
11146 }
11147
11148 /* Here, the bitmap has been populated with all the Latin1 code points that
11149 * always match. Can now add to the overall list those that match only
11150 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
11151 * */
11152 if (upper_latin1_only_utf8_matches) {
11153 if (cp_list) {
11154 _invlist_union(cp_list,
11155 upper_latin1_only_utf8_matches,
11156 &cp_list);
11157 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11158 }
11159 else {
11160 cp_list = upper_latin1_only_utf8_matches;
11161 }
11162 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11163 }
11164
11165 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
11166 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
11167 ? listsv
11168 : NULL,
11169 only_utf8_locale_list);
11170
11171 SvREFCNT_dec(cp_list);;
11172 SvREFCNT_dec(only_utf8_locale_list);
11173 return ret;
11174}
11175
11176STATIC U8
11177S_optimize_regclass(pTHX_
11178 RExC_state_t *pRExC_state,
11179 SV * cp_list,
11180 SV* only_utf8_locale_list,
11181 SV* upper_latin1_only_utf8_matches,
11182 const U32 has_runtime_dependency,
11183 const U32 posixl,
11184 U8 * anyof_flags,
11185 bool * invert,
11186 regnode_offset * ret,
11187 I32 *flagp
11188 )
11189{
11190 /* This function exists just to make S_regclass() smaller. It extracts out
11191 * the code that looks for potential optimizations away from a full generic
11192 * ANYOF node. The parameter names are the same as the corresponding
11193 * variables in S_regclass.
11194 *
11195 * It returns the new op (the impossible END one if no optimization found)
11196 * and sets *ret to any created regnode. If the new op is sufficiently
11197 * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
11198 *
11199 * Certain of the parameters may be updated as a result of the changes
11200 * herein */
11201
11202 U8 op = END; /* The returned node-type, initialized to an impossible
11203 one. */
11204 UV value = 0;
11205 PERL_UINT_FAST8_T i;
11206 UV partial_cp_count = 0;
11207 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
11208 UV end[MAX_FOLD_FROMS+1] = { 0 };
11209 bool single_range = FALSE;
11210 UV lowest_cp = 0, highest_cp = 0;
11211
11212 PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
11213
11214 if (cp_list) { /* Count the code points in enough ranges that we would see
11215 all the ones possible in any fold in this version of
11216 Unicode */
11217
11218 invlist_iterinit(cp_list);
11219 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
11220 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
11221 break;
11222 }
11223 partial_cp_count += end[i] - start[i] + 1;
11224 }
11225
11226 if (i == 1) {
11227 single_range = TRUE;
11228 }
11229 invlist_iterfinish(cp_list);
11230
11231 /* If we know at compile time that this matches every possible code
11232 * point, any run-time dependencies don't matter */
11233 if (start[0] == 0 && end[0] == UV_MAX) {
11234 if (*invert) {
11235 goto return_OPFAIL;
11236 }
11237 else {
11238 goto return_SANY;
11239 }
11240 }
11241
11242 /* Use a clearer mnemonic for below */
11243 lowest_cp = start[0];
11244
11245 highest_cp = invlist_highest(cp_list);
11246 }
11247
11248 /* Similarly, for /l posix classes, if both a class and its complement
11249 * match, any run-time dependencies don't matter */
11250 if (posixl) {
11251 int namedclass;
11252 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
11253 if ( POSIXL_TEST(posixl, namedclass) /* class */
11254 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
11255 {
11256 if (*invert) {
11257 goto return_OPFAIL;
11258 }
11259 goto return_SANY;
11260 }
11261 }
11262
11263 /* For well-behaved locales, some classes are subsets of others, so
11264 * complementing the subset and including the non-complemented superset
11265 * should match everything, like [\D[:alnum:]], and
11266 * [[:^alpha:][:alnum:]], but some implementations of locales are
11267 * buggy, and khw thinks its a bad idea to have optimization change
11268 * behavior, even if it avoids an OS bug in a given case */
11269
11270#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
11271
11272 /* If is a single posix /l class, can optimize to just that op. Such a
11273 * node will not match anything in the Latin1 range, as that is not
11274 * determinable until runtime, but will match whatever the class does
11275 * outside that range. (Note that some classes won't match anything
11276 * outside the range, like [:ascii:]) */
11277 if ( isSINGLE_BIT_SET(posixl)
11278 && (partial_cp_count == 0 || lowest_cp > 255))
11279 {
11280 U8 classnum;
11281 SV * class_above_latin1 = NULL;
11282 bool already_inverted;
11283 bool are_equivalent;
11284
11285
11286 namedclass = single_1bit_pos32(posixl);
11287 classnum = namedclass_to_classnum(namedclass);
11288
11289 /* The named classes are such that the inverted number is one
11290 * larger than the non-inverted one */
11291 already_inverted = namedclass - classnum_to_namedclass(classnum);
11292
11293 /* Create an inversion list of the official property, inverted if
11294 * the constructed node list is inverted, and restricted to only
11295 * the above latin1 code points, which are the only ones known at
11296 * compile time */
11297 _invlist_intersection_maybe_complement_2nd(
11298 PL_AboveLatin1,
11299 PL_XPosix_ptrs[classnum],
11300 already_inverted,
11301 &class_above_latin1);
11302 are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
11303 SvREFCNT_dec_NN(class_above_latin1);
11304
11305 if (are_equivalent) {
11306
11307 /* Resolve the run-time inversion flag with this possibly
11308 * inverted class */
11309 *invert = *invert ^ already_inverted;
11310
11311 op = POSIXL + *invert * (NPOSIXL - POSIXL);
11312 *ret = reg_node(pRExC_state, op);
11313 FLAGS(REGNODE_p(*ret)) = classnum;
11314 return op;
11315 }
11316 }
11317 }
11318
11319 /* khw can't think of any other possible transformation involving these. */
11320 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
11321 return END;
11322 }
11323
11324 if (! has_runtime_dependency) {
11325
11326 /* If the list is empty, nothing matches. This happens, for example,
11327 * when a Unicode property that doesn't match anything is the only
11328 * element in the character class (perluniprops.pod notes such
11329 * properties). */
11330 if (partial_cp_count == 0) {
11331 if (*invert) {
11332 goto return_SANY;
11333 }
11334 else {
11335 goto return_OPFAIL;
11336 }
11337 }
11338
11339 /* If matches everything but \n */
11340 if ( start[0] == 0 && end[0] == '\n' - 1
11341 && start[1] == '\n' + 1 && end[1] == UV_MAX)
11342 {
11343 assert (! *invert);
11344 op = REG_ANY;
11345 *ret = reg_node(pRExC_state, op);
11346 MARK_NAUGHTY(1);
11347 return op;
11348 }
11349 }
11350
11351 /* Next see if can optimize classes that contain just a few code points
11352 * into an EXACTish node. The reason to do this is to let the optimizer
11353 * join this node with adjacent EXACTish ones, and ANYOF nodes require
11354 * runtime conversion to code point from UTF-8, which we'd like to avoid.
11355 *
11356 * An EXACTFish node can be generated even if not under /i, and vice versa.
11357 * But care must be taken. An EXACTFish node has to be such that it only
11358 * matches precisely the code points in the class, but we want to generate
11359 * the least restrictive one that does that, to increase the odds of being
11360 * able to join with an adjacent node. For example, if the class contains
11361 * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
11362 * from matching. Whether we are under /i or not is irrelevant in this
11363 * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER
11364 * LETTER APOSTROPHE. That is supposed to match the single character U+0149
11365 * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there
11366 * is no simple fold that includes \X{02BC}, there is a multi-char fold
11367 * that does, and so the node generated for it must be an EXACTFish one.
11368 * On the other hand qr/:/i should generate a plain EXACT node since the
11369 * colon participates in no fold whatsoever, and having it be EXACT tells
11370 * the optimizer the target string cannot match unless it has a colon in
11371 * it. */
11372 if ( ! posixl
11373 && ! *invert
11374
11375 /* Only try if there are no more code points in the class than in
11376 * the max possible fold */
11377 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
11378 {
11379 /* We can always make a single code point class into an EXACTish node.
11380 * */
11381 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
11382 if (LOC) {
11383
11384 /* Here is /l: Use EXACTL, except if there is a fold not known
11385 * until runtime so shows as only a single code point here.
11386 * For code points above 255, we know which can cause problems
11387 * by having a potential fold to the Latin1 range. */
11388 if ( ! FOLD
11389 || ( lowest_cp > 255
11390 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
11391 {
11392 op = EXACTL;
11393 }
11394 else {
11395 op = EXACTFL;
11396 }
11397 }
11398 else if (! FOLD) { /* Not /l and not /i */
11399 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
11400 }
11401 else if (lowest_cp < 256) { /* /i, not /l, and the code point is
11402 small */
11403
11404 /* Under /i, it gets a little tricky. A code point that
11405 * doesn't participate in a fold should be an EXACT node. We
11406 * know this one isn't the result of a simple fold, or there'd
11407 * be more than one code point in the list, but it could be
11408 * part of a multi-character fold. In that case we better not
11409 * create an EXACT node, as we would wrongly be telling the
11410 * optimizer that this code point must be in the target string,
11411 * and that is wrong. This is because if the sequence around
11412 * this code point forms a multi-char fold, what needs to be in
11413 * the string could be the code point that folds to the
11414 * sequence.
11415 *
11416 * This handles the case of below-255 code points, as we have
11417 * an easy look up for those. The next clause handles the
11418 * above-256 one */
11419 op = IS_IN_SOME_FOLD_L1(lowest_cp)
11420 ? EXACTFU
11421 : EXACT;
11422 }
11423 else { /* /i, larger code point. Since we are under /i, and have
11424 just this code point, we know that it can't fold to
11425 something else, so PL_InMultiCharFold applies to it */
11426 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
11427 ? EXACTFU_REQ8
11428 : EXACT_REQ8;
11429 }
11430
11431 value = lowest_cp;
11432 }
11433 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
11434 && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
11435 {
11436 /* Here, the only runtime dependency, if any, is from /d, and the
11437 * class matches more than one code point, and the lowest code
11438 * point participates in some fold. It might be that the other
11439 * code points are /i equivalent to this one, and hence they would
11440 * be representable by an EXACTFish node. Above, we eliminated
11441 * classes that contain too many code points to be EXACTFish, with
11442 * the test for MAX_FOLD_FROMS
11443 *
11444 * First, special case the ASCII fold pairs, like 'B' and 'b'. We
11445 * do this because we have EXACTFAA at our disposal for the ASCII
11446 * range */
11447 if (partial_cp_count == 2 && isASCII(lowest_cp)) {
11448
11449 /* The only ASCII characters that participate in folds are
11450 * alphabetics */
11451 assert(isALPHA(lowest_cp));
11452 if ( end[0] == start[0] /* First range is a single
11453 character, so 2nd exists */
11454 && isALPHA_FOLD_EQ(start[0], start[1]))
11455 {
11456 /* Here, is part of an ASCII fold pair */
11457
11458 if ( ASCII_FOLD_RESTRICTED
11459 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
11460 {
11461 /* If the second clause just above was true, it means
11462 * we can't be under /i, or else the list would have
11463 * included more than this fold pair. Therefore we
11464 * have to exclude the possibility of whatever else it
11465 * is that folds to these, by using EXACTFAA */
11466 op = EXACTFAA;
11467 }
11468 else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
11469
11470 /* Here, there's no simple fold that lowest_cp is part
11471 * of, but there is a multi-character one. If we are
11472 * not under /i, we want to exclude that possibility;
11473 * if under /i, we want to include it */
11474 op = (FOLD) ? EXACTFU : EXACTFAA;
11475 }
11476 else {
11477
11478 /* Here, the only possible fold lowest_cp participates in
11479 * is with start[1]. /i or not isn't relevant */
11480 op = EXACTFU;
11481 }
11482
11483 value = toFOLD(lowest_cp);
11484 }
11485 }
11486 else if ( ! upper_latin1_only_utf8_matches
11487 || ( _invlist_len(upper_latin1_only_utf8_matches) == 2
11488 && PL_fold_latin1[
11489 invlist_highest(upper_latin1_only_utf8_matches)]
11490 == lowest_cp))
11491 {
11492 /* Here, the smallest character is non-ascii or there are more
11493 * than 2 code points matched by this node. Also, we either
11494 * don't have /d UTF-8 dependent matches, or if we do, they
11495 * look like they could be a single character that is the fold
11496 * of the lowest one is in the always-match list. This test
11497 * quickly excludes most of the false positives when there are
11498 * /d UTF-8 depdendent matches. These are like LATIN CAPITAL
11499 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
11500 * iff the target string is UTF-8. (We don't have to worry
11501 * above about exceeding the array bounds of PL_fold_latin1[]
11502 * because any code point in 'upper_latin1_only_utf8_matches'
11503 * is below 256.)
11504 *
11505 * EXACTFAA would apply only to pairs (hence exactly 2 code
11506 * points) in the ASCII range, so we can't use it here to
11507 * artificially restrict the fold domain, so we check if the
11508 * class does or does not match some EXACTFish node. Further,
11509 * if we aren't under /i, and and the folded-to character is
11510 * part of a multi-character fold, we can't do this
11511 * optimization, as the sequence around it could be that
11512 * multi-character fold, and we don't here know the context, so
11513 * we have to assume it is that multi-char fold, to prevent
11514 * potential bugs.
11515 *
11516 * To do the general case, we first find the fold of the lowest
11517 * code point (which may be higher than that lowest unfolded
11518 * one), then find everything that folds to it. (The data
11519 * structure we have only maps from the folded code points, so
11520 * we have to do the earlier step.) */
11521
11522 Size_t foldlen;
11523 U8 foldbuf[UTF8_MAXBYTES_CASE];
11524 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
11525 U32 first_fold;
11526 const U32 * remaining_folds;
11527 Size_t folds_to_this_cp_count = _inverse_folds(
11528 folded,
11529 &first_fold,
11530 &remaining_folds);
11531 Size_t folds_count = folds_to_this_cp_count + 1;
11532 SV * fold_list = _new_invlist(folds_count);
11533 unsigned int i;
11534
11535 /* If there are UTF-8 dependent matches, create a temporary
11536 * list of what this node matches, including them. */
11537 SV * all_cp_list = NULL;
11538 SV ** use_this_list = &cp_list;
11539
11540 if (upper_latin1_only_utf8_matches) {
11541 all_cp_list = _new_invlist(0);
11542 use_this_list = &all_cp_list;
11543 _invlist_union(cp_list,
11544 upper_latin1_only_utf8_matches,
11545 use_this_list);
11546 }
11547
11548 /* Having gotten everything that participates in the fold
11549 * containing the lowest code point, we turn that into an
11550 * inversion list, making sure everything is included. */
11551 fold_list = add_cp_to_invlist(fold_list, lowest_cp);
11552 fold_list = add_cp_to_invlist(fold_list, folded);
11553 if (folds_to_this_cp_count > 0) {
11554 fold_list = add_cp_to_invlist(fold_list, first_fold);
11555 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
11556 fold_list = add_cp_to_invlist(fold_list,
11557 remaining_folds[i]);
11558 }
11559 }
11560
11561 /* If the fold list is identical to what's in this ANYOF node,
11562 * the node can be represented by an EXACTFish one instead */
11563 if (_invlistEQ(*use_this_list, fold_list,
11564 0 /* Don't complement */ )
11565 ) {
11566
11567 /* But, we have to be careful, as mentioned above. Just
11568 * the right sequence of characters could match this if it
11569 * is part of a multi-character fold. That IS what we want
11570 * if we are under /i. But it ISN'T what we want if not
11571 * under /i, as it could match when it shouldn't. So, when
11572 * we aren't under /i and this character participates in a
11573 * multi-char fold, we don't optimize into an EXACTFish
11574 * node. So, for each case below we have to check if we
11575 * are folding, and if not, if it is not part of a
11576 * multi-char fold. */
11577 if (lowest_cp > 255) { /* Highish code point */
11578 if (FOLD || ! _invlist_contains_cp(
11579 PL_InMultiCharFold, folded))
11580 {
11581 op = (LOC)
11582 ? EXACTFLU8
11583 : (ASCII_FOLD_RESTRICTED)
11584 ? EXACTFAA
11585 : EXACTFU_REQ8;
11586 value = folded;
11587 }
11588 } /* Below, the lowest code point < 256 */
11589 else if ( FOLD
11590 && folded == 's'
11591 && DEPENDS_SEMANTICS)
11592 { /* An EXACTF node containing a single character 's',
11593 can be an EXACTFU if it doesn't get joined with an
11594 adjacent 's' */
11595 op = EXACTFU_S_EDGE;
11596 value = folded;
11597 }
11598 else if ( FOLD
11599 || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
11600 {
11601 if (upper_latin1_only_utf8_matches) {
11602 op = EXACTF;
11603
11604 /* We can't use the fold, as that only matches
11605 * under UTF-8 */
11606 value = lowest_cp;
11607 }
11608 else if ( UNLIKELY(lowest_cp == MICRO_SIGN)
11609 && ! UTF)
11610 { /* EXACTFUP is a special node for this character */
11611 op = (ASCII_FOLD_RESTRICTED)
11612 ? EXACTFAA
11613 : EXACTFUP;
11614 value = MICRO_SIGN;
11615 }
11616 else if ( ASCII_FOLD_RESTRICTED
11617 && ! isASCII(lowest_cp))
11618 { /* For ASCII under /iaa, we can use EXACTFU below
11619 */
11620 op = EXACTFAA;
11621 value = folded;
11622 }
11623 else {
11624 op = EXACTFU;
11625 value = folded;
11626 }
11627 }
11628 }
11629
11630 SvREFCNT_dec_NN(fold_list);
11631 SvREFCNT_dec(all_cp_list);
11632 }
11633 }
11634
11635 if (op != END) {
11636 U8 len;
11637
11638 /* Here, we have calculated what EXACTish node to use. Have to
11639 * convert to UTF-8 if not already there */
11640 if (value > 255) {
11641 if (! UTF) {
11642 SvREFCNT_dec(cp_list);;
11643 REQUIRE_UTF8(flagp);
11644 }
11645
11646 /* This is a kludge to the special casing issues with this
11647 * ligature under /aa. FB05 should fold to FB06, but the call
11648 * above to _to_uni_fold_flags() didn't find this, as it didn't
11649 * use the /aa restriction in order to not miss other folds
11650 * that would be affected. This is the only instance likely to
11651 * ever be a problem in all of Unicode. So special case it. */
11652 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
11653 && ASCII_FOLD_RESTRICTED)
11654 {
11655 value = LATIN_SMALL_LIGATURE_ST;
11656 }
11657 }
11658
11659 len = (UTF) ? UVCHR_SKIP(value) : 1;
11660
11661 *ret = REGNODE_GUTS(pRExC_state, op, len);
11662 FILL_NODE(*ret, op);
11663 RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
11664 setSTR_LEN(REGNODE_p(*ret), len);
11665 if (len == 1) {
11666 *STRINGs(REGNODE_p(*ret)) = (U8) value;
11667 }
11668 else {
11669 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
11670 }
11671
11672 return op;
11673 }
11674 }
11675
11676 if (! has_runtime_dependency) {
11677
11678 /* See if this can be turned into an ANYOFM node. Think about the bit
11679 * patterns in two different bytes. In some positions, the bits in
11680 * each will be 1; and in other positions both will be 0; and in some
11681 * positions the bit will be 1 in one byte, and 0 in the other. Let
11682 * 'n' be the number of positions where the bits differ. We create a
11683 * mask which has exactly 'n' 0 bits, each in a position where the two
11684 * bytes differ. Now take the set of all bytes that when ANDed with
11685 * the mask yield the same result. That set has 2**n elements, and is
11686 * representable by just two 8 bit numbers: the result and the mask.
11687 * Importantly, matching the set can be vectorized by creating a word
11688 * full of the result bytes, and a word full of the mask bytes,
11689 * yielding a significant speed up. Here, see if this node matches
11690 * such a set. As a concrete example consider [01], and the byte
11691 * representing '0' which is 0x30 on ASCII machines. It has the bits
11692 * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with
11693 * that mask we get 0x30. Any other bytes ANDed yield something else.
11694 * So [01], which is a common usage, is optimizable into ANYOFM, and
11695 * can benefit from the speed up. We can only do this on UTF-8
11696 * invariant bytes, because they have the same bit patterns under UTF-8
11697 * as not. */
11698 PERL_UINT_FAST8_T inverted = 0;
11699
11700 /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
11701 * EBCDIC */
11702 const PERL_UINT_FAST8_T max_permissible
11703 = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
11704
11705 /* If doesn't fit the criteria for ANYOFM, invert and try again. If
11706 * that works we will instead later generate an NANYOFM, and invert
11707 * back when through */
11708 if (highest_cp > max_permissible) {
11709 _invlist_invert(cp_list);
11710 inverted = 1;
11711 }
11712
11713 if (invlist_highest(cp_list) <= max_permissible) {
11714 UV this_start, this_end;
11715 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
11716 U8 bits_differing = 0;
11717 Size_t full_cp_count = 0;
11718 bool first_time = TRUE;
11719
11720 /* Go through the bytes and find the bit positions that differ */
11721 invlist_iterinit(cp_list);
11722 while (invlist_iternext(cp_list, &this_start, &this_end)) {
11723 unsigned int i = this_start;
11724
11725 if (first_time) {
11726 if (! UVCHR_IS_INVARIANT(i)) {
11727 goto done_anyofm;
11728 }
11729
11730 first_time = FALSE;
11731 lowest_cp = this_start;
11732
11733 /* We have set up the code point to compare with. Don't
11734 * compare it with itself */
11735 i++;
11736 }
11737
11738 /* Find the bit positions that differ from the lowest code
11739 * point in the node. Keep track of all such positions by
11740 * OR'ing */
11741 for (; i <= this_end; i++) {
11742 if (! UVCHR_IS_INVARIANT(i)) {
11743 goto done_anyofm;
11744 }
11745
11746 bits_differing |= i ^ lowest_cp;
11747 }
11748
11749 full_cp_count += this_end - this_start + 1;
11750 }
11751
11752 /* At the end of the loop, we count how many bits differ from the
11753 * bits in lowest code point, call the count 'd'. If the set we
11754 * found contains 2**d elements, it is the closure of all code
11755 * points that differ only in those bit positions. To convince
11756 * yourself of that, first note that the number in the closure must
11757 * be a power of 2, which we test for. The only way we could have
11758 * that count and it be some differing set, is if we got some code
11759 * points that don't differ from the lowest code point in any
11760 * position, but do differ from each other in some other position.
11761 * That means one code point has a 1 in that position, and another
11762 * has a 0. But that would mean that one of them differs from the
11763 * lowest code point in that position, which possibility we've
11764 * already excluded. */
11765 if ( (inverted || full_cp_count > 1)
11766 && full_cp_count == 1U << PL_bitcount[bits_differing])
11767 {
11768 U8 ANYOFM_mask;
11769
11770 op = ANYOFM + inverted;;
11771
11772 /* We need to make the bits that differ be 0's */
11773 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
11774
11775 /* The argument is the lowest code point */
11776 *ret = reg1node(pRExC_state, op, lowest_cp);
11777 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
11778 }
11779
11780 done_anyofm:
11781 invlist_iterfinish(cp_list);
11782 }
11783
11784 if (inverted) {
11785 _invlist_invert(cp_list);
11786 }
11787
11788 if (op != END) {
11789 return op;
11790 }
11791
11792 /* XXX We could create an ANYOFR_LOW node here if we saved above if all
11793 * were invariants, it wasn't inverted, and there is a single range.
11794 * This would be faster than some of the posix nodes we create below
11795 * like /\d/a, but would be twice the size. Without having actually
11796 * measured the gain, khw doesn't think the tradeoff is really worth it
11797 * */
11798 }
11799
11800 if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
11801 PERL_UINT_FAST8_T type;
11802 SV * intersection = NULL;
11803 SV* d_invlist = NULL;
11804
11805 /* See if this matches any of the POSIX classes. The POSIXA and POSIXD
11806 * ones are about the same speed as ANYOF ops, but take less room; the
11807 * ones that have above-Latin1 code point matches are somewhat faster
11808 * than ANYOF. */
11809
11810 for (type = POSIXA; type >= POSIXD; type--) {
11811 int posix_class;
11812
11813 if (type == POSIXL) { /* But not /l posix classes */
11814 continue;
11815 }
11816
11817 for (posix_class = 0;
11818 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
11819 posix_class++)
11820 {
11821 SV** our_code_points = &cp_list;
11822 SV** official_code_points;
11823 int try_inverted;
11824
11825 if (type == POSIXA) {
11826 official_code_points = &PL_Posix_ptrs[posix_class];
11827 }
11828 else {
11829 official_code_points = &PL_XPosix_ptrs[posix_class];
11830 }
11831
11832 /* Skip non-existent classes of this type. e.g. \v only has an
11833 * entry in PL_XPosix_ptrs */
11834 if (! *official_code_points) {
11835 continue;
11836 }
11837
11838 /* Try both the regular class, and its inversion */
11839 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
11840 bool this_inverted = *invert ^ try_inverted;
11841
11842 if (type != POSIXD) {
11843
11844 /* This class that isn't /d can't match if we have /d
11845 * dependencies */
11846 if (has_runtime_dependency
11847 & HAS_D_RUNTIME_DEPENDENCY)
11848 {
11849 continue;
11850 }
11851 }
11852 else /* is /d */ if (! this_inverted) {
11853
11854 /* /d classes don't match anything non-ASCII below 256
11855 * unconditionally (which cp_list contains) */
11856 _invlist_intersection(cp_list, PL_UpperLatin1,
11857 &intersection);
11858 if (_invlist_len(intersection) != 0) {
11859 continue;
11860 }
11861
11862 SvREFCNT_dec(d_invlist);
11863 d_invlist = invlist_clone(cp_list, NULL);
11864
11865 /* But under UTF-8 it turns into using /u rules. Add
11866 * the things it matches under these conditions so that
11867 * we check below that these are identical to what the
11868 * tested class should match */
11869 if (upper_latin1_only_utf8_matches) {
11870 _invlist_union(
11871 d_invlist,
11872 upper_latin1_only_utf8_matches,
11873 &d_invlist);
11874 }
11875 our_code_points = &d_invlist;
11876 }
11877 else { /* POSIXD, inverted. If this doesn't have this
11878 flag set, it isn't /d. */
11879 if (! ( *anyof_flags
11880 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
11881 {
11882 continue;
11883 }
11884
11885 our_code_points = &cp_list;
11886 }
11887
11888 /* Here, have weeded out some things. We want to see if
11889 * the list of characters this node contains
11890 * ('*our_code_points') precisely matches those of the
11891 * class we are currently checking against
11892 * ('*official_code_points'). */
11893 if (_invlistEQ(*our_code_points,
11894 *official_code_points,
11895 try_inverted))
11896 {
11897 /* Here, they precisely match. Optimize this ANYOF
11898 * node into its equivalent POSIX one of the correct
11899 * type, possibly inverted.
11900 *
11901 * Some of these nodes match a single range of
11902 * characters (or [:alpha:] matches two parallel ranges
11903 * on ASCII platforms). The array lookup at execution
11904 * time could be replaced by a range check for such
11905 * nodes. But regnodes are a finite resource, and the
11906 * possible performance boost isn't large, so this
11907 * hasn't been done. An attempt to use just one node
11908 * (and its inverse) to encompass all such cases was
11909 * made in d62feba66bf43f35d092bb026694f927e9f94d38.
11910 * But the shifting/masking it used ended up being
11911 * slower than the array look up, so it was reverted */
11912 op = (try_inverted)
11913 ? type + NPOSIXA - POSIXA
11914 : type;
11915 *ret = reg_node(pRExC_state, op);
11916 FLAGS(REGNODE_p(*ret)) = posix_class;
11917 SvREFCNT_dec(d_invlist);
11918 SvREFCNT_dec(intersection);
11919 return op;
11920 }
11921 }
11922 }
11923 }
11924 SvREFCNT_dec(d_invlist);
11925 SvREFCNT_dec(intersection);
11926 }
11927
11928 /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
11929 * in size and speed. Currently, a 20 bit range base (smallest code point
11930 * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
11931 * This allows for using it on all of the Unicode code points except for
11932 * the highest plane, which is only for private use code points. khw
11933 * doubts that a bigger delta is likely in real world applications */
11934 if ( single_range
11935 && ! has_runtime_dependency
11936 && *anyof_flags == 0
11937 && start[0] < (1 << ANYOFR_BASE_BITS)
11938 && end[0] - start[0]
11939 < ((1U << (sizeof(ARG1u_LOC(NULL))
11940 * CHARBITS - ANYOFR_BASE_BITS))))
11941
11942 {
11943 U8 low_utf8[UTF8_MAXBYTES+1];
11944 U8 high_utf8[UTF8_MAXBYTES+1];
11945
11946 op = ANYOFR;
11947 *ret = reg1node(pRExC_state, op,
11948 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
11949
11950 /* Place the lowest UTF-8 start byte in the flags field, so as to allow
11951 * efficient ruling out at run time of many possible inputs. */
11952 (void) uvchr_to_utf8(low_utf8, start[0]);
11953 (void) uvchr_to_utf8(high_utf8, end[0]);
11954
11955 /* If all code points share the same first byte, this can be an
11956 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
11957 * quickly rule out many inputs at run-time without having to compute
11958 * the code point from UTF-8. For EBCDIC, we use I8, as not doing that
11959 * transformation would not rule out nearly so many things */
11960 if (low_utf8[0] == high_utf8[0]) {
11961 op = ANYOFRb;
11962 OP(REGNODE_p(*ret)) = op;
11963 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
11964 }
11965 else {
11966 ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
11967 }
11968
11969 return op;
11970 }
11971
11972 /* If didn't find an optimization and there is no need for a bitmap,
11973 * of the lowest code points, optimize to indicate that */
11974 if ( lowest_cp >= NUM_ANYOF_CODE_POINTS
11975 && ! LOC
11976 && ! upper_latin1_only_utf8_matches
11977 && *anyof_flags == 0)
11978 {
11979 U8 low_utf8[UTF8_MAXBYTES+1];
11980 UV highest_cp = invlist_highest(cp_list);
11981
11982 /* Currently the maximum allowed code point by the system is IV_MAX.
11983 * Higher ones are reserved for future internal use. This particular
11984 * regnode can be used for higher ones, but we can't calculate the code
11985 * point of those. IV_MAX suffices though, as it will be a large first
11986 * byte */
11987 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
11988 - low_utf8;
11989
11990 /* We store the lowest possible first byte of the UTF-8 representation,
11991 * using the flags field. This allows for quick ruling out of some
11992 * inputs without having to convert from UTF-8 to code point. For
11993 * EBCDIC, we use I8, as not doing that transformation would not rule
11994 * out nearly so many things */
11995 *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
11996
11997 op = ANYOFH;
11998
11999 /* If the first UTF-8 start byte for the highest code point in the
12000 * range is suitably small, we may be able to get an upper bound as
12001 * well */
12002 if (highest_cp <= IV_MAX) {
12003 U8 high_utf8[UTF8_MAXBYTES+1];
12004 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
12005
12006 /* If the lowest and highest are the same, we can get an exact
12007 * first byte instead of a just minimum or even a sequence of exact
12008 * leading bytes. We signal these with different regnodes */
12009 if (low_utf8[0] == high_utf8[0]) {
12010 Size_t len = find_first_differing_byte_pos(low_utf8,
12011 high_utf8,
12012 MIN(low_len, high_len));
12013 if (len == 1) {
12014
12015 /* No need to convert to I8 for EBCDIC as this is an exact
12016 * match */
12017 *anyof_flags = low_utf8[0];
12018
12019 if (high_len == 2) {
12020 /* If the elements matched all have a 2-byte UTF-8
12021 * representation, with the first byte being the same,
12022 * we can use a compact, fast regnode. capable of
12023 * matching any combination of continuation byte
12024 * patterns.
12025 *
12026 * (A similar regnode could be created for the Latin1
12027 * range; the complication being that it could match
12028 * non-UTF8 targets. The internal bitmap would serve
12029 * both cases; with some extra code in regexec.c) */
12030 op = ANYOFHbbm;
12031 *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12032 FILL_NODE(*ret, op);
12033 FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0],
12034
12035 /* The 64 bit (or 32 on EBCCDIC) map can be looked up
12036 * directly based on the continuation byte, without
12037 * needing to convert to code point */
12038 populate_bitmap_from_invlist(
12039 cp_list,
12040
12041 /* The base code point is from the start byte */
12042 TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
12043 UTF_CONTINUATION_MARK | 0),
12044
12045 ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
12046 REGNODE_BBM_BITMAP_LEN);
12047 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
12048 return op;
12049 }
12050 else {
12051 op = ANYOFHb;
12052 }
12053 }
12054 else {
12055 op = ANYOFHs;
12056 *ret = REGNODE_GUTS(pRExC_state, op,
12057 REGNODE_ARG_LEN(op) + STR_SZ(len));
12058 FILL_NODE(*ret, op);
12059 STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret))
12060 = len;
12061 Copy(low_utf8, /* Add the common bytes */
12062 ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
12063 len, U8);
12064 RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
12065 set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
12066 NULL, only_utf8_locale_list);
12067 return op;
12068 }
12069 }
12070 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
12071
12072 /* Here, the high byte is not the same as the low, but is small
12073 * enough that its reasonable to have a loose upper bound,
12074 * which is packed in with the strict lower bound. See
12075 * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC
12076 * platforms, I8 is used. On ASCII platforms I8 is the same
12077 * thing as UTF-8 */
12078
12079 U8 bits = 0;
12080 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
12081 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
12082 - *anyof_flags;
12083
12084 if (range_diff <= max_range_diff / 8) {
12085 bits = 3;
12086 }
12087 else if (range_diff <= max_range_diff / 4) {
12088 bits = 2;
12089 }
12090 else if (range_diff <= max_range_diff / 2) {
12091 bits = 1;
12092 }
12093 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
12094 op = ANYOFHr;
12095 }
12096 }
12097 }
12098
12099 return op;
12100
12101 return_OPFAIL:
12102 op = OPFAIL;
12103 *ret = reg1node(pRExC_state, op, 0);
12104 return op;
12105
12106 return_SANY:
12107 op = SANY;
12108 *ret = reg_node(pRExC_state, op);
12109 MARK_NAUGHTY(1);
12110 return op;
12111}
12112
12113#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12114
12115#ifdef PERL_RE_BUILD_AUX
12116void
12117Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
12118 regnode* const node,
12119 SV* const cp_list,
12120 SV* const runtime_defns,
12121 SV* const only_utf8_locale_list)
12122{
12123 /* Sets the arg field of an ANYOF-type node 'node', using information about
12124 * the node passed-in. If only the bitmap is needed to determine what
12125 * matches, the arg is set appropriately to either
12126 * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
12127 * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
12128 *
12129 * Otherwise, it sets the argument to the count returned by reg_add_data(),
12130 * having allocated and stored an array, av, as follows:
12131 * av[0] stores the inversion list defining this class as far as known at
12132 * this time, or PL_sv_undef if nothing definite is now known.
12133 * av[1] stores the inversion list of code points that match only if the
12134 * current locale is UTF-8, or if none, PL_sv_undef if there is an
12135 * av[2], or no entry otherwise.
12136 * av[2] stores the list of user-defined properties whose subroutine
12137 * definitions aren't known at this time, or no entry if none. */
12138
12139 UV n;
12140
12141 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
12142
12143 /* If this is set, the final disposition won't be known until runtime, so
12144 * we can't do any of the compile time optimizations */
12145 if (! runtime_defns) {
12146
12147 /* On plain ANYOF nodes without the possibility of a runtime locale
12148 * making a difference, maybe there's no information to be gleaned
12149 * except for what's in the bitmap */
12150 if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
12151
12152 /* There are two such cases:
12153 * 1) there is no list of code points matched outside the bitmap
12154 */
12155 if (! cp_list) {
12156 ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
12157 return;
12158 }
12159
12160 /* 2) the list indicates everything outside the bitmap matches */
12161 if ( invlist_highest(cp_list) == UV_MAX
12162 && invlist_highest_range_start(cp_list)
12163 <= NUM_ANYOF_CODE_POINTS)
12164 {
12165 ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
12166 return;
12167 }
12168
12169 /* In all other cases there are things outside the bitmap that we
12170 * may need to check at runtime. */
12171 }
12172
12173 /* Here, we have resolved all the possible run-time matches, and they
12174 * are stored in one or both of two possible lists. (While some match
12175 * only under certain runtime circumstances, we know all the possible
12176 * ones for each such circumstance.)
12177 *
12178 * It may very well be that the pattern being compiled contains an
12179 * identical class, already encountered. Reusing that class here saves
12180 * space. Look through all classes so far encountered. */
12181 U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
12182 for (unsigned int i = 0; i < existing_items; i++) {
12183
12184 /* Only look at auxiliary data of this type */
12185 if (RExC_rxi->data->what[i] != 's') {
12186 continue;
12187 }
12188
12189 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
12190 AV * const av = MUTABLE_AV(SvRV(rv));
12191
12192 /* If the already encountered class has data that won't be known
12193 * until runtime (stored in the final element of the array), we
12194 * can't share */
12195 if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
12196 continue;
12197 }
12198
12199 SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
12200 false /* no lvalue */);
12201
12202 /* The new and the existing one both have to have or both not
12203 * have this element, for this one to duplicate that one */
12204 if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
12205 continue;
12206 }
12207
12208 /* If the inversion lists aren't equivalent, can't share */
12209 if (cp_list && ! _invlistEQ(cp_list,
12210 *stored_cp_list_ptr,
12211 FALSE /* don't complement */))
12212 {
12213 continue;
12214 }
12215
12216 /* Similarly for the other list */
12217 SV ** stored_only_utf8_locale_list_ptr = av_fetch(
12218 av,
12219 ONLY_LOCALE_MATCHES_INDEX,
12220 false /* no lvalue */);
12221 if ( cBOOL(only_utf8_locale_list)
12222 != cBOOL(stored_only_utf8_locale_list_ptr))
12223 {
12224 continue;
12225 }
12226
12227 if (only_utf8_locale_list && ! _invlistEQ(
12228 only_utf8_locale_list,
12229 *stored_only_utf8_locale_list_ptr,
12230 FALSE /* don't complement */))
12231 {
12232 continue;
12233 }
12234
12235 /* Here, the existence and contents of both compile-time lists
12236 * are identical between the new and existing data. Re-use the
12237 * existing one */
12238 ARG1u_SET(node, i);
12239 return;
12240 } /* end of loop through existing classes */
12241 }
12242
12243 /* Here, we need to create a new auxiliary data element; either because
12244 * this doesn't duplicate an existing one, or we can't tell at this time if
12245 * it eventually will */
12246
12247 AV * const av = newAV();
12248 SV *rv;
12249
12250 if (cp_list) {
12251 av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
12252 }
12253
12254 /* (Note that if any of this changes, the size calculations in
12255 * S_optimize_regclass() might need to be updated.) */
12256
12257 if (only_utf8_locale_list) {
12258 av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX,
12259 SvREFCNT_inc_NN(only_utf8_locale_list));
12260 }
12261
12262 if (runtime_defns) {
12263 av_store_simple(av, DEFERRED_USER_DEFINED_INDEX,
12264 SvREFCNT_inc_NN(runtime_defns));
12265 }
12266
12267 rv = newRV_noinc(MUTABLE_SV(av));
12268 n = reg_add_data(pRExC_state, STR_WITH_LEN("s"));
12269 RExC_rxi->data->data[n] = (void*)rv;
12270 ARG1u_SET(node, n);
12271}
12272#endif /* PERL_RE_BUILD_AUX */
12273
12274SV *
12275
12276#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12277Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12278#else
12279Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12280#endif
12281
12282{
12283 /* For internal core use only.
12284 * Returns the inversion list for the input 'node' in the regex 'prog'.
12285 * If <doinit> is 'true', will attempt to create the inversion list if not
12286 * already done. If it is created, it will add to the normal inversion
12287 * list any that comes from user-defined properties. It croaks if this
12288 * is called before such a list is ready to be generated, that is when a
12289 * user-defined property has been declared, buyt still not yet defined.
12290 * If <listsvp> is non-null, will return the printable contents of the
12291 * property definition. This can be used to get debugging information
12292 * even before the inversion list exists, by calling this function with
12293 * 'doinit' set to false, in which case the components that will be used
12294 * to eventually create the inversion list are returned (in a printable
12295 * form).
12296 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
12297 * store an inversion list of code points that should match only if the
12298 * execution-time locale is a UTF-8 one.
12299 * If <output_invlist> is not NULL, it is where this routine is to store an
12300 * inversion list of the code points that would be instead returned in
12301 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
12302 * when this parameter is used, is just the non-code point data that
12303 * will go into creating the inversion list. This currently should be just
12304 * user-defined properties whose definitions were not known at compile
12305 * time. Using this parameter allows for easier manipulation of the
12306 * inversion list's data by the caller. It is illegal to call this
12307 * function with this parameter set, but not <listsvp>
12308 *
12309 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
12310 * that, in spite of this function's name, the inversion list it returns
12311 * may include the bitmap data as well */
12312
12313 SV *si = NULL; /* Input initialization string */
12314 SV* invlist = NULL;
12315
12316 RXi_GET_DECL_NULL(prog, progi);
12317 const struct reg_data * const data = prog ? progi->data : NULL;
12318
12319#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12320 PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
12321#else
12322 PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
12323#endif
12324 assert(! output_invlist || listsvp);
12325
12326 if (data && data->count) {
12327 const U32 n = ARG1u(node);
12328
12329 if (data->what[n] == 's') {
12330 SV * const rv = MUTABLE_SV(data->data[n]);
12331 AV * const av = MUTABLE_AV(SvRV(rv));
12332 SV **const ary = AvARRAY(av);
12333
12334 invlist = ary[INVLIST_INDEX];
12335
12336 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
12337 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
12338 }
12339
12340 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
12341 si = ary[DEFERRED_USER_DEFINED_INDEX];
12342 }
12343
12344 if (doinit && (si || invlist)) {
12345 if (si) {
12346 bool user_defined;
12347 SV * msg = newSVpvs_flags("", SVs_TEMP);
12348
12349 SV * prop_definition = handle_user_defined_property(
12350 "", 0, FALSE, /* There is no \p{}, \P{} */
12351 SvPVX_const(si)[1] - '0', /* /i or not has been
12352 stored here for just
12353 this occasion */
12354 TRUE, /* run time */
12355 FALSE, /* This call must find the defn */
12356 si, /* The property definition */
12357 &user_defined,
12358 msg,
12359 0 /* base level call */
12360 );
12361
12362 if (SvCUR(msg)) {
12363 assert(prop_definition == NULL);
12364
12365 Perl_croak(aTHX_ "%" UTF8f,
12366 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
12367 }
12368
12369 if (invlist) {
12370 _invlist_union(invlist, prop_definition, &invlist);
12371 SvREFCNT_dec_NN(prop_definition);
12372 }
12373 else {
12374 invlist = prop_definition;
12375 }
12376
12377 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
12378 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
12379
12380 ary[INVLIST_INDEX] = invlist;
12381 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
12382 ? ONLY_LOCALE_MATCHES_INDEX
12383 : INVLIST_INDEX);
12384 si = NULL;
12385 }
12386 }
12387 }
12388 }
12389
12390 /* If requested, return a printable version of what this ANYOF node matches
12391 * */
12392 if (listsvp) {
12393 SV* matches_string = NULL;
12394
12395 /* This function can be called at compile-time, before everything gets
12396 * resolved, in which case we return the currently best available
12397 * information, which is the string that will eventually be used to do
12398 * that resolving, 'si' */
12399 if (si) {
12400 /* Here, we only have 'si' (and possibly some passed-in data in
12401 * 'invlist', which is handled below) If the caller only wants
12402 * 'si', use that. */
12403 if (! output_invlist) {
12404 matches_string = newSVsv(si);
12405 }
12406 else {
12407 /* But if the caller wants an inversion list of the node, we
12408 * need to parse 'si' and place as much as possible in the
12409 * desired output inversion list, making 'matches_string' only
12410 * contain the currently unresolvable things */
12411 const char *si_string = SvPVX(si);
12412 STRLEN remaining = SvCUR(si);
12413 UV prev_cp = 0;
12414 U8 count = 0;
12415
12416 /* Ignore everything before and including the first new-line */
12417 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
12418 assert (si_string != NULL);
12419 si_string++;
12420 remaining = SvPVX(si) + SvCUR(si) - si_string;
12421
12422 while (remaining > 0) {
12423
12424 /* The data consists of just strings defining user-defined
12425 * property names, but in prior incarnations, and perhaps
12426 * somehow from pluggable regex engines, it could still
12427 * hold hex code point definitions, all of which should be
12428 * legal (or it wouldn't have gotten this far). Each
12429 * component of a range would be separated by a tab, and
12430 * each range by a new-line. If these are found, instead
12431 * add them to the inversion list */
12432 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
12433 |PERL_SCAN_SILENT_NON_PORTABLE;
12434 STRLEN len = remaining;
12435 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
12436
12437 /* If the hex decode routine found something, it should go
12438 * up to the next \n */
12439 if ( *(si_string + len) == '\n') {
12440 if (count) { /* 2nd code point on line */
12441 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
12442 }
12443 else {
12444 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
12445 }
12446 count = 0;
12447 goto prepare_for_next_iteration;
12448 }
12449
12450 /* If the hex decode was instead for the lower range limit,
12451 * save it, and go parse the upper range limit */
12452 if (*(si_string + len) == '\t') {
12453 assert(count == 0);
12454
12455 prev_cp = cp;
12456 count = 1;
12457 prepare_for_next_iteration:
12458 si_string += len + 1;
12459 remaining -= len + 1;
12460 continue;
12461 }
12462
12463 /* Here, didn't find a legal hex number. Just add the text
12464 * from here up to the next \n, omitting any trailing
12465 * markers. */
12466
12467 remaining -= len;
12468 len = strcspn(si_string,
12469 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
12470 remaining -= len;
12471 if (matches_string) {
12472 sv_catpvn(matches_string, si_string, len);
12473 }
12474 else {
12475 matches_string = newSVpvn(si_string, len);
12476 }
12477 sv_catpvs(matches_string, " ");
12478
12479 si_string += len;
12480 if ( remaining
12481 && UCHARAT(si_string)
12482 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
12483 {
12484 si_string++;
12485 remaining--;
12486 }
12487 if (remaining && UCHARAT(si_string) == '\n') {
12488 si_string++;
12489 remaining--;
12490 }
12491 } /* end of loop through the text */
12492
12493 assert(matches_string);
12494 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
12495 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
12496 }
12497 } /* end of has an 'si' */
12498 }
12499
12500 /* Add the stuff that's already known */
12501 if (invlist) {
12502
12503 /* Again, if the caller doesn't want the output inversion list, put
12504 * everything in 'matches-string' */
12505 if (! output_invlist) {
12506 if ( ! matches_string) {
12507 matches_string = newSVpvs("\n");
12508 }
12509 sv_catsv(matches_string, invlist_contents(invlist,
12510 TRUE /* traditional style */
12511 ));
12512 }
12513 else if (! *output_invlist) {
12514 *output_invlist = invlist_clone(invlist, NULL);
12515 }
12516 else {
12517 _invlist_union(*output_invlist, invlist, output_invlist);
12518 }
12519 }
12520
12521 *listsvp = matches_string;
12522 }
12523
12524 return invlist;
12525}
12526
12527/* reg_skipcomment()
12528
12529 Absorbs an /x style # comment from the input stream,
12530 returning a pointer to the first character beyond the comment, or if the
12531 comment terminates the pattern without anything following it, this returns
12532 one past the final character of the pattern (in other words, RExC_end) and
12533 sets the REG_RUN_ON_COMMENT_SEEN flag.
12534
12535 Note it's the callers responsibility to ensure that we are
12536 actually in /x mode
12537
12538*/
12539
12540PERL_STATIC_INLINE char*
12541S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
12542{
12543 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12544
12545 assert(*p == '#');
12546
12547 while (p < RExC_end) {
12548 if (*(++p) == '\n') {
12549 return p+1;
12550 }
12551 }
12552
12553 /* we ran off the end of the pattern without ending the comment, so we have
12554 * to add an \n when wrapping */
12555 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12556 return p;
12557}
12558
12559STATIC void
12560S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
12561 char ** p,
12562 const bool force_to_xmod
12563 )
12564{
12565 /* If the text at the current parse position '*p' is a '(?#...)' comment,
12566 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
12567 * is /x whitespace, advance '*p' so that on exit it points to the first
12568 * byte past all such white space and comments */
12569
12570 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
12571
12572 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
12573
12574 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
12575
12576 for (;;) {
12577 if (RExC_end - (*p) >= 3
12578 && *(*p) == '('
12579 && *(*p + 1) == '?'
12580 && *(*p + 2) == '#')
12581 {
12582 while (*(*p) != ')') {
12583 if ((*p) == RExC_end)
12584 FAIL("Sequence (?#... not terminated");
12585 (*p)++;
12586 }
12587 (*p)++;
12588 continue;
12589 }
12590
12591 if (use_xmod) {
12592 const char * save_p = *p;
12593 while ((*p) < RExC_end) {
12594 STRLEN len;
12595 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
12596 (*p) += len;
12597 }
12598 else if (*(*p) == '#') {
12599 (*p) = reg_skipcomment(pRExC_state, (*p));
12600 }
12601 else {
12602 break;
12603 }
12604 }
12605 if (*p != save_p) {
12606 continue;
12607 }
12608 }
12609
12610 break;
12611 }
12612
12613 return;
12614}
12615
12616/* nextchar()
12617
12618 Advances the parse position by one byte, unless that byte is the beginning
12619 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
12620 those two cases, the parse position is advanced beyond all such comments and
12621 white space.
12622
12623 This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
12624*/
12625
12626STATIC void
12627S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12628{
12629 PERL_ARGS_ASSERT_NEXTCHAR;
12630
12631 if (RExC_parse < RExC_end) {
12632 assert( ! UTF
12633 || UTF8_IS_INVARIANT(*RExC_parse)
12634 || UTF8_IS_START(*RExC_parse));
12635
12636 RExC_parse_inc_safe();
12637
12638 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12639 FALSE /* Don't force /x */ );
12640 }
12641}
12642
12643STATIC void
12644S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
12645{
12646 /* 'size' is the delta number of smallest regnode equivalents to add or
12647 * subtract from the current memory allocated to the regex engine being
12648 * constructed. */
12649
12650 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
12651
12652 RExC_size += size;
12653
12654 Renewc(RExC_rxi,
12655 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
12656 /* +1 for REG_MAGIC */
12657 char,
12658 regexp_internal);
12659 if ( RExC_rxi == NULL )
12660 FAIL("Regexp out of space");
12661 RXi_SET(RExC_rx, RExC_rxi);
12662
12663 RExC_emit_start = RExC_rxi->program;
12664 if (size > 0) {
12665 Zero(REGNODE_p(RExC_emit), size, regnode);
12666 }
12667}
12668
12669STATIC regnode_offset
12670S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
12671{
12672 /* Allocate a regnode that is (1 + extra_size) times as big as the
12673 * smallest regnode worth of space, and also aligns and increments
12674 * RExC_size appropriately.
12675 *
12676 * It returns the regnode's offset into the regex engine program */
12677
12678 const regnode_offset ret = RExC_emit;
12679
12680 PERL_ARGS_ASSERT_REGNODE_GUTS;
12681
12682 SIZE_ALIGN(RExC_size);
12683 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
12684 NODE_ALIGN_FILL(REGNODE_p(ret));
12685 return(ret);
12686}
12687
12688#ifdef DEBUGGING
12689
12690STATIC regnode_offset
12691S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
12692 PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
12693 assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
12694 return S_regnode_guts(aTHX_ pRExC_state, extra_size);
12695}
12696
12697#endif
12698
12699
12700
12701/*
12702- reg_node - emit a node
12703*/
12704STATIC regnode_offset /* Location. */
12705S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12706{
12707 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12708 regnode_offset ptr = ret;
12709
12710 PERL_ARGS_ASSERT_REG_NODE;
12711
12712 assert(REGNODE_ARG_LEN(op) == 0);
12713
12714 FILL_ADVANCE_NODE(ptr, op);
12715 RExC_emit = ptr;
12716 return(ret);
12717}
12718
12719/*
12720- reg1node - emit a node with an argument
12721*/
12722STATIC regnode_offset /* Location. */
12723S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12724{
12725 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12726 regnode_offset ptr = ret;
12727
12728 PERL_ARGS_ASSERT_REG1NODE;
12729
12730 /* ANYOF are special cased to allow non-length 1 args */
12731 assert(REGNODE_ARG_LEN(op) == 1);
12732
12733 FILL_ADVANCE_NODE_ARG1u(ptr, op, arg);
12734 RExC_emit = ptr;
12735 return(ret);
12736}
12737
12738/*
12739- regpnode - emit a temporary node with a SV* argument
12740*/
12741STATIC regnode_offset /* Location. */
12742S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
12743{
12744 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12745 regnode_offset ptr = ret;
12746
12747 PERL_ARGS_ASSERT_REGPNODE;
12748
12749 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
12750 RExC_emit = ptr;
12751 return(ret);
12752}
12753
12754STATIC regnode_offset
12755S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
12756{
12757 /* emit a node with U32 and I32 arguments */
12758
12759 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12760 regnode_offset ptr = ret;
12761
12762 PERL_ARGS_ASSERT_REG2NODE;
12763
12764 assert(REGNODE_ARG_LEN(op) == 2);
12765
12766 FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2);
12767 RExC_emit = ptr;
12768 return(ret);
12769}
12770
12771/*
12772- reginsert - insert an operator in front of already-emitted operand
12773*
12774* That means that on exit 'operand' is the offset of the newly inserted
12775* operator, and the original operand has been relocated.
12776*
12777* IMPORTANT NOTE - it is the *callers* responsibility to correctly
12778* set up NEXT_OFF() of the inserted node if needed. Something like this:
12779*
12780* reginsert(pRExC, OPFAIL, orig_emit, depth+1);
12781* NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
12782*
12783* ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
12784*/
12785STATIC void
12786S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
12787 const regnode_offset operand, const U32 depth)
12788{
12789 regnode *src;
12790 regnode *dst;
12791 regnode *place;
12792 const int offset = REGNODE_ARG_LEN((U8)op);
12793 const int size = NODE_STEP_REGNODE + offset;
12794 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12795
12796 PERL_ARGS_ASSERT_REGINSERT;
12797 PERL_UNUSED_CONTEXT;
12798 PERL_UNUSED_ARG(depth);
12799 DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
12800 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
12801 studying. If this is wrong then we need to adjust RExC_recurse
12802 below like we do with RExC_open_parens/RExC_close_parens. */
12803 change_engine_size(pRExC_state, (Ptrdiff_t) size);
12804 src = REGNODE_p(RExC_emit);
12805 RExC_emit += size;
12806 dst = REGNODE_p(RExC_emit);
12807
12808 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
12809 * and [perl #133871] shows this can lead to problems, so skip this
12810 * realignment of parens until a later pass when they are reliable */
12811 if (! IN_PARENS_PASS && RExC_open_parens) {
12812 int paren;
12813 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
12814 /* remember that RExC_npar is rex->nparens + 1,
12815 * iow it is 1 more than the number of parens seen in
12816 * the pattern so far. */
12817 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12818 /* note, RExC_open_parens[0] is the start of the
12819 * regex, it can't move. RExC_close_parens[0] is the end
12820 * of the regex, it *can* move. */
12821 if ( paren && RExC_open_parens[paren] >= operand ) {
12822 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
12823 RExC_open_parens[paren] += size;
12824 } else {
12825 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12826 }
12827 if ( RExC_close_parens[paren] >= operand ) {
12828 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
12829 RExC_close_parens[paren] += size;
12830 } else {
12831 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12832 }
12833 }
12834 }
12835 if (RExC_end_op)
12836 RExC_end_op += size;
12837
12838 while (src > REGNODE_p(operand)) {
12839 StructCopy(--src, --dst, regnode);
12840 }
12841
12842 place = REGNODE_p(operand); /* Op node, where operand used to be. */
12843 src = place + 1; /* NOT REGNODE_AFTER! */
12844 FLAGS(place) = 0;
12845 FILL_NODE(operand, op);
12846
12847 /* Zero out any arguments in the new node */
12848 Zero(src, offset, regnode);
12849}
12850
12851/*
12852- regtail - set the next-pointer at the end of a node chain of p to val. If
12853 that value won't fit in the space available, instead returns FALSE.
12854 (Except asserts if we can't fit in the largest space the regex
12855 engine is designed for.)
12856- SEE ALSO: regtail_study
12857*/
12858STATIC bool
12859S_regtail(pTHX_ RExC_state_t * pRExC_state,
12860 const regnode_offset p,
12861 const regnode_offset val,
12862 const U32 depth)
12863{
12864 regnode_offset scan;
12865 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12866
12867 PERL_ARGS_ASSERT_REGTAIL;
12868#ifndef DEBUGGING
12869 PERL_UNUSED_ARG(depth);
12870#endif
12871
12872 /* The final node in the chain is the first one with a nonzero next pointer
12873 * */
12874 scan = (regnode_offset) p;
12875 for (;;) {
12876 regnode * const temp = regnext(REGNODE_p(scan));
12877 DEBUG_PARSE_r({
12878 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12879 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12880 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
12881 SvPV_nolen_const(RExC_mysv), scan,
12882 (temp == NULL ? "->" : ""),
12883 (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
12884 );
12885 });
12886 if (temp == NULL)
12887 break;
12888 scan = REGNODE_OFFSET(temp);
12889 }
12890
12891 /* Populate this node's next pointer */
12892 assert(val >= scan);
12893 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12894 assert((UV) (val - scan) <= U32_MAX);
12895 ARG1u_SET(REGNODE_p(scan), val - scan);
12896 }
12897 else {
12898 if (val - scan > U16_MAX) {
12899 /* Populate this with something that won't loop and will likely
12900 * lead to a crash if the caller ignores the failure return, and
12901 * execution continues */
12902 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12903 return FALSE;
12904 }
12905 NEXT_OFF(REGNODE_p(scan)) = val - scan;
12906 }
12907
12908 return TRUE;
12909}
12910
12911#ifdef DEBUGGING
12912/*
12913- regtail_study - set the next-pointer at the end of a node chain of p to val.
12914- Look for optimizable sequences at the same time.
12915- currently only looks for EXACT chains.
12916
12917This is experimental code. The idea is to use this routine to perform
12918in place optimizations on branches and groups as they are constructed,
12919with the long term intention of removing optimization from study_chunk so
12920that it is purely analytical.
12921
12922Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12923to control which is which.
12924
12925This used to return a value that was ignored. It was a problem that it is
12926#ifdef'd to be another function that didn't return a value. khw has changed it
12927so both currently return a pass/fail return.
12928
12929*/
12930/* TODO: All four parms should be const */
12931
12932STATIC bool
12933S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
12934 const regnode_offset val, U32 depth)
12935{
12936 regnode_offset scan;
12937 U8 exact = PSEUDO;
12938#ifdef EXPERIMENTAL_INPLACESCAN
12939 I32 min = 0;
12940#endif
12941 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12942
12943 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12944
12945
12946 /* Find last node. */
12947
12948 scan = p;
12949 for (;;) {
12950 regnode * const temp = regnext(REGNODE_p(scan));
12951#ifdef EXPERIMENTAL_INPLACESCAN
12952 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12953 bool unfolded_multi_char; /* Unexamined in this routine */
12954 if (join_exact(pRExC_state, scan, &min,
12955 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
12956 return TRUE; /* Was return EXACT */
12957 }
12958#endif
12959 if ( exact ) {
12960 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12961 if (exact == PSEUDO )
12962 exact= OP(REGNODE_p(scan));
12963 else if (exact != OP(REGNODE_p(scan)) )
12964 exact= 0;
12965 }
12966 else if (OP(REGNODE_p(scan)) != NOTHING) {
12967 exact= 0;
12968 }
12969 }
12970 DEBUG_PARSE_r({
12971 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12972 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12973 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
12974 SvPV_nolen_const(RExC_mysv),
12975 scan,
12976 REGNODE_NAME(exact));
12977 });
12978 if (temp == NULL)
12979 break;
12980 scan = REGNODE_OFFSET(temp);
12981 }
12982 DEBUG_PARSE_r({
12983 DEBUG_PARSE_MSG("");
12984 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
12985 Perl_re_printf( aTHX_
12986 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
12987 SvPV_nolen_const(RExC_mysv),
12988 (IV)val,
12989 (IV)(val - scan)
12990 );
12991 });
12992 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12993 assert((UV) (val - scan) <= U32_MAX);
12994 ARG1u_SET(REGNODE_p(scan), val - scan);
12995 }
12996 else {
12997 if (val - scan > U16_MAX) {
12998 /* Populate this with something that won't loop and will likely
12999 * lead to a crash if the caller ignores the failure return, and
13000 * execution continues */
13001 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
13002 return FALSE;
13003 }
13004 NEXT_OFF(REGNODE_p(scan)) = val - scan;
13005 }
13006
13007 return TRUE; /* Was 'return exact' */
13008}
13009#endif
13010
13011
13012#ifdef PERL_RE_BUILD_AUX
13013SV*
13014Perl_get_ANYOFM_contents(pTHX_ const regnode * n) {
13015
13016 /* Returns an inversion list of all the code points matched by the
13017 * ANYOFM/NANYOFM node 'n' */
13018
13019 SV * cp_list = _new_invlist(-1);
13020 const U8 lowest = (U8) ARG1u(n);
13021 unsigned int i;
13022 U8 count = 0;
13023 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
13024
13025 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
13026
13027 /* Starting with the lowest code point, any code point that ANDed with the
13028 * mask yields the lowest code point is in the set */
13029 for (i = lowest; i <= 0xFF; i++) {
13030 if ((i & FLAGS(n)) == ARG1u(n)) {
13031 cp_list = add_cp_to_invlist(cp_list, i);
13032 count++;
13033
13034 /* We know how many code points (a power of two) that are in the
13035 * set. No use looking once we've got that number */
13036 if (count >= needed) break;
13037 }
13038 }
13039
13040 if (OP(n) == NANYOFM) {
13041 _invlist_invert(cp_list);
13042 }
13043 return cp_list;
13044}
13045
13046SV *
13047Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
13048 PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
13049
13050 SV * cp_list = NULL;
13051 populate_invlist_from_bitmap(
13052 ((struct regnode_bbm *) n)->bitmap,
13053 REGNODE_BBM_BITMAP_LEN * CHARBITS,
13054 &cp_list,
13055
13056 /* The base cp is from the start byte plus a zero continuation */
13057 TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n),
13058 UTF_CONTINUATION_MARK | 0));
13059 return cp_list;
13060}
13061#endif /* PERL_RE_BUILD_AUX */
13062
13063
13064SV *
13065Perl_re_intuit_string(pTHX_ REGEXP * const r)
13066{ /* Assume that RE_INTUIT is set */
13067 /* Returns an SV containing a string that must appear in the target for it
13068 * to match, or NULL if nothing is known that must match.
13069 *
13070 * CAUTION: the SV can be freed during execution of the regex engine */
13071
13072 struct regexp *const prog = ReANY(r);
13073 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13074
13075 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13076 PERL_UNUSED_CONTEXT;
13077
13078 DEBUG_COMPILE_r(
13079 {
13080 if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
13081 const char * const s = SvPV_nolen_const(RX_UTF8(r)
13082 ? prog->check_utf8 : prog->check_substr);
13083
13084 if (!PL_colorset) reginitcolors();
13085 Perl_re_printf( aTHX_
13086 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13087 PL_colors[4],
13088 RX_UTF8(r) ? "utf8 " : "",
13089 PL_colors[5], PL_colors[0],
13090 s,
13091 PL_colors[1],
13092 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
13093 }
13094 } );
13095
13096 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
13097 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
13098}
13099
13100/*
13101 pregfree()
13102
13103 handles refcounting and freeing the perl core regexp structure. When
13104 it is necessary to actually free the structure the first thing it
13105 does is call the 'free' method of the regexp_engine associated to
13106 the regexp, allowing the handling of the void *pprivate; member
13107 first. (This routine is not overridable by extensions, which is why
13108 the extensions free is called first.)
13109
13110 See regdupe and regdupe_internal if you change anything here.
13111*/
13112#ifndef PERL_IN_XSUB_RE
13113void
13114Perl_pregfree(pTHX_ REGEXP *r)
13115{
13116 SvREFCNT_dec(r);
13117}
13118
13119void
13120Perl_pregfree2(pTHX_ REGEXP *rx)
13121{
13122 struct regexp *const r = ReANY(rx);
13123 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13124
13125 PERL_ARGS_ASSERT_PREGFREE2;
13126
13127 if (! r)
13128 return;
13129
13130 if (r->mother_re) {
13131 ReREFCNT_dec(r->mother_re);
13132 } else {
13133 CALLREGFREE_PVT(rx); /* free the private data */
13134 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13135 }
13136 if (r->substrs) {
13137 int i;
13138 for (i = 0; i < 2; i++) {
13139 SvREFCNT_dec(r->substrs->data[i].substr);
13140 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
13141 }
13142 Safefree(r->substrs);
13143 }
13144 RX_MATCH_COPY_FREE(rx);
13145#ifdef PERL_ANY_COW
13146 SvREFCNT_dec(r->saved_copy);
13147#endif
13148 Safefree(RXp_OFFSp(r));
13149 if (r->logical_to_parno) {
13150 Safefree(r->logical_to_parno);
13151 Safefree(r->parno_to_logical);
13152 Safefree(r->parno_to_logical_next);
13153 }
13154
13155 SvREFCNT_dec(r->qr_anoncv);
13156 if (r->recurse_locinput)
13157 Safefree(r->recurse_locinput);
13158}
13159
13160
13161/* reg_temp_copy()
13162
13163 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
13164 except that dsv will be created if NULL.
13165
13166 This function is used in two main ways. First to implement
13167 $r = qr/....; $s = $$r;
13168
13169 Secondly, it is used as a hacky workaround to the structural issue of
13170 match results
13171 being stored in the regexp structure which is in turn stored in
13172 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13173 could be PL_curpm in multiple contexts, and could require multiple
13174 result sets being associated with the pattern simultaneously, such
13175 as when doing a recursive match with (??{$qr})
13176
13177 The solution is to make a lightweight copy of the regexp structure
13178 when a qr// is returned from the code executed by (??{$qr}) this
13179 lightweight copy doesn't actually own any of its data except for
13180 the starp/end and the actual regexp structure itself.
13181
13182*/
13183
13184
13185REGEXP *
13186Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
13187{
13188 struct regexp *drx;
13189 struct regexp *const srx = ReANY(ssv);
13190 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
13191
13192 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13193
13194 if (!dsv)
13195 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
13196 else {
13197 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
13198
13199 /* our only valid caller, sv_setsv_flags(), should have done
13200 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
13201 assert(!SvOOK(dsv));
13202 assert(!SvIsCOW(dsv));
13203 assert(!SvROK(dsv));
13204
13205 if (SvPVX_const(dsv)) {
13206 if (SvLEN(dsv))
13207 Safefree(SvPVX(dsv));
13208 SvPVX(dsv) = NULL;
13209 }
13210 SvLEN_set(dsv, 0);
13211 SvCUR_set(dsv, 0);
13212 SvOK_off((SV *)dsv);
13213
13214 if (islv) {
13215 /* For PVLVs, the head (sv_any) points to an XPVLV, while
13216 * the LV's xpvlenu_rx will point to a regexp body, which
13217 * we allocate here */
13218 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
13219 assert(!SvPVX(dsv));
13220 /* We "steal" the body from the newly allocated SV temp, changing
13221 * the pointer in its HEAD to NULL. We then change its type to
13222 * SVt_NULL so that when we immediately release its only reference,
13223 * no memory deallocation happens.
13224 *
13225 * The body will eventually be freed (from the PVLV) either in
13226 * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
13227 * the regexp body needs to be removed)
13228 * or in Perl_sv_clear() (if the PVLV still holds the pointer until
13229 * the PVLV itself is deallocated). */
13230 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
13231 temp->sv_any = NULL;
13232 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
13233 SvREFCNT_dec_NN(temp);
13234 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
13235 ing below will not set it. */
13236 SvCUR_set(dsv, SvCUR(ssv));
13237 }
13238 }
13239 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
13240 sv_force_normal(sv) is called. */
13241 SvFAKE_on(dsv);
13242 drx = ReANY(dsv);
13243
13244 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
13245 SvPV_set(dsv, RX_WRAPPED(ssv));
13246 /* We share the same string buffer as the original regexp, on which we
13247 hold a reference count, incremented when mother_re is set below.
13248 The string pointer is copied here, being part of the regexp struct.
13249 */
13250 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
13251 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13252
13253 if (!islv)
13254 SvLEN_set(dsv, 0);
13255 if (RXp_OFFSp(srx)) {
13256 const I32 npar = srx->nparens+1;
13257 NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair);
13258 }
13259 if (srx->substrs) {
13260 int i;
13261 Newx(drx->substrs, 1, struct reg_substr_data);
13262 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
13263
13264 for (i = 0; i < 2; i++) {
13265 SvREFCNT_inc_void(drx->substrs->data[i].substr);
13266 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
13267 }
13268
13269 /* check_substr and check_utf8, if non-NULL, point to either their
13270 anchored or float namesakes, and don't hold a second reference. */
13271 }
13272 if (srx->logical_to_parno) {
13273 NewCopy(srx->logical_to_parno,
13274 drx->logical_to_parno,
13275 srx->nparens+1, I32);
13276 NewCopy(srx->parno_to_logical,
13277 drx->parno_to_logical,
13278 srx->nparens+1, I32);
13279 NewCopy(srx->parno_to_logical_next,
13280 drx->parno_to_logical_next,
13281 srx->nparens+1, I32);
13282 } else {
13283 drx->logical_to_parno = NULL;
13284 drx->parno_to_logical = NULL;
13285 drx->parno_to_logical_next = NULL;
13286 }
13287 drx->logical_nparens = srx->logical_nparens;
13288
13289 RX_MATCH_COPIED_off(dsv);
13290#ifdef PERL_ANY_COW
13291 RXp_SAVED_COPY(drx) = NULL;
13292#endif
13293 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
13294 SvREFCNT_inc_void(drx->qr_anoncv);
13295 if (srx->recurse_locinput)
13296 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
13297
13298 return dsv;
13299}
13300#endif
13301
13302
13303/* regfree_internal()
13304
13305 Free the private data in a regexp. This is overloadable by
13306 extensions. Perl takes care of the regexp structure in pregfree(),
13307 this covers the *pprivate pointer which technically perl doesn't
13308 know about, however of course we have to handle the
13309 regexp_internal structure when no extension is in use.
13310
13311 Note this is called before freeing anything in the regexp
13312 structure.
13313 */
13314
13315void
13316Perl_regfree_internal(pTHX_ REGEXP * const rx)
13317{
13318 struct regexp *const r = ReANY(rx);
13319 RXi_GET_DECL(r, ri);
13320 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13321
13322 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13323
13324 if (! ri) {
13325 return;
13326 }
13327
13328 DEBUG_COMPILE_r({
13329 if (!PL_colorset)
13330 reginitcolors();
13331 {
13332 SV *dsv= sv_newmortal();
13333 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13334 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
13335 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
13336 PL_colors[4], PL_colors[5], s);
13337 }
13338 });
13339
13340 if (ri->code_blocks)
13341 S_free_codeblocks(aTHX_ ri->code_blocks);
13342
13343 if (ri->data) {
13344 int n = ri->data->count;
13345
13346 while (--n >= 0) {
13347 /* If you add a ->what type here, update the comment in regcomp.h */
13348 switch (ri->data->what[n]) {
13349 case 'a':
13350 case 'r':
13351 case 's':
13352 case 'S':
13353 case 'u':
13354 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13355 break;
13356 case 'f':
13357 Safefree(ri->data->data[n]);
13358 break;
13359 case 'l':
13360 case 'L':
13361 break;
13362 case 'T':
13363 { /* Aho Corasick add-on structure for a trie node.
13364 Used in stclass optimization only */
13365 U32 refcount;
13366 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13367 OP_REFCNT_LOCK;
13368 refcount = --aho->refcount;
13369 OP_REFCNT_UNLOCK;
13370 if ( !refcount ) {
13371 PerlMemShared_free(aho->states);
13372 PerlMemShared_free(aho->fail);
13373 /* do this last!!!! */
13374 PerlMemShared_free(ri->data->data[n]);
13375 /* we should only ever get called once, so
13376 * assert as much, and also guard the free
13377 * which /might/ happen twice. At the least
13378 * it will make code anlyzers happy and it
13379 * doesn't cost much. - Yves */
13380 assert(ri->regstclass);
13381 if (ri->regstclass) {
13382 PerlMemShared_free(ri->regstclass);
13383 ri->regstclass = 0;
13384 }
13385 }
13386 }
13387 break;
13388 case 't':
13389 {
13390 /* trie structure. */
13391 U32 refcount;
13392 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13393 OP_REFCNT_LOCK;
13394 refcount = --trie->refcount;
13395 OP_REFCNT_UNLOCK;
13396 if ( !refcount ) {
13397 PerlMemShared_free(trie->charmap);
13398 PerlMemShared_free(trie->states);
13399 PerlMemShared_free(trie->trans);
13400 if (trie->bitmap)
13401 PerlMemShared_free(trie->bitmap);
13402 if (trie->jump)
13403 PerlMemShared_free(trie->jump);
13404 if (trie->j_before_paren)
13405 PerlMemShared_free(trie->j_before_paren);
13406 if (trie->j_after_paren)
13407 PerlMemShared_free(trie->j_after_paren);
13408 PerlMemShared_free(trie->wordinfo);
13409 /* do this last!!!! */
13410 PerlMemShared_free(ri->data->data[n]);
13411 }
13412 }
13413 break;
13414 case '%':
13415 /* NO-OP a '%' data contains a null pointer, so that reg_add_data
13416 * always returns non-zero, this should only ever happen in the
13417 * 0 index */
13418 assert(n==0);
13419 break;
13420 default:
13421 Perl_croak(aTHX_ "panic: regfree data code '%c'",
13422 ri->data->what[n]);
13423 }
13424 }
13425 Safefree(ri->data->what);
13426 Safefree(ri->data);
13427 }
13428
13429 Safefree(ri);
13430}
13431
13432#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
13433
13434/*
13435=for apidoc re_dup_guts
13436Duplicate a regexp.
13437
13438This routine is expected to clone a given regexp structure. It is only
13439compiled under USE_ITHREADS.
13440
13441After all of the core data stored in struct regexp is duplicated
13442the C<regexp_engine.dupe> method is used to copy any private data
13443stored in the *pprivate pointer. This allows extensions to handle
13444any duplication they need to do.
13445
13446=cut
13447
13448 See pregfree() and regfree_internal() if you change anything here.
13449*/
13450#if defined(USE_ITHREADS)
13451#ifndef PERL_IN_XSUB_RE
13452void
13453Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13454{
13455 I32 npar;
13456 const struct regexp *r = ReANY(sstr);
13457 struct regexp *ret = ReANY(dstr);
13458
13459 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13460
13461 npar = r->nparens+1;
13462 NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair);
13463
13464 if (ret->substrs) {
13465 /* Do it this way to avoid reading from *r after the StructCopy().
13466 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13467 cache, it doesn't matter. */
13468 int i;
13469 const bool anchored = r->check_substr
13470 ? r->check_substr == r->substrs->data[0].substr
13471 : r->check_utf8 == r->substrs->data[0].utf8_substr;
13472 Newx(ret->substrs, 1, struct reg_substr_data);
13473 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13474
13475 for (i = 0; i < 2; i++) {
13476 ret->substrs->data[i].substr =
13477 sv_dup_inc(ret->substrs->data[i].substr, param);
13478 ret->substrs->data[i].utf8_substr =
13479 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
13480 }
13481
13482 /* check_substr and check_utf8, if non-NULL, point to either their
13483 anchored or float namesakes, and don't hold a second reference. */
13484
13485 if (ret->check_substr) {
13486 if (anchored) {
13487 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
13488
13489 ret->check_substr = ret->substrs->data[0].substr;
13490 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13491 } else {
13492 assert(r->check_substr == r->substrs->data[1].substr);
13493 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
13494
13495 ret->check_substr = ret->substrs->data[1].substr;
13496 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13497 }
13498 } else if (ret->check_utf8) {
13499 if (anchored) {
13500 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13501 } else {
13502 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13503 }
13504 }
13505 }
13506
13507 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13508 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13509 if (r->recurse_locinput)
13510 Newx(ret->recurse_locinput, r->nparens + 1, char *);
13511
13512 if (ret->pprivate)
13513 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
13514
13515 if (RX_MATCH_COPIED(dstr))
13516 RXp_SUBBEG(ret) = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret));
13517 else
13518 RXp_SUBBEG(ret) = NULL;
13519#ifdef PERL_ANY_COW
13520 RXp_SAVED_COPY(ret) = NULL;
13521#endif
13522
13523 if (r->logical_to_parno) {
13524 /* we use total_parens for all three just for symmetry */
13525 ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32));
13526 ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32));
13527 ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32));
13528 } else {
13529 ret->logical_to_parno = NULL;
13530 ret->parno_to_logical = NULL;
13531 ret->parno_to_logical_next = NULL;
13532 }
13533
13534 ret->logical_nparens = r->logical_nparens;
13535
13536 /* Whether mother_re be set or no, we need to copy the string. We
13537 cannot refrain from copying it when the storage points directly to
13538 our mother regexp, because that's
13539 1: a buffer in a different thread
13540 2: something we no longer hold a reference on
13541 so we need to copy it locally. */
13542 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
13543 /* set malloced length to a non-zero value so it will be freed
13544 * (otherwise in combination with SVf_FAKE it looks like an alien
13545 * buffer). It doesn't have to be the actual malloced size, since it
13546 * should never be grown */
13547 SvLEN_set(dstr, SvCUR(sstr)+1);
13548 ret->mother_re = NULL;
13549}
13550#endif /* PERL_IN_XSUB_RE */
13551
13552/*
13553 regdupe_internal()
13554
13555 This is the internal complement to regdupe() which is used to copy
13556 the structure pointed to by the *pprivate pointer in the regexp.
13557 This is the core version of the extension overridable cloning hook.
13558 The regexp structure being duplicated will be copied by perl prior
13559 to this and will be provided as the regexp *r argument, however
13560 with the /old/ structures pprivate pointer value. Thus this routine
13561 may override any copying normally done by perl.
13562
13563 It returns a pointer to the new regexp_internal structure.
13564*/
13565
13566void *
13567Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13568{
13569 struct regexp *const r = ReANY(rx);
13570 regexp_internal *reti;
13571 int len;
13572 RXi_GET_DECL(r, ri);
13573
13574 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13575
13576 len = ProgLen(ri);
13577
13578 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
13579 char, regexp_internal);
13580 Copy(ri->program, reti->program, len+1, regnode);
13581
13582
13583 if (ri->code_blocks) {
13584 int n;
13585 Newx(reti->code_blocks, 1, struct reg_code_blocks);
13586 Newx(reti->code_blocks->cb, ri->code_blocks->count,
13587 struct reg_code_block);
13588 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
13589 ri->code_blocks->count, struct reg_code_block);
13590 for (n = 0; n < ri->code_blocks->count; n++)
13591 reti->code_blocks->cb[n].src_regex = (REGEXP*)
13592 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
13593 reti->code_blocks->count = ri->code_blocks->count;
13594 reti->code_blocks->refcnt = 1;
13595 }
13596 else
13597 reti->code_blocks = NULL;
13598
13599 reti->regstclass = NULL;
13600
13601 if (ri->data) {
13602 struct reg_data *d;
13603 const int count = ri->data->count;
13604 int i;
13605
13606 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13607 char, struct reg_data);
13608 Newx(d->what, count, U8);
13609
13610 d->count = count;
13611 for (i = 0; i < count; i++) {
13612 d->what[i] = ri->data->what[i];
13613 switch (d->what[i]) {
13614 /* see also regcomp.h and regfree_internal() */
13615 case 'a': /* actually an AV, but the dup function is identical.
13616 values seem to be "plain sv's" generally. */
13617 case 'r': /* a compiled regex (but still just another SV) */
13618 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
13619 this use case should go away, the code could have used
13620 'a' instead - see S_set_ANYOF_arg() for array contents. */
13621 case 'S': /* actually an SV, but the dup function is identical. */
13622 case 'u': /* actually an HV, but the dup function is identical.
13623 values are "plain sv's" */
13624 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13625 break;
13626 case 'f':
13627 /* Synthetic Start Class - "Fake" charclass we generate to optimize
13628 * patterns which could start with several different things. Pre-TRIE
13629 * this was more important than it is now, however this still helps
13630 * in some places, for instance /x?a+/ might produce a SSC equivalent
13631 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
13632 * in regexec.c
13633 */
13634 /* This is cheating. */
13635 Newx(d->data[i], 1, regnode_ssc);
13636 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
13637 reti->regstclass = (regnode*)d->data[i];
13638 break;
13639 case 'T':
13640 /* AHO-CORASICK fail table */
13641 /* Trie stclasses are readonly and can thus be shared
13642 * without duplication. We free the stclass in pregfree
13643 * when the corresponding reg_ac_data struct is freed.
13644 */
13645 reti->regstclass= ri->regstclass;
13646 /* FALLTHROUGH */
13647 case 't':
13648 /* TRIE transition table */
13649 OP_REFCNT_LOCK;
13650 ((reg_trie_data*)ri->data->data[i])->refcount++;
13651 OP_REFCNT_UNLOCK;
13652 /* FALLTHROUGH */
13653 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
13654 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
13655 is not from another regexp */
13656 d->data[i] = ri->data->data[i];
13657 break;
13658 case '%':
13659 /* this is a placeholder type, it exists purely so that
13660 * reg_add_data always returns a non-zero value, this type of
13661 * entry should ONLY be present in the 0 slot of the array */
13662 assert(i == 0);
13663 d->data[i]= ri->data->data[i];
13664 break;
13665 default:
13666 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
13667 ri->data->what[i]);
13668 }
13669 }
13670
13671 reti->data = d;
13672 }
13673 else
13674 reti->data = NULL;
13675
13676 if (ri->regstclass && !reti->regstclass) {
13677 /* Assume that the regstclass is a regnode which is inside of the
13678 * program which we have to copy over */
13679 regnode *node= ri->regstclass;
13680 assert(node >= ri->program && (node - ri->program) < len);
13681 reti->regstclass = reti->program + (node - ri->program);
13682 }
13683
13684
13685 reti->name_list_idx = ri->name_list_idx;
13686
13687 SetProgLen(reti, len);
13688
13689 return (void*)reti;
13690}
13691
13692#endif /* USE_ITHREADS */
13693
13694STATIC void
13695S_re_croak(pTHX_ bool utf8, const char* pat,...)
13696{
13697 va_list args;
13698 STRLEN len = strlen(pat);
13699 char buf[512];
13700 SV *msv;
13701 const char *message;
13702
13703 PERL_ARGS_ASSERT_RE_CROAK;
13704
13705 if (len > 510)
13706 len = 510;
13707 Copy(pat, buf, len , char);
13708 buf[len] = '\n';
13709 buf[len + 1] = '\0';
13710 va_start(args, pat);
13711 msv = vmess(buf, &args);
13712 va_end(args);
13713 message = SvPV_const(msv, len);
13714 if (len > 512)
13715 len = 512;
13716 Copy(message, buf, len , char);
13717 /* len-1 to avoid \n */
13718 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
13719}
13720
13721/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13722
13723#ifndef PERL_IN_XSUB_RE
13724void
13725Perl_save_re_context(pTHX)
13726{
13727 I32 nparens = -1;
13728 I32 i;
13729
13730 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13731
13732 if (PL_curpm) {
13733 const REGEXP * const rx = PM_GETRE(PL_curpm);
13734 if (rx)
13735 nparens = RX_NPARENS(rx);
13736 }
13737
13738 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
13739 * that PL_curpm will be null, but that utf8.pm and the modules it
13740 * loads will only use $1..$3.
13741 * The t/porting/re_context.t test file checks this assumption.
13742 */
13743 if (nparens == -1)
13744 nparens = 3;
13745
13746 for (i = 1; i <= nparens; i++) {
13747 char digits[TYPE_CHARS(long)];
13748 const STRLEN len = my_snprintf(digits, sizeof(digits),
13749 "%lu", (long)i);
13750 GV *const *const gvp
13751 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13752
13753 if (gvp) {
13754 GV * const gv = *gvp;
13755 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13756 save_scalar(gv);
13757 }
13758 }
13759}
13760#endif
13761
13762#ifndef PERL_IN_XSUB_RE
13763
13764# include "uni_keywords.h"
13765
13766void
13767Perl_init_uniprops(pTHX)
13768{
13769
13770# ifdef DEBUGGING
13771 char * dump_len_string;
13772
13773 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
13774 if ( ! dump_len_string
13775 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
13776 {
13777 PL_dump_re_max_len = 60; /* A reasonable default */
13778 }
13779# endif
13780
13781 PL_user_def_props = newHV();
13782
13783# ifdef USE_ITHREADS
13784
13785 HvSHAREKEYS_off(PL_user_def_props);
13786 PL_user_def_props_aTHX = aTHX;
13787
13788# endif
13789
13790 /* Set up the inversion list interpreter-level variables */
13791
13792 PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13793 PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
13794 PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
13795 PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
13796 PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
13797 PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
13798 PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
13799 PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
13800 PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
13801 PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
13802 PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
13803 PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
13804 PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
13805 PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
13806 PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
13807 PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
13808
13809 PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13810 PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
13811 PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
13812 PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
13813 PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
13814 PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
13815 PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
13816 PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
13817 PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
13818 PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
13819 PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
13820 PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
13821 PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
13822 PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
13823 PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
13824 PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
13825
13826 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
13827 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
13828 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
13829 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
13830 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
13831
13832 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
13833 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
13834 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
13835 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
13836
13837 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
13838
13839 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
13840 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
13841
13842 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
13843 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
13844
13845 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
13846 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13847 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
13848 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13849 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
13850 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
13851 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
13852 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
13853 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
13854 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
13855 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
13856 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
13857 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
13858 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
13859
13860# ifdef UNI_XIDC
13861 /* The below are used only by deprecated functions. They could be removed */
13862 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
13863 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
13864 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
13865# endif
13866}
13867
13868/* These four functions are compiled only in regcomp.c, where they have access
13869 * to the data they return. They are a way for re_comp.c to get access to that
13870 * data without having to compile the whole data structures. */
13871
13872I16
13873Perl_do_uniprop_match(const char * const key, const U16 key_len)
13874{
13875 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
13876
13877 return match_uniprop((U8 *) key, key_len);
13878}
13879
13880SV *
13881Perl_get_prop_definition(pTHX_ const int table_index)
13882{
13883 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
13884
13885 /* Create and return the inversion list */
13886 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
13887}
13888
13889const char * const *
13890Perl_get_prop_values(const int table_index)
13891{
13892 PERL_ARGS_ASSERT_GET_PROP_VALUES;
13893
13894 return UNI_prop_value_ptrs[table_index];
13895}
13896
13897const char *
13898Perl_get_deprecated_property_msg(const Size_t warning_offset)
13899{
13900 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
13901
13902 return deprecated_property_msgs[warning_offset];
13903}
13904
13905# if 0
13906
13907This code was mainly added for backcompat to give a warning for non-portable
13908code points in user-defined properties. But experiments showed that the
13909warning in earlier perls were only omitted on overflow, which should be an
13910error, so there really isnt a backcompat issue, and actually adding the
13911warning when none was present before might cause breakage, for little gain. So
13912khw left this code in, but not enabled. Tests were never added.
13913
13914embed.fnc entry:
13915Ei |const char *|get_extended_utf8_msg|const UV cp
13916
13917PERL_STATIC_INLINE const char *
13918S_get_extended_utf8_msg(pTHX_ const UV cp)
13919{
13920 U8 dummy[UTF8_MAXBYTES + 1];
13921 HV *msgs;
13922 SV **msg;
13923
13924 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
13925 &msgs);
13926
13927 msg = hv_fetchs(msgs, "text", 0);
13928 assert(msg);
13929
13930 (void) sv_2mortal((SV *) msgs);
13931
13932 return SvPVX(*msg);
13933}
13934
13935# endif
13936#endif /* end of ! PERL_IN_XSUB_RE */
13937
13938STATIC REGEXP *
13939S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
13940 const bool ignore_case)
13941{
13942 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
13943 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
13944 * because nothing outside of ASCII will match. Use /m because the input
13945 * string may be a bunch of lines strung together.
13946 *
13947 * Also sets up the debugging info */
13948
13949 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
13950 U32 rx_flags;
13951 SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
13952 REGEXP * subpattern_re;
13953 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13954
13955 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
13956
13957 if (ignore_case) {
13958 flags |= PMf_FOLD;
13959 }
13960 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
13961
13962 /* Like in op.c, we copy the compile time pm flags to the rx ones */
13963 rx_flags = flags & RXf_PMf_COMPILETIME;
13964
13965#ifndef PERL_IN_XSUB_RE
13966 /* Use the core engine if this file is regcomp.c. That means no
13967 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
13968 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13969 &PL_core_reg_engine,
13970 NULL, NULL,
13971 rx_flags, flags);
13972#else
13973 if (isDEBUG_WILDCARD) {
13974 /* Use the special debugging engine if this file is re_comp.c and wants
13975 * to output the wildcard matching. This uses whatever
13976 * 'use re "Debug ..." is in effect */
13977 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13978 &my_reg_engine,
13979 NULL, NULL,
13980 rx_flags, flags);
13981 }
13982 else {
13983 /* Use the special wildcard engine if this file is re_comp.c and
13984 * doesn't want to output the wildcard matching. This uses whatever
13985 * 'use re "Debug ..." is in effect for compilation, but this engine
13986 * structure has been set up so that it uses the core engine for
13987 * execution, so no execution debugging as a result of re.pm will be
13988 * displayed. */
13989 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13990 &wild_reg_engine,
13991 NULL, NULL,
13992 rx_flags, flags);
13993 /* XXX The above has the effect that any user-supplied regex engine
13994 * won't be called for matching wildcards. That might be good, or bad.
13995 * It could be changed in several ways. The reason it is done the
13996 * current way is to avoid having to save and restore
13997 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
13998 * could be used. Another suggestion is to keep the authoritative
13999 * value of the debug flags in a thread-local variable and add set/get
14000 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
14001 * Still another is to pass a flag, say in the engine's intflags that
14002 * would be checked each time before doing the debug output */
14003 }
14004#endif
14005
14006 assert(subpattern_re); /* Should have died if didn't compile successfully */
14007 return subpattern_re;
14008}
14009
14010STATIC I32
14011S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
14012 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
14013{
14014 I32 result;
14015 DECLARE_AND_GET_RE_DEBUG_FLAGS;
14016
14017 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
14018
14019 ENTER;
14020
14021 /* The compilation has set things up so that if the program doesn't want to
14022 * see the wildcard matching procedure, it will get the core execution
14023 * engine, which is subject only to -Dr. So we have to turn that off
14024 * around this procedure */
14025 if (! isDEBUG_WILDCARD) {
14026 /* Note! Casts away 'volatile' */
14027 SAVEI32(PL_debug);
14028 PL_debug &= ~ DEBUG_r_FLAG;
14029 }
14030
14031 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
14032 NULL, nosave);
14033 LEAVE;
14034
14035 return result;
14036}
14037
14038SV *
14039S_handle_user_defined_property(pTHX_
14040
14041 /* Parses the contents of a user-defined property definition; returning the
14042 * expanded definition if possible. If so, the return is an inversion
14043 * list.
14044 *
14045 * If there are subroutines that are part of the expansion and which aren't
14046 * known at the time of the call to this function, this returns what
14047 * parse_uniprop_string() returned for the first one encountered.
14048 *
14049 * If an error was found, NULL is returned, and 'msg' gets a suitable
14050 * message appended to it. (Appending allows the back trace of how we got
14051 * to the faulty definition to be displayed through nested calls of
14052 * user-defined subs.)
14053 *
14054 * The caller IS responsible for freeing any returned SV.
14055 *
14056 * The syntax of the contents is pretty much described in perlunicode.pod,
14057 * but we also allow comments on each line */
14058
14059 const char * name, /* Name of property */
14060 const STRLEN name_len, /* The name's length in bytes */
14061 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14062 const bool to_fold, /* ? Is this under /i */
14063 const bool runtime, /* ? Are we in compile- or run-time */
14064 const bool deferrable, /* Is it ok for this property's full definition
14065 to be deferred until later? */
14066 SV* contents, /* The property's definition */
14067 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
14068 getting called unless this is thought to be
14069 a user-defined property */
14070 SV * msg, /* Any error or warning msg(s) are appended to
14071 this */
14072 const STRLEN level) /* Recursion level of this call */
14073{
14074 STRLEN len;
14075 const char * string = SvPV_const(contents, len);
14076 const char * const e = string + len;
14077 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
14078 const STRLEN msgs_length_on_entry = SvCUR(msg);
14079
14080 const char * s0 = string; /* Points to first byte in the current line
14081 being parsed in 'string' */
14082 const char overflow_msg[] = "Code point too large in \"";
14083 SV* running_definition = NULL;
14084
14085 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
14086
14087 *user_defined_ptr = TRUE;
14088
14089 /* Look at each line */
14090 while (s0 < e) {
14091 const char * s; /* Current byte */
14092 char op = '+'; /* Default operation is 'union' */
14093 IV min = 0; /* range begin code point */
14094 IV max = -1; /* and range end */
14095 SV* this_definition;
14096
14097 /* Skip comment lines */
14098 if (*s0 == '#') {
14099 s0 = strchr(s0, '\n');
14100 if (s0 == NULL) {
14101 break;
14102 }
14103 s0++;
14104 continue;
14105 }
14106
14107 /* For backcompat, allow an empty first line */
14108 if (*s0 == '\n') {
14109 s0++;
14110 continue;
14111 }
14112
14113 /* First character in the line may optionally be the operation */
14114 if ( *s0 == '+'
14115 || *s0 == '!'
14116 || *s0 == '-'
14117 || *s0 == '&')
14118 {
14119 op = *s0++;
14120 }
14121
14122 /* If the line is one or two hex digits separated by blank space, its
14123 * a range; otherwise it is either another user-defined property or an
14124 * error */
14125
14126 s = s0;
14127
14128 if (! isXDIGIT(*s)) {
14129 goto check_if_property;
14130 }
14131
14132 do { /* Each new hex digit will add 4 bits. */
14133 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
14134 s = strchr(s, '\n');
14135 if (s == NULL) {
14136 s = e;
14137 }
14138 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14139 sv_catpv(msg, overflow_msg);
14140 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14141 UTF8fARG(is_contents_utf8, s - s0, s0));
14142 sv_catpvs(msg, "\"");
14143 goto return_failure;
14144 }
14145
14146 /* Accumulate this digit into the value */
14147 min = (min << 4) + READ_XDIGIT(s);
14148 } while (isXDIGIT(*s));
14149
14150 while (isBLANK(*s)) { s++; }
14151
14152 /* We allow comments at the end of the line */
14153 if (*s == '#') {
14154 s = strchr(s, '\n');
14155 if (s == NULL) {
14156 s = e;
14157 }
14158 s++;
14159 }
14160 else if (s < e && *s != '\n') {
14161 if (! isXDIGIT(*s)) {
14162 goto check_if_property;
14163 }
14164
14165 /* Look for the high point of the range */
14166 max = 0;
14167 do {
14168 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
14169 s = strchr(s, '\n');
14170 if (s == NULL) {
14171 s = e;
14172 }
14173 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14174 sv_catpv(msg, overflow_msg);
14175 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14176 UTF8fARG(is_contents_utf8, s - s0, s0));
14177 sv_catpvs(msg, "\"");
14178 goto return_failure;
14179 }
14180
14181 max = (max << 4) + READ_XDIGIT(s);
14182 } while (isXDIGIT(*s));
14183
14184 while (isBLANK(*s)) { s++; }
14185
14186 if (*s == '#') {
14187 s = strchr(s, '\n');
14188 if (s == NULL) {
14189 s = e;
14190 }
14191 }
14192 else if (s < e && *s != '\n') {
14193 goto check_if_property;
14194 }
14195 }
14196
14197 if (max == -1) { /* The line only had one entry */
14198 max = min;
14199 }
14200 else if (max < min) {
14201 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14202 sv_catpvs(msg, "Illegal range in \"");
14203 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14204 UTF8fARG(is_contents_utf8, s - s0, s0));
14205 sv_catpvs(msg, "\"");
14206 goto return_failure;
14207 }
14208
14209# if 0 /* See explanation at definition above of get_extended_utf8_msg() */
14210
14211 if ( UNICODE_IS_PERL_EXTENDED(min)
14212 || UNICODE_IS_PERL_EXTENDED(max))
14213 {
14214 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14215
14216 /* If both code points are non-portable, warn only on the lower
14217 * one. */
14218 sv_catpv(msg, get_extended_utf8_msg(
14219 (UNICODE_IS_PERL_EXTENDED(min))
14220 ? min : max));
14221 sv_catpvs(msg, " in \"");
14222 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14223 UTF8fARG(is_contents_utf8, s - s0, s0));
14224 sv_catpvs(msg, "\"");
14225 }
14226
14227# endif
14228
14229 /* Here, this line contains a legal range */
14230 this_definition = sv_2mortal(_new_invlist(2));
14231 this_definition = _add_range_to_invlist(this_definition, min, max);
14232 goto calculate;
14233
14234 check_if_property:
14235
14236 /* Here it isn't a legal range line. See if it is a legal property
14237 * line. First find the end of the meat of the line */
14238 s = strpbrk(s, "#\n");
14239 if (s == NULL) {
14240 s = e;
14241 }
14242
14243 /* Ignore trailing blanks in keeping with the requirements of
14244 * parse_uniprop_string() */
14245 s--;
14246 while (s > s0 && isBLANK_A(*s)) {
14247 s--;
14248 }
14249 s++;
14250
14251 this_definition = parse_uniprop_string(s0, s - s0,
14252 is_utf8, to_fold, runtime,
14253 deferrable,
14254 NULL,
14255 user_defined_ptr, msg,
14256 (name_len == 0)
14257 ? level /* Don't increase level
14258 if input is empty */
14259 : level + 1
14260 );
14261 if (this_definition == NULL) {
14262 goto return_failure; /* 'msg' should have had the reason
14263 appended to it by the above call */
14264 }
14265
14266 if (! is_invlist(this_definition)) { /* Unknown at this time */
14267 return newSVsv(this_definition);
14268 }
14269
14270 if (*s != '\n') {
14271 s = strchr(s, '\n');
14272 if (s == NULL) {
14273 s = e;
14274 }
14275 }
14276
14277 calculate:
14278
14279 switch (op) {
14280 case '+':
14281 _invlist_union(running_definition, this_definition,
14282 &running_definition);
14283 break;
14284 case '-':
14285 _invlist_subtract(running_definition, this_definition,
14286 &running_definition);
14287 break;
14288 case '&':
14289 _invlist_intersection(running_definition, this_definition,
14290 &running_definition);
14291 break;
14292 case '!':
14293 _invlist_union_complement_2nd(running_definition,
14294 this_definition, &running_definition);
14295 break;
14296 default:
14297 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
14298 __FILE__, __LINE__, op);
14299 break;
14300 }
14301
14302 /* Position past the '\n' */
14303 s0 = s + 1;
14304 } /* End of loop through the lines of 'contents' */
14305
14306 /* Here, we processed all the lines in 'contents' without error. If we
14307 * didn't add any warnings, simply return success */
14308 if (msgs_length_on_entry == SvCUR(msg)) {
14309
14310 /* If the expansion was empty, the answer isn't nothing: its an empty
14311 * inversion list */
14312 if (running_definition == NULL) {
14313 running_definition = _new_invlist(1);
14314 }
14315
14316 return running_definition;
14317 }
14318
14319 /* Otherwise, add some explanatory text, but we will return success */
14320 goto return_msg;
14321
14322 return_failure:
14323 running_definition = NULL;
14324
14325 return_msg:
14326
14327 if (name_len > 0) {
14328 sv_catpvs(msg, " in expansion of ");
14329 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
14330 }
14331
14332 return running_definition;
14333}
14334
14335/* As explained below, certain operations need to take place in the first
14336 * thread created. These macros switch contexts */
14337# ifdef USE_ITHREADS
14338# define DECLARATION_FOR_GLOBAL_CONTEXT \
14339 PerlInterpreter * save_aTHX = aTHX;
14340# define SWITCH_TO_GLOBAL_CONTEXT \
14341 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
14342# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
14343# define CUR_CONTEXT aTHX
14344# define ORIGINAL_CONTEXT save_aTHX
14345# else
14346# define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
14347# define SWITCH_TO_GLOBAL_CONTEXT NOOP
14348# define RESTORE_CONTEXT NOOP
14349# define CUR_CONTEXT NULL
14350# define ORIGINAL_CONTEXT NULL
14351# endif
14352
14353STATIC void
14354S_delete_recursion_entry(pTHX_ void *key)
14355{
14356 /* Deletes the entry used to detect recursion when expanding user-defined
14357 * properties. This is a function so it can be set up to be called even if
14358 * the program unexpectedly quits */
14359
14360 SV ** current_entry;
14361 const STRLEN key_len = strlen((const char *) key);
14362 DECLARATION_FOR_GLOBAL_CONTEXT;
14363
14364 SWITCH_TO_GLOBAL_CONTEXT;
14365
14366 /* If the entry is one of these types, it is a permanent entry, and not the
14367 * one used to detect recursions. This function should delete only the
14368 * recursion entry */
14369 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
14370 if ( current_entry
14371 && ! is_invlist(*current_entry)
14372 && ! SvPOK(*current_entry))
14373 {
14374 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
14375 G_DISCARD);
14376 }
14377
14378 RESTORE_CONTEXT;
14379}
14380
14381STATIC SV *
14382S_get_fq_name(pTHX_
14383 const char * const name, /* The first non-blank in the \p{}, \P{} */
14384 const Size_t name_len, /* Its length in bytes, not including any trailing space */
14385 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14386 const bool has_colon_colon
14387 )
14388{
14389 /* Returns a mortal SV containing the fully qualified version of the input
14390 * name */
14391
14392 SV * fq_name;
14393
14394 fq_name = newSVpvs_flags("", SVs_TEMP);
14395
14396 /* Use the current package if it wasn't included in our input */
14397 if (! has_colon_colon) {
14398 const HV * pkg = (IN_PERL_COMPILETIME)
14399 ? PL_curstash
14400 : CopSTASH(PL_curcop);
14401 const char* pkgname = HvNAME(pkg);
14402
14403 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14404 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
14405 sv_catpvs(fq_name, "::");
14406 }
14407
14408 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14409 UTF8fARG(is_utf8, name_len, name));
14410 return fq_name;
14411}
14412
14413STATIC SV *
14414S_parse_uniprop_string(pTHX_
14415
14416 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
14417 * now. If so, the return is an inversion list.
14418 *
14419 * If the property is user-defined, it is a subroutine, which in turn
14420 * may call other subroutines. This function will call the whole nest of
14421 * them to get the definition they return; if some aren't known at the time
14422 * of the call to this function, the fully qualified name of the highest
14423 * level sub is returned. It is an error to call this function at runtime
14424 * without every sub defined.
14425 *
14426 * If an error was found, NULL is returned, and 'msg' gets a suitable
14427 * message appended to it. (Appending allows the back trace of how we got
14428 * to the faulty definition to be displayed through nested calls of
14429 * user-defined subs.)
14430 *
14431 * The caller should NOT try to free any returned inversion list.
14432 *
14433 * Other parameters will be set on return as described below */
14434
14435 const char * const name, /* The first non-blank in the \p{}, \P{} */
14436 Size_t name_len, /* Its length in bytes, not including any
14437 trailing space */
14438 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14439 const bool to_fold, /* ? Is this under /i */
14440 const bool runtime, /* TRUE if this is being called at run time */
14441 const bool deferrable, /* TRUE if it's ok for the definition to not be
14442 known at this call */
14443 AV ** strings, /* To return string property values, like named
14444 sequences */
14445 bool *user_defined_ptr, /* Upon return from this function it will be
14446 set to TRUE if any component is a
14447 user-defined property */
14448 SV * msg, /* Any error or warning msg(s) are appended to
14449 this */
14450 const STRLEN level) /* Recursion level of this call */
14451{
14452 char* lookup_name; /* normalized name for lookup in our tables */
14453 unsigned lookup_len; /* Its length */
14454 enum { Not_Strict = 0, /* Some properties have stricter name */
14455 Strict, /* normalization rules, which we decide */
14456 As_Is /* upon based on parsing */
14457 } stricter = Not_Strict;
14458
14459 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
14460 * (though it requires extra effort to download them from Unicode and
14461 * compile perl to know about them) */
14462 bool is_nv_type = FALSE;
14463
14464 unsigned int i = 0, i_zero = 0, j = 0;
14465 int equals_pos = -1; /* Where the '=' is found, or negative if none */
14466 int slash_pos = -1; /* Where the '/' is found, or negative if none */
14467 int table_index = 0; /* The entry number for this property in the table
14468 of all Unicode property names */
14469 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
14470 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
14471 the normalized name in certain situations */
14472 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
14473 part of a package name */
14474 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
14475 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
14476 property rather than a Unicode
14477 one. */
14478 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
14479 if an error. If it is an inversion list,
14480 it is the definition. Otherwise it is a
14481 string containing the fully qualified sub
14482 name of 'name' */
14483 SV * fq_name = NULL; /* For user-defined properties, the fully
14484 qualified name */
14485 bool invert_return = FALSE; /* ? Do we need to complement the result before
14486 returning it */
14487 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
14488 explicit utf8:: package that we strip
14489 off */
14490 /* The expansion of properties that could be either user-defined or
14491 * official unicode ones is deferred until runtime, including a marker for
14492 * those that might be in the latter category. This boolean indicates if
14493 * we've seen that marker. If not, what we're parsing can't be such an
14494 * official Unicode property whose expansion was deferred */
14495 bool could_be_deferred_official = FALSE;
14496
14497 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
14498
14499 /* The input will be normalized into 'lookup_name' */
14500 Newx(lookup_name, name_len, char);
14501 SAVEFREEPV(lookup_name);
14502
14503 /* Parse the input. */
14504 for (i = 0; i < name_len; i++) {
14505 char cur = name[i];
14506
14507 /* Most of the characters in the input will be of this ilk, being parts
14508 * of a name */
14509 if (isIDCONT_A(cur)) {
14510
14511 /* Case differences are ignored. Our lookup routine assumes
14512 * everything is lowercase, so normalize to that */
14513 if (isUPPER_A(cur)) {
14514 lookup_name[j++] = toLOWER_A(cur);
14515 continue;
14516 }
14517
14518 if (cur == '_') { /* Don't include these in the normalized name */
14519 continue;
14520 }
14521
14522 lookup_name[j++] = cur;
14523
14524 /* The first character in a user-defined name must be of this type.
14525 * */
14526 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
14527 could_be_user_defined = FALSE;
14528 }
14529
14530 continue;
14531 }
14532
14533 /* Here, the character is not something typically in a name, But these
14534 * two types of characters (and the '_' above) can be freely ignored in
14535 * most situations. Later it may turn out we shouldn't have ignored
14536 * them, and we have to reparse, but we don't have enough information
14537 * yet to make that decision */
14538 if (cur == '-' || isSPACE_A(cur)) {
14539 could_be_user_defined = FALSE;
14540 continue;
14541 }
14542
14543 /* An equals sign or single colon mark the end of the first part of
14544 * the property name */
14545 if ( cur == '='
14546 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
14547 {
14548 lookup_name[j++] = '='; /* Treat the colon as an '=' */
14549 equals_pos = j; /* Note where it occurred in the input */
14550 could_be_user_defined = FALSE;
14551 break;
14552 }
14553
14554 /* If this looks like it is a marker we inserted at compile time,
14555 * set a flag and otherwise ignore it. If it isn't in the final
14556 * position, keep it as it would have been user input. */
14557 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
14558 && ! deferrable
14559 && could_be_user_defined
14560 && i == name_len - 1)
14561 {
14562 name_len--;
14563 could_be_deferred_official = TRUE;
14564 continue;
14565 }
14566
14567 /* Otherwise, this character is part of the name. */
14568 lookup_name[j++] = cur;
14569
14570 /* Here it isn't a single colon, so if it is a colon, it must be a
14571 * double colon */
14572 if (cur == ':') {
14573
14574 /* A double colon should be a package qualifier. We note its
14575 * position and continue. Note that one could have
14576 * pkg1::pkg2::...::foo
14577 * so that the position at the end of the loop will be just after
14578 * the final qualifier */
14579
14580 i++;
14581 non_pkg_begin = i + 1;
14582 lookup_name[j++] = ':';
14583 lun_non_pkg_begin = j;
14584 }
14585 else { /* Only word chars (and '::') can be in a user-defined name */
14586 could_be_user_defined = FALSE;
14587 }
14588 } /* End of parsing through the lhs of the property name (or all of it if
14589 no rhs) */
14590
14591 /* If there is a single package name 'utf8::', it is ambiguous. It could
14592 * be for a user-defined property, or it could be a Unicode property, as
14593 * all of them are considered to be for that package. For the purposes of
14594 * parsing the rest of the property, strip it off */
14595 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
14596 lookup_name += STRLENs("utf8::");
14597 j -= STRLENs("utf8::");
14598 equals_pos -= STRLENs("utf8::");
14599 i_zero = STRLENs("utf8::"); /* When resetting 'i' to reparse
14600 from the beginning, it has to be
14601 set past what we're stripping
14602 off */
14603 stripped_utf8_pkg = TRUE;
14604 }
14605
14606 /* Here, we are either done with the whole property name, if it was simple;
14607 * or are positioned just after the '=' if it is compound. */
14608
14609 if (equals_pos >= 0) {
14610 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
14611
14612 /* Space immediately after the '=' is ignored */
14613 i++;
14614 for (; i < name_len; i++) {
14615 if (! isSPACE_A(name[i])) {
14616 break;
14617 }
14618 }
14619
14620 /* Most punctuation after the equals indicates a subpattern, like
14621 * \p{foo=/bar/} */
14622 if ( isPUNCT_A(name[i])
14623 && name[i] != '-'
14624 && name[i] != '+'
14625 && name[i] != '_'
14626 && name[i] != '{'
14627 /* A backslash means the real delimiter is the next character,
14628 * but it must be punctuation */
14629 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
14630 {
14631 bool special_property = memEQs(lookup_name, j - 1, "name")
14632 || memEQs(lookup_name, j - 1, "na");
14633 if (! special_property) {
14634 /* Find the property. The table includes the equals sign, so
14635 * we use 'j' as-is */
14636 table_index = do_uniprop_match(lookup_name, j);
14637 }
14638 if (special_property || table_index) {
14639 REGEXP * subpattern_re;
14640 char open = name[i++];
14641 char close;
14642 const char * pos_in_brackets;
14643 const char * const * prop_values;
14644 bool escaped = 0;
14645
14646 /* Backslash => delimiter is the character following. We
14647 * already checked that it is punctuation */
14648 if (open == '\\') {
14649 open = name[i++];
14650 escaped = 1;
14651 }
14652
14653 /* This data structure is constructed so that the matching
14654 * closing bracket is 3 past its matching opening. The second
14655 * set of closing is so that if the opening is something like
14656 * ']', the closing will be that as well. Something similar is
14657 * done in toke.c */
14658 pos_in_brackets = memCHRs("([<)]>)]>", open);
14659 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
14660
14661 if ( i >= name_len
14662 || name[name_len-1] != close
14663 || (escaped && name[name_len-2] != '\\')
14664 /* Also make sure that there are enough characters.
14665 * e.g., '\\\' would show up incorrectly as legal even
14666 * though it is too short */
14667 || (SSize_t) (name_len - i - 1 - escaped) < 0)
14668 {
14669 sv_catpvs(msg, "Unicode property wildcard not terminated");
14670 goto append_name_to_msg;
14671 }
14672
14673 Perl_ck_warner_d(aTHX_
14674 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
14675 "The Unicode property wildcards feature is experimental");
14676
14677 if (special_property) {
14678 const char * error_msg;
14679 const char * revised_name = name + i;
14680 Size_t revised_name_len = name_len - (i + 1 + escaped);
14681
14682 /* Currently, the only 'special_property' is name, which we
14683 * lookup in _charnames.pm */
14684
14685 if (! load_charnames(newSVpvs("placeholder"),
14686 revised_name, revised_name_len,
14687 &error_msg))
14688 {
14689 sv_catpv(msg, error_msg);
14690 goto append_name_to_msg;
14691 }
14692
14693 /* Farm this out to a function just to make the current
14694 * function less unwieldy */
14695 if (handle_names_wildcard(revised_name, revised_name_len,
14696 &prop_definition,
14697 strings))
14698 {
14699 return prop_definition;
14700 }
14701
14702 goto failed;
14703 }
14704
14705 prop_values = get_prop_values(table_index);
14706
14707 /* Now create and compile the wildcard subpattern. Use /i
14708 * because the property values are supposed to match with case
14709 * ignored. */
14710 subpattern_re = compile_wildcard(name + i,
14711 name_len - i - 1 - escaped,
14712 TRUE /* /i */
14713 );
14714
14715 /* For each legal property value, see if the supplied pattern
14716 * matches it. */
14717 while (*prop_values) {
14718 const char * const entry = *prop_values;
14719 const Size_t len = strlen(entry);
14720 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
14721
14722 if (execute_wildcard(subpattern_re,
14723 (char *) entry,
14724 (char *) entry + len,
14725 (char *) entry, 0,
14726 entry_sv,
14727 0))
14728 { /* Here, matched. Add to the returned list */
14729 Size_t total_len = j + len;
14730 SV * sub_invlist = NULL;
14731 char * this_string;
14732
14733 /* We know this is a legal \p{property=value}. Call
14734 * the function to return the list of code points that
14735 * match it */
14736 Newxz(this_string, total_len + 1, char);
14737 Copy(lookup_name, this_string, j, char);
14738 my_strlcat(this_string, entry, total_len + 1);
14739 SAVEFREEPV(this_string);
14740 sub_invlist = parse_uniprop_string(this_string,
14741 total_len,
14742 is_utf8,
14743 to_fold,
14744 runtime,
14745 deferrable,
14746 NULL,
14747 user_defined_ptr,
14748 msg,
14749 level + 1);
14750 _invlist_union(prop_definition, sub_invlist,
14751 &prop_definition);
14752 }
14753
14754 prop_values++; /* Next iteration, look at next propvalue */
14755 } /* End of looking through property values; (the data
14756 structure is terminated by a NULL ptr) */
14757
14758 SvREFCNT_dec_NN(subpattern_re);
14759
14760 if (prop_definition) {
14761 return prop_definition;
14762 }
14763
14764 sv_catpvs(msg, "No Unicode property value wildcard matches:");
14765 goto append_name_to_msg;
14766 }
14767
14768 /* Here's how khw thinks we should proceed to handle the properties
14769 * not yet done: Bidi Mirroring Glyph can map to ""
14770 Bidi Paired Bracket can map to ""
14771 Case Folding (both full and simple)
14772 Shouldn't /i be good enough for Full
14773 Decomposition Mapping
14774 Equivalent Unified Ideograph can map to ""
14775 Lowercase Mapping (both full and simple)
14776 NFKC Case Fold can map to ""
14777 Titlecase Mapping (both full and simple)
14778 Uppercase Mapping (both full and simple)
14779 * Handle these the same way Name is done, using say, _wild.pm, but
14780 * having both loose and full, like in charclass_invlists.h.
14781 * Perhaps move block and script to that as they are somewhat large
14782 * in charclass_invlists.h.
14783 * For properties where the default is the code point itself, such
14784 * as any of the case changing mappings, the string would otherwise
14785 * consist of all Unicode code points in UTF-8 strung together.
14786 * This would be impractical. So instead, examine their compiled
14787 * pattern, looking at the ssc. If none, reject the pattern as an
14788 * error. Otherwise run the pattern against every code point in
14789 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
14790 * And it might be good to create an API to return the ssc.
14791 * Or handle them like the algorithmic names are done
14792 */
14793 } /* End of is a wildcard subppattern */
14794
14795 /* \p{name=...} is handled specially. Instead of using the normal
14796 * mechanism involving charclass_invlists.h, it uses _charnames.pm
14797 * which has the necessary (huge) data accessible to it, and which
14798 * doesn't get loaded unless necessary. The legal syntax for names is
14799 * somewhat different than other properties due both to the vagaries of
14800 * a few outlier official names, and the fact that only a few ASCII
14801 * characters are permitted in them */
14802 if ( memEQs(lookup_name, j - 1, "name")
14803 || memEQs(lookup_name, j - 1, "na"))
14804 {
14805 dSP;
14806 HV * table;
14807 SV * character;
14808 const char * error_msg;
14809 CV* lookup_loose;
14810 SV * character_name;
14811 STRLEN character_len;
14812 UV cp;
14813
14814 stricter = As_Is;
14815
14816 /* Since the RHS (after skipping initial space) is passed unchanged
14817 * to charnames, and there are different criteria for what are
14818 * legal characters in the name, just parse it here. A character
14819 * name must begin with an ASCII alphabetic */
14820 if (! isALPHA(name[i])) {
14821 goto failed;
14822 }
14823 lookup_name[j++] = name[i];
14824
14825 for (++i; i < name_len; i++) {
14826 /* Official names can only be in the ASCII range, and only
14827 * certain characters */
14828 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
14829 goto failed;
14830 }
14831 lookup_name[j++] = name[i];
14832 }
14833
14834 /* Finished parsing, save the name into an SV */
14835 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
14836
14837 /* Make sure _charnames is loaded. (The parameters give context
14838 * for any errors generated */
14839 table = load_charnames(character_name, name, name_len, &error_msg);
14840 if (table == NULL) {
14841 sv_catpv(msg, error_msg);
14842 goto append_name_to_msg;
14843 }
14844
14845 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
14846 if (! lookup_loose) {
14847 Perl_croak(aTHX_
14848 "panic: Can't find '_charnames::_loose_regcomp_lookup");
14849 }
14850
14851 PUSHSTACKi(PERLSI_REGCOMP);
14852 ENTER ;
14853 SAVETMPS;
14854 save_re_context();
14855
14856 PUSHMARK(SP) ;
14857 XPUSHs(character_name);
14858 PUTBACK;
14859 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
14860
14861 SPAGAIN ;
14862
14863 character = POPs;
14864 SvREFCNT_inc_simple_void_NN(character);
14865
14866 PUTBACK ;
14867 FREETMPS ;
14868 LEAVE ;
14869 POPSTACK;
14870
14871 if (! SvOK(character)) {
14872 goto failed;
14873 }
14874
14875 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
14876 if (character_len == SvCUR(character)) {
14877 prop_definition = add_cp_to_invlist(NULL, cp);
14878 }
14879 else {
14880 AV * this_string;
14881
14882 /* First of the remaining characters in the string. */
14883 char * remaining = SvPVX(character) + character_len;
14884
14885 if (strings == NULL) {
14886 goto failed; /* XXX Perhaps a specific msg instead, like
14887 'not available here' */
14888 }
14889
14890 if (*strings == NULL) {
14891 *strings = newAV();
14892 }
14893
14894 this_string = newAV();
14895 av_push_simple(this_string, newSVuv(cp));
14896
14897 do {
14898 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
14899 av_push_simple(this_string, newSVuv(cp));
14900 remaining += character_len;
14901 } while (remaining < SvEND(character));
14902
14903 av_push_simple(*strings, (SV *) this_string);
14904 }
14905
14906 return prop_definition;
14907 }
14908
14909 /* Certain properties whose values are numeric need special handling.
14910 * They may optionally be prefixed by 'is'. Ignore that prefix for the
14911 * purposes of checking if this is one of those properties */
14912 if (memBEGINPs(lookup_name, j, "is")) {
14913 lookup_offset = 2;
14914 }
14915
14916 /* Then check if it is one of these specially-handled properties. The
14917 * possibilities are hard-coded because easier this way, and the list
14918 * is unlikely to change.
14919 *
14920 * All numeric value type properties are of this ilk, and are also
14921 * special in a different way later on. So find those first. There
14922 * are several numeric value type properties in the Unihan DB (which is
14923 * unlikely to be compiled with perl, but we handle it here in case it
14924 * does get compiled). They all end with 'numeric'. The interiors
14925 * aren't checked for the precise property. This would stop working if
14926 * a cjk property were to be created that ended with 'numeric' and
14927 * wasn't a numeric type */
14928 is_nv_type = memEQs(lookup_name + lookup_offset,
14929 j - 1 - lookup_offset, "numericvalue")
14930 || memEQs(lookup_name + lookup_offset,
14931 j - 1 - lookup_offset, "nv")
14932 || ( memENDPs(lookup_name + lookup_offset,
14933 j - 1 - lookup_offset, "numeric")
14934 && ( memBEGINPs(lookup_name + lookup_offset,
14935 j - 1 - lookup_offset, "cjk")
14936 || memBEGINPs(lookup_name + lookup_offset,
14937 j - 1 - lookup_offset, "k")));
14938 if ( is_nv_type
14939 || memEQs(lookup_name + lookup_offset,
14940 j - 1 - lookup_offset, "canonicalcombiningclass")
14941 || memEQs(lookup_name + lookup_offset,
14942 j - 1 - lookup_offset, "ccc")
14943 || memEQs(lookup_name + lookup_offset,
14944 j - 1 - lookup_offset, "age")
14945 || memEQs(lookup_name + lookup_offset,
14946 j - 1 - lookup_offset, "in")
14947 || memEQs(lookup_name + lookup_offset,
14948 j - 1 - lookup_offset, "presentin"))
14949 {
14950 unsigned int k;
14951
14952 /* Since the stuff after the '=' is a number, we can't throw away
14953 * '-' willy-nilly, as those could be a minus sign. Other stricter
14954 * rules also apply. However, these properties all can have the
14955 * rhs not be a number, in which case they contain at least one
14956 * alphabetic. In those cases, the stricter rules don't apply.
14957 * But the numeric type properties can have the alphas [Ee] to
14958 * signify an exponent, and it is still a number with stricter
14959 * rules. So look for an alpha that signifies not-strict */
14960 stricter = Strict;
14961 for (k = i; k < name_len; k++) {
14962 if ( isALPHA_A(name[k])
14963 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
14964 {
14965 stricter = Not_Strict;
14966 break;
14967 }
14968 }
14969 }
14970
14971 if (stricter) {
14972
14973 /* A number may have a leading '+' or '-'. The latter is retained
14974 * */
14975 if (name[i] == '+') {
14976 i++;
14977 }
14978 else if (name[i] == '-') {
14979 lookup_name[j++] = '-';
14980 i++;
14981 }
14982
14983 /* Skip leading zeros including single underscores separating the
14984 * zeros, or between the final leading zero and the first other
14985 * digit */
14986 for (; i < name_len - 1; i++) {
14987 if ( name[i] != '0'
14988 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
14989 {
14990 break;
14991 }
14992 }
14993
14994 /* Turn nv=-0 into nv=0. These should be equivalent, but vary by
14995 * underling libc implementation. */
14996 if ( i == name_len - 1
14997 && name[name_len-1] == '0'
14998 && lookup_name[j-1] == '-')
14999 {
15000 j--;
15001 }
15002 }
15003 }
15004 else { /* No '=' */
15005
15006 /* Only a few properties without an '=' should be parsed with stricter
15007 * rules. The list is unlikely to change. */
15008 if ( memBEGINPs(lookup_name, j, "perl")
15009 && memNEs(lookup_name + 4, j - 4, "space")
15010 && memNEs(lookup_name + 4, j - 4, "word"))
15011 {
15012 stricter = Strict;
15013
15014 /* We set the inputs back to 0 and the code below will reparse,
15015 * using strict */
15016 i = i_zero;
15017 j = 0;
15018 }
15019 }
15020
15021 /* Here, we have either finished the property, or are positioned to parse
15022 * the remainder, and we know if stricter rules apply. Finish out, if not
15023 * already done */
15024 for (; i < name_len; i++) {
15025 char cur = name[i];
15026
15027 /* In all instances, case differences are ignored, and we normalize to
15028 * lowercase */
15029 if (isUPPER_A(cur)) {
15030 lookup_name[j++] = toLOWER(cur);
15031 continue;
15032 }
15033
15034 /* An underscore is skipped, but not under strict rules unless it
15035 * separates two digits */
15036 if (cur == '_') {
15037 if ( stricter
15038 && ( i == i_zero || (int) i == equals_pos || i == name_len- 1
15039 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
15040 {
15041 lookup_name[j++] = '_';
15042 }
15043 continue;
15044 }
15045
15046 /* Hyphens are skipped except under strict */
15047 if (cur == '-' && ! stricter) {
15048 continue;
15049 }
15050
15051 /* XXX Bug in documentation. It says white space skipped adjacent to
15052 * non-word char. Maybe we should, but shouldn't skip it next to a dot
15053 * in a number */
15054 if (isSPACE_A(cur) && ! stricter) {
15055 continue;
15056 }
15057
15058 lookup_name[j++] = cur;
15059
15060 /* Unless this is a non-trailing slash, we are done with it */
15061 if (i >= name_len - 1 || cur != '/') {
15062 continue;
15063 }
15064
15065 slash_pos = j;
15066
15067 /* A slash in the 'numeric value' property indicates that what follows
15068 * is a denominator. It can have a leading '+' and '0's that should be
15069 * skipped. But we have never allowed a negative denominator, so treat
15070 * a minus like every other character. (No need to rule out a second
15071 * '/', as that won't match anything anyway */
15072 if (is_nv_type) {
15073 i++;
15074 if (i < name_len && name[i] == '+') {
15075 i++;
15076 }
15077
15078 /* Skip leading zeros including underscores separating digits */
15079 for (; i < name_len - 1; i++) {
15080 if ( name[i] != '0'
15081 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15082 {
15083 break;
15084 }
15085 }
15086
15087 /* Store the first real character in the denominator */
15088 if (i < name_len) {
15089 lookup_name[j++] = name[i];
15090 }
15091 }
15092 }
15093
15094 /* Here are completely done parsing the input 'name', and 'lookup_name'
15095 * contains a copy, normalized.
15096 *
15097 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
15098 * different from without the underscores. */
15099 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
15100 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
15101 && UNLIKELY(name[name_len-1] == '_'))
15102 {
15103 lookup_name[j++] = '&';
15104 }
15105
15106 /* If the original input began with 'In' or 'Is', it could be a subroutine
15107 * call to a user-defined property instead of a Unicode property name. */
15108 if ( name_len - non_pkg_begin > 2
15109 && name[non_pkg_begin+0] == 'I'
15110 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
15111 {
15112 /* Names that start with In have different characteristics than those
15113 * that start with Is */
15114 if (name[non_pkg_begin+1] == 's') {
15115 starts_with_Is = TRUE;
15116 }
15117 }
15118 else {
15119 could_be_user_defined = FALSE;
15120 }
15121
15122 if (could_be_user_defined) {
15123 CV* user_sub;
15124
15125 /* If the user defined property returns the empty string, it could
15126 * easily be because the pattern is being compiled before the data it
15127 * actually needs to compile is available. This could be argued to be
15128 * a bug in the perl code, but this is a change of behavior for Perl,
15129 * so we handle it. This means that intentionally returning nothing
15130 * will not be resolved until runtime */
15131 bool empty_return = FALSE;
15132
15133 /* Here, the name could be for a user defined property, which are
15134 * implemented as subs. */
15135 user_sub = get_cvn_flags(name, name_len, 0);
15136 if (! user_sub) {
15137
15138 /* Here, the property name could be a user-defined one, but there
15139 * is no subroutine to handle it (as of now). Defer handling it
15140 * until runtime. Otherwise, a block defined by Unicode in a later
15141 * release would get the synonym InFoo added for it, and existing
15142 * code that used that name would suddenly break if it referred to
15143 * the property before the sub was declared. See [perl #134146] */
15144 if (deferrable) {
15145 goto definition_deferred;
15146 }
15147
15148 /* Here, we are at runtime, and didn't find the user property. It
15149 * could be an official property, but only if no package was
15150 * specified, or just the utf8:: package. */
15151 if (could_be_deferred_official) {
15152 lookup_name += lun_non_pkg_begin;
15153 j -= lun_non_pkg_begin;
15154 }
15155 else if (! stripped_utf8_pkg) {
15156 goto unknown_user_defined;
15157 }
15158
15159 /* Drop down to look up in the official properties */
15160 }
15161 else {
15162 const char insecure[] = "Insecure user-defined property";
15163
15164 /* Here, there is a sub by the correct name. Normally we call it
15165 * to get the property definition */
15166 dSP;
15167 SV * user_sub_sv = MUTABLE_SV(user_sub);
15168 SV * error; /* Any error returned by calling 'user_sub' */
15169 SV * key; /* The key into the hash of user defined sub names
15170 */
15171 SV * placeholder;
15172 SV ** saved_user_prop_ptr; /* Hash entry for this property */
15173
15174 /* How many times to retry when another thread is in the middle of
15175 * expanding the same definition we want */
15176 PERL_INT_FAST8_T retry_countdown = 10;
15177
15178 DECLARATION_FOR_GLOBAL_CONTEXT;
15179
15180 /* If we get here, we know this property is user-defined */
15181 *user_defined_ptr = TRUE;
15182
15183 /* We refuse to call a potentially tainted subroutine; returning an
15184 * error instead */
15185 if (TAINT_get) {
15186 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15187 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15188 goto append_name_to_msg;
15189 }
15190
15191 /* In principal, we only call each subroutine property definition
15192 * once during the life of the program. This guarantees that the
15193 * property definition never changes. The results of the single
15194 * sub call are stored in a hash, which is used instead for future
15195 * references to this property. The property definition is thus
15196 * immutable. But, to allow the user to have a /i-dependent
15197 * definition, we call the sub once for non-/i, and once for /i,
15198 * should the need arise, passing the /i status as a parameter.
15199 *
15200 * We start by constructing the hash key name, consisting of the
15201 * fully qualified subroutine name, preceded by the /i status, so
15202 * that there is a key for /i and a different key for non-/i */
15203 key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
15204 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15205 non_pkg_begin != 0);
15206 sv_catsv(key, fq_name);
15207
15208 /* We only call the sub once throughout the life of the program
15209 * (with the /i, non-/i exception noted above). That means the
15210 * hash must be global and accessible to all threads. It is
15211 * created at program start-up, before any threads are created, so
15212 * is accessible to all children. But this creates some
15213 * complications.
15214 *
15215 * 1) The keys can't be shared, or else problems arise; sharing is
15216 * turned off at hash creation time
15217 * 2) All SVs in it are there for the remainder of the life of the
15218 * program, and must be created in the same interpreter context
15219 * as the hash, or else they will be freed from the wrong pool
15220 * at global destruction time. This is handled by switching to
15221 * the hash's context to create each SV going into it, and then
15222 * immediately switching back
15223 * 3) All accesses to the hash must be controlled by a mutex, to
15224 * prevent two threads from getting an unstable state should
15225 * they simultaneously be accessing it. The code below is
15226 * crafted so that the mutex is locked whenever there is an
15227 * access and unlocked only when the next stable state is
15228 * achieved.
15229 *
15230 * The hash stores either the definition of the property if it was
15231 * valid, or, if invalid, the error message that was raised. We
15232 * use the type of SV to distinguish.
15233 *
15234 * There's also the need to guard against the definition expansion
15235 * from infinitely recursing. This is handled by storing the aTHX
15236 * of the expanding thread during the expansion. Again the SV type
15237 * is used to distinguish this from the other two cases. If we
15238 * come to here and the hash entry for this property is our aTHX,
15239 * it means we have recursed, and the code assumes that we would
15240 * infinitely recurse, so instead stops and raises an error.
15241 * (Any recursion has always been treated as infinite recursion in
15242 * this feature.)
15243 *
15244 * If instead, the entry is for a different aTHX, it means that
15245 * that thread has gotten here first, and hasn't finished expanding
15246 * the definition yet. We just have to wait until it is done. We
15247 * sleep and retry a few times, returning an error if the other
15248 * thread doesn't complete. */
15249
15250 re_fetch:
15251 USER_PROP_MUTEX_LOCK;
15252
15253 /* If we have an entry for this key, the subroutine has already
15254 * been called once with this /i status. */
15255 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
15256 SvPVX(key), SvCUR(key), 0);
15257 if (saved_user_prop_ptr) {
15258
15259 /* If the saved result is an inversion list, it is the valid
15260 * definition of this property */
15261 if (is_invlist(*saved_user_prop_ptr)) {
15262 prop_definition = *saved_user_prop_ptr;
15263
15264 /* The SV in the hash won't be removed until global
15265 * destruction, so it is stable and we can unlock */
15266 USER_PROP_MUTEX_UNLOCK;
15267
15268 /* The caller shouldn't try to free this SV */
15269 return prop_definition;
15270 }
15271
15272 /* Otherwise, if it is a string, it is the error message
15273 * that was returned when we first tried to evaluate this
15274 * property. Fail, and append the message */
15275 if (SvPOK(*saved_user_prop_ptr)) {
15276 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15277 sv_catsv(msg, *saved_user_prop_ptr);
15278
15279 /* The SV in the hash won't be removed until global
15280 * destruction, so it is stable and we can unlock */
15281 USER_PROP_MUTEX_UNLOCK;
15282
15283 return NULL;
15284 }
15285
15286 assert(SvIOK(*saved_user_prop_ptr));
15287
15288 /* Here, we have an unstable entry in the hash. Either another
15289 * thread is in the middle of expanding the property's
15290 * definition, or we are ourselves recursing. We use the aTHX
15291 * in it to distinguish */
15292 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
15293
15294 /* Here, it's another thread doing the expanding. We've
15295 * looked as much as we are going to at the contents of the
15296 * hash entry. It's safe to unlock. */
15297 USER_PROP_MUTEX_UNLOCK;
15298
15299 /* Retry a few times */
15300 if (retry_countdown-- > 0) {
15301 PerlProc_sleep(1);
15302 goto re_fetch;
15303 }
15304
15305 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15306 sv_catpvs(msg, "Timeout waiting for another thread to "
15307 "define");
15308 goto append_name_to_msg;
15309 }
15310
15311 /* Here, we are recursing; don't dig any deeper */
15312 USER_PROP_MUTEX_UNLOCK;
15313
15314 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15315 sv_catpvs(msg,
15316 "Infinite recursion in user-defined property");
15317 goto append_name_to_msg;
15318 }
15319
15320 /* Here, this thread has exclusive control, and there is no entry
15321 * for this property in the hash. So we have the go ahead to
15322 * expand the definition ourselves. */
15323
15324 PUSHSTACKi(PERLSI_REGCOMP);
15325 ENTER;
15326
15327 /* Create a temporary placeholder in the hash to detect recursion
15328 * */
15329 SWITCH_TO_GLOBAL_CONTEXT;
15330 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
15331 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
15332 RESTORE_CONTEXT;
15333
15334 /* Now that we have a placeholder, we can let other threads
15335 * continue */
15336 USER_PROP_MUTEX_UNLOCK;
15337
15338 /* Make sure the placeholder always gets destroyed */
15339 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
15340
15341 PUSHMARK(SP);
15342 SAVETMPS;
15343
15344 /* Call the user's function, with the /i status as a parameter.
15345 * Note that we have gone to a lot of trouble to keep this call
15346 * from being within the locked mutex region. */
15347 XPUSHs(boolSV(to_fold));
15348 PUTBACK;
15349
15350 /* The following block was taken from swash_init(). Presumably
15351 * they apply to here as well, though we no longer use a swash --
15352 * khw */
15353 SAVEHINTS();
15354 save_re_context();
15355 /* We might get here via a subroutine signature which uses a utf8
15356 * parameter name, at which point PL_subname will have been set
15357 * but not yet used. */
15358 save_item(PL_subname);
15359
15360 /* G_SCALAR guarantees a single return value */
15361 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
15362
15363 SPAGAIN;
15364
15365 error = ERRSV;
15366 if (TAINT_get || SvTRUE(error)) {
15367 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15368 if (SvTRUE(error)) {
15369 sv_catpvs(msg, "Error \"");
15370 sv_catsv(msg, error);
15371 sv_catpvs(msg, "\"");
15372 }
15373 if (TAINT_get) {
15374 if (SvTRUE(error)) sv_catpvs(msg, "; ");
15375 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15376 }
15377
15378 if (name_len > 0) {
15379 sv_catpvs(msg, " in expansion of ");
15380 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
15381 name_len,
15382 name));
15383 }
15384
15385 (void) POPs;
15386 prop_definition = NULL;
15387 }
15388 else {
15389 SV * contents = POPs;
15390
15391 /* The contents is supposed to be the expansion of the property
15392 * definition. If the definition is deferrable, and we got an
15393 * empty string back, set a flag to later defer it (after clean
15394 * up below). */
15395 if ( deferrable
15396 && (! SvPOK(contents) || SvCUR(contents) == 0))
15397 {
15398 empty_return = TRUE;
15399 }
15400 else { /* Otherwise, call a function to check for valid syntax,
15401 and handle it */
15402
15403 prop_definition = handle_user_defined_property(
15404 name, name_len,
15405 is_utf8, to_fold, runtime,
15406 deferrable,
15407 contents, user_defined_ptr,
15408 msg,
15409 level);
15410 }
15411 }
15412
15413 /* Here, we have the results of the expansion. Delete the
15414 * placeholder, and if the definition is now known, replace it with
15415 * that definition. We need exclusive access to the hash, and we
15416 * can't let anyone else in, between when we delete the placeholder
15417 * and add the permanent entry */
15418 USER_PROP_MUTEX_LOCK;
15419
15420 S_delete_recursion_entry(aTHX_ SvPVX(key));
15421
15422 if ( ! empty_return
15423 && (! prop_definition || is_invlist(prop_definition)))
15424 {
15425 /* If we got success we use the inversion list defining the
15426 * property; otherwise use the error message */
15427 SWITCH_TO_GLOBAL_CONTEXT;
15428 (void) hv_store_ent(PL_user_def_props,
15429 key,
15430 ((prop_definition)
15431 ? newSVsv(prop_definition)
15432 : newSVsv(msg)),
15433 0);
15434 RESTORE_CONTEXT;
15435 }
15436
15437 /* All done, and the hash now has a permanent entry for this
15438 * property. Give up exclusive control */
15439 USER_PROP_MUTEX_UNLOCK;
15440
15441 FREETMPS;
15442 LEAVE;
15443 POPSTACK;
15444
15445 if (empty_return) {
15446 goto definition_deferred;
15447 }
15448
15449 if (prop_definition) {
15450
15451 /* If the definition is for something not known at this time,
15452 * we toss it, and go return the main property name, as that's
15453 * the one the user will be aware of */
15454 if (! is_invlist(prop_definition)) {
15455 SvREFCNT_dec_NN(prop_definition);
15456 goto definition_deferred;
15457 }
15458
15459 sv_2mortal(prop_definition);
15460 }
15461
15462 /* And return */
15463 return prop_definition;
15464
15465 } /* End of calling the subroutine for the user-defined property */
15466 } /* End of it could be a user-defined property */
15467
15468 /* Here it wasn't a user-defined property that is known at this time. See
15469 * if it is a Unicode property */
15470
15471 lookup_len = j; /* This is a more mnemonic name than 'j' */
15472
15473 /* Get the index into our pointer table of the inversion list corresponding
15474 * to the property */
15475 table_index = do_uniprop_match(lookup_name, lookup_len);
15476
15477 /* If it didn't find the property ... */
15478 if (table_index == 0) {
15479
15480 /* Try again stripping off any initial 'Is'. This is because we
15481 * promise that an initial Is is optional. The same isn't true of
15482 * names that start with 'In'. Those can match only blocks, and the
15483 * lookup table already has those accounted for. The lookup table also
15484 * has already accounted for Perl extensions (without and = sign)
15485 * starting with 'i's'. */
15486 if (starts_with_Is && equals_pos >= 0) {
15487 lookup_name += 2;
15488 lookup_len -= 2;
15489 equals_pos -= 2;
15490 slash_pos -= 2;
15491
15492 table_index = do_uniprop_match(lookup_name, lookup_len);
15493 }
15494
15495 if (table_index == 0) {
15496 char * canonical;
15497
15498 /* Here, we didn't find it. If not a numeric type property, and
15499 * can't be a user-defined one, it isn't a legal property */
15500 if (! is_nv_type) {
15501 if (! could_be_user_defined) {
15502 goto failed;
15503 }
15504
15505 /* Here, the property name is legal as a user-defined one. At
15506 * compile time, it might just be that the subroutine for that
15507 * property hasn't been encountered yet, but at runtime, it's
15508 * an error to try to use an undefined one */
15509 if (! deferrable) {
15510 goto unknown_user_defined;;
15511 }
15512
15513 goto definition_deferred;
15514 } /* End of isn't a numeric type property */
15515
15516 /* The numeric type properties need more work to decide. What we
15517 * do is make sure we have the number in canonical form and look
15518 * that up. */
15519
15520 if (slash_pos < 0) { /* No slash */
15521
15522 /* When it isn't a rational, take the input, convert it to a
15523 * NV, then create a canonical string representation of that
15524 * NV. */
15525
15526 NV value;
15527 SSize_t value_len = lookup_len - equals_pos;
15528
15529 /* Get the value */
15530 if ( value_len <= 0
15531 || my_atof3(lookup_name + equals_pos, &value,
15532 value_len)
15533 != lookup_name + lookup_len)
15534 {
15535 goto failed;
15536 }
15537
15538 /* If the value is an integer, the canonical value is integral
15539 * */
15540 if (Perl_ceil(value) == value) {
15541 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
15542 equals_pos, lookup_name, value);
15543 }
15544 else { /* Otherwise, it is %e with a known precision */
15545 char * exp_ptr;
15546
15547 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
15548 equals_pos, lookup_name,
15549 PL_E_FORMAT_PRECISION, value);
15550
15551 /* The exponent generated is expecting two digits, whereas
15552 * %e on some systems will generate three. Remove leading
15553 * zeros in excess of 2 from the exponent. We start
15554 * looking for them after the '=' */
15555 exp_ptr = strchr(canonical + equals_pos, 'e');
15556 if (exp_ptr) {
15557 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
15558 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
15559
15560 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
15561
15562 if (excess_exponent_len > 0) {
15563 SSize_t leading_zeros = strspn(cur_ptr, "0");
15564 SSize_t excess_leading_zeros
15565 = MIN(leading_zeros, excess_exponent_len);
15566 if (excess_leading_zeros > 0) {
15567 Move(cur_ptr + excess_leading_zeros,
15568 cur_ptr,
15569 strlen(cur_ptr) - excess_leading_zeros
15570 + 1, /* Copy the NUL as well */
15571 char);
15572 }
15573 }
15574 }
15575 }
15576 }
15577 else { /* Has a slash. Create a rational in canonical form */
15578 UV numerator, denominator, gcd, trial;
15579 const char * end_ptr;
15580 const char * sign = "";
15581
15582 /* We can't just find the numerator, denominator, and do the
15583 * division, then use the method above, because that is
15584 * inexact. And the input could be a rational that is within
15585 * epsilon (given our precision) of a valid rational, and would
15586 * then incorrectly compare valid.
15587 *
15588 * We're only interested in the part after the '=' */
15589 const char * this_lookup_name = lookup_name + equals_pos;
15590 lookup_len -= equals_pos;
15591 slash_pos -= equals_pos;
15592
15593 /* Handle any leading minus */
15594 if (this_lookup_name[0] == '-') {
15595 sign = "-";
15596 this_lookup_name++;
15597 lookup_len--;
15598 slash_pos--;
15599 }
15600
15601 /* Convert the numerator to numeric */
15602 end_ptr = this_lookup_name + slash_pos;
15603 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
15604 goto failed;
15605 }
15606
15607 /* It better have included all characters before the slash */
15608 if (*end_ptr != '/') {
15609 goto failed;
15610 }
15611
15612 /* Set to look at just the denominator */
15613 this_lookup_name += slash_pos;
15614 lookup_len -= slash_pos;
15615 end_ptr = this_lookup_name + lookup_len;
15616
15617 /* Convert the denominator to numeric */
15618 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
15619 goto failed;
15620 }
15621
15622 /* It better be the rest of the characters, and don't divide by
15623 * 0 */
15624 if ( end_ptr != this_lookup_name + lookup_len
15625 || denominator == 0)
15626 {
15627 goto failed;
15628 }
15629
15630 /* Get the greatest common denominator using
15631 https://en.wikipedia.org/wiki/Euclidean_algorithm */
15632 gcd = numerator;
15633 trial = denominator;
15634 while (trial != 0) {
15635 UV temp = trial;
15636 trial = gcd % trial;
15637 gcd = temp;
15638 }
15639
15640 /* If already in lowest possible terms, we have already tried
15641 * looking this up */
15642 if (gcd == 1) {
15643 goto failed;
15644 }
15645
15646 /* Reduce the rational, which should put it in canonical form
15647 * */
15648 numerator /= gcd;
15649 denominator /= gcd;
15650
15651 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
15652 equals_pos, lookup_name, sign, numerator, denominator);
15653 }
15654
15655 /* Here, we have the number in canonical form. Try that */
15656 table_index = do_uniprop_match(canonical, strlen(canonical));
15657 if (table_index == 0) {
15658 goto failed;
15659 }
15660 } /* End of still didn't find the property in our table */
15661 } /* End of didn't find the property in our table */
15662
15663 /* Here, we have a non-zero return, which is an index into a table of ptrs.
15664 * A negative return signifies that the real index is the absolute value,
15665 * but the result needs to be inverted */
15666 if (table_index < 0) {
15667 invert_return = TRUE;
15668 table_index = -table_index;
15669 }
15670
15671 /* Out-of band indices indicate a deprecated property. The proper index is
15672 * modulo it with the table size. And dividing by the table size yields
15673 * an offset into a table constructed by regen/mk_invlists.pl to contain
15674 * the corresponding warning message */
15675 if (table_index > MAX_UNI_KEYWORD_INDEX) {
15676 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
15677 table_index %= MAX_UNI_KEYWORD_INDEX;
15678 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME),
15679 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
15680 (int) name_len, name,
15681 get_deprecated_property_msg(warning_offset));
15682 }
15683
15684 /* In a few properties, a different property is used under /i. These are
15685 * unlikely to change, so are hard-coded here. */
15686 if (to_fold) {
15687 if ( table_index == UNI_XPOSIXUPPER
15688 || table_index == UNI_XPOSIXLOWER
15689 || table_index == UNI_TITLE)
15690 {
15691 table_index = UNI_CASED;
15692 }
15693 else if ( table_index == UNI_UPPERCASELETTER
15694 || table_index == UNI_LOWERCASELETTER
15695# ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
15696 || table_index == UNI_TITLECASELETTER
15697# endif
15698 ) {
15699 table_index = UNI_CASEDLETTER;
15700 }
15701 else if ( table_index == UNI_POSIXUPPER
15702 || table_index == UNI_POSIXLOWER)
15703 {
15704 table_index = UNI_POSIXALPHA;
15705 }
15706 }
15707
15708 /* Create and return the inversion list */
15709 prop_definition = get_prop_definition(table_index);
15710 sv_2mortal(prop_definition);
15711
15712 /* See if there is a private use override to add to this definition */
15713 {
15714 COPHH * hinthash = (IN_PERL_COMPILETIME)
15715 ? CopHINTHASH_get(&PL_compiling)
15716 : CopHINTHASH_get(PL_curcop);
15717 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
15718
15719 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
15720
15721 /* See if there is an element in the hints hash for this table */
15722 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
15723 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
15724
15725 if (pos) {
15726 bool dummy;
15727 SV * pu_definition;
15728 SV * pu_invlist;
15729 SV * expanded_prop_definition =
15730 sv_2mortal(invlist_clone(prop_definition, NULL));
15731
15732 /* If so, it's definition is the string from here to the next
15733 * \a character. And its format is the same as a user-defined
15734 * property */
15735 pos += SvCUR(pu_lookup);
15736 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
15737 pu_invlist = handle_user_defined_property(lookup_name,
15738 lookup_len,
15739 0, /* Not UTF-8 */
15740 0, /* Not folded */
15741 runtime,
15742 deferrable,
15743 pu_definition,
15744 &dummy,
15745 msg,
15746 level);
15747 if (TAINT_get) {
15748 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15749 sv_catpvs(msg, "Insecure private-use override");
15750 goto append_name_to_msg;
15751 }
15752
15753 /* For now, as a safety measure, make sure that it doesn't
15754 * override non-private use code points */
15755 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
15756
15757 /* Add it to the list to be returned */
15758 _invlist_union(prop_definition, pu_invlist,
15759 &expanded_prop_definition);
15760 prop_definition = expanded_prop_definition;
15761 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
15762 }
15763 }
15764 }
15765
15766 if (invert_return) {
15767 _invlist_invert(prop_definition);
15768 }
15769 return prop_definition;
15770
15771 unknown_user_defined:
15772 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15773 sv_catpvs(msg, "Unknown user-defined property name");
15774 goto append_name_to_msg;
15775
15776 failed:
15777 if (non_pkg_begin != 0) {
15778 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15779 sv_catpvs(msg, "Illegal user-defined property name");
15780 }
15781 else {
15782 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15783 sv_catpvs(msg, "Can't find Unicode property definition");
15784 }
15785 /* FALLTHROUGH */
15786
15787 append_name_to_msg:
15788 {
15789 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
15790 const char * suffix = (runtime && level == 0) ? "}" : "\"";
15791
15792 sv_catpv(msg, prefix);
15793 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
15794 sv_catpv(msg, suffix);
15795 }
15796
15797 return NULL;
15798
15799 definition_deferred:
15800
15801 {
15802 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
15803
15804 /* Here it could yet to be defined, so defer evaluation of this until
15805 * its needed at runtime. We need the fully qualified property name to
15806 * avoid ambiguity */
15807 if (! fq_name) {
15808 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15809 is_qualified);
15810 }
15811
15812 /* If it didn't come with a package, or the package is utf8::, this
15813 * actually could be an official Unicode property whose inclusion we
15814 * are deferring until runtime to make sure that it isn't overridden by
15815 * a user-defined property of the same name (which we haven't
15816 * encountered yet). Add a marker to indicate this possibility, for
15817 * use at such time when we first need the definition during pattern
15818 * matching execution */
15819 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
15820 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
15821 }
15822
15823 /* We also need a trailing newline */
15824 sv_catpvs(fq_name, "\n");
15825
15826 *user_defined_ptr = TRUE;
15827 return fq_name;
15828 }
15829}
15830
15831STATIC bool
15832S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
15833 const STRLEN wname_len, /* Its length */
15834 SV ** prop_definition,
15835 AV ** strings)
15836{
15837 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
15838 * any matches, adding them to prop_definition */
15839
15840 dSP;
15841
15842 CV * get_names_info; /* entry to charnames.pm to get info we need */
15843 SV * names_string; /* Contains all character names, except algo */
15844 SV * algorithmic_names; /* Contains info about algorithmically
15845 generated character names */
15846 REGEXP * subpattern_re; /* The user's pattern to match with */
15847 struct regexp * prog; /* The compiled pattern */
15848 char * all_names_start; /* lib/unicore/Name.pl string of every
15849 (non-algorithmic) character name */
15850 char * cur_pos; /* We match, effectively using /gc; this is
15851 where we are now */
15852 bool found_matches = FALSE; /* Did any name match so far? */
15853 SV * empty; /* For matching zero length names */
15854 SV * must_sv; /* Contains the substring, if any, that must be
15855 in a name for the subpattern to match */
15856 const char * must; /* The PV of 'must' */
15857 STRLEN must_len; /* And its length */
15858 SV * syllable_name = NULL; /* For Hangul syllables */
15859 const char hangul_prefix[] = "HANGUL SYLLABLE ";
15860 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
15861
15862 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
15863 * syllable name, and these are immutable and guaranteed by the Unicode
15864 * standard to never be extended */
15865 const STRLEN syl_max_len = hangul_prefix_len + 7;
15866
15867 IV i;
15868
15869 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
15870
15871 /* Make sure _charnames is loaded. (The parameters give context
15872 * for any errors generated */
15873 get_names_info = get_cv("_charnames::_get_names_info", 0);
15874 if (! get_names_info) {
15875 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
15876 }
15877
15878 /* Get the charnames data */
15879 PUSHSTACKi(PERLSI_REGCOMP);
15880 ENTER ;
15881 SAVETMPS;
15882 save_re_context();
15883
15884 PUSHMARK(SP) ;
15885 PUTBACK;
15886
15887 /* Special _charnames entry point that returns the info this routine
15888 * requires */
15889 call_sv(MUTABLE_SV(get_names_info), G_LIST);
15890
15891 SPAGAIN ;
15892
15893 /* Data structure for names which end in their very own code points */
15894 algorithmic_names = POPs;
15895 SvREFCNT_inc_simple_void_NN(algorithmic_names);
15896
15897 /* The lib/unicore/Name.pl string */
15898 names_string = POPs;
15899 SvREFCNT_inc_simple_void_NN(names_string);
15900
15901 PUTBACK ;
15902 FREETMPS ;
15903 LEAVE ;
15904 POPSTACK;
15905
15906 if ( ! SvROK(names_string)
15907 || ! SvROK(algorithmic_names))
15908 { /* Perhaps should panic instead XXX */
15909 SvREFCNT_dec(names_string);
15910 SvREFCNT_dec(algorithmic_names);
15911 return FALSE;
15912 }
15913
15914 names_string = sv_2mortal(SvRV(names_string));
15915 all_names_start = SvPVX(names_string);
15916 cur_pos = all_names_start;
15917
15918 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
15919
15920 /* Compile the subpattern consisting of the name being looked for */
15921 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
15922
15923 must_sv = re_intuit_string(subpattern_re);
15924 if (must_sv) {
15925 /* regexec.c can free the re_intuit_string() return. GH #17734 */
15926 must_sv = sv_2mortal(newSVsv(must_sv));
15927 must = SvPV(must_sv, must_len);
15928 }
15929 else {
15930 must = "";
15931 must_len = 0;
15932 }
15933
15934 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
15935 * This works because the NUL causes the function to return early, thus
15936 * showing that there are characters in it other than the acceptable ones,
15937 * which is our desired result.) */
15938
15939 prog = ReANY(subpattern_re);
15940
15941 /* If only nothing is matched, skip to where empty names are looked for */
15942 if (prog->maxlen == 0) {
15943 goto check_empty;
15944 }
15945
15946 /* And match against the string of all names /gc. Don't even try if it
15947 * must match a character not found in any name. */
15948 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
15949 {
15950 while (execute_wildcard(subpattern_re,
15951 cur_pos,
15952 SvEND(names_string),
15953 all_names_start, 0,
15954 names_string,
15955 0))
15956 { /* Here, matched. */
15957
15958 /* Note the string entries look like
15959 * 00001\nSTART OF HEADING\n\n
15960 * so we could match anywhere in that string. We have to rule out
15961 * matching a code point line */
15962 char * this_name_start = all_names_start
15963 + RX_OFFS_START(subpattern_re,0);
15964 char * this_name_end = all_names_start
15965 + RX_OFFS_END(subpattern_re,0);
15966 char * cp_start;
15967 char * cp_end;
15968 UV cp = 0; /* Silences some compilers */
15969 AV * this_string = NULL;
15970 bool is_multi = FALSE;
15971
15972 /* If matched nothing, advance to next possible match */
15973 if (this_name_start == this_name_end) {
15974 cur_pos = (char *) memchr(this_name_end + 1, '\n',
15975 SvEND(names_string) - this_name_end);
15976 if (cur_pos == NULL) {
15977 break;
15978 }
15979 }
15980 else {
15981 /* Position the next match to start beyond the current returned
15982 * entry */
15983 cur_pos = (char *) memchr(this_name_end, '\n',
15984 SvEND(names_string) - this_name_end);
15985 }
15986
15987 /* Back up to the \n just before the beginning of the character. */
15988 cp_end = (char *) my_memrchr(all_names_start,
15989 '\n',
15990 this_name_start - all_names_start);
15991
15992 /* If we didn't find a \n, it means it matched somewhere in the
15993 * initial '00000' in the string, so isn't a real match */
15994 if (cp_end == NULL) {
15995 continue;
15996 }
15997
15998 this_name_start = cp_end + 1; /* The name starts just after */
15999 cp_end--; /* the \n, and the code point */
16000 /* ends just before it */
16001
16002 /* All code points are 5 digits long */
16003 cp_start = cp_end - 4;
16004
16005 /* This shouldn't happen, as we found a \n, and the first \n is
16006 * further along than what we subtracted */
16007 assert(cp_start >= all_names_start);
16008
16009 if (cp_start == all_names_start) {
16010 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
16011 continue;
16012 }
16013
16014 /* If the character is a blank, we either have a named sequence, or
16015 * something is wrong */
16016 if (*(cp_start - 1) == ' ') {
16017 cp_start = (char *) my_memrchr(all_names_start,
16018 '\n',
16019 cp_start - all_names_start);
16020 cp_start++;
16021 }
16022
16023 assert(cp_start != NULL && cp_start >= all_names_start + 2);
16024
16025 /* Except for the first line in the string, the sequence before the
16026 * code point is \n\n. If that isn't the case here, we didn't
16027 * match the name of a character. (We could have matched a named
16028 * sequence, not currently handled */
16029 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
16030 continue;
16031 }
16032
16033 /* We matched! Add this to the list */
16034 found_matches = TRUE;
16035
16036 /* Loop through all the code points in the sequence */
16037 while (cp_start < cp_end) {
16038
16039 /* Calculate this code point from its 5 digits */
16040 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
16041 + (XDIGIT_VALUE(cp_start[1]) << 12)
16042 + (XDIGIT_VALUE(cp_start[2]) << 8)
16043 + (XDIGIT_VALUE(cp_start[3]) << 4)
16044 + XDIGIT_VALUE(cp_start[4]);
16045
16046 cp_start += 6; /* Go past any blank */
16047
16048 if (cp_start < cp_end || is_multi) {
16049 if (this_string == NULL) {
16050 this_string = newAV();
16051 }
16052
16053 is_multi = TRUE;
16054 av_push_simple(this_string, newSVuv(cp));
16055 }
16056 }
16057
16058 if (is_multi) { /* Was more than one code point */
16059 if (*strings == NULL) {
16060 *strings = newAV();
16061 }
16062
16063 av_push_simple(*strings, (SV *) this_string);
16064 }
16065 else { /* Only a single code point */
16066 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
16067 }
16068 } /* End of loop through the non-algorithmic names string */
16069 }
16070
16071 /* There are also character names not in 'names_string'. These are
16072 * algorithmically generatable. Try this pattern on each possible one.
16073 * (khw originally planned to leave this out given the large number of
16074 * matches attempted; but the speed turned out to be quite acceptable
16075 *
16076 * There are plenty of opportunities to optimize to skip many of the tests.
16077 * beyond the rudimentary ones already here */
16078
16079 /* First see if the subpattern matches any of the algorithmic generatable
16080 * Hangul syllable names.
16081 *
16082 * We know none of these syllable names will match if the input pattern
16083 * requires more bytes than any syllable has, or if the input pattern only
16084 * matches an empty name, or if the pattern has something it must match and
16085 * one of the characters in that isn't in any Hangul syllable. */
16086 if ( prog->minlen <= (SSize_t) syl_max_len
16087 && prog->maxlen > 0
16088 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
16089 {
16090 /* These constants, names, values, and algorithm are adapted from the
16091 * Unicode standard, version 5.1, section 3.12, and should never
16092 * change. */
16093 const char * JamoL[] = {
16094 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
16095 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
16096 };
16097 const int LCount = C_ARRAY_LENGTH(JamoL);
16098
16099 const char * JamoV[] = {
16100 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
16101 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
16102 "I"
16103 };
16104 const int VCount = C_ARRAY_LENGTH(JamoV);
16105
16106 const char * JamoT[] = {
16107 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
16108 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
16109 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
16110 };
16111 const int TCount = C_ARRAY_LENGTH(JamoT);
16112
16113 int L, V, T;
16114
16115 /* This is the initial Hangul syllable code point; each time through the
16116 * inner loop, it maps to the next higher code point. For more info,
16117 * see the Hangul syllable section of the Unicode standard. */
16118 int cp = 0xAC00;
16119
16120 syllable_name = sv_2mortal(newSV(syl_max_len));
16121 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
16122
16123 for (L = 0; L < LCount; L++) {
16124 for (V = 0; V < VCount; V++) {
16125 for (T = 0; T < TCount; T++) {
16126
16127 /* Truncate back to the prefix, which is unvarying */
16128 SvCUR_set(syllable_name, hangul_prefix_len);
16129
16130 sv_catpv(syllable_name, JamoL[L]);
16131 sv_catpv(syllable_name, JamoV[V]);
16132 sv_catpv(syllable_name, JamoT[T]);
16133
16134 if (execute_wildcard(subpattern_re,
16135 SvPVX(syllable_name),
16136 SvEND(syllable_name),
16137 SvPVX(syllable_name), 0,
16138 syllable_name,
16139 0))
16140 {
16141 *prop_definition = add_cp_to_invlist(*prop_definition,
16142 cp);
16143 found_matches = TRUE;
16144 }
16145
16146 cp++;
16147 }
16148 }
16149 }
16150 }
16151
16152 /* The rest of the algorithmically generatable names are of the form
16153 * "PREFIX-code_point". The prefixes and the code point limits of each
16154 * were returned to us in the array 'algorithmic_names' from data in
16155 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
16156 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
16157 IV j;
16158
16159 /* Each element of the array is a hash, giving the details for the
16160 * series of names it covers. There is the base name of the characters
16161 * in the series, and the low and high code points in the series. And,
16162 * for optimization purposes a string containing all the legal
16163 * characters that could possibly be in a name in this series. */
16164 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
16165 SV * prefix = * hv_fetchs(this_series, "name", 0);
16166 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
16167 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
16168 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
16169
16170 /* Pre-allocate an SV with enough space */
16171 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
16172 SvPVX(prefix)));
16173 if (high >= 0x10000) {
16174 sv_catpvs(algo_name, "0");
16175 }
16176
16177 /* This series can be skipped entirely if the pattern requires
16178 * something longer than any name in the series, or can only match an
16179 * empty name, or contains a character not found in any name in the
16180 * series */
16181 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
16182 && prog->maxlen > 0
16183 && (strspn(must, legal) == must_len))
16184 {
16185 for (j = low; j <= high; j++) { /* For each code point in the series */
16186
16187 /* Get its name, and see if it matches the subpattern */
16188 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
16189 (unsigned) j);
16190
16191 if (execute_wildcard(subpattern_re,
16192 SvPVX(algo_name),
16193 SvEND(algo_name),
16194 SvPVX(algo_name), 0,
16195 algo_name,
16196 0))
16197 {
16198 *prop_definition = add_cp_to_invlist(*prop_definition, j);
16199 found_matches = TRUE;
16200 }
16201 }
16202 }
16203 }
16204
16205 check_empty:
16206 /* Finally, see if the subpattern matches an empty string */
16207 empty = newSVpvs("");
16208 if (execute_wildcard(subpattern_re,
16209 SvPVX(empty),
16210 SvEND(empty),
16211 SvPVX(empty), 0,
16212 empty,
16213 0))
16214 {
16215 /* Many code points have empty names. Currently these are the \p{GC=C}
16216 * ones, minus CC and CF */
16217
16218 SV * empty_names_ref = get_prop_definition(UNI_C);
16219 SV * empty_names = invlist_clone(empty_names_ref, NULL);
16220
16221 SV * subtract = get_prop_definition(UNI_CC);
16222
16223 _invlist_subtract(empty_names, subtract, &empty_names);
16224 SvREFCNT_dec_NN(empty_names_ref);
16225 SvREFCNT_dec_NN(subtract);
16226
16227 subtract = get_prop_definition(UNI_CF);
16228 _invlist_subtract(empty_names, subtract, &empty_names);
16229 SvREFCNT_dec_NN(subtract);
16230
16231 _invlist_union(*prop_definition, empty_names, prop_definition);
16232 found_matches = TRUE;
16233 SvREFCNT_dec_NN(empty_names);
16234 }
16235 SvREFCNT_dec_NN(empty);
16236
16237#if 0
16238 /* If we ever were to accept aliases for, say private use names, we would
16239 * need to do something fancier to find empty names. The code below works
16240 * (at the time it was written), and is slower than the above */
16241 const char empties_pat[] = "^.";
16242 if (strNE(name, empties_pat)) {
16243 SV * empty = newSVpvs("");
16244 if (execute_wildcard(subpattern_re,
16245 SvPVX(empty),
16246 SvEND(empty),
16247 SvPVX(empty), 0,
16248 empty,
16249 0))
16250 {
16251 SV * empties = NULL;
16252
16253 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
16254
16255 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
16256 SvREFCNT_dec_NN(empties);
16257
16258 found_matches = TRUE;
16259 }
16260 SvREFCNT_dec_NN(empty);
16261 }
16262#endif
16263
16264 SvREFCNT_dec_NN(subpattern_re);
16265 return found_matches;
16266}
16267
16268/*
16269 * ex: set ts=8 sts=4 sw=4 et:
16270 */