5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
35 * pregcomp and pregexec -- regsub and regerror are not used in perl
37 * Copyright (c) 1986 by University of Toronto.
38 * Written by Henry Spencer. Not derived from licensed software.
40 * Permission is granted to anyone to use this software for any
41 * purpose on any computer system, and to redistribute it freely,
42 * subject to the following restrictions:
44 * 1. The author is not responsible for the consequences of use of
45 * this software, no matter how awful, even if they arise
48 * 2. The origin of this software must not be misrepresented, either
49 * by explicit claim or by omission.
51 * 3. Altered versions must be plainly marked as such, and must not
52 * be misrepresented as being the original software.
55 **** Alterations to Henry's code are...
57 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
58 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
59 **** by Larry Wall and others
61 **** You may distribute under the terms of either the GNU General Public
62 **** License or the Artistic License, as specified in the README file.
65 * Beware that some of this code is subtly aware of the way operator
66 * precedence is structured in regular expressions. Serious changes in
67 * regular-expression syntax might require a total rethink.
70 /* Note on debug output:
72 * This is set up so that -Dr turns on debugging like all other flags that are
73 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
74 * all regular expressions encountered in a program, and gives a huge amount of
75 * output for all but the shortest programs.
77 * The ability to output pattern debugging information lexically, and with much
78 * finer grained control was added, with 'use re qw(Debug ....);' available even
79 * in non-DEBUGGING builds. This is accomplished by copying the contents of
80 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
81 * Those files are compiled and linked into the perl executable, and they are
82 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
85 * That would normally mean linking errors when two functions of the same name
86 * are attempted to be placed into the same executable. That is solved in one
88 * 1) Static functions aren't known outside the file they are in, so for the
89 * many functions of that type in this file, it just isn't a problem.
90 * 2) Most externally known functions are enclosed in
91 * #ifndef PERL_IN_XSUB_RE
94 * blocks, so there is only one definition for them in the whole
95 * executable, the one in regcomp.c (or regexec.c). The implication of
96 * that is any debugging info that comes from them is controlled only by
97 * -Dr. Further, any static function they call will also be the version
98 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
99 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
100 * have different names, so that what gets loaded in the executable is
101 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
102 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
103 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
104 * versions and their callees are under control of re.pm. The catch is
105 * that references to all these go through the regexp_engine structure,
106 * which is initialized in regcomp.h to the Perl_foo versions, and
107 * substituted out in lexical scopes where 'use re' is in effect to the
108 * 'my_foo' ones. That structure is public API, so it would be a hard
109 * sell to add any additional members.
110 * 4) For functions in regcomp.c and re_comp.c that are called only from,
111 * respectively, regexec.c and re_exec.c, they can have two different
112 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
115 * The bottom line is that if you add code to one of the public functions
116 * listed in ext/re/re_top.h, debugging automagically works. But if you write
117 * a new function that needs to do debugging or there is a chain of calls from
118 * it that need to do debugging, all functions in the chain should use options
121 * A function may have to be split so that debugging stuff is static, but it
122 * calls out to some other function that only gets compiled in regcomp.c to
123 * access data that we don't want to duplicate.
126 #ifdef PERL_EXT_RE_BUILD
131 #define PERL_IN_REGEX_ENGINE
132 #define PERL_IN_REGCOMP_ANY
133 #define PERL_IN_REGCOMP_C
136 #ifdef PERL_IN_XSUB_RE
137 # include "re_comp.h"
138 EXTERN_C const struct regexp_engine my_reg_engine;
139 EXTERN_C const struct regexp_engine wild_reg_engine;
141 # include "regcomp.h"
144 #include "invlist_inline.h"
145 #include "unicode_constants.h"
146 #include "regcomp_internal.h"
148 /* =========================================================
149 * BEGIN edit_distance stuff.
151 * This calculates how many single character changes of any type are needed to
152 * transform a string into another one. It is taken from version 3.1 of
154 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
157 /* Our unsorted dictionary linked list. */
158 /* Note we use UVs, not chars. */
163 struct dictionary* next;
165 typedef struct dictionary item;
168 PERL_STATIC_INLINE item*
169 push(UV key, item* curr)
180 PERL_STATIC_INLINE item*
181 find(item* head, UV key)
183 item* iterator = head;
185 if (iterator->key == key){
188 iterator = iterator->next;
194 PERL_STATIC_INLINE item*
195 uniquePush(item* head, UV key)
197 item* iterator = head;
200 if (iterator->key == key) {
203 iterator = iterator->next;
206 return push(key, head);
209 PERL_STATIC_INLINE void
210 dict_free(item* head)
212 item* iterator = head;
215 item* temp = iterator;
216 iterator = iterator->next;
223 /* End of Dictionary Stuff */
225 /* All calculations/work are done here */
227 S_edit_distance(const UV* src,
229 const STRLEN x, /* length of src[] */
230 const STRLEN y, /* length of tgt[] */
231 const SSize_t maxDistance
235 UV swapCount, swapScore, targetCharCount, i, j;
237 UV score_ceil = x + y;
239 PERL_ARGS_ASSERT_EDIT_DISTANCE;
241 /* initialize matrix start values */
242 Newx(scores, ( (x + 2) * (y + 2)), UV);
243 scores[0] = score_ceil;
244 scores[1 * (y + 2) + 0] = score_ceil;
245 scores[0 * (y + 2) + 1] = score_ceil;
246 scores[1 * (y + 2) + 1] = 0;
247 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
254 head = uniquePush(head, src[i]);
255 scores[(i+1) * (y + 2) + 1] = i;
256 scores[(i+1) * (y + 2) + 0] = score_ceil;
262 head = uniquePush(head, tgt[j]);
263 scores[1 * (y + 2) + (j + 1)] = j;
264 scores[0 * (y + 2) + (j + 1)] = score_ceil;
267 targetCharCount = find(head, tgt[j-1])->value;
268 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
270 if (src[i-1] != tgt[j-1]){
271 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
275 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
279 find(head, src[i-1])->value = i;
283 IV score = scores[(x+1) * (y + 2) + (y + 1)];
286 return (maxDistance != 0 && maxDistance < score)?(-1):score;
290 /* END of edit_distance() stuff
291 * ========================================================= */
293 /* add a data member to the struct reg_data attached to this regex, it should
294 * always return a non-zero return. the 's' argument is the type of the items
295 * being added and the n is the number of items. The length of 's' should match
296 * the number of items. */
298 Perl_reg_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
300 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 1;
302 PERL_ARGS_ASSERT_REG_ADD_DATA;
304 /* in the below expression we have (count + n - 1), the minus one is there
305 * because the struct that we allocate already contains a slot for 1 data
306 * item, so we do not need to allocate it the first time. IOW, the
307 * sizeof(*RExC_rxi->data) already accounts for one of the elements we need
308 * to allocate. See struct reg_data in regcomp.h
310 Renewc(RExC_rxi->data,
311 sizeof(*RExC_rxi->data) + (sizeof(void*) * (count + n - 1)),
312 char, struct reg_data);
313 /* however in the data->what expression we use (count + n) and do not
314 * subtract one from the result because the data structure contains a
315 * pointer to an array, and does not allocate the first element as part of
316 * the data struct. */
318 Renew(RExC_rxi->data->what, (count + n), U8);
320 /* when count == 1 it means we have not initialized anything.
321 * we always fill the 0 slot of the data array with a '%' entry, which
322 * means "zero" (all the other types are letters) which exists purely
323 * so the return from reg_add_data is ALWAYS true, so we can tell it apart
324 * from a "no value" idx=0 in places where we would return an index
325 * into reg_add_data. This is particularly important with the new "single
326 * pass, usually, but not always" strategy that we use, where the code
327 * will use a 0 to represent "not able to compute this yet".
329 Newx(RExC_rxi->data->what, n+1, U8);
330 /* fill in the placeholder slot of 0 with a what of '%', we use
331 * this because it sorta looks like a zero (0/0) and it is not a letter
332 * like any of the other "whats", this type should never be created
333 * any other way but here. '%' happens to also not appear in this
334 * file for any other reason (at the time of writing this comment)*/
335 RExC_rxi->data->what[0]= '%';
336 RExC_rxi->data->data[0]= NULL;
338 RExC_rxi->data->count = count + n;
339 Copy(s, RExC_rxi->data->what + count, n, U8);
344 /*XXX: todo make this not included in a non debugging perl, but appears to be
345 * used anyway there, in 'use re' */
346 #ifndef PERL_IN_XSUB_RE
348 Perl_reginitcolors(pTHX)
350 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
362 PL_colors[i] = t = (char *)"";
367 PL_colors[i++] = (char *)"";
374 #ifdef TRIE_STUDY_OPT
375 /* search for "restudy" in this file for a detailed explanation */
376 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
379 (data.flags & SCF_TRIE_RESTUDY) \
387 #define CHECK_RESTUDY_GOTO_butfirst
391 * pregcomp - compile a regular expression into internal code
393 * Decides which engine's compiler to call based on the hint currently in
397 #ifndef PERL_IN_XSUB_RE
399 /* return the currently in-scope regex engine (or the default if none) */
401 regexp_engine const *
402 Perl_current_re_engine(pTHX)
404 if (IN_PERL_COMPILETIME) {
405 HV * const table = GvHV(PL_hintgv);
408 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
409 return &PL_core_reg_engine;
410 ptr = hv_fetchs(table, "regcomp", FALSE);
411 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
412 return &PL_core_reg_engine;
413 return INT2PTR(regexp_engine*, SvIV(*ptr));
417 if (!PL_curcop->cop_hints_hash)
418 return &PL_core_reg_engine;
419 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
420 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
421 return &PL_core_reg_engine;
422 return INT2PTR(regexp_engine*, SvIV(ptr));
428 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
430 regexp_engine const *eng = current_re_engine();
431 DECLARE_AND_GET_RE_DEBUG_FLAGS;
433 PERL_ARGS_ASSERT_PREGCOMP;
435 /* Dispatch a request to compile a regexp to correct regexp engine. */
437 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
440 return CALLREGCOMP_ENG(eng, pattern, flags);
445 =for apidoc re_compile
447 Compile the regular expression pattern C<pattern>, returning a pointer to the
448 compiled object for later matching with the internal regex engine.
450 This function is typically used by a custom regexp engine C<.comp()> function
451 to hand off to the core regexp engine those patterns it doesn't want to handle
452 itself (typically passing through the same flags it was called with). In
453 almost all other cases, a regexp should be compiled by calling L</C<pregcomp>>
454 to compile using the currently active regexp engine.
456 If C<pattern> is already a C<REGEXP>, this function does nothing but return a
457 pointer to the input. Otherwise the PV is extracted and treated like a string
458 representing a pattern. See L<perlre>.
460 The possible flags for C<rx_flags> are documented in L<perlreapi>. Their names
461 all begin with C<RXf_>.
465 * public entry point for the perl core's own regex compiling code.
466 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
467 * pattern rather than a list of OPs, and uses the internal engine rather
468 * than the current one */
471 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
473 SV *pat = pattern; /* defeat constness! */
475 PERL_ARGS_ASSERT_RE_COMPILE;
477 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
478 #ifdef PERL_IN_XSUB_RE
483 NULL, NULL, rx_flags, 0);
487 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
491 if (--cbs->refcnt > 0)
493 for (n = 0; n < cbs->count; n++) {
494 REGEXP *rx = cbs->cb[n].src_regex;
496 cbs->cb[n].src_regex = NULL;
505 static struct reg_code_blocks *
506 S_alloc_code_blocks(pTHX_ int ncode)
508 struct reg_code_blocks *cbs;
509 Newx(cbs, 1, struct reg_code_blocks);
512 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
514 Newx(cbs->cb, ncode, struct reg_code_block);
521 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
522 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
523 * point to the realloced string and length.
525 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
529 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
530 char **pat_p, STRLEN *plen_p, int num_code_blocks)
532 U8 *const src = (U8*)*pat_p;
537 DECLARE_AND_GET_RE_DEBUG_FLAGS;
539 DEBUG_PARSE_r(Perl_re_printf( aTHX_
540 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
542 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
543 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
546 while (s < *plen_p) {
547 append_utf8_from_native_byte(src[s], &d);
549 if (n < num_code_blocks) {
550 assert(pRExC_state->code_blocks);
551 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
552 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
553 assert(*(d - 1) == '(');
556 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
557 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
558 assert(*(d - 1) == ')');
567 *pat_p = (char*) dst;
569 RExC_orig_utf8 = RExC_utf8 = 1;
574 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
575 * while recording any code block indices, and handling overloading,
576 * nested qr// objects etc. If pat is null, it will allocate a new
577 * string, or just return the first arg, if there's only one.
579 * Returns the malloced/updated pat.
580 * patternp and pat_count is the array of SVs to be concatted;
581 * oplist is the optional list of ops that generated the SVs;
582 * recompile_p is a pointer to a boolean that will be set if
583 * the regex will need to be recompiled.
584 * delim, if non-null is an SV that will be inserted between each element
588 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
589 SV *pat, SV ** const patternp, int pat_count,
590 OP *oplist, bool *recompile_p, SV *delim)
594 bool use_delim = FALSE;
595 bool alloced = FALSE;
597 /* if we know we have at least two args, create an empty string,
598 * then concatenate args to that. For no args, return an empty string */
599 if (!pat && pat_count != 1) {
605 for (svp = patternp; svp < patternp + pat_count; svp++) {
608 STRLEN orig_patlen = 0;
610 SV *msv = use_delim ? delim : *svp;
611 if (!msv) msv = &PL_sv_undef;
613 /* if we've got a delimiter, we go round the loop twice for each
614 * svp slot (except the last), using the delimiter the second
623 if (SvTYPE(msv) == SVt_PVAV) {
624 /* we've encountered an interpolated array within
625 * the pattern, e.g. /...@a..../. Expand the list of elements,
626 * then recursively append elements.
627 * The code in this block is based on S_pushav() */
629 AV *const av = (AV*)msv;
630 const SSize_t maxarg = AvFILL(av) + 1;
634 assert(oplist->op_type == OP_PADAV
635 || oplist->op_type == OP_RV2AV);
636 oplist = OpSIBLING(oplist);
639 if (SvRMAGICAL(av)) {
642 Newx(array, maxarg, SV*);
644 for (i=0; i < maxarg; i++) {
645 SV ** const svp = av_fetch(av, i, FALSE);
646 array[i] = svp ? *svp : &PL_sv_undef;
653 pat = S_concat_pat(aTHX_ pRExC_state, pat,
654 array, maxarg, NULL, recompile_p,
656 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
659 pat = newSVpvs_flags("", SVs_TEMP);
666 /* we make the assumption here that each op in the list of
667 * op_siblings maps to one SV pushed onto the stack,
668 * except for code blocks, with have both an OP_NULL and
670 * This allows us to match up the list of SVs against the
671 * list of OPs to find the next code block.
673 * Note that PUSHMARK PADSV PADSV ..
675 * PADRANGE PADSV PADSV ..
676 * so the alignment still works. */
679 if (oplist->op_type == OP_NULL
680 && (oplist->op_flags & OPf_SPECIAL))
682 assert(n < pRExC_state->code_blocks->count);
683 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
684 pRExC_state->code_blocks->cb[n].block = oplist;
685 pRExC_state->code_blocks->cb[n].src_regex = NULL;
688 oplist = OpSIBLING(oplist); /* skip CONST */
691 oplist = OpSIBLING(oplist);;
694 /* apply magic and QR overloading to arg */
697 if (SvROK(msv) && SvAMAGIC(msv)) {
698 SV *sv = AMG_CALLunary(msv, regexp_amg);
702 if (SvTYPE(sv) != SVt_REGEXP)
703 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
708 /* try concatenation overload ... */
709 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
710 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
713 /* overloading involved: all bets are off over literal
714 * code. Pretend we haven't seen it */
716 pRExC_state->code_blocks->count -= n;
720 /* ... or failing that, try "" overload */
722 && (sv = AMG_CALLunary(msv, string_amg))
726 && SvRV(msv) == SvRV(sv))
731 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
735 /* this is a partially unrolled
736 * sv_catsv_nomg(pat, msv);
737 * that allows us to adjust code block indices if
740 char *dst = SvPV_force_nomg(pat, dlen);
742 if (SvUTF8(msv) && !SvUTF8(pat)) {
743 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
744 sv_setpvn(pat, dst, dlen);
747 sv_catsv_nomg(pat, msv);
751 /* We have only one SV to process, but we need to verify
752 * it is properly null terminated or we will fail asserts
753 * later. In theory we probably shouldn't get such SV's,
754 * but if we do we should handle it gracefully. */
755 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
756 /* not a string, or a string with a trailing null */
759 /* a string with no trailing null, we need to copy it
760 * so it has a trailing null */
761 pat = sv_2mortal(newSVsv(msv));
766 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
769 /* extract any code blocks within any embedded qr//'s */
770 if (rx && SvTYPE(rx) == SVt_REGEXP
771 && RX_ENGINE((REGEXP*)rx)->op_comp)
774 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
775 if (ri->code_blocks && ri->code_blocks->count) {
777 /* the presence of an embedded qr// with code means
778 * we should always recompile: the text of the
779 * qr// may not have changed, but it may be a
780 * different closure than last time */
782 if (pRExC_state->code_blocks) {
783 int new_count = pRExC_state->code_blocks->count
784 + ri->code_blocks->count;
785 Renew(pRExC_state->code_blocks->cb,
786 new_count, struct reg_code_block);
787 pRExC_state->code_blocks->count = new_count;
790 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
791 ri->code_blocks->count);
793 for (i=0; i < ri->code_blocks->count; i++) {
794 struct reg_code_block *src, *dst;
795 STRLEN offset = orig_patlen
796 + ReANY((REGEXP *)rx)->pre_prefix;
797 assert(n < pRExC_state->code_blocks->count);
798 src = &ri->code_blocks->cb[i];
799 dst = &pRExC_state->code_blocks->cb[n];
800 dst->start = src->start + offset;
801 dst->end = src->end + offset;
802 dst->block = src->block;
803 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
812 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
821 /* see if there are any run-time code blocks in the pattern.
822 * False positives are allowed */
825 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
826 char *pat, STRLEN plen)
833 for (s = 0; s < plen; s++) {
834 if ( pRExC_state->code_blocks
835 && n < pRExC_state->code_blocks->count
836 && s == pRExC_state->code_blocks->cb[n].start)
838 s = pRExC_state->code_blocks->cb[n].end;
842 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
844 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
846 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
853 /* Handle run-time code blocks. We will already have compiled any direct
854 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
855 * copy of it, but with any literal code blocks blanked out and
856 * appropriate chars escaped; then feed it into
858 * eval "qr'modified_pattern'"
862 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
866 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
868 * After eval_sv()-ing that, grab any new code blocks from the returned qr
869 * and merge them with any code blocks of the original regexp.
871 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
872 * instead, just save the qr and return FALSE; this tells our caller that
873 * the original pattern needs upgrading to utf8.
877 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
878 char *pat, STRLEN plen)
882 DECLARE_AND_GET_RE_DEBUG_FLAGS;
884 if (pRExC_state->runtime_code_qr) {
885 /* this is the second time we've been called; this should
886 * only happen if the main pattern got upgraded to utf8
887 * during compilation; re-use the qr we compiled first time
888 * round (which should be utf8 too)
890 qr = pRExC_state->runtime_code_qr;
891 pRExC_state->runtime_code_qr = NULL;
892 assert(RExC_utf8 && SvUTF8(qr));
898 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
902 /* determine how many extra chars we need for ' and \ escaping */
903 for (s = 0; s < plen; s++) {
904 if (pat[s] == '\'' || pat[s] == '\\')
908 Newx(newpat, newlen, char);
910 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
912 for (s = 0; s < plen; s++) {
913 if ( pRExC_state->code_blocks
914 && n < pRExC_state->code_blocks->count
915 && s == pRExC_state->code_blocks->cb[n].start)
917 /* blank out literal code block so that they aren't
918 * recompiled: eg change from/to:
928 assert(pat[s] == '(');
929 assert(pat[s+1] == '?');
933 while (s < pRExC_state->code_blocks->cb[n].end) {
941 if (pat[s] == '\'' || pat[s] == '\\')
946 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
948 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
954 Perl_re_printf( aTHX_
955 "%sre-parsing pattern for runtime code:%s %s\n",
956 PL_colors[4], PL_colors[5], newpat);
959 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
965 PUSHSTACKi(PERLSI_REQUIRE);
966 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
967 * parsing qr''; normally only q'' does this. It also alters
969 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
975 SV * const errsv = ERRSV;
976 if (SvTRUE_NN(errsv))
978 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
980 assert(SvROK(qr_ref));
982 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
983 /* the leaving below frees the tmp qr_ref.
984 * Give qr a life of its own */
992 if (!RExC_utf8 && SvUTF8(qr)) {
993 /* first time through; the pattern got upgraded; save the
994 * qr for the next time through */
995 assert(!pRExC_state->runtime_code_qr);
996 pRExC_state->runtime_code_qr = qr;
1001 /* extract any code blocks within the returned qr// */
1004 /* merge the main (r1) and run-time (r2) code blocks into one */
1006 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
1007 struct reg_code_block *new_block, *dst;
1008 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
1012 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
1014 SvREFCNT_dec_NN(qr);
1018 if (!r1->code_blocks)
1019 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
1021 r1c = r1->code_blocks->count;
1022 r2c = r2->code_blocks->count;
1024 Newx(new_block, r1c + r2c, struct reg_code_block);
1028 while (i1 < r1c || i2 < r2c) {
1029 struct reg_code_block *src;
1033 src = &r2->code_blocks->cb[i2++];
1037 src = &r1->code_blocks->cb[i1++];
1038 else if ( r1->code_blocks->cb[i1].start
1039 < r2->code_blocks->cb[i2].start)
1041 src = &r1->code_blocks->cb[i1++];
1042 assert(src->end < r2->code_blocks->cb[i2].start);
1045 assert( r1->code_blocks->cb[i1].start
1046 > r2->code_blocks->cb[i2].start);
1047 src = &r2->code_blocks->cb[i2++];
1049 assert(src->end < r1->code_blocks->cb[i1].start);
1052 assert(pat[src->start] == '(');
1053 assert(pat[src->end] == ')');
1054 dst->start = src->start;
1055 dst->end = src->end;
1056 dst->block = src->block;
1057 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
1061 r1->code_blocks->count += r2c;
1062 Safefree(r1->code_blocks->cb);
1063 r1->code_blocks->cb = new_block;
1066 SvREFCNT_dec_NN(qr);
1072 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
1073 struct reg_substr_datum *rsd,
1074 struct scan_data_substrs *sub,
1075 STRLEN longest_length)
1077 /* This is the common code for setting up the floating and fixed length
1078 * string data extracted from Perl_re_op_compile() below. Returns a boolean
1079 * as to whether succeeded or not */
1083 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
1084 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
1086 if (! (longest_length
1087 || (eol /* Can't have SEOL and MULTI */
1088 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
1090 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
1091 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
1096 /* copy the information about the longest from the reg_scan_data
1097 over to the program. */
1098 if (SvUTF8(sub->str)) {
1100 rsd->utf8_substr = sub->str;
1102 rsd->substr = sub->str;
1103 rsd->utf8_substr = NULL;
1105 /* end_shift is how many chars that must be matched that
1106 follow this item. We calculate it ahead of time as once the
1107 lookbehind offset is added in we lose the ability to correctly
1109 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
1110 rsd->end_shift = ml - sub->min_offset
1112 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
1114 + (SvTAIL(sub->str) != 0)
1118 t = (eol/* Can't have SEOL and MULTI */
1119 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
1120 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
1126 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
1128 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
1129 * properly wrapped with the right modifiers */
1131 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
1132 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
1133 != REGEX_DEPENDS_CHARSET);
1135 /* The caret is output if there are any defaults: if not all the STD
1136 * flags are set, or if no character set specifier is needed */
1138 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
1140 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
1141 == REG_RUN_ON_COMMENT_SEEN);
1142 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
1143 >> RXf_PMf_STD_PMMOD_SHIFT);
1144 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
1146 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
1148 /* We output all the necessary flags; we never output a minus, as all
1149 * those are defaults, so are
1150 * covered by the caret */
1151 const STRLEN wraplen = pat_len + has_p + has_runon
1152 + has_default /* If needs a caret */
1153 + PL_bitcount[reganch] /* 1 char for each set standard flag */
1155 /* If needs a character set specifier */
1156 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
1157 + (sizeof("(?:)") - 1);
1159 PERL_ARGS_ASSERT_SET_REGEX_PV;
1161 /* make sure PL_bitcount bounds not exceeded */
1162 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
1164 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
1167 SvFLAGS(Rx) |= SVf_UTF8;
1170 /* If a default, cover it using the caret */
1172 *p++= DEFAULT_PAT_MOD;
1178 name = get_regex_charset_name(RExC_rx->extflags, &len);
1179 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
1181 name = UNICODE_PAT_MODS;
1182 len = sizeof(UNICODE_PAT_MODS) - 1;
1184 Copy(name, p, len, char);
1188 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
1191 while((ch = *fptr++)) {
1199 Copy(RExC_precomp, p, pat_len, char);
1200 assert ((RX_WRAPPED(Rx) - p) < 16);
1201 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
1204 /* Adding a trailing \n causes this to compile properly:
1205 my $R = qr / A B C # D E/x; /($R)/
1206 Otherwise the parens are considered part of the comment */
1211 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
1215 * Perl_re_op_compile - the perl internal RE engine's function to compile a
1216 * regular expression into internal code.
1217 * The pattern may be passed either as:
1218 * a list of SVs (patternp plus pat_count)
1219 * a list of OPs (expr)
1220 * If both are passed, the SV list is used, but the OP list indicates
1221 * which SVs are actually pre-compiled code blocks
1223 * The SVs in the list have magic and qr overloading applied to them (and
1224 * the list may be modified in-place with replacement SVs in the latter
1227 * If the pattern hasn't changed from old_re, then old_re will be
1230 * eng is the current engine. If that engine has an op_comp method, then
1231 * handle directly (i.e. we assume that op_comp was us); otherwise, just
1232 * do the initial concatenation of arguments and pass on to the external
1235 * If is_bare_re is not null, set it to a boolean indicating whether the
1236 * arg list reduced (after overloading) to a single bare regex which has
1237 * been returned (i.e. /$qr/).
1239 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
1241 * pm_flags contains the PMf_* flags, typically based on those from the
1242 * pm_flags field of the related PMOP. Currently we're only interested in
1243 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
1245 * For many years this code had an initial sizing pass that calculated
1246 * (sometimes incorrectly, leading to security holes) the size needed for the
1247 * compiled pattern. That was changed by commit
1248 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
1249 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
1250 * references to this sizing pass.
1252 * Now, an initial crude guess as to the size needed is made, based on the
1253 * length of the pattern. Patches welcome to improve that guess. That amount
1254 * of space is malloc'd and then immediately freed, and then clawed back node
1255 * by node. This design is to minimize, to the extent possible, memory churn
1256 * when doing the reallocs.
1258 * A separate parentheses counting pass may be needed in some cases.
1259 * (Previously the sizing pass did this.) Patches welcome to reduce the number
1262 * The existence of a sizing pass necessitated design decisions that are no
1263 * longer needed. There are potential areas of simplification.
1265 * Beware that the optimization-preparation code in here knows about some
1266 * of the structure of the compiled regexp. [I'll say.]
1270 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
1271 OP *expr, const regexp_engine* eng, REGEXP *old_re,
1272 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
1274 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
1282 SV** new_patternp = patternp;
1284 /* these are all flags - maybe they should be turned
1285 * into a single int with different bit masks */
1286 I32 sawlookahead = 0;
1291 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
1293 bool runtime_code = 0;
1295 RExC_state_t RExC_state;
1296 RExC_state_t * const pRExC_state = &RExC_state;
1297 #ifdef TRIE_STUDY_OPT
1298 /* search for "restudy" in this file for a detailed explanation */
1300 RExC_state_t copyRExC_state;
1302 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1304 PERL_ARGS_ASSERT_RE_OP_COMPILE;
1306 DEBUG_r(if (!PL_colorset) reginitcolors());
1309 pRExC_state->warn_text = NULL;
1310 pRExC_state->unlexed_names = NULL;
1311 pRExC_state->code_blocks = NULL;
1314 *is_bare_re = FALSE;
1316 if (expr && (expr->op_type == OP_LIST ||
1317 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
1318 /* allocate code_blocks if needed */
1322 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
1323 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
1324 ncode++; /* count of DO blocks */
1327 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
1331 /* compile-time pattern with just OP_CONSTs and DO blocks */
1336 /* find how many CONSTs there are */
1339 if (expr->op_type == OP_CONST)
1342 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1343 if (o->op_type == OP_CONST)
1347 /* fake up an SV array */
1349 assert(!new_patternp);
1350 Newx(new_patternp, n, SV*);
1351 SAVEFREEPV(new_patternp);
1355 if (expr->op_type == OP_CONST)
1356 new_patternp[n] = cSVOPx_sv(expr);
1358 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
1359 if (o->op_type == OP_CONST)
1360 new_patternp[n++] = cSVOPo_sv;
1365 DEBUG_PARSE_r(Perl_re_printf( aTHX_
1366 "Assembling pattern from %d elements%s\n", pat_count,
1367 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1369 /* set expr to the first arg op */
1371 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
1372 && expr->op_type != OP_CONST)
1374 expr = cLISTOPx(expr)->op_first;
1375 assert( expr->op_type == OP_PUSHMARK
1376 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
1377 || expr->op_type == OP_PADRANGE);
1378 expr = OpSIBLING(expr);
1381 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
1382 expr, &recompile, NULL);
1384 /* handle bare (possibly after overloading) regex: foo =~ $re */
1389 if (SvTYPE(re) == SVt_REGEXP) {
1393 DEBUG_PARSE_r(Perl_re_printf( aTHX_
1394 "Precompiled pattern%s\n",
1395 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
1401 exp = SvPV_nomg(pat, plen);
1403 if (!eng->op_comp) {
1404 if ((SvUTF8(pat) && IN_BYTES)
1405 || SvGMAGICAL(pat) || SvAMAGIC(pat))
1407 /* make a temporary copy; either to convert to bytes,
1408 * or to avoid repeating get-magic / overloaded stringify */
1409 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
1410 (IN_BYTES ? 0 : SvUTF8(pat)));
1412 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
1415 /* ignore the utf8ness if the pattern is 0 length */
1416 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
1417 RExC_uni_semantics = 0;
1418 RExC_contains_locale = 0;
1419 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
1420 RExC_in_script_run = 0;
1421 RExC_study_started = 0;
1422 pRExC_state->runtime_code_qr = NULL;
1423 RExC_frame_head= NULL;
1424 RExC_frame_last= NULL;
1425 RExC_frame_count= 0;
1426 RExC_latest_warn_offset = 0;
1427 RExC_use_BRANCHJ = 0;
1428 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
1429 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
1430 RExC_logical_total_parens = 0;
1431 RExC_total_parens = 0;
1432 RExC_logical_to_parno = NULL;
1433 RExC_parno_to_logical = NULL;
1434 RExC_open_parens = NULL;
1435 RExC_close_parens = NULL;
1436 RExC_paren_names = NULL;
1438 RExC_seen_d_op = FALSE;
1440 RExC_paren_name_list = NULL;
1444 RExC_mysv1= sv_newmortal();
1445 RExC_mysv2= sv_newmortal();
1449 SV *dsv= sv_newmortal();
1450 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1451 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
1452 PL_colors[4], PL_colors[5], s);
1455 /* we jump here if we have to recompile, e.g., from upgrading the pattern
1458 if ((pm_flags & PMf_USE_RE_EVAL)
1459 /* this second condition covers the non-regex literal case,
1460 * i.e. $foo =~ '(?{})'. */
1461 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
1463 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
1466 /* return old regex if pattern hasn't changed */
1467 /* XXX: note in the below we have to check the flags as well as the
1470 * Things get a touch tricky as we have to compare the utf8 flag
1471 * independently from the compile flags. */
1475 && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
1476 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
1477 && RX_PRELEN(old_re) == plen
1478 && memEQ(RX_PRECOMP(old_re), exp, plen)
1479 && !runtime_code /* with runtime code, always recompile */ )
1482 SV *dsv= sv_newmortal();
1483 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
1484 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
1485 PL_colors[4], PL_colors[5], s);
1490 /* Allocate the pattern's SV */
1491 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
1492 RExC_rx = ReANY(Rx);
1493 if ( RExC_rx == NULL )
1494 FAIL("Regexp out of space");
1496 rx_flags = orig_rx_flags;
1498 if ( toUSE_UNI_CHARSET_NOT_DEPENDS
1499 && initial_charset == REGEX_DEPENDS_CHARSET)
1502 /* Set to use unicode semantics if the pattern is in utf8 and has the
1503 * 'depends' charset specified, as it means unicode when utf8 */
1504 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
1505 RExC_uni_semantics = 1;
1508 RExC_pm_flags = pm_flags;
1511 assert(TAINTING_get || !TAINT_get);
1513 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
1515 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
1516 /* whoops, we have a non-utf8 pattern, whilst run-time code
1517 * got compiled as utf8. Try again with a utf8 pattern */
1518 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1519 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1523 assert(!pRExC_state->runtime_code_qr);
1529 RExC_in_lookaround = 0;
1530 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1531 RExC_recode_x_to_native = 0;
1532 RExC_in_multi_char_class = 0;
1534 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
1535 RExC_precomp_end = RExC_end = exp + plen;
1537 RExC_whilem_seen = 0;
1539 RExC_recurse = NULL;
1540 RExC_study_chunk_recursed = NULL;
1541 RExC_study_chunk_recursed_bytes= 0;
1542 RExC_recurse_count = 0;
1543 RExC_sets_depth = 0;
1544 pRExC_state->code_index = 0;
1546 /* Initialize the string in the compiled pattern. This is so that there is
1547 * something to output if necessary */
1548 set_regex_pv(pRExC_state, Rx);
1551 Perl_re_printf( aTHX_
1552 "Starting parse and generation\n");
1554 RExC_lastparse=NULL;
1557 /* Allocate space and zero-initialize. Note, the two step process
1558 of zeroing when in debug mode, thus anything assigned has to
1559 happen after that */
1562 /* On the first pass of the parse, we guess how big this will be. Then
1563 * we grow in one operation to that amount and then give it back. As
1564 * we go along, we re-allocate what we need.
1566 * XXX Currently the guess is essentially that the pattern will be an
1567 * EXACT node with one byte input, one byte output. This is crude, and
1568 * better heuristics are welcome.
1570 * On any subsequent passes, we guess what we actually computed in the
1571 * latest earlier pass. Such a pass probably didn't complete so is
1572 * missing stuff. We could improve those guesses by knowing where the
1573 * parse stopped, and use the length so far plus apply the above
1574 * assumption to what's left. */
1575 RExC_size = STR_SZ(RExC_end - RExC_start);
1578 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
1579 if ( RExC_rxi == NULL )
1580 FAIL("Regexp out of space");
1582 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
1583 RXi_SET( RExC_rx, RExC_rxi );
1585 /* We start from 0 (over from 0 in the case this is a reparse. The first
1586 * node parsed will give back any excess memory we have allocated so far).
1590 /* non-zero initialization begins here */
1591 RExC_rx->engine= eng;
1592 RExC_rx->extflags = rx_flags;
1593 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
1595 if (pm_flags & PMf_IS_QR) {
1596 RExC_rxi->code_blocks = pRExC_state->code_blocks;
1597 if (RExC_rxi->code_blocks) {
1598 RExC_rxi->code_blocks->refcnt++;
1602 RExC_rx->intflags = 0;
1604 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
1605 RExC_parse_set(exp);
1607 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
1608 * code makes sure the final byte is an uncounted NUL. But should this
1609 * ever not be the case, lots of things could read beyond the end of the
1610 * buffer: loops like
1611 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
1612 * strchr(RExC_parse, "foo");
1613 * etc. So it is worth noting. */
1614 assert(*RExC_end == '\0');
1618 RExC_logical_npar = 1;
1619 RExC_parens_buf_size = 0;
1620 RExC_emit_start = RExC_rxi->program;
1621 pRExC_state->code_index = 0;
1623 *((char*) RExC_emit_start) = (char) REG_MAGIC;
1624 RExC_emit = NODE_STEP_REGNODE;
1627 if (reg(pRExC_state, 0, &flags, 1)) {
1629 /* Success!, But we may need to redo the parse knowing how many parens
1630 * there actually are */
1631 if (IN_PARENS_PASS) {
1632 flags |= RESTART_PARSE;
1635 /* We have that number in RExC_npar */
1636 RExC_total_parens = RExC_npar;
1637 RExC_logical_total_parens = RExC_logical_npar;
1639 else if (! MUST_RESTART(flags)) {
1641 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
1644 /* Here, we either have success, or we have to redo the parse for some reason */
1645 if (MUST_RESTART(flags)) {
1647 /* It's possible to write a regexp in ascii that represents Unicode
1648 codepoints outside of the byte range, such as via \x{100}. If we
1649 detect such a sequence we have to convert the entire pattern to utf8
1650 and then recompile, as our sizing calculation will have been based
1651 on 1 byte == 1 character, but we will need to use utf8 to encode
1652 at least some part of the pattern, and therefore must convert the whole
1655 if (flags & NEED_UTF8) {
1657 /* We have stored the offset of the final warning output so far.
1658 * That must be adjusted. Any variant characters between the start
1659 * of the pattern and this warning count for 2 bytes in the final,
1660 * so just add them again */
1661 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
1662 RExC_latest_warn_offset +=
1663 variant_under_utf8_count((U8 *) exp, (U8 *) exp
1664 + RExC_latest_warn_offset);
1666 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
1667 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
1668 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
1671 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
1674 if (ALL_PARENS_COUNTED) {
1675 /* Make enough room for all the known parens, and zero it */
1676 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
1677 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
1678 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
1680 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
1681 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
1682 /* we do NOT reinitialize RExC_logical_to_parno and
1683 * RExC_parno_to_logical here. We need their data on the second
1686 else { /* Parse did not complete. Reinitialize the parentheses
1688 RExC_total_parens = 0;
1689 if (RExC_open_parens) {
1690 Safefree(RExC_open_parens);
1691 RExC_open_parens = NULL;
1693 if (RExC_close_parens) {
1694 Safefree(RExC_close_parens);
1695 RExC_close_parens = NULL;
1697 if (RExC_logical_to_parno) {
1698 Safefree(RExC_logical_to_parno);
1699 RExC_logical_to_parno = NULL;
1701 if (RExC_parno_to_logical) {
1702 Safefree(RExC_parno_to_logical);
1703 RExC_parno_to_logical = NULL;
1707 /* Clean up what we did in this parse */
1708 SvREFCNT_dec_NN(RExC_rx_sv);
1713 /* Here, we have successfully parsed and generated the pattern's program
1714 * for the regex engine. We are ready to finish things up and look for
1717 /* Update the string to compile, with correct modifiers, etc */
1718 set_regex_pv(pRExC_state, Rx);
1720 RExC_rx->nparens = RExC_total_parens - 1;
1721 RExC_rx->logical_nparens = RExC_logical_total_parens - 1;
1723 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
1724 if (RExC_whilem_seen > 15)
1725 RExC_whilem_seen = 15;
1728 Perl_re_printf( aTHX_
1729 "Required size %" IVdf " nodes\n", (IV)RExC_size);
1731 RExC_lastparse=NULL;
1734 SetProgLen(RExC_rxi,RExC_size);
1736 DEBUG_DUMP_PRE_OPTIMIZE_r({
1737 SV * const sv = sv_newmortal();
1738 RXi_GET_DECL(RExC_rx, ri);
1740 Perl_re_printf( aTHX_ "Program before optimization:\n");
1742 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
1747 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
1750 /* XXXX To minimize changes to RE engine we always allocate
1751 3-units-long substrs field. */
1752 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
1753 if (RExC_recurse_count) {
1754 Newx(RExC_recurse, RExC_recurse_count, regnode *);
1755 SAVEFREEPV(RExC_recurse);
1758 if (RExC_seen & REG_RECURSE_SEEN) {
1759 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
1760 * So its 1 if there are no parens. */
1761 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
1762 ((RExC_total_parens & 0x07) != 0);
1763 Newx(RExC_study_chunk_recursed,
1764 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1765 SAVEFREEPV(RExC_study_chunk_recursed);
1769 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
1771 RExC_study_chunk_recursed_count= 0;
1773 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
1774 if (RExC_study_chunk_recursed) {
1775 Zero(RExC_study_chunk_recursed,
1776 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
1780 #ifdef TRIE_STUDY_OPT
1781 /* search for "restudy" in this file for a detailed explanation */
1783 StructCopy(&zero_scan_data, &data, scan_data_t);
1784 copyRExC_state = RExC_state;
1787 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
1789 RExC_state = copyRExC_state;
1790 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
1791 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
1793 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
1794 StructCopy(&zero_scan_data, &data, scan_data_t);
1797 StructCopy(&zero_scan_data, &data, scan_data_t);
1800 /* Dig out information for optimizations. */
1801 RExC_rx->extflags = RExC_flags; /* was pm_op */
1802 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
1805 SvUTF8_on(Rx); /* Unicode in it? */
1806 RExC_rxi->regstclass = NULL;
1807 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
1808 RExC_rx->intflags |= PREGf_NAUGHTY;
1809 scan = RExC_rxi->program + 1; /* First BRANCH. */
1811 /* testing for BRANCH here tells us whether there is "must appear"
1812 data in the pattern. If there is then we can use it for optimisations */
1813 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
1815 SSize_t fake_deltap;
1816 STRLEN longest_length[2];
1817 regnode_ssc ch_class; /* pointed to by data */
1819 SSize_t last_close = 0; /* pointed to by data */
1820 regnode *first= scan;
1821 regnode *first_next= regnext(first);
1822 regnode *last_close_op= NULL;
1826 * Skip introductions and multiplicators >= 1
1827 * so that we can extract the 'meat' of the pattern that must
1828 * match in the large if() sequence following.
1829 * NOTE that EXACT is NOT covered here, as it is normally
1830 * picked up by the optimiser separately.
1832 * This is unfortunate as the optimiser isnt handling lookahead
1833 * properly currently.
1838 if (OP(first) == OPEN)
1841 if (OP(first) == IFMATCH && !FLAGS(first))
1842 /* for now we can't handle lookbehind IFMATCH */
1845 if (OP(first) == PLUS)
1848 if (OP(first) == MINMOD)
1852 /* An OR of *one* alternative - should not happen now. */
1853 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
1854 /* An {n,m} with n>0 */
1855 (REGNODE_TYPE(OP(first)) == CURLY && ARG1i(first) > 0) ||
1856 (OP(first) == NOTHING && REGNODE_TYPE(OP(first_next)) != END)
1861 first = REGNODE_AFTER(first);
1862 first_next= regnext(first);
1865 /* Starting-point info. */
1867 DEBUG_PEEP("first:", first, 0, 0);
1868 /* Ignore EXACT as we deal with it later. */
1869 if (REGNODE_TYPE(OP(first)) == EXACT) {
1870 if (! isEXACTFish(OP(first))) {
1871 NOOP; /* Empty, get anchored substr later. */
1874 RExC_rxi->regstclass = first;
1877 else if (REGNODE_TYPE(OP(first)) == TRIE &&
1878 ((reg_trie_data *)RExC_rxi->data->data[ ARG1u(first) ])->minlen>0)
1880 /* this can happen only on restudy
1881 * Search for "restudy" in this file to find
1882 * a comment with details. */
1883 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
1886 else if (REGNODE_SIMPLE(OP(first)))
1887 RExC_rxi->regstclass = first;
1888 else if (REGNODE_TYPE(OP(first)) == BOUND ||
1889 REGNODE_TYPE(OP(first)) == NBOUND)
1890 RExC_rxi->regstclass = first;
1891 else if (REGNODE_TYPE(OP(first)) == BOL) {
1892 RExC_rx->intflags |= (OP(first) == MBOL
1895 first = REGNODE_AFTER(first);
1898 else if (OP(first) == GPOS) {
1899 RExC_rx->intflags |= PREGf_ANCH_GPOS;
1900 first = REGNODE_AFTER_type(first,tregnode_GPOS);
1903 else if ((!sawopen || !RExC_sawback) &&
1905 (OP(first) == STAR &&
1906 REGNODE_TYPE(OP(REGNODE_AFTER(first))) == REG_ANY) &&
1907 !(RExC_rx->intflags & PREGf_ANCH) && !(RExC_seen & REG_PESSIMIZE_SEEN))
1909 /* turn .* into ^.* with an implied $*=1 */
1911 (OP(REGNODE_AFTER(first)) == REG_ANY)
1914 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
1915 first = REGNODE_AFTER(first);
1918 if (sawplus && !sawminmod && !sawlookahead
1919 && (!sawopen || !RExC_sawback)
1920 && !(RExC_seen & REG_PESSIMIZE_SEEN)) /* May examine pos and $& */
1921 /* x+ must match at the 1st pos of run of x's */
1922 RExC_rx->intflags |= PREGf_SKIP;
1924 /* Scan is after the zeroth branch, first is atomic matcher. */
1925 #ifdef TRIE_STUDY_OPT
1926 /* search for "restudy" in this file for a detailed explanation */
1929 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
1930 (IV)(first - scan + 1))
1934 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
1935 (IV)(first - scan + 1))
1941 * If there's something expensive in the r.e., find the
1942 * longest literal string that must appear and make it the
1943 * regmust. Resolve ties in favor of later strings, since
1944 * the regstart check works with the beginning of the r.e.
1945 * and avoiding duplication strengthens checking. Not a
1946 * strong reason, but sufficient in the absence of others.
1947 * [Now we resolve ties in favor of the earlier string if
1948 * it happens that c_offset_min has been invalidated, since the
1949 * earlier string may buy us something the later one won't.]
1952 data.substrs[0].str = newSVpvs("");
1953 data.substrs[1].str = newSVpvs("");
1954 data.last_found = newSVpvs("");
1955 data.cur_is_floating = 0; /* initially any found substring is fixed */
1956 ENTER_with_name("study_chunk");
1957 SAVEFREESV(data.substrs[0].str);
1958 SAVEFREESV(data.substrs[1].str);
1959 SAVEFREESV(data.last_found);
1961 if (!RExC_rxi->regstclass) {
1962 ssc_init(pRExC_state, &ch_class);
1963 data.start_class = &ch_class;
1964 stclass_flag = SCF_DO_STCLASS_AND;
1965 } else /* XXXX Check for BOUND? */
1967 data.last_closep = &last_close;
1968 data.last_close_opp = &last_close_op;
1972 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
1973 * (NO top level branches)
1975 minlen = study_chunk(pRExC_state, &first, &minlen, &fake_deltap,
1976 scan + RExC_size, /* Up to end */
1978 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
1979 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
1981 /* search for "restudy" in this file for a detailed explanation
1982 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
1985 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
1988 if ( RExC_total_parens == 1 && !data.cur_is_floating
1989 && data.last_start_min == 0 && data.last_end > 0
1990 && !RExC_seen_zerolen
1991 && !(RExC_seen & REG_VERBARG_SEEN)
1992 && !(RExC_seen & REG_GPOS_SEEN)
1994 RExC_rx->extflags |= RXf_CHECK_ALL;
1996 scan_commit(pRExC_state, &data,&minlen, 0);
1999 /* XXX this is done in reverse order because that's the way the
2000 * code was before it was parameterised. Don't know whether it
2001 * actually needs doing in reverse order. DAPM */
2002 for (i = 1; i >= 0; i--) {
2003 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
2006 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
2007 && data.substrs[0].min_offset
2008 == data.substrs[1].min_offset
2009 && SvCUR(data.substrs[0].str)
2010 == SvCUR(data.substrs[1].str)
2012 && S_setup_longest (aTHX_ pRExC_state,
2013 &(RExC_rx->substrs->data[i]),
2017 RExC_rx->substrs->data[i].min_offset =
2018 data.substrs[i].min_offset - data.substrs[i].lookbehind;
2020 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
2021 /* Don't offset infinity */
2022 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
2023 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
2024 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
2027 RExC_rx->substrs->data[i].substr = NULL;
2028 RExC_rx->substrs->data[i].utf8_substr = NULL;
2029 longest_length[i] = 0;
2033 LEAVE_with_name("study_chunk");
2035 if (RExC_rxi->regstclass
2036 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
2037 RExC_rxi->regstclass = NULL;
2039 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
2040 || RExC_rx->substrs->data[0].min_offset)
2042 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2043 && is_ssc_worth_it(pRExC_state, data.start_class))
2045 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2047 ssc_finalize(pRExC_state, data.start_class);
2049 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2050 StructCopy(data.start_class,
2051 (regnode_ssc*)RExC_rxi->data->data[n],
2053 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2054 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
2055 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
2056 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2057 Perl_re_printf( aTHX_
2058 "synthetic stclass \"%s\".\n",
2059 SvPVX_const(sv));});
2060 data.start_class = NULL;
2063 /* A temporary algorithm prefers floated substr to fixed one of
2064 * same length to dig more info. */
2065 i = (longest_length[0] <= longest_length[1]);
2066 RExC_rx->substrs->check_ix = i;
2067 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
2068 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
2069 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
2070 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
2071 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
2072 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
2073 RExC_rx->intflags |= PREGf_NOSCAN;
2075 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
2076 RExC_rx->extflags |= RXf_USE_INTUIT;
2077 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
2078 RExC_rx->extflags |= RXf_INTUIT_TAIL;
2081 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
2082 if ( (STRLEN)minlen < longest_length[1] )
2083 minlen= longest_length[1];
2084 if ( (STRLEN)minlen < longest_length[0] )
2085 minlen= longest_length[0];
2089 /* Several toplevels. Best we can is to set minlen. */
2090 SSize_t fake_deltap;
2091 regnode_ssc ch_class;
2092 SSize_t last_close = 0;
2093 regnode *last_close_op = NULL;
2095 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
2097 scan = RExC_rxi->program + 1;
2098 ssc_init(pRExC_state, &ch_class);
2099 data.start_class = &ch_class;
2100 data.last_closep = &last_close;
2101 data.last_close_opp = &last_close_op;
2105 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
2106 * (patterns WITH top level branches)
2108 minlen = study_chunk(pRExC_state,
2109 &scan, &minlen, &fake_deltap, scan + RExC_size, &data, -1, 0, NULL,
2110 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
2111 ? SCF_TRIE_DOING_RESTUDY
2114 /* search for "restudy" in this file for a detailed explanation
2115 * of 'restudied' and SCF_TRIE_DOING_RESTUDY */
2117 CHECK_RESTUDY_GOTO_butfirst(NOOP);
2119 RExC_rx->check_substr = NULL;
2120 RExC_rx->check_utf8 = NULL;
2121 RExC_rx->substrs->data[0].substr = NULL;
2122 RExC_rx->substrs->data[0].utf8_substr = NULL;
2123 RExC_rx->substrs->data[1].substr = NULL;
2124 RExC_rx->substrs->data[1].utf8_substr = NULL;
2126 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
2127 && is_ssc_worth_it(pRExC_state, data.start_class))
2129 const U32 n = reg_add_data(pRExC_state, STR_WITH_LEN("f"));
2131 ssc_finalize(pRExC_state, data.start_class);
2133 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
2134 StructCopy(data.start_class,
2135 (regnode_ssc*)RExC_rxi->data->data[n],
2137 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
2138 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
2139 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
2140 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
2141 Perl_re_printf( aTHX_
2142 "synthetic stclass \"%s\".\n",
2143 SvPVX_const(sv));});
2144 data.start_class = NULL;
2148 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
2149 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
2150 RExC_rx->maxlen = REG_INFTY;
2153 RExC_rx->maxlen = RExC_maxlen;
2156 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
2157 the "real" pattern. */
2159 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
2160 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
2162 RExC_rx->minlenret = minlen;
2163 if (RExC_rx->minlen < minlen)
2164 RExC_rx->minlen = minlen;
2166 if (RExC_seen & REG_RECURSE_SEEN ) {
2167 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
2168 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
2170 if (RExC_seen & REG_GPOS_SEEN)
2171 RExC_rx->intflags |= PREGf_GPOS_SEEN;
2173 if (RExC_seen & REG_PESSIMIZE_SEEN)
2174 RExC_rx->intflags |= PREGf_PESSIMIZE_SEEN;
2176 if (RExC_seen & REG_LOOKBEHIND_SEEN)
2177 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
2179 if (pRExC_state->code_blocks)
2180 RExC_rx->extflags |= RXf_EVAL_SEEN;
2182 if (RExC_seen & REG_VERBARG_SEEN) {
2183 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
2184 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
2187 if (RExC_seen & REG_CUTGROUP_SEEN)
2188 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
2190 if (pm_flags & PMf_USE_RE_EVAL)
2191 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
2193 if (RExC_paren_names)
2194 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
2196 RXp_PAREN_NAMES(RExC_rx) = NULL;
2198 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
2199 * so it can be used in pp.c */
2200 if (RExC_rx->intflags & PREGf_ANCH)
2201 RExC_rx->extflags |= RXf_IS_ANCHORED;
2205 /* this is used to identify "special" patterns that might result
2206 * in Perl NOT calling the regex engine and instead doing the match "itself",
2207 * particularly special cases in split//. By having the regex compiler
2208 * do this pattern matching at a regop level (instead of by inspecting the pattern)
2209 * we avoid weird issues with equivalent patterns resulting in different behavior,
2210 * AND we allow non Perl engines to get the same optimizations by the setting the
2211 * flags appropriately - Yves */
2212 regnode *first = RExC_rxi->program + 1;
2214 regnode *next = NULL;
2216 if (fop == NOTHING || fop == MBOL || fop == SBOL || fop == PLUS) {
2217 next = REGNODE_AFTER(first);
2220 /* It's safe to read through *next only if OP(first) is a regop of
2221 * the right type (not EXACT, for example).
2223 if (REGNODE_TYPE(fop) == NOTHING && nop == END)
2224 RExC_rx->extflags |= RXf_NULL;
2225 else if ((fop == MBOL || (fop == SBOL && !FLAGS(first))) && nop == END)
2226 /* when fop is SBOL first->flags will be true only when it was
2227 * produced by parsing /\A/, and not when parsing /^/. This is
2228 * very important for the split code as there we want to
2229 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
2230 * See rt #122761 for more details. -- Yves */
2231 RExC_rx->extflags |= RXf_START_ONLY;
2232 else if (fop == PLUS
2233 && REGNODE_TYPE(nop) == POSIXD && FLAGS(next) == CC_SPACE_
2234 && OP(regnext(first)) == END)
2235 RExC_rx->extflags |= RXf_WHITE;
2236 else if ( RExC_rx->extflags & RXf_SPLIT
2237 && (REGNODE_TYPE(fop) == EXACT && ! isEXACTFish(fop))
2238 && STR_LEN(first) == 1
2239 && *(STRING(first)) == ' '
2240 && OP(regnext(first)) == END )
2241 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
2245 if (RExC_contains_locale) {
2246 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
2250 if (RExC_paren_names) {
2251 RExC_rxi->name_list_idx = reg_add_data( pRExC_state, STR_WITH_LEN("a"));
2252 RExC_rxi->data->data[RExC_rxi->name_list_idx]
2253 = (void*)SvREFCNT_inc(RExC_paren_name_list);
2256 RExC_rxi->name_list_idx = 0;
2258 while ( RExC_recurse_count > 0 ) {
2259 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
2261 * This data structure is set up in study_chunk() and is used
2262 * to calculate the distance between a GOSUB regopcode and
2263 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
2266 * If for some reason someone writes code that optimises
2267 * away a GOSUB opcode then the assert should be changed to
2268 * an if(scan) to guard the ARG2i_SET() - Yves
2271 assert(scan && OP(scan) == GOSUB);
2272 ARG2i_SET( scan, RExC_open_parens[ARG1u(scan)] - REGNODE_OFFSET(scan));
2274 if (RExC_logical_total_parens != RExC_total_parens) {
2275 Newxz(RExC_parno_to_logical_next, RExC_total_parens, I32);
2276 /* we rebuild this below */
2277 Zero(RExC_logical_to_parno, RExC_total_parens, I32);
2278 for( int parno = RExC_total_parens-1 ; parno > 0 ; parno-- ) {
2279 int logical_parno= RExC_parno_to_logical[parno];
2280 assert(logical_parno);
2281 RExC_parno_to_logical_next[parno]= RExC_logical_to_parno[logical_parno];
2282 RExC_logical_to_parno[logical_parno] = parno;
2284 RExC_rx->logical_to_parno = RExC_logical_to_parno;
2285 RExC_rx->parno_to_logical = RExC_parno_to_logical;
2286 RExC_rx->parno_to_logical_next = RExC_parno_to_logical_next;
2287 RExC_logical_to_parno = NULL;
2288 RExC_parno_to_logical = NULL;
2289 RExC_parno_to_logical_next = NULL;
2291 RExC_rx->logical_to_parno = NULL;
2292 RExC_rx->parno_to_logical = NULL;
2293 RExC_rx->parno_to_logical_next = NULL;
2296 Newxz(RXp_OFFSp(RExC_rx), RExC_total_parens, regexp_paren_pair);
2297 /* assume we don't need to swap parens around before we match */
2299 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
2300 (unsigned long)RExC_study_chunk_recursed_count);
2304 Perl_re_printf( aTHX_ "Final program:\n");
2308 if (RExC_open_parens) {
2309 Safefree(RExC_open_parens);
2310 RExC_open_parens = NULL;
2312 if (RExC_close_parens) {
2313 Safefree(RExC_close_parens);
2314 RExC_close_parens = NULL;
2316 if (RExC_logical_to_parno) {
2317 Safefree(RExC_logical_to_parno);
2318 RExC_logical_to_parno = NULL;
2320 if (RExC_parno_to_logical) {
2321 Safefree(RExC_parno_to_logical);
2322 RExC_parno_to_logical = NULL;
2326 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
2327 * by setting the regexp SV to readonly-only instead. If the
2328 * pattern's been recompiled, the USEDness should remain. */
2329 if (old_re && SvREADONLY(old_re))
2338 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
2340 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
2341 PERL_UNUSED_ARG(rx);
2345 return newSVpvs("Regexp");
2348 /* Scans the name of a named buffer from the pattern.
2349 * If flags is REG_RSN_RETURN_NULL returns null.
2350 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
2351 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
2352 * to the parsed name as looked up in the RExC_paren_names hash.
2353 * If there is an error throws a vFAIL().. type exception.
2356 #define REG_RSN_RETURN_NULL 0
2357 #define REG_RSN_RETURN_NAME 1
2358 #define REG_RSN_RETURN_DATA 2
2361 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
2363 char *name_start = RExC_parse;
2366 PERL_ARGS_ASSERT_REG_SCAN_NAME;
2368 assert (RExC_parse <= RExC_end);
2369 if (RExC_parse == RExC_end) NOOP;
2370 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
2371 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
2372 * using do...while */
2375 RExC_parse_inc_utf8();
2376 } while ( RExC_parse < RExC_end
2377 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
2380 RExC_parse_inc_by(1);
2381 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
2383 RExC_parse_inc_by(1); /* so the <- from the vFAIL is after the offending
2385 vFAIL("Group name must start with a non-digit word character");
2387 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
2388 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
2389 if ( flags == REG_RSN_RETURN_NAME)
2391 else if (flags==REG_RSN_RETURN_DATA) {
2394 if ( ! sv_name ) /* should not happen*/
2395 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
2396 if (RExC_paren_names)
2397 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
2399 sv_dat = HeVAL(he_str);
2400 if ( ! sv_dat ) { /* Didn't find group */
2402 /* It might be a forward reference; we can't fail until we
2403 * know, by completing the parse to get all the groups, and
2405 if (ALL_PARENS_COUNTED) {
2406 vFAIL("Reference to nonexistent named group");
2409 REQUIRE_PARENS_PASS;
2415 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
2416 (unsigned long) flags);
2419 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
2420 if (RExC_lastparse!=RExC_parse) { \
2421 Perl_re_printf( aTHX_ "%s", \
2422 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
2423 RExC_end - RExC_parse, 16, \
2425 PERL_PV_ESCAPE_UNI_DETECT | \
2426 PERL_PV_PRETTY_ELLIPSES | \
2427 PERL_PV_PRETTY_LTGT | \
2428 PERL_PV_ESCAPE_RE | \
2429 PERL_PV_PRETTY_EXACTSIZE \
2433 Perl_re_printf( aTHX_ "%16s",""); \
2435 if (RExC_lastnum!=RExC_emit) \
2436 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
2438 Perl_re_printf( aTHX_ "|%4s",""); \
2439 Perl_re_printf( aTHX_ "|%*s%-4s", \
2440 (int)((depth*2)), "", \
2443 RExC_lastnum=RExC_emit; \
2444 RExC_lastparse=RExC_parse; \
2449 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
2450 DEBUG_PARSE_MSG((funcname)); \
2451 Perl_re_printf( aTHX_ "%4s","\n"); \
2453 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
2454 DEBUG_PARSE_MSG((funcname)); \
2455 Perl_re_printf( aTHX_ fmt "\n",args); \
2460 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
2462 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
2463 * constructs, and updates RExC_flags with them. On input, RExC_parse
2464 * should point to the first flag; it is updated on output to point to the
2465 * final ')' or ':'. There needs to be at least one flag, or this will
2468 /* for (?g), (?gc), and (?o) warnings; warning
2469 about (?c) will warn about (?g) -- japhy */
2471 #define WASTED_O 0x01
2472 #define WASTED_G 0x02
2473 #define WASTED_C 0x04
2474 #define WASTED_GC (WASTED_G|WASTED_C)
2475 I32 wastedflags = 0x00;
2476 U32 posflags = 0, negflags = 0;
2477 U32 *flagsp = &posflags;
2478 char has_charset_modifier = '\0';
2480 bool has_use_defaults = FALSE;
2481 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
2482 int x_mod_count = 0;
2484 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
2486 /* '^' as an initial flag sets certain defaults */
2487 if (UCHARAT(RExC_parse) == '^') {
2488 RExC_parse_inc_by(1);
2489 has_use_defaults = TRUE;
2490 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
2491 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2492 ? REGEX_UNICODE_CHARSET
2493 : REGEX_DEPENDS_CHARSET;
2494 set_regex_charset(&RExC_flags, cs);
2497 cs = get_regex_charset(RExC_flags);
2498 if ( cs == REGEX_DEPENDS_CHARSET
2499 && (toUSE_UNI_CHARSET_NOT_DEPENDS))
2501 cs = REGEX_UNICODE_CHARSET;
2505 while (RExC_parse < RExC_end) {
2506 /* && memCHRs("iogcmsx", *RExC_parse) */
2507 /* (?g), (?gc) and (?o) are useless here
2508 and must be globally applied -- japhy */
2509 if ((RExC_pm_flags & PMf_WILDCARD)) {
2510 if (flagsp == & negflags) {
2511 if (*RExC_parse == 'm') {
2512 RExC_parse_inc_by(1);
2513 /* diag_listed_as: Use of %s is not allowed in Unicode
2514 property wildcard subpatterns in regex; marked by <--
2516 vFAIL("Use of modifier '-m' is not allowed in Unicode"
2517 " property wildcard subpatterns");
2521 if (*RExC_parse == 's') {
2522 goto modifier_illegal_in_wildcard;
2527 switch (*RExC_parse) {
2529 /* Code for the imsxn flags */
2530 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
2532 case LOCALE_PAT_MOD:
2533 if (has_charset_modifier) {
2534 goto excess_modifier;
2536 else if (flagsp == &negflags) {
2539 cs = REGEX_LOCALE_CHARSET;
2540 has_charset_modifier = LOCALE_PAT_MOD;
2542 case UNICODE_PAT_MOD:
2543 if (has_charset_modifier) {
2544 goto excess_modifier;
2546 else if (flagsp == &negflags) {
2549 cs = REGEX_UNICODE_CHARSET;
2550 has_charset_modifier = UNICODE_PAT_MOD;
2552 case ASCII_RESTRICT_PAT_MOD:
2553 if (flagsp == &negflags) {
2556 if (has_charset_modifier) {
2557 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
2558 goto excess_modifier;
2560 /* Doubled modifier implies more restricted */
2561 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
2564 cs = REGEX_ASCII_RESTRICTED_CHARSET;
2566 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
2568 case DEPENDS_PAT_MOD:
2569 if (has_use_defaults) {
2570 goto fail_modifiers;
2572 else if (flagsp == &negflags) {
2575 else if (has_charset_modifier) {
2576 goto excess_modifier;
2579 /* The dual charset means unicode semantics if the
2580 * pattern (or target, not known until runtime) are
2581 * utf8, or something in the pattern indicates unicode
2583 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
2584 ? REGEX_UNICODE_CHARSET
2585 : REGEX_DEPENDS_CHARSET;
2586 has_charset_modifier = DEPENDS_PAT_MOD;
2589 RExC_parse_inc_by(1);
2590 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
2591 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
2593 else if (has_charset_modifier == *(RExC_parse - 1)) {
2594 vFAIL2("Regexp modifier \"%c\" may not appear twice",
2598 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
2600 NOT_REACHED; /*NOTREACHED*/
2602 RExC_parse_inc_by(1);
2603 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
2605 NOT_REACHED; /*NOTREACHED*/
2606 case GLOBAL_PAT_MOD: /* 'g' */
2607 if (RExC_pm_flags & PMf_WILDCARD) {
2608 goto modifier_illegal_in_wildcard;
2611 case ONCE_PAT_MOD: /* 'o' */
2612 if (ckWARN(WARN_REGEXP)) {
2613 const I32 wflagbit = *RExC_parse == 'o'
2616 if (! (wastedflags & wflagbit) ) {
2617 wastedflags |= wflagbit;
2618 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2621 "Useless (%s%c) - %suse /%c modifier",
2622 flagsp == &negflags ? "?-" : "?",
2624 flagsp == &negflags ? "don't " : "",
2631 case CONTINUE_PAT_MOD: /* 'c' */
2632 if (RExC_pm_flags & PMf_WILDCARD) {
2633 goto modifier_illegal_in_wildcard;
2635 if (ckWARN(WARN_REGEXP)) {
2636 if (! (wastedflags & WASTED_C) ) {
2637 wastedflags |= WASTED_GC;
2638 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
2641 "Useless (%sc) - %suse /gc modifier",
2642 flagsp == &negflags ? "?-" : "?",
2643 flagsp == &negflags ? "don't " : ""
2648 case KEEPCOPY_PAT_MOD: /* 'p' */
2649 if (RExC_pm_flags & PMf_WILDCARD) {
2650 goto modifier_illegal_in_wildcard;
2652 if (flagsp == &negflags) {
2653 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
2655 *flagsp |= RXf_PMf_KEEPCOPY;
2659 /* A flag is a default iff it is following a minus, so
2660 * if there is a minus, it means will be trying to
2661 * re-specify a default which is an error */
2662 if (has_use_defaults || flagsp == &negflags) {
2663 goto fail_modifiers;
2666 wastedflags = 0; /* reset so (?g-c) warns twice */
2672 if ( (RExC_pm_flags & PMf_WILDCARD)
2673 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
2675 RExC_parse_inc_by(1);
2676 /* diag_listed_as: Use of %s is not allowed in Unicode
2677 property wildcard subpatterns in regex; marked by <--
2679 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
2680 " property wildcard subpatterns",
2681 has_charset_modifier);
2684 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
2685 negflags |= RXf_PMf_EXTENDED_MORE;
2687 RExC_flags |= posflags;
2689 if (negflags & RXf_PMf_EXTENDED) {
2690 negflags |= RXf_PMf_EXTENDED_MORE;
2692 RExC_flags &= ~negflags;
2693 set_regex_charset(&RExC_flags, cs);
2698 RExC_parse_inc_if_char();
2699 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
2700 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
2701 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
2702 NOT_REACHED; /*NOTREACHED*/
2708 vFAIL("Sequence (?... not terminated");
2710 modifier_illegal_in_wildcard:
2711 RExC_parse_inc_by(1);
2712 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
2713 subpatterns in regex; marked by <-- HERE in m/%s/ */
2714 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
2715 " subpatterns", *(RExC_parse - 1));
2719 - reg - regular expression, i.e. main body or parenthesized thing
2721 * Caller must absorb opening parenthesis.
2723 * Combining parenthesis handling with the base level of regular expression
2724 * is a trifle forced, but the need to tie the tails of the branches to what
2725 * follows makes it hard to avoid.
2728 STATIC regnode_offset
2729 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
2731 char * backref_parse_start,
2736 char* name_start = RExC_parse;
2738 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
2739 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2741 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
2743 if (RExC_parse != name_start && ch == '}') {
2744 while (isBLANK(*RExC_parse)) {
2745 RExC_parse_inc_by(1);
2748 if (RExC_parse == name_start || *RExC_parse != ch) {
2749 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
2750 vFAIL2("Sequence %.3s... not terminated", backref_parse_start);
2754 num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
2755 RExC_rxi->data->data[num]=(void*)sv_dat;
2756 SvREFCNT_inc_simple_void_NN(sv_dat);
2759 ret = reg2node(pRExC_state,
2762 : (ASCII_FOLD_RESTRICTED)
2764 : (AT_LEAST_UNI_SEMANTICS)
2769 num, RExC_nestroot);
2770 if (RExC_nestroot && num >= (U32)RExC_nestroot)
2771 FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
2774 nextchar(pRExC_state);
2780 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2781 * NOTHING regop when the construct is empty.
2783 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2785 * Checks for unterminated constructs and throws a "not terminated" error
2786 * with the appropriate type if necessary
2788 * Assuming it does not throw an exception increments RExC_seen_zerolen.
2790 * If the construct is empty generates a NOTHING op and returns its
2791 * regnode_offset, which the caller would then return to its caller.
2793 * If the construct is not empty increments RExC_in_lookaround, and turns
2794 * on any flags provided in RExC_seen, and then returns 0 to signify
2795 * that parsing should continue.
2797 * PS: I would have called this reg_parse_lookaround_NOTHING() but then
2798 * any use of it would have had to be broken onto multiple lines, hence
2801 STATIC regnode_offset
2802 S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2806 PERL_ARGS_ASSERT_REG_LA_NOTHING;
2808 /* false below so we do not force /x */
2809 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2811 if (RExC_parse >= RExC_end)
2812 vFAIL2("Sequence (%s... not terminated", type);
2814 /* Always increment as NOTHING regops are zerolen */
2815 RExC_seen_zerolen++;
2817 if (*RExC_parse == ')') {
2818 regnode_offset ret= reg_node(pRExC_state, NOTHING);
2819 nextchar(pRExC_state);
2824 RExC_in_lookaround++;
2825 return 0; /* keep parsing! */
2830 * Maybe parse a parenthesized lookaround construct that is equivalent to a
2831 * OPFAIL regop when the construct is empty.
2833 * Calls skip_to_be_ignored_text() before checking if the construct is empty.
2835 * Checks for unterminated constructs and throws a "not terminated" error
2838 * If the construct is empty generates an OPFAIL op and returns its
2839 * regnode_offset which the caller should then return to its caller.
2841 * If the construct is not empty increments RExC_in_lookaround, and also
2842 * increments RExC_seen_zerolen, and turns on the flags provided in
2843 * RExC_seen, and then returns 0 to signify that parsing should continue.
2845 * PS: I would have called this reg_parse_lookaround_OPFAIL() but then
2846 * any use of it would have had to be broken onto multiple lines, hence
2850 STATIC regnode_offset
2851 S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
2855 PERL_ARGS_ASSERT_REG_LA_OPFAIL;
2857 /* FALSE so we don't force to /x below */;
2858 skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE);
2860 if (RExC_parse >= RExC_end)
2861 vFAIL2("Sequence (%s... not terminated", type);
2863 if (*RExC_parse == ')') {
2864 regnode_offset ret= reg1node(pRExC_state, OPFAIL, 0);
2865 nextchar(pRExC_state);
2866 return ret; /* return produced regop */
2869 /* only increment zerolen *after* we check if we produce an OPFAIL
2870 * as an OPFAIL does not match a zero length construct, as it
2871 * does not match ever. */
2872 RExC_seen_zerolen++;
2874 RExC_in_lookaround++;
2875 return 0; /* keep parsing! */
2878 /* Below are the main parsing routines.
2880 * S_reg() parses a whole pattern or subpattern. It itself handles things
2881 * like the 'xyz' in '(?xyz:...)', and calls S_regbranch for each
2882 * alternation '|' in the '...' pattern.
2883 * S_regbranch() effectively implements the concatenation operator, handling
2884 * one alternative of '|', repeatedly calling S_regpiece on each
2885 * segment of the input.
2886 * S_regpiece() calls S_regatom to handle the next atomic chunk of the input,
2887 * and then adds any quantifier for that chunk.
2888 * S_regatom() parses the next chunk of the input, returning when it
2889 * determines it has found a complete atomic chunk. The chunk may
2890 * be a nested subpattern, in which case S_reg is called
2893 * The functions generate regnodes as they go along, appending each to the
2894 * pattern data structure so far. They return the offset of the current final
2895 * node into that structure, or 0 on failure.
2897 * There are three parameters common to all of them:
2898 * pRExC_state is a structure with much information about the current
2899 * state of the parse. It's easy to add new elements to
2900 * convey new information, but beware that an error return may
2901 * require clearing the element.
2902 * flagp is a pointer to bit flags set in a lower level to pass up
2903 * to higher levels information, such as the cause of a
2904 * failure, or some characteristic about the generated node
2905 * depth is roughly the recursion depth, mostly unused except for
2906 * pretty printing debugging info.
2908 * There are ancillary functions that these may farm work out to, using the
2911 * The protocol for handling flags is that each function will, before
2912 * returning, add into *flagp the flags it needs to pass up. Each function has
2913 * a second flags variable, typically named 'flags', which it sets and clears
2914 * at will. Flag bits in it are used in that function, and it calls the next
2915 * layer down with its 'flagp' parameter set to '&flags'. Thus, upon return,
2916 * 'flags' will contain whatever it had before the call, plus whatever that
2917 * function passed up. If it wants to pass any of these up to its caller, it
2918 * has to add them to its *flagp. This means that it takes extra steps to keep
2919 * passing a flag upwards, and otherwise the flag bit is cleared for higher
2923 /* On success, returns the offset at which any next node should be placed into
2924 * the regex engine program being compiled.
2926 * Returns 0 otherwise, with *flagp set to indicate why:
2927 * TRYAGAIN at the end of (?) that only sets flags.
2928 * RESTART_PARSE if the parse needs to be restarted, or'd with
2929 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
2930 * Otherwise would only return 0 if regbranch() returns 0, which cannot
2932 STATIC regnode_offset
2933 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
2934 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
2935 * 2 is like 1, but indicates that nextchar() has been called to advance
2936 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
2937 * this flag alerts us to the need to check for that */
2939 regnode_offset ret = 0; /* Will be the head of the group. */
2941 regnode_offset lastbr;
2942 regnode_offset ender = 0;
2943 I32 logical_parno = 0;
2946 U32 oregflags = RExC_flags;
2947 bool have_branch = 0;
2949 I32 freeze_paren = 0;
2950 I32 after_freeze = 0;
2951 I32 num; /* numeric backreferences */
2952 SV * max_open; /* Max number of unclosed parens */
2953 I32 was_in_lookaround = RExC_in_lookaround;
2954 I32 fake_eval = 0; /* matches paren */
2956 /* The difference between the following variables can be seen with *
2957 * the broken pattern /(?:foo/ where segment_parse_start will point *
2958 * at the 'f', and reg_parse_start will point at the '(' */
2960 /* the following is used for unmatched '(' errors */
2961 char * const reg_parse_start = RExC_parse;
2963 /* the following is used to track where various segments of
2964 * the pattern that we parse out started. */
2965 char * segment_parse_start = RExC_parse;
2967 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2969 PERL_ARGS_ASSERT_REG;
2970 DEBUG_PARSE("reg ");
2972 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
2974 if (!SvIOK(max_open)) {
2975 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
2977 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
2979 vFAIL("Too many nested open parens");
2982 *flagp = 0; /* Initialize. */
2984 /* Having this true makes it feasible to have a lot fewer tests for the
2985 * parse pointer being in scope. For example, we can write
2986 * while(isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2988 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse_inc_by(1);
2990 assert(*RExC_end == '\0');
2992 /* Make an OPEN node, if parenthesized. */
2995 /* Under /x, space and comments can be gobbled up between the '(' and
2996 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
2997 * intervening space, as the sequence is a token, and a token should be
2999 bool has_intervening_patws = (paren == 2)
3000 && *(RExC_parse - 1) != '(';
3002 if (RExC_parse >= RExC_end) {
3003 vFAIL("Unmatched (");
3006 if (paren == 'r') { /* Atomic script run */
3010 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
3011 if (RExC_parse[1] == '{') { /* (*{ ... }) optimistic EVAL */
3016 char *start_verb = RExC_parse + 1;
3018 char *start_arg = NULL;
3019 unsigned char op = 0;
3020 int arg_required = 0;
3021 int internal_argval = -1; /* if > -1 no argument allowed */
3022 bool has_upper = FALSE;
3023 U32 seen_flag_set = 0; /* RExC_seen flags we must set */
3025 if (has_intervening_patws) {
3026 RExC_parse_inc_by(1); /* past the '*' */
3028 /* For strict backwards compatibility, don't change the message
3029 * now that we also have lowercase operands */
3030 if (isUPPER(*RExC_parse)) {
3031 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
3034 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
3037 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
3038 if ( *RExC_parse == ':' ) {
3039 start_arg = RExC_parse + 1;
3043 if (isUPPER(*RExC_parse)) {
3046 RExC_parse_inc_by(1);
3049 RExC_parse_inc_utf8();
3052 verb_len = RExC_parse - start_verb;
3054 if (RExC_parse >= RExC_end) {
3055 goto unterminated_verb_pattern;
3059 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
3062 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3063 unterminated_verb_pattern:
3065 vFAIL("Unterminated verb pattern argument");
3068 vFAIL("Unterminated '(*...' argument");
3072 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
3074 vFAIL("Unterminated verb pattern");
3077 vFAIL("Unterminated '(*...' construct");
3082 /* Here, we know that RExC_parse < RExC_end */
3084 switch ( *start_verb ) {
3085 case 'A': /* (*ACCEPT) */
3086 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
3088 internal_argval = RExC_nestroot;
3091 case 'C': /* (*COMMIT) */
3092 if ( memEQs(start_verb, verb_len,"COMMIT") )
3095 case 'F': /* (*FAIL) */
3096 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
3100 case ':': /* (*:NAME) */
3101 case 'M': /* (*MARK:NAME) */
3102 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
3107 case 'P': /* (*PRUNE) */
3108 if ( memEQs(start_verb, verb_len,"PRUNE") )
3111 case 'S': /* (*SKIP) */
3112 if ( memEQs(start_verb, verb_len,"SKIP") )
3115 case 'T': /* (*THEN) */
3116 /* [19:06] <TimToady> :: is then */
3117 if ( memEQs(start_verb, verb_len,"THEN") ) {
3119 RExC_seen |= REG_CUTGROUP_SEEN;
3123 if ( memEQs(start_verb, verb_len, "asr")
3124 || memEQs(start_verb, verb_len, "atomic_script_run"))
3126 paren = 'r'; /* Mnemonic: recursed run */
3129 else if (memEQs(start_verb, verb_len, "atomic")) {
3130 paren = 't'; /* AtOMIC */
3131 goto alpha_assertions;
3135 if ( memEQs(start_verb, verb_len, "plb")
3136 || memEQs(start_verb, verb_len, "positive_lookbehind"))
3139 goto lookbehind_alpha_assertions;
3141 else if ( memEQs(start_verb, verb_len, "pla")
3142 || memEQs(start_verb, verb_len, "positive_lookahead"))
3145 goto alpha_assertions;
3149 if ( memEQs(start_verb, verb_len, "nlb")
3150 || memEQs(start_verb, verb_len, "negative_lookbehind"))
3153 goto lookbehind_alpha_assertions;
3155 else if ( memEQs(start_verb, verb_len, "nla")
3156 || memEQs(start_verb, verb_len, "negative_lookahead"))
3159 goto alpha_assertions;
3163 if ( memEQs(start_verb, verb_len, "sr")
3164 || memEQs(start_verb, verb_len, "script_run"))
3166 regnode_offset atomic;
3172 /* This indicates Unicode rules. */
3173 REQUIRE_UNI_RULES(flagp, 0);
3179 RExC_parse_set(start_arg);
3181 if (RExC_in_script_run) {
3183 /* Nested script runs are treated as no-ops, because
3184 * if the nested one fails, the outer one must as
3185 * well. It could fail sooner, and avoid (??{} with
3186 * side effects, but that is explicitly documented as
3187 * undefined behavior. */
3196 /* But, the atomic part of a nested atomic script run
3197 * isn't a no-op, but can be treated just like a '(?>'
3204 /* Here, we're starting a new regular script run */
3205 ret = reg_node(pRExC_state, SROPEN);
3206 RExC_in_script_run = 1;
3211 /* Here, we are starting an atomic script run. This is
3212 * handled by recursing to deal with the atomic portion
3213 * separately, enclosed in SROPEN ... SRCLOSE nodes */
3215 ret = reg_node(pRExC_state, SROPEN);
3217 RExC_in_script_run = 1;
3219 atomic = reg(pRExC_state, 'r', &flags, depth);
3220 if (flags & (RESTART_PARSE|NEED_UTF8)) {
3221 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
3225 if (! REGTAIL(pRExC_state, ret, atomic)) {
3226 REQUIRE_BRANCHJ(flagp, 0);
3229 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
3232 REQUIRE_BRANCHJ(flagp, 0);
3235 RExC_in_script_run = 0;
3241 lookbehind_alpha_assertions:
3242 seen_flag_set = REG_LOOKBEHIND_SEEN;
3251 if ( RExC_parse == start_arg ) {
3252 if ( paren == 'A' || paren == 'B' ) {
3253 /* An empty negative lookaround assertion is failure.
3254 * See also: S_reg_la_OPFAIL() */
3256 /* Note: OPFAIL is *not* zerolen. */
3257 ret = reg1node(pRExC_state, OPFAIL, 0);
3258 nextchar(pRExC_state);
3262 if ( paren == 'a' || paren == 'b' ) {
3263 /* An empty positive lookaround assertion is success.
3264 * See also: S_reg_la_NOTHING() */
3266 /* Note: NOTHING is zerolen, so increment here */
3267 RExC_seen_zerolen++;
3268 ret = reg_node(pRExC_state, NOTHING);
3269 nextchar(pRExC_state);
3274 RExC_seen_zerolen++;
3275 RExC_in_lookaround++;
3276 RExC_seen |= seen_flag_set;
3278 RExC_parse_set(start_arg);
3282 vFAIL2utf8f( "'(*%" UTF8f "' requires a terminating ':'",
3283 UTF8fARG(UTF, verb_len, start_verb));
3284 NOT_REACHED; /*NOTREACHED*/
3286 } /* End of switch */
3288 RExC_parse_inc_safe();
3289 if (has_upper || verb_len == 0) {
3290 vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'",
3291 UTF8fARG(UTF, verb_len, start_verb));
3294 vFAIL2utf8f( "Unknown '(*...)' construct '%" UTF8f "'",
3295 UTF8fARG(UTF, verb_len, start_verb));
3298 if ( RExC_parse == start_arg ) {
3301 if ( arg_required && !start_arg ) {
3302 vFAIL3( "Verb pattern '%.*s' has a mandatory argument",
3303 (int) verb_len, start_verb);
3305 if (internal_argval == -1) {
3306 ret = reg1node(pRExC_state, op, 0);
3308 ret = reg2node(pRExC_state, op, 0, internal_argval);
3310 RExC_seen |= REG_VERBARG_SEEN;
3312 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
3313 ARG1u(REGNODE_p(ret)) = reg_add_data( pRExC_state,
3315 RExC_rxi->data->data[ARG1u(REGNODE_p(ret))]=(void*)sv;
3316 FLAGS(REGNODE_p(ret)) = 1;
3318 FLAGS(REGNODE_p(ret)) = 0;
3320 if ( internal_argval != -1 )
3321 ARG2i_SET(REGNODE_p(ret), internal_argval);
3322 nextchar(pRExC_state);
3325 else if (*RExC_parse == '?') { /* (?...) */
3327 ; /* make sure the label has a statement associated with it*/
3328 bool is_logical = 0, is_optimistic = 0;
3329 const char * const seqstart = RExC_parse;
3330 const char * endptr;
3331 const char non_existent_group_msg[]
3332 = "Reference to nonexistent group";
3333 const char impossible_group[] = "Invalid reference to group";
3335 if (has_intervening_patws) {
3336 RExC_parse_inc_by(1);
3337 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
3340 RExC_parse_inc_by(1); /* past the '?' */
3342 paren = *RExC_parse; /* might be a trailing NUL, if not
3350 if (RExC_parse > RExC_end) {
3353 ret = 0; /* For look-ahead/behind. */
3356 case 'P': /* (?P...) variants for those used to PCRE/Python */
3357 paren = *RExC_parse;
3358 if ( paren == '<') { /* (?P<...>) named capture */
3359 RExC_parse_inc_by(1);
3360 if (RExC_parse >= RExC_end) {
3361 vFAIL("Sequence (?P<... not terminated");
3365 else if (paren == '>') { /* (?P>name) named recursion */
3366 RExC_parse_inc_by(1);
3367 if (RExC_parse >= RExC_end) {
3368 vFAIL("Sequence (?P>... not terminated");
3370 goto named_recursion;
3372 else if (paren == '=') { /* (?P=...) named backref */
3373 RExC_parse_inc_by(1);
3374 return handle_named_backref(pRExC_state, flagp,
3375 segment_parse_start, ')');
3377 RExC_parse_inc_if_char();
3378 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3379 vFAIL3("Sequence (%.*s...) not recognized",
3380 (int) (RExC_parse - seqstart), seqstart);
3381 NOT_REACHED; /*NOTREACHED*/
3382 case '<': /* (?<...) */
3383 /* If you want to support (?<*...), first reconcile with GH #17363 */
3384 if (*RExC_parse == '!') {
3385 paren = ','; /* negative lookbehind (?<! ... ) */
3386 RExC_parse_inc_by(1);
3387 if ((ret= reg_la_OPFAIL(pRExC_state,REG_LB_SEEN,"?<!")))
3392 if (*RExC_parse == '=') {
3393 /* paren = '<' - negative lookahead (?<= ... ) */
3394 RExC_parse_inc_by(1);
3395 if ((ret= reg_la_NOTHING(pRExC_state,REG_LB_SEEN,"?<=")))
3406 case '\'': /* (?'...') */
3407 name_start = RExC_parse;
3408 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
3409 if ( RExC_parse == name_start
3410 || RExC_parse >= RExC_end
3411 || *RExC_parse != paren)
3413 vFAIL2("Sequence (?%c... not terminated",
3414 paren=='>' ? '<' : (char) paren);
3419 if (!svname) /* shouldn't happen */
3421 "panic: reg_scan_name returned NULL");
3422 if (!RExC_paren_names) {
3423 RExC_paren_names= newHV();
3424 sv_2mortal(MUTABLE_SV(RExC_paren_names));
3426 RExC_paren_name_list= newAV();
3427 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
3430 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
3432 sv_dat = HeVAL(he_str);
3434 /* croak baby croak */
3436 "panic: paren_name hash element allocation failed");
3437 } else if ( SvPOK(sv_dat) ) {
3438 /* (?|...) can mean we have dupes so scan to check
3439 its already been stored. Maybe a flag indicating
3440 we are inside such a construct would be useful,
3441 but the arrays are likely to be quite small, so
3442 for now we punt -- dmq */
3443 IV count = SvIV(sv_dat);
3444 I32 *pv = (I32*)SvPVX(sv_dat);
3446 for ( i = 0 ; i < count ; i++ ) {
3447 if ( pv[i] == RExC_npar ) {
3453 pv = (I32*)SvGROW(sv_dat,
3454 SvCUR(sv_dat) + sizeof(I32)+1);
3455 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
3456 pv[count] = RExC_npar;
3457 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
3460 (void)SvUPGRADE(sv_dat, SVt_PVNV);
3461 sv_setpvn(sv_dat, (char *)&(RExC_npar),
3464 SvIV_set(sv_dat, 1);
3467 /* No, this does not cause a memory leak under
3468 * debugging. RExC_paren_name_list is freed later
3469 * on in the dump process. - Yves
3471 if (!av_store(RExC_paren_name_list,
3472 RExC_npar, SvREFCNT_inc_NN(svname)))
3473 SvREFCNT_dec_NN(svname);
3477 nextchar(pRExC_state);
3479 goto capturing_parens;
3481 NOT_REACHED; /*NOTREACHED*/
3482 case '=': /* (?=...) */
3483 if ((ret= reg_la_NOTHING(pRExC_state, 0, "?=")))
3486 case '!': /* (?!...) */
3487 if ((ret= reg_la_OPFAIL(pRExC_state, 0, "?!")))
3490 case '|': /* (?|...) */
3491 /* branch reset, behave like a (?:...) except that
3492 buffers in alternations share the same numbers */
3494 after_freeze = freeze_paren = RExC_logical_npar;
3496 /* XXX This construct currently requires an extra pass.
3497 * Investigation would be required to see if that could be
3499 REQUIRE_PARENS_PASS;
3501 case ':': /* (?:...) */
3502 case '>': /* (?>...) */
3504 case '$': /* (?$...) */
3505 case '@': /* (?@...) */
3506 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
3508 case '0' : /* (?0) */
3509 case 'R' : /* (?R) */
3510 if (RExC_parse == RExC_end || *RExC_parse != ')')
3511 FAIL("Sequence (?R) not terminated");
3513 RExC_seen |= REG_RECURSE_SEEN;
3515 /* XXX These constructs currently require an extra pass.
3516 * It probably could be changed */
3517 REQUIRE_PARENS_PASS;
3519 *flagp |= POSTPONED;
3520 goto gen_recurse_regop;
3522 /* named and numeric backreferences */
3523 case '&': /* (?&NAME) */
3524 segment_parse_start = RExC_parse - 1;
3527 SV *sv_dat = reg_scan_name(pRExC_state,
3528 REG_RSN_RETURN_DATA);
3529 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
3531 if (RExC_parse >= RExC_end || *RExC_parse != ')')
3532 vFAIL("Sequence (?&... not terminated");
3533 goto gen_recurse_regop;
3536 if (! inRANGE(RExC_parse[0], '1', '9')) {
3537 RExC_parse_inc_by(1);
3538 vFAIL("Illegal pattern");
3540 goto parse_recursion;
3542 case '-': /* (?-1) */
3543 if (! inRANGE(RExC_parse[0], '1', '9')) {
3544 RExC_parse--; /* rewind to let it be handled later */
3548 case '1': case '2': case '3': case '4': /* (?1) */
3549 case '5': case '6': case '7': case '8': case '9':
3550 RExC_parse_set((char *) seqstart + 1); /* Point to the digit */
3553 bool is_neg = FALSE;
3555 segment_parse_start = RExC_parse - 1;
3556 if (*RExC_parse == '-') {
3557 RExC_parse_inc_by(1);
3561 if (grok_atoUV(RExC_parse, &unum, &endptr)
3565 RExC_parse_set((char*)endptr);
3567 else { /* Overflow, or something like that. Position
3568 beyond all digits for the message */
3569 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
3570 RExC_parse_inc_by(1);
3572 vFAIL(impossible_group);
3575 /* -num is always representable on 1 and 2's complement
3580 if (*RExC_parse!=')')
3581 vFAIL("Expecting close bracket");
3583 if (paren == '-' || paren == '+') {
3585 /* Don't overflow */
3586 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
3587 RExC_parse_inc_by(1);
3588 vFAIL(impossible_group);
3592 Diagram of capture buffer numbering.
3593 Top line is the normal capture buffer numbers
3594 Bottom line is the negative indexing as from
3598 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
3599 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
3602 Resolve to absolute group. Recall that RExC_npar is +1 of
3603 the actual parenthesis group number. For lookahead, we
3604 have to compensate for that. Using the above example, when
3605 we get to Y in the parse, num is 2 and RExC_npar is 6. We
3606 want 7 for +2, and 4 for -2.
3608 if ( paren == '+' ) {
3614 if (paren == '-' && num < 1) {
3615 RExC_parse_inc_by(1);
3616 vFAIL(non_existent_group_msg);
3620 if (num && num < RExC_logical_npar) {
3621 num = RExC_logical_to_parno[num];
3624 if (ALL_PARENS_COUNTED) {
3625 if (num < RExC_logical_total_parens) {
3626 num = RExC_logical_to_parno[num];
3629 RExC_parse_inc_by(1);
3630 vFAIL(non_existent_group_msg);
3634 REQUIRE_PARENS_PASS;
3639 if (num >= RExC_npar) {
3641 /* It might be a forward reference; we can't fail until we
3642 * know, by completing the parse to get all the groups, and
3644 if (ALL_PARENS_COUNTED) {
3645 if (num >= RExC_total_parens) {
3646 RExC_parse_inc_by(1);
3647 vFAIL(non_existent_group_msg);
3651 REQUIRE_PARENS_PASS;
3655 /* We keep track how many GOSUB items we have produced.
3656 To start off the ARG2i() of the GOSUB holds its "id",
3657 which is used later in conjunction with RExC_recurse
3658 to calculate the offset we need to jump for the GOSUB,
3659 which it will store in the final representation.
3660 We have to defer the actual calculation until much later
3661 as the regop may move.
3663 ret = reg2node(pRExC_state, GOSUB, num, RExC_recurse_count);
3664 RExC_recurse_count++;
3665 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
3666 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
3667 22, "| |", (int)(depth * 2 + 1), "",
3668 (UV)ARG1u(REGNODE_p(ret)),
3669 (IV)ARG2i(REGNODE_p(ret))));
3670 RExC_seen |= REG_RECURSE_SEEN;
3672 *flagp |= POSTPONED;
3673 assert(*RExC_parse == ')');
3674 nextchar(pRExC_state);
3679 case '?': /* (??...) */
3681 if (*RExC_parse != '{') {
3682 RExC_parse_inc_if_char();
3683 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
3685 "Sequence (%" UTF8f "...) not recognized",
3686 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
3687 NOT_REACHED; /*NOTREACHED*/
3689 *flagp |= POSTPONED;
3691 RExC_parse_inc_by(1);
3693 case '{': /* (?{...}) */
3696 struct reg_code_block *cb;
3699 RExC_seen_zerolen++;
3701 if ( !pRExC_state->code_blocks
3702 || pRExC_state->code_index
3703 >= pRExC_state->code_blocks->count
3704 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
3705 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
3708 if (RExC_pm_flags & PMf_USE_RE_EVAL)
3709 FAIL("panic: Sequence (?{...}): no code block found\n");
3710 FAIL("Eval-group not allowed at runtime, use re 'eval'");
3712 /* this is a pre-compiled code block (?{...}) */
3713 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
3714 RExC_parse_set(RExC_start + cb->end);
3716 if (cb->src_regex) {
3717 n = reg_add_data(pRExC_state, STR_WITH_LEN("rl"));
3718 RExC_rxi->data->data[n] =
3719 (void*)SvREFCNT_inc((SV*)cb->src_regex);
3720 RExC_rxi->data->data[n+1] = (void*)o;
3723 n = reg_add_data(pRExC_state,
3724 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
3725 RExC_rxi->data->data[n] = (void*)o;
3727 pRExC_state->code_index++;
3728 nextchar(pRExC_state);
3730 RExC_seen |= REG_PESSIMIZE_SEEN;
3733 regnode_offset eval;
3734 ret = reg_node(pRExC_state, LOGICAL);
3735 FLAGS(REGNODE_p(ret)) = 2;
3737 eval = reg2node(pRExC_state, EVAL,
3740 /* for later propagation into (??{})
3742 RExC_flags & RXf_PMf_COMPILETIME
3744 FLAGS(REGNODE_p(eval)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3745 if (! REGTAIL(pRExC_state, ret, eval)) {
3746 REQUIRE_BRANCHJ(flagp, 0);
3750 ret = reg2node(pRExC_state, EVAL, n, 0);
3751 FLAGS(REGNODE_p(ret)) = is_optimistic * EVAL_OPTIMISTIC_FLAG;
3755 case '(': /* (?(?{...})...) and (?(?=...)...) */
3758 const int DEFINE_len = sizeof("DEFINE") - 1;
3759 if ( RExC_parse < RExC_end - 1
3760 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
3761 && ( RExC_parse[1] == '='
3762 || RExC_parse[1] == '!'
3763 || RExC_parse[1] == '<'
3764 || RExC_parse[1] == '{'))
3765 || ( RExC_parse[0] == '*' /* (?(*...)) */
3766 && ( RExC_parse[1] == '{'
3767 || ( memBEGINs(RExC_parse + 1,
3768 (Size_t) (RExC_end - (RExC_parse + 1)),
3770 || memBEGINs(RExC_parse + 1,
3771 (Size_t) (RExC_end - (RExC_parse + 1)),
3773 || memBEGINs(RExC_parse + 1,
3774 (Size_t) (RExC_end - (RExC_parse + 1)),
3776 || memBEGINs(RExC_parse + 1,
3777 (Size_t) (RExC_end - (RExC_parse + 1)),
3779 || memBEGINs(RExC_parse + 1,
3780 (Size_t) (RExC_end - (RExC_parse + 1)),
3781 "positive_lookahead:")
3782 || memBEGINs(RExC_parse + 1,
3783 (Size_t) (RExC_end - (RExC_parse + 1)),
3784 "positive_lookbehind:")
3785 || memBEGINs(RExC_parse + 1,
3786 (Size_t) (RExC_end - (RExC_parse + 1)),
3787 "negative_lookahead:")
3788 || memBEGINs(RExC_parse + 1,
3789 (Size_t) (RExC_end - (RExC_parse + 1)),
3790 "negative_lookbehind:")))))
3791 ) { /* Lookahead or eval. */
3793 regnode_offset tail;
3795 ret = reg_node(pRExC_state, LOGICAL);
3796 FLAGS(REGNODE_p(ret)) = 1;
3798 tail = reg(pRExC_state, 1, &flag, depth+1);
3799 RETURN_FAIL_ON_RESTART(flag, flagp);
3800 if (! REGTAIL(pRExC_state, ret, tail)) {
3801 REQUIRE_BRANCHJ(flagp, 0);
3805 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
3806 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
3808 char ch = RExC_parse[0] == '<' ? '>' : '\'';
3809 char *name_start= RExC_parse;
3810 RExC_parse_inc_by(1);
3812 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
3813 if ( RExC_parse == name_start
3814 || RExC_parse >= RExC_end
3815 || *RExC_parse != ch)
3817 vFAIL2("Sequence (?(%c... not terminated",
3818 (ch == '>' ? '<' : ch));
3820 RExC_parse_inc_by(1);
3822 num = reg_add_data( pRExC_state, STR_WITH_LEN("S"));
3823 RExC_rxi->data->data[num]=(void*)sv_dat;
3824 SvREFCNT_inc_simple_void_NN(sv_dat);
3826 ret = reg1node(pRExC_state, GROUPPN, num);
3827 goto insert_if_check_paren;
3829 else if (memBEGINs(RExC_parse,
3830 (STRLEN) (RExC_end - RExC_parse),
3833 ret = reg1node(pRExC_state, DEFINEP, 0);
3834 RExC_parse_inc_by(DEFINE_len);
3836 goto insert_if_check_paren;
3838 else if (RExC_parse[0] == 'R') {
3839 RExC_parse_inc_by(1);
3840 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
3841 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
3842 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
3845 if (RExC_parse[0] == '0') {
3847 RExC_parse_inc_by(1);
3849 else if (inRANGE(RExC_parse[0], '1', '9')) {
3852 if (grok_atoUV(RExC_parse, &uv, &endptr)
3855 parno = (I32)uv + 1;
3856 RExC_parse_set((char*)endptr);
3858 /* else "Switch condition not recognized" below */
3859 } else if (RExC_parse[0] == '&') {
3861 RExC_parse_inc_by(1);
3862 sv_dat = reg_scan_name(pRExC_state,
3863 REG_RSN_RETURN_DATA);
3865 parno = 1 + *((I32 *)SvPVX(sv_dat));
3867 ret = reg1node(pRExC_state, INSUBP, parno);
3868 goto insert_if_check_paren;
3870 else if (inRANGE(RExC_parse[0], '1', '9')) {
3875 if (grok_atoUV(RExC_parse, &uv, &endptr)
3879 RExC_parse_set((char*)endptr);
3882 vFAIL("panic: grok_atoUV returned FALSE");
3884 ret = reg1node(pRExC_state, GROUPP, parno);
3886 insert_if_check_paren:
3887 if (UCHARAT(RExC_parse) != ')') {
3888 RExC_parse_inc_safe();
3889 vFAIL("Switch condition not recognized");
3891 nextchar(pRExC_state);
3893 if (! REGTAIL(pRExC_state, ret, reg1node(pRExC_state,
3896 REQUIRE_BRANCHJ(flagp, 0);
3898 br = regbranch(pRExC_state, &flags, 1, depth+1);
3900 RETURN_FAIL_ON_RESTART(flags,flagp);
3901 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3904 if (! REGTAIL(pRExC_state, br, reg1node(pRExC_state,
3907 REQUIRE_BRANCHJ(flagp, 0);
3909 c = UCHARAT(RExC_parse);
3910 nextchar(pRExC_state);
3915 vFAIL("(?(DEFINE)....) does not allow branches");
3917 /* Fake one for optimizer. */
3918 lastbr = reg1node(pRExC_state, IFTHEN, 0);
3920 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
3921 RETURN_FAIL_ON_RESTART(flags, flagp);
3922 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
3925 if (! REGTAIL(pRExC_state, ret, lastbr)) {
3926 REQUIRE_BRANCHJ(flagp, 0);
3930 c = UCHARAT(RExC_parse);
3931 nextchar(pRExC_state);
3936 if (RExC_parse >= RExC_end)
3937 vFAIL("Switch (?(condition)... not terminated");
3939 vFAIL("Switch (?(condition)... contains too many branches");
3941 ender = reg_node(pRExC_state, TAIL);
3942 if (! REGTAIL(pRExC_state, br, ender)) {
3943 REQUIRE_BRANCHJ(flagp, 0);
3946 if (! REGTAIL(pRExC_state, lastbr, ender)) {
3947 REQUIRE_BRANCHJ(flagp, 0);
3949 if (! REGTAIL(pRExC_state,
3951 REGNODE_AFTER(REGNODE_p(lastbr))),
3954 REQUIRE_BRANCHJ(flagp, 0);
3958 if (! REGTAIL(pRExC_state, ret, ender)) {
3959 REQUIRE_BRANCHJ(flagp, 0);
3961 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
3962 RExC_size++; /* XXX WHY do we need this?!!
3963 For large programs it seems to be required
3964 but I can't figure out why. -- dmq*/
3968 RExC_parse_inc_safe();
3969 vFAIL("Unknown switch condition (?(...))");
3971 case '[': /* (?[ ... ]) */
3972 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1);
3974 RExC_parse--; /* for vFAIL to print correctly */
3975 vFAIL("Sequence (? incomplete");
3979 if (RExC_strict) { /* [perl #132851] */
3980 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
3983 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
3985 default: /* e.g., (?i) */
3986 RExC_parse_set((char *) seqstart + 1);
3988 parse_lparen_question_flags(pRExC_state);
3989 if (UCHARAT(RExC_parse) != ':') {
3990 if (RExC_parse < RExC_end)
3991 nextchar(pRExC_state);
3996 nextchar(pRExC_state);
4001 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
4005 if (RExC_npar >= U16_MAX)
4006 FAIL2("Too many capture groups (limit is %" UVuf ")", (UV)RExC_npar);
4008 logical_parno = RExC_logical_npar;
4009 RExC_logical_npar++;
4010 if (! ALL_PARENS_COUNTED) {
4011 /* If we are in our first pass through (and maybe only pass),
4012 * we need to allocate memory for the capturing parentheses
4016 if (!RExC_parens_buf_size) {
4017 /* first guess at number of parens we might encounter */
4018 RExC_parens_buf_size = 10;
4020 /* setup RExC_open_parens, which holds the address of each
4021 * OPEN tag, and to make things simpler for the 0 index the
4022 * start of the program - this is used later for offsets */
4023 Newxz(RExC_open_parens, RExC_parens_buf_size,
4025 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
4027 /* setup RExC_close_parens, which holds the address of each
4028 * CLOSE tag, and to make things simpler for the 0 index
4029 * the end of the program - this is used later for offsets
4031 Newxz(RExC_close_parens, RExC_parens_buf_size,
4033 /* we don't know where end op starts yet, so we don't need to
4034 * set RExC_close_parens[0] like we do RExC_open_parens[0]
4037 Newxz(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4038 Newxz(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4040 else if (RExC_npar > RExC_parens_buf_size) {
4041 I32 old_size = RExC_parens_buf_size;
4043 RExC_parens_buf_size *= 2;
4045 Renew(RExC_open_parens, RExC_parens_buf_size,
4047 Zero(RExC_open_parens + old_size,
4048 RExC_parens_buf_size - old_size, regnode_offset);
4050 Renew(RExC_close_parens, RExC_parens_buf_size,
4052 Zero(RExC_close_parens + old_size,
4053 RExC_parens_buf_size - old_size, regnode_offset);
4055 Renew(RExC_logical_to_parno, RExC_parens_buf_size, I32);
4056 Zero(RExC_logical_to_parno + old_size,
4057 RExC_parens_buf_size - old_size, I32);
4059 Renew(RExC_parno_to_logical, RExC_parens_buf_size, I32);
4060 Zero(RExC_parno_to_logical + old_size,
4061 RExC_parens_buf_size - old_size, I32);
4065 ret = reg1node(pRExC_state, OPEN, parno);
4067 RExC_nestroot = parno;
4068 if (RExC_open_parens && !RExC_open_parens[parno])
4070 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4071 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
4072 22, "| |", (int)(depth * 2 + 1), "",
4074 RExC_open_parens[parno]= ret;
4076 if (RExC_parno_to_logical) {
4077 RExC_parno_to_logical[parno] = logical_parno;
4078 if (RExC_logical_to_parno && !RExC_logical_to_parno[logical_parno])
4079 RExC_logical_to_parno[logical_parno] = parno;
4083 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
4092 /* Pick up the branches, linking them together. */
4093 segment_parse_start = RExC_parse;
4094 I32 npar_before_regbranch = RExC_npar - 1;
4095 br = regbranch(pRExC_state, &flags, 1, depth+1);
4097 /* branch_len = (paren != 0); */
4100 RETURN_FAIL_ON_RESTART(flags, flagp);
4101 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4103 if (*RExC_parse == '|') {
4104 if (RExC_use_BRANCHJ) {
4105 reginsert(pRExC_state, BRANCHJ, br, depth+1);
4106 ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
4107 ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4110 reginsert(pRExC_state, BRANCH, br, depth+1);
4111 ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
4112 ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
4116 else if (paren == ':') {
4117 *flagp |= flags&SIMPLE;
4119 if (is_open) { /* Starts with OPEN. */
4120 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
4121 REQUIRE_BRANCHJ(flagp, 0);
4124 else if (paren != '?') /* Not Conditional */
4126 *flagp |= flags & (HASWIDTH | POSTPONED);
4128 while (*RExC_parse == '|') {
4129 if (RExC_use_BRANCHJ) {
4132 ender = reg1node(pRExC_state, LONGJMP, 0);
4134 /* Append to the previous. */
4135 shut_gcc_up = REGTAIL(pRExC_state,
4136 REGNODE_OFFSET(REGNODE_AFTER(REGNODE_p(lastbr))),
4138 PERL_UNUSED_VAR(shut_gcc_up);
4140 nextchar(pRExC_state);
4142 if (RExC_logical_npar > after_freeze)
4143 after_freeze = RExC_logical_npar;
4144 RExC_logical_npar = freeze_paren;
4146 br = regbranch(pRExC_state, &flags, 0, depth+1);
4149 RETURN_FAIL_ON_RESTART(flags, flagp);
4150 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
4152 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
4153 REQUIRE_BRANCHJ(flagp, 0);
4155 assert(OP(REGNODE_p(br)) == BRANCH || OP(REGNODE_p(br))==BRANCHJ);
4156 assert(OP(REGNODE_p(lastbr)) == BRANCH || OP(REGNODE_p(lastbr))==BRANCHJ);
4157 if (OP(REGNODE_p(br)) == BRANCH) {
4158 if (OP(REGNODE_p(lastbr)) == BRANCH)
4159 ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4161 ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
4164 if (OP(REGNODE_p(br)) == BRANCHJ) {
4165 if (OP(REGNODE_p(lastbr)) == BRANCH)
4166 ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4168 ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
4172 *flagp |= flags & (HASWIDTH | POSTPONED);
4175 if (have_branch || paren != ':') {
4178 /* Make a closing node, and hook it on the end. */
4181 ender = reg_node(pRExC_state, TAIL);
4184 ender = reg1node(pRExC_state, CLOSE, parno);
4185 if ( RExC_close_parens ) {
4186 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4187 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
4188 22, "| |", (int)(depth * 2 + 1), "",
4190 RExC_close_parens[parno]= ender;
4191 if (RExC_nestroot == parno)
4196 ender = reg_node(pRExC_state, SRCLOSE);
4197 RExC_in_script_run = 0;
4199 /* LOOKBEHIND ops (not sure why these are duplicated - Yves) */
4200 case 'b': /* (*positive_lookbehind: ... ) (*plb: ... ) */
4201 case 'B': /* (*negative_lookbehind: ... ) (*nlb: ... ) */
4202 case '<': /* (?<= ... ) */
4203 case ',': /* (?<! ... ) */
4204 *flagp &= ~HASWIDTH;
4205 ender = reg_node(pRExC_state, LOOKBEHIND_END);
4207 /* LOOKAHEAD ops (not sure why these are duplicated - Yves) */
4212 *flagp &= ~HASWIDTH;
4214 case 't': /* aTomic */
4216 ender = reg_node(pRExC_state, SUCCEED);
4219 ender = reg_node(pRExC_state, END);
4220 assert(!RExC_end_op); /* there can only be one! */
4221 RExC_end_op = REGNODE_p(ender);
4222 if (RExC_close_parens) {
4223 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
4224 "%*s%*s Setting close paren #0 (END) to %zu\n",
4225 22, "| |", (int)(depth * 2 + 1), "",
4228 RExC_close_parens[0]= ender;
4233 DEBUG_PARSE_MSG("lsbr");
4234 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
4235 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
4236 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4237 SvPV_nolen_const(RExC_mysv1),
4239 SvPV_nolen_const(RExC_mysv2),
4241 (IV)(ender - lastbr)
4244 if (OP(REGNODE_p(lastbr)) == BRANCH) {
4245 ARG1b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4248 if (OP(REGNODE_p(lastbr)) == BRANCHJ) {
4249 ARG2b_SET(REGNODE_p(lastbr),(U16)RExC_npar-1);
4252 if (! REGTAIL(pRExC_state, lastbr, ender)) {
4253 REQUIRE_BRANCHJ(flagp, 0);
4259 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
4261 /* Hook the tails of the branches to the closing node. */
4262 for (br = REGNODE_p(ret); br; br = regnext(br)) {
4263 const U8 op = REGNODE_TYPE(OP(br));
4264 regnode *nextoper = REGNODE_AFTER(br);
4266 if (! REGTAIL_STUDY(pRExC_state,
4267 REGNODE_OFFSET(nextoper),
4270 REQUIRE_BRANCHJ(flagp, 0);
4272 if ( OP(nextoper) != NOTHING
4273 || regnext(nextoper) != REGNODE_p(ender))
4276 else if (op == BRANCHJ) {
4277 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
4278 REGNODE_OFFSET(nextoper),
4280 PERL_UNUSED_VAR(shut_gcc_up);
4281 /* for now we always disable this optimisation * /
4282 regnode *nopr= REGNODE_AFTER_type(br,tregnode_BRANCHJ);
4283 if ( OP(nopr) != NOTHING
4284 || regnext(nopr) != REGNODE_p(ender))
4290 regnode * ret_as_regnode = REGNODE_p(ret);
4291 br= REGNODE_TYPE(OP(ret_as_regnode)) != BRANCH
4292 ? regnext(ret_as_regnode)
4295 DEBUG_PARSE_MSG("NADA");
4296 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
4298 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
4300 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
4301 SvPV_nolen_const(RExC_mysv1),
4302 (IV)REG_NODE_NUM(ret_as_regnode),
4303 SvPV_nolen_const(RExC_mysv2),
4309 if (OP(REGNODE_p(ender)) == TAIL) {
4311 RExC_emit= REGNODE_OFFSET(br) + NODE_STEP_REGNODE;
4314 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
4316 NEXT_OFF(br)= REGNODE_p(ender) - br;
4324 /* Even/odd or x=don't care: 010101x10x */
4325 static const char parens[] = "=!aA<,>Bbt";
4326 /* flag below is set to 0 up through 'A'; 1 for larger */
4328 if (paren && (p = strchr(parens, paren))) {
4329 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
4330 int flag = (p - parens) > 3;
4332 if (paren == '>' || paren == 't') {
4333 node = SUSPEND, flag = 0;
4336 reginsert(pRExC_state, node, ret, depth+1);
4337 FLAGS(REGNODE_p(ret)) = flag;
4338 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
4340 REQUIRE_BRANCHJ(flagp, 0);
4345 /* Check for proper termination. */
4347 /* restore original flags, but keep (?p) and, if we've encountered
4348 * something in the parse that changes /d rules into /u, keep the /u */
4349 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
4350 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
4351 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
4353 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
4354 RExC_parse_set(reg_parse_start);
4355 vFAIL("Unmatched (");
4357 nextchar(pRExC_state);
4359 else if (!paren && RExC_parse < RExC_end) {
4360 if (*RExC_parse == ')') {
4361 RExC_parse_inc_by(1);
4362 vFAIL("Unmatched )");
4365 FAIL("Junk on end of regexp"); /* "Can't happen". */
4366 NOT_REACHED; /* NOTREACHED */
4369 if (after_freeze > RExC_logical_npar)
4370 RExC_logical_npar = after_freeze;
4372 RExC_in_lookaround = was_in_lookaround;
4378 - regbranch - one alternative of an | operator
4380 * Implements the concatenation operator.
4382 * On success, returns the offset at which any next node should be placed into
4383 * the regex engine program being compiled.
4385 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
4386 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
4389 STATIC regnode_offset
4390 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
4393 regnode_offset chain = 0;
4394 regnode_offset latest;
4395 regnode *branch_node = NULL;
4396 I32 flags = 0, c = 0;
4397 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4399 PERL_ARGS_ASSERT_REGBRANCH;
4401 DEBUG_PARSE("brnc");
4406 if (RExC_use_BRANCHJ) {
4407 ret = reg2node(pRExC_state, BRANCHJ, 0, 0);
4408 branch_node = REGNODE_p(ret);
4409 ARG2a_SET(branch_node, (U16)RExC_npar-1);
4411 ret = reg1node(pRExC_state, BRANCH, 0);
4412 branch_node = REGNODE_p(ret);
4413 ARG1a_SET(branch_node, (U16)RExC_npar-1);
4417 *flagp = 0; /* Initialize. */
4419 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
4420 FALSE /* Don't force to /x */ );
4421 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
4423 latest = regpiece(pRExC_state, &flags, depth+1);
4425 if (flags & TRYAGAIN)
4427 RETURN_FAIL_ON_RESTART(flags, flagp);
4428 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
4432 *flagp |= flags&(HASWIDTH|POSTPONED);
4434 /* FIXME adding one for every branch after the first is probably
4435 * excessive now we have TRIE support. (hv) */
4437 if (! REGTAIL(pRExC_state, chain, latest)) {
4438 /* XXX We could just redo this branch, but figuring out what
4439 * bookkeeping needs to be reset is a pain, and it's likely
4440 * that other branches that goto END will also be too large */
4441 REQUIRE_BRANCHJ(flagp, 0);
4447 if (chain == 0) { /* Loop ran zero times. */
4448 chain = reg_node(pRExC_state, NOTHING);
4453 *flagp |= flags & SIMPLE;
4464 #ifndef PERL_IN_XSUB_RE
4466 Perl_regcurly(const char *s, const char *e, const char * result[5])
4468 /* This function matches a {m,n} quantifier. When called with a NULL final
4469 * argument, it simply parses the input from 's' up through 'e-1', and
4470 * returns a boolean as to whether or not this input is syntactically a
4473 * When called with a non-NULL final parameter, and when the function
4474 * returns TRUE, it additionally stores information into the array
4475 * specified by that parameter about what it found in the parse. The
4476 * parameter must be a pointer into a 5 element array of 'const char *'
4477 * elements. The returned information is as follows:
4478 * result[RBRACE] points to the closing brace
4479 * result[MIN_S] points to the first byte of the lower bound
4480 * result[MIN_E] points to one beyond the final byte of the lower bound
4481 * result[MAX_S] points to the first byte of the upper bound
4482 * result[MAX_E] points to one beyond the final byte of the upper bound
4484 * If the quantifier is of the form {m,} (meaning an infinite upper
4485 * bound), result[MAX_E] is set to result[MAX_S]; what they actually point
4486 * to is irrelevant, just that it's the same place
4488 * If instead the quantifier is of the form {m} there is actually only
4489 * one bound, and both the upper and lower result[] elements are set to
4492 * This function checks only for syntactic validity; it leaves checking for
4493 * semantic validity and raising any diagnostics to the caller. This
4494 * function is called in multiple places to check for syntax, but only from
4495 * one for semantics. It makes it as simple as possible for the
4496 * syntax-only callers, while furnishing just enough information for the
4500 const char * min_start = NULL;
4501 const char * max_start = NULL;
4502 const char * min_end = NULL;
4503 const char * max_end = NULL;
4505 bool has_comma = FALSE;
4507 PERL_ARGS_ASSERT_REGCURLY;
4509 if (s >= e || *s++ != '{')
4512 while (s < e && isBLANK(*s)) {
4520 } while (s < e && isDIGIT(*s));
4524 while (s < e && isBLANK(*s)) {
4532 while (s < e && isBLANK(*s)) {
4540 } while (s < e && isDIGIT(*s));
4545 while (s < e && isBLANK(*s)) {
4548 /* Need at least one number */
4549 if (s >= e || *s != '}' || (! min_start && ! max_end)) {
4557 result[MIN_S] = min_start;
4558 result[MIN_E] = min_end;
4561 result[MAX_S] = max_start;
4562 result[MAX_E] = max_end;
4565 /* Having no value after the comma is signalled by setting
4566 * start and end to the same value. What that value is isn't
4567 * relevant; NULL is chosen simply because it will fail if the
4568 * caller mistakenly uses it */
4569 result[MAX_S] = result[MAX_E] = NULL;
4572 else { /* No comma means lower and upper bounds are the same */
4573 result[MAX_S] = min_start;
4574 result[MAX_E] = min_end;
4583 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state,
4584 const char * start, const char * end)
4586 /* This is a helper function for regpiece() to compute, given the
4587 * quantifier {m,n}, the value of either m or n, based on the starting
4588 * position 'start' in the string, through the byte 'end-1', returning it
4589 * if valid, and failing appropriately if not. It knows the restrictions
4590 * imposed on quantifier values */
4593 STATIC_ASSERT_DECL(REG_INFTY <= U32_MAX);
4595 PERL_ARGS_ASSERT_GET_QUANTIFIER_VALUE;
4597 if (grok_atoUV(start, &uv, &end)) {
4598 if (uv < REG_INFTY) { /* A valid, small-enough number */
4602 else if (*start == '0') { /* grok_atoUV() fails for only two reasons:
4603 leading zeros or overflow */
4604 RExC_parse_set((char * ) end);
4606 /* Perhaps too generic a msg for what is only failure from having
4607 * leading zeros, but this is how it's always behaved. */
4608 vFAIL("Invalid quantifier in {,}");
4609 NOT_REACHED; /*NOTREACHED*/
4612 /* Here, found a quantifier, but was too large; either it overflowed or was
4613 * too big a legal number */
4614 RExC_parse_set((char * ) end);
4615 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
4617 NOT_REACHED; /*NOTREACHED*/
4618 return U32_MAX; /* Perhaps some compilers will be expecting a return */
4622 - regpiece - something followed by possible quantifier * + ? {n,m}
4624 * Note that the branching code sequences used for ? and the general cases
4625 * of * and + are somewhat optimized: they use the same NOTHING node as
4626 * both the endmarker for their branch list and the body of the last branch.
4627 * It might seem that this node could be dispensed with entirely, but the
4628 * endmarker role is not redundant.
4630 * On success, returns the offset at which any next node should be placed into
4631 * the regex engine program being compiled.
4633 * Returns 0 otherwise, with *flagp set to indicate why:
4634 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
4635 * RESTART_PARSE if the parse needs to be restarted, or'd with
4636 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
4638 STATIC regnode_offset
4639 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
4644 const char * const origparse = RExC_parse;
4646 I32 max = REG_INFTY;
4647 I32 npar_before = RExC_npar-1;
4649 /* Save the original in case we change the emitted regop to a FAIL. */
4650 const regnode_offset orig_emit = RExC_emit;
4652 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4654 PERL_ARGS_ASSERT_REGPIECE;
4656 DEBUG_PARSE("piec");
4658 ret = regatom(pRExC_state, &flags, depth+1);
4660 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
4661 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
4663 I32 npar_after = RExC_npar-1;
4667 const char * regcurly_return[5];
4670 nextchar(pRExC_state);
4675 nextchar(pRExC_state);
4680 nextchar(pRExC_state);
4684 case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
4685 to determine which */
4686 if (regcurly(RExC_parse, RExC_end, regcurly_return)) {
4687 const char * min_start = regcurly_return[MIN_S];
4688 const char * min_end = regcurly_return[MIN_E];
4689 const char * max_start = regcurly_return[MAX_S];
4690 const char * max_end = regcurly_return[MAX_E];
4693 min = get_quantifier_value(pRExC_state, min_start, min_end);
4699 if (max_start == max_end) { /* Was of the form {m,} */
4702 else if (max_start == min_start) { /* Was of the form {m} */
4705 else { /* Was of the form {m,n} */
4706 assert(max_end >= max_start);
4708 max = get_quantifier_value(pRExC_state, max_start, max_end);
4711 RExC_parse_set((char *) regcurly_return[RBRACE]);
4712 nextchar(pRExC_state);
4714 if (max < min) { /* If can't match, warn and optimize to fail
4716 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
4717 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
4718 NEXT_OFF(REGNODE_p(orig_emit)) =
4719 REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
4722 else if (min == max && *RExC_parse == '?') {
4723 ckWARN2reg(RExC_parse + 1,
4724 "Useless use of greediness modifier '%c'",
4729 } /* End of is {m,n} */
4731 /* Here was a '{', but what followed it didn't form a quantifier. */
4737 NOT_REACHED; /*NOTREACHED*/
4740 /* Here we have a quantifier, and have calculated 'min' and 'max'.
4742 * Check and possibly adjust a zero width operand */
4743 if (! (flags & (HASWIDTH|POSTPONED))) {
4744 if (max > REG_INFTY/3) {
4745 ckWARN2reg(RExC_parse,
4746 "%" UTF8f " matches null string many times",
4747 UTF8fARG(UTF, (RExC_parse >= origparse
4748 ? RExC_parse - origparse
4753 /* There's no point in trying to match something 0 length more than
4754 * once except for extra side effects, which we don't have here since
4764 /* If this is a code block pass it up */
4765 *flagp |= (flags & POSTPONED);
4768 *flagp |= (flags & HASWIDTH);
4769 if (max == REG_INFTY)
4770 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
4773 /* 'SIMPLE' operands don't require full generality */
4774 if ((flags&SIMPLE)) {
4775 if (max == REG_INFTY) {
4777 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
4778 goto min0_maxINF_wildcard_forbidden;
4781 reginsert(pRExC_state, STAR, ret, depth+1);
4785 else if (min == 1) {
4786 reginsert(pRExC_state, PLUS, ret, depth+1);
4792 /* Here, SIMPLE, but not the '*' and '+' special cases */
4794 MARK_NAUGHTY_EXP(2, 2);
4795 reginsert(pRExC_state, CURLY, ret, depth+1);
4797 else { /* not SIMPLE */
4798 const regnode_offset w = reg_node(pRExC_state, WHILEM);
4800 FLAGS(REGNODE_p(w)) = 0;
4801 if (! REGTAIL(pRExC_state, ret, w)) {
4802 REQUIRE_BRANCHJ(flagp, 0);
4804 if (RExC_use_BRANCHJ) {
4805 reginsert(pRExC_state, LONGJMP, ret, depth+1);
4806 reginsert(pRExC_state, NOTHING, ret, depth+1);
4807 REGNODE_STEP_OVER(ret,tregnode_NOTHING,tregnode_LONGJMP);
4809 reginsert(pRExC_state, CURLYX, ret, depth+1);
4810 if (RExC_use_BRANCHJ)
4811 /* Go over NOTHING to LONGJMP. */
4812 REGNODE_STEP_OVER(ret,tregnode_CURLYX,tregnode_NOTHING);
4814 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
4817 REQUIRE_BRANCHJ(flagp, 0);
4820 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
4823 /* Finish up the CURLY/CURLYX case */
4824 FLAGS(REGNODE_p(ret)) = 0;
4826 ARG1i_SET(REGNODE_p(ret), min);
4827 ARG2i_SET(REGNODE_p(ret), max);
4829 /* if we had a npar_after then we need to increment npar_before,
4830 * we want to track the range of parens we need to reset each iteration
4832 if (npar_after!=npar_before) {
4833 ARG3a_SET(REGNODE_p(ret), (U16)npar_before+1);
4834 ARG3b_SET(REGNODE_p(ret), (U16)npar_after);
4836 ARG3a_SET(REGNODE_p(ret), 0);
4837 ARG3b_SET(REGNODE_p(ret), 0);
4842 /* Process any greediness modifiers */
4843 if (*RExC_parse == '?') {
4844 nextchar(pRExC_state);
4845 reginsert(pRExC_state, MINMOD, ret, depth+1);
4846 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
4847 REQUIRE_BRANCHJ(flagp, 0);
4850 else if (*RExC_parse == '+') {
4851 regnode_offset ender;
4852 nextchar(pRExC_state);
4853 ender = reg_node(pRExC_state, SUCCEED);
4854 if (! REGTAIL(pRExC_state, ret, ender)) {
4855 REQUIRE_BRANCHJ(flagp, 0);
4857 reginsert(pRExC_state, SUSPEND, ret, depth+1);
4858 ender = reg_node(pRExC_state, TAIL);
4859 if (! REGTAIL(pRExC_state, ret, ender)) {
4860 REQUIRE_BRANCHJ(flagp, 0);
4864 /* Forbid extra quantifiers */
4865 if (isQUANTIFIER(RExC_parse, RExC_end)) {
4866 RExC_parse_inc_by(1);
4867 vFAIL("Nested quantifiers");
4872 min0_maxINF_wildcard_forbidden:
4874 /* Here we are in a wildcard match, and the minimum match length is 0, and
4875 * the max could be infinity. This is currently forbidden. The only
4876 * reason is to make it harder to write patterns that take a long long time
4877 * to halt, and because the use of this construct isn't necessary in
4878 * matching Unicode property values */
4879 RExC_parse_inc_by(1);
4880 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
4881 subpatterns in regex; marked by <-- HERE in m/%s/
4883 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
4886 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
4887 * legal at all in wildcards, so can't get this far */
4889 NOT_REACHED; /*NOTREACHED*/
4893 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
4894 regnode_offset * node_p,
4902 /* This routine teases apart the various meanings of \N and returns
4903 * accordingly. The input parameters constrain which meaning(s) is/are valid
4904 * in the current context.
4906 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
4908 * If <code_point_p> is not NULL, the context is expecting the result to be a
4909 * single code point. If this \N instance turns out to a single code point,
4910 * the function returns TRUE and sets *code_point_p to that code point.
4912 * If <node_p> is not NULL, the context is expecting the result to be one of
4913 * the things representable by a regnode. If this \N instance turns out to be
4914 * one such, the function generates the regnode, returns TRUE and sets *node_p
4915 * to point to the offset of that regnode into the regex engine program being
4918 * If this instance of \N isn't legal in any context, this function will
4919 * generate a fatal error and not return.
4921 * On input, RExC_parse should point to the first char following the \N at the
4922 * time of the call. On successful return, RExC_parse will have been updated
4923 * to point to just after the sequence identified by this routine. Also
4924 * *flagp has been updated as needed.
4926 * When there is some problem with the current context and this \N instance,
4927 * the function returns FALSE, without advancing RExC_parse, nor setting
4928 * *node_p, nor *code_point_p, nor *flagp.
4930 * If <cp_count> is not NULL, the caller wants to know the length (in code
4931 * points) that this \N sequence matches. This is set, and the input is
4932 * parsed for errors, even if the function returns FALSE, as detailed below.
4934 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
4936 * Probably the most common case is for the \N to specify a single code point.
4937 * *cp_count will be set to 1, and *code_point_p will be set to that code
4940 * Another possibility is for the input to be an empty \N{}. This is no
4941 * longer accepted, and will generate a fatal error.
4943 * Another possibility is for a custom charnames handler to be in effect which
4944 * translates the input name to an empty string. *cp_count will be set to 0.
4945 * *node_p will be set to a generated NOTHING node.
4947 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
4948 * set to 0. *node_p will be set to a generated REG_ANY node.
4950 * The fifth possibility is that \N resolves to a sequence of more than one
4951 * code points. *cp_count will be set to the number of code points in the
4952 * sequence. *node_p will be set to a generated node returned by this
4953 * function calling S_reg().
4955 * The sixth and final possibility is that it is premature to be calling this
4956 * function; the parse needs to be restarted. This can happen when this
4957 * changes from /d to /u rules, or when the pattern needs to be upgraded to
4958 * UTF-8. The latter occurs only when the fifth possibility would otherwise
4959 * be in effect, and is because one of those code points requires the pattern
4960 * to be recompiled as UTF-8. The function returns FALSE, and sets the
4961 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
4962 * happens, the caller needs to desist from continuing parsing, and return
4963 * this information to its caller. This is not set for when there is only one
4964 * code point, as this can be called as part of an ANYOF node, and they can
4965 * store above-Latin1 code points without the pattern having to be in UTF-8.
4967 * For non-single-quoted regexes, the tokenizer has resolved character and
4968 * sequence names inside \N{...} into their Unicode values, normalizing the
4969 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
4970 * hex-represented code points in the sequence. This is done there because
4971 * the names can vary based on what charnames pragma is in scope at the time,
4972 * so we need a way to take a snapshot of what they resolve to at the time of
4973 * the original parse. [perl #56444].
4975 * That parsing is skipped for single-quoted regexes, so here we may get
4976 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
4977 * like '\N{U+41}', that code point is Unicode, and has to be translated into
4978 * the native character set for non-ASCII platforms. The other possibilities
4979 * are already native, so no translation is done. */
4981 char * endbrace; /* points to '}' following the name */
4982 char * e; /* points to final non-blank before endbrace */
4983 char* p = RExC_parse; /* Temporary */
4985 SV * substitute_parse = NULL;
4990 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4992 PERL_ARGS_ASSERT_GROK_BSLASH_N;
4994 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
4995 assert(! (node_p && cp_count)); /* At most 1 should be set */
4997 if (cp_count) { /* Initialize return for the most common case */
5001 /* The [^\n] meaning of \N ignores spaces and comments under the /x
5002 * modifier. The other meanings do not (except blanks adjacent to and
5003 * within the braces), so use a temporary until we find out which we are
5004 * being called with */
5005 skip_to_be_ignored_text(pRExC_state, &p,
5006 FALSE /* Don't force to /x */ );
5008 /* Disambiguate between \N meaning a named character versus \N meaning
5009 * [^\n]. The latter is assumed when the {...} following the \N is a legal
5010 * quantifier, or if there is no '{' at all */
5011 if (*p != '{' || regcurly(p, RExC_end, NULL)) {
5021 *node_p = reg_node(pRExC_state, REG_ANY);
5022 *flagp |= HASWIDTH|SIMPLE;
5027 /* The test above made sure that the next real character is a '{', but
5028 * under the /x modifier, it could be separated by space (or a comment and
5029 * \n) and this is not allowed (for consistency with \x{...} and the
5030 * tokenizer handling of \N{NAME}). */
5031 if (*RExC_parse != '{') {
5032 vFAIL("Missing braces on \\N{}");
5035 RExC_parse_inc_by(1); /* Skip past the '{' */
5037 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
5038 if (! endbrace) { /* no trailing brace */
5039 vFAIL2("Missing right brace on \\%c{}", 'N');
5042 /* Here, we have decided it should be a named character or sequence. These
5043 * imply Unicode semantics */
5044 REQUIRE_UNI_RULES(flagp, FALSE);
5046 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
5047 * nothing at all (not allowed under strict) */
5048 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
5049 RExC_parse_set(endbrace);
5051 RExC_parse_inc_by(1); /* Position after the "}" */
5052 vFAIL("Zero length \\N{}");
5058 nextchar(pRExC_state);
5063 *node_p = reg_node(pRExC_state, NOTHING);
5067 while (isBLANK(*RExC_parse)) {
5068 RExC_parse_inc_by(1);
5072 while (RExC_parse < e && isBLANK(*(e-1))) {
5076 if (e - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
5078 /* Here, the name isn't of the form U+.... This can happen if the
5079 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
5080 * is the time to find out what the name means */
5082 const STRLEN name_len = e - RExC_parse;
5083 SV * value_sv; /* What does this name evaluate to */
5085 const U8 * value; /* string of name's value */
5086 STRLEN value_len; /* and its length */
5088 /* RExC_unlexed_names is a hash of names that weren't evaluated by
5089 * toke.c, and their values. Make sure is initialized */
5090 if (! RExC_unlexed_names) {
5091 RExC_unlexed_names = newHV();
5094 /* If we have already seen this name in this pattern, use that. This
5095 * allows us to only call the charnames handler once per name per
5096 * pattern. A broken or malicious handler could return something
5097 * different each time, which could cause the results to vary depending
5098 * on if something gets added or subtracted from the pattern that
5099 * causes the number of passes to change, for example */
5100 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
5103 value_sv = *value_svp;
5105 else { /* Otherwise we have to go out and get the name */
5106 const char * error_msg = NULL;
5107 value_sv = get_and_check_backslash_N_name(RExC_parse, e,
5111 RExC_parse_set(endbrace);
5115 /* If no error message, should have gotten a valid return */
5118 /* Save the name's meaning for later use */
5119 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
5122 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
5126 /* Here, we have the value the name evaluates to in 'value_sv' */
5127 value = (U8 *) SvPV(value_sv, value_len);
5129 /* See if the result is one code point vs 0 or multiple */
5130 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
5134 /* Here, exactly one code point. If that isn't what is wanted,
5136 if (! code_point_p) {
5141 /* Convert from string to numeric code point */
5142 *code_point_p = (SvUTF8(value_sv))
5143 ? valid_utf8_to_uvchr(value, NULL)
5146 /* Have parsed this entire single code point \N{...}. *cp_count
5147 * has already been set to 1, so don't do it again. */
5148 RExC_parse_set(endbrace);
5149 nextchar(pRExC_state);
5151 } /* End of is a single code point */
5153 /* Count the code points, if caller desires. The API says to do this
5154 * even if we will later return FALSE */
5158 *cp_count = (SvUTF8(value_sv))
5159 ? utf8_length(value, value + value_len)
5163 /* Fail if caller doesn't want to handle a multi-code-point sequence.
5164 * But don't back the pointer up if the caller wants to know how many
5165 * code points there are (they need to handle it themselves in this
5174 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
5175 * reg recursively to parse it. That way, it retains its atomicness,
5176 * while not having to worry about any special handling that some code
5177 * points may have. */
5179 substitute_parse = newSVpvs("?:");
5180 sv_catsv(substitute_parse, value_sv);
5181 sv_catpv(substitute_parse, ")");
5183 /* The value should already be native, so no need to convert on EBCDIC
5185 assert(! RExC_recode_x_to_native);
5188 else { /* \N{U+...} */
5189 Size_t count = 0; /* code point count kept internally */
5191 /* We can get to here when the input is \N{U+...} or when toke.c has
5192 * converted a name to the \N{U+...} form. This include changing a
5193 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
5195 RExC_parse_inc_by(2); /* Skip past the 'U+' */
5197 /* Code points are separated by dots. The '}' terminates the whole
5200 do { /* Loop until the ending brace */
5201 I32 flags = PERL_SCAN_SILENT_OVERFLOW
5202 | PERL_SCAN_SILENT_ILLDIGIT
5203 | PERL_SCAN_NOTIFY_ILLDIGIT
5204 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
5205 | PERL_SCAN_DISALLOW_PREFIX;
5206 STRLEN len = e - RExC_parse;
5208 char * start_digit = RExC_parse;
5209 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
5212 RExC_parse_inc_by(1);
5214 vFAIL("Invalid hexadecimal number in \\N{U+...}");
5217 RExC_parse_inc_by(len);
5219 if (cp > MAX_LEGAL_CP) {
5220 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
5223 if (RExC_parse >= e) { /* Got to the closing '}' */
5228 /* Here, is a single code point; fail if doesn't want that */
5229 if (! code_point_p) {
5234 /* A single code point is easy to handle; just return it */
5235 *code_point_p = UNI_TO_NATIVE(cp);
5236 RExC_parse_set(endbrace);
5237 nextchar(pRExC_state);
5241 /* Here, the parse stopped bfore the ending brace. This is legal
5242 * only if that character is a dot separating code points, like a
5243 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
5244 * So the next character must be a dot (and the one after that
5245 * can't be the ending brace, or we'd have something like
5248 if (*RExC_parse != '.' || RExC_parse + 1 >= e) {
5249 /*point to after 1st invalid */
5250 RExC_parse_incf(RExC_orig_utf8);
5251 /*Guard against malformed utf8*/
5252 RExC_parse_set(MIN(e, RExC_parse));
5256 /* Here, looks like its really a multiple character sequence. Fail
5257 * if that's not what the caller wants. But continue with counting
5258 * and error checking if they still want a count */
5259 if (! node_p && ! cp_count) {
5263 /* What is done here is to convert this to a sub-pattern of the
5264 * form \x{char1}\x{char2}... and then call reg recursively to
5265 * parse it (enclosing in "(?: ... )" ). That way, it retains its
5266 * atomicness, while not having to worry about special handling
5267 * that some code points may have. We don't create a subpattern,
5268 * but go through the motions of code point counting and error
5269 * checking, if the caller doesn't want a node returned. */
5271 if (node_p && ! substitute_parse) {
5272 substitute_parse = newSVpvs("?:");
5278 /* Convert to notation the rest of the code understands */
5279 sv_catpvs(substitute_parse, "\\x{");
5280 sv_catpvn(substitute_parse, start_digit,
5281 RExC_parse - start_digit);
5282 sv_catpvs(substitute_parse, "}");
5285 /* Move to after the dot (or ending brace the final time through.)
5287 RExC_parse_inc_by(1);
5290 } while (RExC_parse < e);
5292 if (! node_p) { /* Doesn't want the node */
5299 sv_catpvs(substitute_parse, ")");
5301 /* The values are Unicode, and therefore have to be converted to native
5302 * on a non-Unicode (meaning non-ASCII) platform. */
5303 SET_recode_x_to_native(1);
5306 /* Here, we have the string the name evaluates to, ready to be parsed,
5307 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
5308 * constructs. This can be called from within a substitute parse already.
5309 * The error reporting mechanism doesn't work for 2 levels of this, but the
5310 * code above has validated this new construct, so there should be no
5311 * errors generated by the below. And this isn't an exact copy, so the
5312 * mechanism to seamlessly deal with this won't work, so turn off warnings
5314 save_start = RExC_start;
5315 orig_end = RExC_end;
5317 RExC_start = SvPVX(substitute_parse);
5318 RExC_parse_set(RExC_start);
5319 RExC_end = RExC_parse + SvCUR(substitute_parse);
5320 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
5322 *node_p = reg(pRExC_state, 1, &flags, depth+1);
5324 /* Restore the saved values */
5326 RExC_start = save_start;
5327 RExC_parse_set(endbrace);
5328 RExC_end = orig_end;
5329 SET_recode_x_to_native(0);
5331 SvREFCNT_dec_NN(substitute_parse);
5334 RETURN_FAIL_ON_RESTART(flags, flagp);
5335 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
5338 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5340 nextchar(pRExC_state);
5347 S_compute_EXACTish(RExC_state_t *pRExC_state)
5351 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
5359 op = get_regex_charset(RExC_flags);
5360 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
5361 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
5362 been, so there is no hole */
5368 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
5369 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
5372 S_backref_value(char *p, char *e)
5374 const char* endptr = e;
5376 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
5383 - regatom - the lowest level
5385 Try to identify anything special at the start of the current parse position.
5386 If there is, then handle it as required. This may involve generating a
5387 single regop, such as for an assertion; or it may involve recursing, such as
5388 to handle a () structure.
5390 If the string doesn't start with something special then we gobble up
5391 as much literal text as we can. If we encounter a quantifier, we have to
5392 back off the final literal character, as that quantifier applies to just it
5393 and not to the whole string of literals.
5395 Once we have been able to handle whatever type of thing started the
5396 sequence, we return the offset into the regex engine program being compiled
5397 at which any next regnode should be placed.
5399 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
5400 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
5401 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
5402 Otherwise does not return 0.
5404 Note: we have to be careful with escapes, as they can be both literal
5405 and special, and in the case of \10 and friends, context determines which.
5407 A summary of the code structure is:
5409 switch (first_byte) {
5410 cases for each special:
5411 handle this special;
5415 cases for each unambiguous special:
5416 handle this special;
5418 cases for each ambiguous special/literal:
5420 if (special) handle here
5422 default: // unambiguously literal:
5425 default: // is a literal char
5428 create EXACTish node for literal;
5429 while (more input and node isn't full) {
5430 switch (input_byte) {
5431 cases for each special;
5432 make sure parse pointer is set so that the next call to
5433 regatom will see this special first
5434 goto loopdone; // EXACTish node terminated by prev. char
5436 append char to EXACTISH node;
5438 get next input byte;
5442 return the generated node;
5444 Specifically there are two separate switches for handling
5445 escape sequences, with the one for handling literal escapes requiring
5446 a dummy entry for all of the special escapes that are actually handled
5451 STATIC regnode_offset
5452 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
5454 regnode_offset ret = 0;
5456 char *atom_parse_start;
5460 DECLARE_AND_GET_RE_DEBUG_FLAGS;
5462 *flagp = 0; /* Initialize. */
5464 DEBUG_PARSE("atom");
5466 PERL_ARGS_ASSERT_REGATOM;
5469 atom_parse_start = RExC_parse;
5470 assert(RExC_parse < RExC_end);
5471 switch ((U8)*RExC_parse) {
5473 RExC_seen_zerolen++;
5474 nextchar(pRExC_state);
5475 if (RExC_flags & RXf_PMf_MULTILINE)
5476 ret = reg_node(pRExC_state, MBOL);
5478 ret = reg_node(pRExC_state, SBOL);
5481 nextchar(pRExC_state);
5483 RExC_seen_zerolen++;
5484 if (RExC_flags & RXf_PMf_MULTILINE)
5485 ret = reg_node(pRExC_state, MEOL);
5487 ret = reg_node(pRExC_state, SEOL);
5490 nextchar(pRExC_state);
5491 if (RExC_flags & RXf_PMf_SINGLELINE)
5492 ret = reg_node(pRExC_state, SANY);
5494 ret = reg_node(pRExC_state, REG_ANY);
5495 *flagp |= HASWIDTH|SIMPLE;
5500 char * const cc_parse_start = ++RExC_parse;
5501 ret = regclass(pRExC_state, flagp, depth+1,
5502 FALSE, /* means parse the whole char class */
5503 TRUE, /* allow multi-char folds */
5504 FALSE, /* don't silence non-portable warnings. */
5506 TRUE, /* Allow an optimized regnode result */
5509 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5510 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5513 if (*RExC_parse != ']') {
5514 RExC_parse_set(cc_parse_start);
5515 vFAIL("Unmatched [");
5517 nextchar(pRExC_state);
5521 nextchar(pRExC_state);
5522 ret = reg(pRExC_state, 2, &flags, depth+1);
5524 if (flags & TRYAGAIN) {
5525 if (RExC_parse >= RExC_end) {
5526 /* Make parent create an empty node if needed. */
5532 RETURN_FAIL_ON_RESTART(flags, flagp);
5533 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
5536 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
5540 if (flags & TRYAGAIN) {
5544 vFAIL("Internal urp");
5545 /* Supposed to be caught earlier. */
5550 RExC_parse_inc_by(1);
5551 vFAIL("Quantifier follows nothing");
5556 This switch handles escape sequences that resolve to some kind
5557 of special regop and not to literal text. Escape sequences that
5558 resolve to literal text are handled below in the switch marked
5561 Every entry in this switch *must* have a corresponding entry
5562 in the literal escape switch. However, the opposite is not
5563 required, as the default for this switch is to jump to the
5564 literal text handling code.
5566 RExC_parse_inc_by(1);
5567 switch ((U8)*RExC_parse) {
5568 /* Special Escapes */
5570 RExC_seen_zerolen++;
5571 /* Under wildcards, this is changed to match \n; should be
5572 * invisible to the user, as they have to compile under /m */
5573 if (RExC_pm_flags & PMf_WILDCARD) {
5574 ret = reg_node(pRExC_state, MBOL);
5577 ret = reg_node(pRExC_state, SBOL);
5578 /* SBOL is shared with /^/ so we set the flags so we can tell
5579 * /\A/ from /^/ in split. */
5580 FLAGS(REGNODE_p(ret)) = 1;
5582 goto finish_meta_pat;
5584 if (RExC_pm_flags & PMf_WILDCARD) {
5585 RExC_parse_inc_by(1);
5586 /* diag_listed_as: Use of %s is not allowed in Unicode property
5587 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
5589 vFAIL("Use of '\\G' is not allowed in Unicode property"
5590 " wildcard subpatterns");
5592 ret = reg_node(pRExC_state, GPOS);
5593 RExC_seen |= REG_GPOS_SEEN;
5594 goto finish_meta_pat;
5596 if (!RExC_in_lookaround) {
5597 RExC_seen_zerolen++;
5598 ret = reg_node(pRExC_state, KEEPS);
5599 /* XXX:dmq : disabling in-place substitution seems to
5600 * be necessary here to avoid cases of memory corruption, as
5601 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
5603 RExC_seen |= REG_LOOKBEHIND_SEEN;
5604 goto finish_meta_pat;
5607 ++RExC_parse; /* advance past the 'K' */
5608 vFAIL("\\K not permitted in lookahead/lookbehind");
5611 if (RExC_pm_flags & PMf_WILDCARD) {
5612 /* See comment under \A above */
5613 ret = reg_node(pRExC_state, MEOL);
5616 ret = reg_node(pRExC_state, SEOL);
5618 RExC_seen_zerolen++; /* Do not optimize RE away */
5619 goto finish_meta_pat;
5621 if (RExC_pm_flags & PMf_WILDCARD) {
5622 /* See comment under \A above */
5623 ret = reg_node(pRExC_state, MEOL);
5626 ret = reg_node(pRExC_state, EOS);
5628 RExC_seen_zerolen++; /* Do not optimize RE away */
5629 goto finish_meta_pat;
5631 vFAIL("\\C no longer supported");
5633 ret = reg_node(pRExC_state, CLUMP);
5635 goto finish_meta_pat;
5643 regex_charset charset = get_regex_charset(RExC_flags);
5645 RExC_seen_zerolen++;
5646 RExC_seen |= REG_LOOKBEHIND_SEEN;
5647 op = BOUND + charset;
5649 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
5650 flags = TRADITIONAL_BOUND;
5651 if (op > BOUNDA) { /* /aa is same as /a */
5657 char name = *RExC_parse;
5658 char * endbrace = (char *) memchr(RExC_parse, '}',
5659 RExC_end - RExC_parse);
5660 char * e = endbrace;
5662 RExC_parse_inc_by(2);
5665 vFAIL2("Missing right brace on \\%c{}", name);
5668 while (isBLANK(*RExC_parse)) {
5669 RExC_parse_inc_by(1);
5672 while (RExC_parse < e && isBLANK(*(e - 1))) {
5676 if (e == RExC_parse) {
5677 RExC_parse_set(endbrace + 1); /* After the '}' */
5678 vFAIL2("Empty \\%c{}", name);
5681 length = e - RExC_parse;
5683 switch (*RExC_parse) {
5686 && (memNEs(RExC_parse + 1, length - 1, "cb")))
5688 goto bad_bound_type;
5693 if (length != 2 || *(RExC_parse + 1) != 'b') {
5694 goto bad_bound_type;
5699 if (length != 2 || *(RExC_parse + 1) != 'b') {
5700 goto bad_bound_type;
5705 if (length != 2 || *(RExC_parse + 1) != 'b') {
5706 goto bad_bound_type;
5714 "'%" UTF8f "' is an unknown bound type",
5715 UTF8fARG(UTF, length, e - length));
5716 NOT_REACHED; /*NOTREACHED*/
5718 RExC_parse_set(endbrace);
5719 REQUIRE_UNI_RULES(flagp, 0);
5724 else if (op >= BOUNDA) { /* /aa is same as /a */
5728 /* Don't have to worry about UTF-8, in this message because
5729 * to get here the contents of the \b must be ASCII */
5730 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
5731 "Using /u for '%.*s' instead of /%s",
5733 endbrace - length + 1,
5734 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
5735 ? ASCII_RESTRICT_PAT_MODS
5736 : ASCII_MORE_RESTRICT_PAT_MODS);
5741 RExC_seen_d_op = TRUE;
5743 else if (op == BOUNDL) {
5744 RExC_contains_locale = 1;
5748 op += NBOUND - BOUND;
5751 ret = reg_node(pRExC_state, op);
5752 FLAGS(REGNODE_p(ret)) = flags;
5754 goto finish_meta_pat;
5758 ret = reg_node(pRExC_state, LNBREAK);
5759 *flagp |= HASWIDTH|SIMPLE;
5760 goto finish_meta_pat;
5774 /* These all have the same meaning inside [brackets], and it knows
5775 * how to do the best optimizations for them. So, pretend we found
5776 * these within brackets, and let it do the work */
5779 ret = regclass(pRExC_state, flagp, depth+1,
5780 TRUE, /* means just parse this element */
5781 FALSE, /* don't allow multi-char folds */
5782 FALSE, /* don't silence non-portable warnings. It
5783 would be a bug if these returned
5786 TRUE, /* Allow an optimized regnode result */
5788 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5789 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
5790 * multi-char folds are allowed. */
5792 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
5795 RExC_parse--; /* regclass() leaves this one too far ahead */
5798 /* The escapes above that don't take a parameter can't be
5799 * followed by a '{'. But 'pX', 'p{foo}' and
5800 * correspondingly 'P' can be */
5801 if ( RExC_parse - atom_parse_start == 1
5802 && UCHARAT(RExC_parse + 1) == '{'
5803 && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL)))
5805 RExC_parse_inc_by(2);
5806 vFAIL("Unescaped left brace in regex is illegal here");
5808 nextchar(pRExC_state);
5811 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
5812 * \N{...} evaluates to a sequence of more than one code points).
5813 * The function call below returns a regnode, which is our result.
5814 * The parameters cause it to fail if the \N{} evaluates to a
5815 * single code point; we handle those like any other literal. The
5816 * reason that the multicharacter case is handled here and not as
5817 * part of the EXACtish code is because of quantifiers. In
5818 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
5819 * this way makes that Just Happen. dmq.
5820 * join_exact() will join this up with adjacent EXACTish nodes
5821 * later on, if appropriate. */
5823 if (grok_bslash_N(pRExC_state,
5824 &ret, /* Want a regnode returned */
5825 NULL, /* Fail if evaluates to a single code
5827 NULL, /* Don't need a count of how many code
5836 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
5838 /* Here, evaluates to a single code point. Go get that */
5839 RExC_parse_set(atom_parse_start);
5842 case 'k': /* Handle \k<NAME> and \k'NAME' and \k{NAME} */
5843 parse_named_seq: /* Also handle non-numeric \g{...} */
5846 if ( RExC_parse >= RExC_end - 1
5847 || (( ch = RExC_parse[1]) != '<'
5851 RExC_parse_inc_by(1);
5852 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
5853 vFAIL2("Sequence %.2s... not terminated", atom_parse_start);
5855 RExC_parse_inc_by(2);
5857 while (isBLANK(*RExC_parse)) {
5858 RExC_parse_inc_by(1);
5861 ret = handle_named_backref(pRExC_state,
5873 case '1': case '2': case '3': case '4':
5874 case '5': case '6': case '7': case '8': case '9':
5877 char * endbrace = NULL;
5878 char * s = RExC_parse;
5879 char * e = RExC_end;
5886 endbrace = (char *) memchr(s, '}', RExC_end - s);
5889 /* Missing '}'. Position after the number to give
5890 * a better indication to the user of where the
5897 /* If it looks to be a name and not a number, go
5898 * handle it there */
5899 if (! isDIGIT(*s)) {
5900 goto parse_named_seq;
5905 } while isDIGIT(*s);
5908 vFAIL("Unterminated \\g{...} pattern");
5911 s++; /* Past the '{' */
5913 while (isBLANK(*s)) {
5917 /* Ignore trailing blanks */
5919 while (s < e && isBLANK(*(e - 1))) {
5924 /* Here, have isolated the meat of the construct from any
5925 * surrounding braces */
5932 if (endbrace && !isDIGIT(*s)) {
5933 goto parse_named_seq;
5937 num = S_backref_value(RExC_parse, RExC_end);
5939 vFAIL("Reference to invalid group 0");
5940 else if (num == I32_MAX) {
5941 if (isDIGIT(*RExC_parse))
5942 vFAIL("Reference to nonexistent group");
5944 vFAIL("Unterminated \\g... pattern");
5948 num = RExC_npar - num;
5950 vFAIL("Reference to nonexistent or unclosed group");
5953 if (num < RExC_logical_npar) {
5954 num = RExC_logical_to_parno[num];
5957 if (ALL_PARENS_COUNTED) {
5958 if (num < RExC_logical_total_parens)
5959 num = RExC_logical_to_parno[num];
5965 REQUIRE_PARENS_PASS;
5969 num = S_backref_value(RExC_parse, RExC_end);
5970 /* bare \NNN might be backref or octal - if it is larger
5971 * than or equal RExC_npar then it is assumed to be an
5972 * octal escape. Note RExC_npar is +1 from the actual
5973 * number of parens. */
5974 /* Note we do NOT check if num == I32_MAX here, as that is
5975 * handled by the RExC_npar check */
5977 if ( /* any numeric escape < 10 is always a backref */
5979 /* any numeric escape < RExC_npar is a backref */
5980 && num >= RExC_logical_npar
5981 /* cannot be an octal escape if it starts with [89]
5983 && ! inRANGE(*RExC_parse, '8', '9')
5985 /* Probably not meant to be a backref, instead likely
5986 * to be an octal character escape, e.g. \35 or \777.
5987 * The above logic should make it obvious why using
5988 * octal escapes in patterns is problematic. - Yves */
5989 RExC_parse_set(atom_parse_start);
5992 if (num < RExC_logical_npar) {
5993 num = RExC_logical_to_parno[num];
5996 if (ALL_PARENS_COUNTED) {
5997 if (num < RExC_logical_total_parens) {
5998 num = RExC_logical_to_parno[num];
6003 REQUIRE_PARENS_PASS;
6007 /* At this point RExC_parse points at a numeric escape like
6008 * \12 or \88 or the digits in \g{34} or \g34 or something
6009 * similar, which we should NOT treat as an octal escape. It
6010 * may or may not be a valid backref escape. For instance
6011 * \88888888 is unlikely to be a valid backref.
6013 * We've already figured out what value the digits represent.
6014 * Now, move the parse to beyond them. */
6016 RExC_parse_set(endbrace + 1);
6018 else while (isDIGIT(*RExC_parse)) {
6019 RExC_parse_inc_by(1);
6022 vFAIL("Reference to nonexistent group");
6024 if (num >= (I32)RExC_npar) {
6025 /* It might be a forward reference; we can't fail until we
6026 * know, by completing the parse to get all the groups, and
6028 if (ALL_PARENS_COUNTED) {
6029 if (num >= RExC_total_parens) {
6030 vFAIL("Reference to nonexistent group");
6034 REQUIRE_PARENS_PASS;
6038 ret = reg2node(pRExC_state,
6041 : (ASCII_FOLD_RESTRICTED)
6043 : (AT_LEAST_UNI_SEMANTICS)
6048 num, RExC_nestroot);
6049 if (RExC_nestroot && num >= RExC_nestroot)
6050 FLAGS(REGNODE_p(ret)) = VOLATILE_REF;
6051 if (OP(REGNODE_p(ret)) == REFF) {
6052 RExC_seen_d_op = TRUE;
6056 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
6057 FALSE /* Don't force to /x */ );
6061 if (RExC_parse >= RExC_end)
6062 FAIL("Trailing \\");
6065 /* Do not generate "unrecognized" warnings here, we fall
6066 back into the quick-grab loop below */
6067 RExC_parse_set(atom_parse_start);
6069 } /* end of switch on a \foo sequence */
6074 /* '#' comments should have been spaced over before this function was
6076 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
6078 if (RExC_flags & RXf_PMf_EXTENDED) {
6079 RExC_parse_set( reg_skipcomment( pRExC_state, RExC_parse ) );
6080 if (RExC_parse < RExC_end)
6090 /* Here, we have determined that the next thing is probably a
6091 * literal character. RExC_parse points to the first byte of its
6092 * definition. (It still may be an escape sequence that evaluates
6093 * to a single character) */
6098 char *s, *old_s = NULL, *old_old_s = NULL;
6100 U32 max_string_len = 255;
6102 /* We may have to reparse the node, artificially stopping filling
6103 * it early, based on info gleaned in the first parse. This
6104 * variable gives where we stop. Make it above the normal stopping
6105 * place first time through; otherwise it would stop too early */
6106 U32 upper_fill = max_string_len + 1;
6108 /* We start out as an EXACT node, even if under /i, until we find a
6109 * character which is in a fold. The algorithm now segregates into
6110 * separate nodes, characters that fold from those that don't under
6111 * /i. (This hopefully will create nodes that are fixed strings
6112 * even under /i, giving the optimizer something to grab on to.)
6113 * So, if a node has something in it and the next character is in
6114 * the opposite category, that node is closed up, and the function
6115 * returns. Then regatom is called again, and a new node is
6116 * created for the new category. */
6117 U8 node_type = EXACT;
6119 /* Assume the node will be fully used; the excess is given back at
6120 * the end. Under /i, we may need to temporarily add the fold of
6121 * an extra character or two at the end to check for splitting
6122 * multi-char folds, so allocate extra space for that. We can't
6123 * make any other length assumptions, as a byte input sequence
6124 * could shrink down. */
6125 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
6129 ? UTF8_MAXBYTES_CASE
6130 /* Max non-UTF-8 expansion is 2 */ : 2)));
6132 bool next_is_quantifier;
6135 /* We can convert EXACTF nodes to EXACTFU if they contain only
6136 * characters that match identically regardless of the target
6137 * string's UTF8ness. The reason to do this is that EXACTF is not
6138 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
6141 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
6142 * contain only above-Latin1 characters (hence must be in UTF8),
6143 * which don't participate in folds with Latin1-range characters,
6144 * as the latter's folds aren't known until runtime. */
6145 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6147 /* Single-character EXACTish nodes are almost always SIMPLE. This
6148 * allows us to override this as encountered */
6149 U8 maybe_SIMPLE = SIMPLE;
6151 /* Does this node contain something that can't match unless the
6152 * target string is (also) in UTF-8 */
6153 bool requires_utf8_target = FALSE;
6155 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
6156 bool has_ss = FALSE;
6158 /* So is the MICRO SIGN */
6159 bool has_micro_sign = FALSE;
6161 /* Set when we fill up the current node and there is still more
6162 * text to process */
6165 /* Allocate an EXACT node. The node_type may change below to
6166 * another EXACTish node, but since the size of the node doesn't
6167 * change, it works */
6168 ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes);
6169 FILL_NODE(ret, node_type);
6170 RExC_emit += NODE_STEP_REGNODE;
6172 s = STRING(REGNODE_p(ret));
6183 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
6184 maybe_SIMPLE = SIMPLE;
6185 requires_utf8_target = FALSE;
6187 has_micro_sign = FALSE;
6191 /* This breaks under rare circumstances. If folding, we do not
6192 * want to split a node at a character that is a non-final in a
6193 * multi-char fold, as an input string could just happen to want to
6194 * match across the node boundary. The code at the end of the loop
6195 * looks for this, and backs off until it finds not such a
6196 * character, but it is possible (though extremely, extremely
6197 * unlikely) for all characters in the node to be non-final fold
6198 * ones, in which case we just leave the node fully filled, and
6199 * hope that it doesn't match the string in just the wrong place */
6201 assert( ! UTF /* Is at the beginning of a character */
6202 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
6203 || UTF8_IS_START(UCHARAT(RExC_parse)));
6207 /* Here, we have a literal character. Find the maximal string of
6208 * them in the input that we can fit into a single EXACTish node.
6209 * We quit at the first non-literal or when the node gets full, or
6210 * under /i the categorization of folding/non-folding character
6212 while (p < RExC_end && len < upper_fill) {
6214 /* In most cases each iteration adds one byte to the output.
6215 * The exceptions override this */
6216 Size_t added_len = 1;
6222 /* White space has already been ignored */
6223 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
6224 || ! is_PATWS_safe((p), RExC_end, UTF));
6227 const char* message;
6240 /* Literal Escapes Switch
6242 This switch is meant to handle escape sequences that
6243 resolve to a literal character.
6245 Every escape sequence that represents something
6246 else, like an assertion or a char class, is handled
6247 in the switch marked 'Special Escapes' above in this
6248 routine, but also has an entry here as anything that
6249 isn't explicitly mentioned here will be treated as
6250 an unescaped equivalent literal.
6255 /* These are all the special escapes. */
6256 case 'A': /* Start assertion */
6257 case 'b': case 'B': /* Word-boundary assertion*/
6258 case 'C': /* Single char !DANGEROUS! */
6259 case 'd': case 'D': /* digit class */
6260 case 'g': case 'G': /* generic-backref, pos assertion */
6261 case 'h': case 'H': /* HORIZWS */
6262 case 'k': case 'K': /* named backref, keep marker */
6263 case 'p': case 'P': /* Unicode property */
6264 case 'R': /* LNBREAK */
6265 case 's': case 'S': /* space class */
6266 case 'v': case 'V': /* VERTWS */
6267 case 'w': case 'W': /* word class */
6268 case 'X': /* eXtended Unicode "combining
6269 character sequence" */
6270 case 'z': case 'Z': /* End of line/string assertion */
6274 /* Anything after here is an escape that resolves to a
6275 literal. (Except digits, which may or may not)
6281 case 'N': /* Handle a single-code point named character. */
6282 RExC_parse_set( p + 1 );
6283 if (! grok_bslash_N(pRExC_state,
6284 NULL, /* Fail if evaluates to
6285 anything other than a
6286 single code point */
6287 &ender, /* The returned single code
6289 NULL, /* Don't need a count of
6290 how many code points */
6295 if (*flagp & NEED_UTF8)
6296 FAIL("panic: grok_bslash_N set NEED_UTF8");
6297 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
6299 /* Here, it wasn't a single code point. Go close
6300 * up this EXACTish node. The switch() prior to
6301 * this switch handles the other cases */
6307 RExC_parse_set(atom_parse_start);
6309 /* The \N{} means the pattern, if previously /d,
6310 * becomes /u. That means it can't be an EXACTF node,
6312 if (node_type == EXACTF) {
6313 node_type = EXACTFU;
6315 /* If the node already contains something that
6316 * differs between EXACTF and EXACTFU, reparse it
6318 if (! maybe_exactfu) {
6347 if (! grok_bslash_o(&p,
6353 FALSE, /* No illegal cp's */
6356 RExC_parse_set(p); /* going to die anyway; point to
6357 exact spot of failure */
6361 if (message && TO_OUTPUT_WARNINGS(p)) {
6362 warn_non_literal_string(p, packed_warn, message);
6366 if (! grok_bslash_x(&p,
6372 FALSE, /* No illegal cp's */
6375 RExC_parse_set(p); /* going to die anyway; point
6376 to exact spot of failure */
6380 if (message && TO_OUTPUT_WARNINGS(p)) {
6381 warn_non_literal_string(p, packed_warn, message);
6385 if (ender < 0x100) {
6386 if (RExC_recode_x_to_native) {
6387 ender = LATIN1_TO_NATIVE(ender);
6394 if (! grok_bslash_c(*p, &grok_c_char,
6395 &message, &packed_warn))
6397 /* going to die anyway; point to exact spot of
6399 char *new_p= p + ((UTF)
6400 ? UTF8_SAFE_SKIP(p, RExC_end)
6402 RExC_parse_set(new_p);
6406 ender = grok_c_char;
6408 if (message && TO_OUTPUT_WARNINGS(p)) {
6409 warn_non_literal_string(p, packed_warn, message);
6413 case '8': case '9': /* must be a backreference */
6415 /* we have an escape like \8 which cannot be an octal escape
6416 * so we exit the loop, and let the outer loop handle this
6417 * escape which may or may not be a legitimate backref. */
6419 case '1': case '2': case '3':case '4':
6420 case '5': case '6': case '7':
6422 /* When we parse backslash escapes there is ambiguity
6423 * between backreferences and octal escapes. Any escape
6424 * from \1 - \9 is a backreference, any multi-digit
6425 * escape which does not start with 0 and which when
6426 * evaluated as decimal could refer to an already
6427 * parsed capture buffer is a back reference. Anything
6430 * Note this implies that \118 could be interpreted as
6431 * 118 OR as "\11" . "8" depending on whether there
6432 * were 118 capture buffers defined already in the
6435 /* NOTE, RExC_npar is 1 more than the actual number of
6436 * parens we have seen so far, hence the "<" as opposed
6438 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
6439 { /* Not to be treated as an octal constant, go
6447 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
6448 | PERL_SCAN_NOTIFY_ILLDIGIT;
6450 ender = grok_oct(p, &numlen, &flags, NULL);
6452 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
6453 && isDIGIT(*p) /* like \08, \178 */
6454 && ckWARN(WARN_REGEXP))
6456 reg_warn_non_literal_string(
6458 form_alien_digit_msg(8, numlen, p,
6459 RExC_end, UTF, FALSE));
6465 FAIL("Trailing \\");
6468 if (isALPHANUMERIC(*p)) {
6469 /* An alpha followed by '{' is going to fail next
6470 * iteration, so don't output this warning in that
6472 if (! isALPHA(*p) || *(p + 1) != '{') {
6473 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
6474 " passed through", p);
6477 goto normal_default;
6478 } /* End of switch on '\' */
6481 /* Trying to gain new uses for '{' without breaking too
6482 * much existing code is hard. The solution currently
6484 * 1) If there is no ambiguity that a '{' should always
6485 * be taken literally, at the start of a construct, we
6487 * 2) If the literal '{' conflicts with our desired use
6488 * of it as a metacharacter, we die. The deprecation
6489 * cycles for this have come and gone.
6490 * 3) If there is ambiguity, we raise a simple warning.
6491 * This could happen, for example, if the user
6492 * intended it to introduce a quantifier, but slightly
6493 * misspelled the quantifier. Without this warning,
6494 * the quantifier would silently be taken as a literal
6495 * string of characters instead of a meta construct */
6496 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
6498 || ( p > atom_parse_start + 1
6499 && isALPHA_A(*(p - 1))
6500 && *(p - 2) == '\\'))
6502 RExC_parse_set(p + 1);
6503 vFAIL("Unescaped left brace in regex is "
6506 ckWARNreg(p + 1, "Unescaped left brace in regex is"
6509 goto normal_default;
6512 if (p > RExC_parse && RExC_strict) {
6513 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
6516 default: /* A literal character */
6518 if (! UTF8_IS_INVARIANT(*p) && UTF) {
6520 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
6521 &numlen, UTF8_ALLOW_DEFAULT);
6527 } /* End of switch on the literal */
6529 /* Here, have looked at the literal character, and <ender>
6530 * contains its ordinal; <p> points to the character after it.
6534 REQUIRE_UTF8(flagp);
6535 if ( UNICODE_IS_PERL_EXTENDED(ender)
6536 && TO_OUTPUT_WARNINGS(p))
6538 ckWARN2_non_literal_string(p,
6539 packWARN(WARN_PORTABLE),
6540 PL_extended_cp_format,
6545 /* We need to check if the next non-ignored thing is a
6546 * quantifier. Move <p> to after anything that should be
6547 * ignored, which, as a side effect, positions <p> for the next
6549 skip_to_be_ignored_text(pRExC_state, &p,
6550 FALSE /* Don't force to /x */ );
6552 /* If the next thing is a quantifier, it applies to this
6553 * character only, which means that this character has to be in
6554 * its own node and can't just be appended to the string in an
6555 * existing node, so if there are already other characters in
6556 * the node, close the node with just them, and set up to do
6557 * this character again next time through, when it will be the
6558 * only thing in its new node */
6560 next_is_quantifier = LIKELY(p < RExC_end)
6561 && UNLIKELY(isQUANTIFIER(p, RExC_end));
6563 if (next_is_quantifier && LIKELY(len)) {
6568 /* Ready to add 'ender' to the node */
6570 if (! FOLD) { /* The simple case, just append the literal */
6573 /* Don't output if it would overflow */
6574 if (UNLIKELY(len > max_string_len - ((UTF)
6582 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
6583 *(s++) = (char) ender;
6586 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
6587 added_len = (char *) new_s - s;
6591 requires_utf8_target = TRUE;
6595 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
6597 /* Here are folding under /l, and the code point is
6598 * problematic. If this is the first character in the
6599 * node, change the node type to folding. Otherwise, if
6600 * this is the first problematic character, close up the
6601 * existing node, so can start a new node with this one */
6603 node_type = EXACTFL;
6604 RExC_contains_locale = 1;
6606 else if (node_type == EXACT) {
6611 /* This problematic code point means we can't simplify
6613 maybe_exactfu = FALSE;
6615 /* Although these two characters have folds that are
6616 * locale-problematic, they also have folds to above Latin1
6617 * that aren't a problem. Doing these now helps at
6619 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
6620 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
6625 /* Here, we are adding a problematic fold character.
6626 * "Problematic" in this context means that its fold isn't
6627 * known until runtime. (The non-problematic code points
6628 * are the above-Latin1 ones that fold to also all
6629 * above-Latin1. Their folds don't vary no matter what the
6630 * locale is.) But here we have characters whose fold
6631 * depends on the locale. We just add in the unfolded
6632 * character, and wait until runtime to fold it */
6633 goto not_fold_common;
6635 else /* regular fold; see if actually is in a fold */
6636 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
6638 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
6640 /* Here, folding, but the character isn't in a fold.
6642 * Start a new node if previous characters in the node were
6644 if (len && node_type != EXACT) {
6649 /* Here, continuing a node with non-folded characters. Add
6651 goto not_fold_common;
6653 else { /* Here, does participate in some fold */
6655 /* If this is the first character in the node, change its
6656 * type to folding. Otherwise, if this is the first
6657 * folding character in the node, close up the existing
6658 * node, so can start a new node with this one. */
6660 node_type = compute_EXACTish(pRExC_state);
6662 else if (node_type == EXACT) {
6667 if (UTF) { /* Alway use the folded value for UTF-8
6669 if (UVCHR_IS_INVARIANT(ender)) {
6670 if (UNLIKELY(len + 1 > max_string_len)) {
6675 *(s)++ = (U8) toFOLD(ender);
6681 folded = _to_uni_fold_flags(
6683 (U8 *) s, /* We have allocated extra space
6684 in 's' so can't run off the
6688 | (( ASCII_FOLD_RESTRICTED
6689 || node_type == EXACTFL)
6690 ? FOLD_FLAGS_NOMIX_ASCII
6692 if (UNLIKELY(len + added_len > max_string_len)) {
6700 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
6702 /* U+B5 folds to the MU, so its possible for a
6703 * non-UTF-8 target to match it */
6704 requires_utf8_target = TRUE;
6708 else { /* Here is non-UTF8. */
6710 /* The fold will be one or (rarely) two characters.
6711 * Check that there's room for at least a single one
6712 * before setting any flags, etc. Because otherwise an
6713 * overflowing character could cause a flag to be set
6714 * even though it doesn't end up in this node. (For
6715 * the two character fold, we check again, before
6716 * setting any flags) */
6717 if (UNLIKELY(len + 1 > max_string_len)) {
6722 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
6723 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
6724 || UNICODE_DOT_DOT_VERSION > 0)
6726 /* On non-ancient Unicodes, check for the only possible
6727 * multi-char fold */
6728 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
6730 /* This potential multi-char fold means the node
6731 * can't be simple (because it could match more
6732 * than a single char). And in some cases it will
6733 * match 'ss', so set that flag */
6737 /* It can't change to be an EXACTFU (unless already
6738 * is one). We fold it iff under /u rules. */
6739 if (node_type != EXACTFU) {
6740 maybe_exactfu = FALSE;
6743 if (UNLIKELY(len + 2 > max_string_len)) {
6752 goto done_with_this_char;
6755 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
6757 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
6759 /* Also, the sequence 'ss' is special when not
6760 * under /u. If the target string is UTF-8, it
6761 * should match SHARP S; otherwise it won't. So,
6762 * here we have to exclude the possibility of this
6763 * node moving to /u.*/
6765 maybe_exactfu = FALSE;
6768 /* Here, the fold will be a single character */
6770 if (UNLIKELY(ender == MICRO_SIGN)) {
6771 has_micro_sign = TRUE;
6773 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
6775 /* If the character's fold differs between /d and
6776 * /u, this can't change to be an EXACTFU node */
6777 maybe_exactfu = FALSE;
6780 *(s++) = (DEPENDS_SEMANTICS)
6781 ? (char) toFOLD(ender)
6783 /* Under /u, the fold of any character in
6784 * the 0-255 range happens to be its
6785 * lowercase equivalent, except for LATIN
6786 * SMALL LETTER SHARP S, which was handled
6787 * above, and the MICRO SIGN, whose fold
6788 * requires UTF-8 to represent. */
6789 : (char) toLOWER_L1(ender);
6791 } /* End of adding current character to the node */
6793 done_with_this_char:
6797 if (next_is_quantifier) {
6799 /* Here, the next input is a quantifier, and to get here,
6800 * the current character is the only one in the node. */
6804 } /* End of loop through literal characters */
6806 /* Here we have either exhausted the input or run out of room in
6807 * the node. If the former, we are done. (If we encountered a
6808 * character that can't be in the node, transfer is made directly
6809 * to <loopdone>, and so we wouldn't have fallen off the end of the
6811 if (LIKELY(! overflowed)) {
6815 /* Here we have run out of room. We can grow plain EXACT and
6816 * LEXACT nodes. If the pattern is gigantic enough, though,
6817 * eventually we'll have to artificially chunk the pattern into
6818 * multiple nodes. */
6819 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
6820 Size_t overhead = 1 + REGNODE_ARG_LEN(OP(REGNODE_p(ret)));
6821 Size_t overhead_expansion = 0;
6823 Size_t max_nodes_for_string;
6827 /* Here we couldn't fit the final character in the current
6828 * node, so it will have to be reparsed, no matter what else we
6832 /* If would have overflowed a regular EXACT node, switch
6833 * instead to an LEXACT. The code below is structured so that
6834 * the actual growing code is common to changing from an EXACT
6835 * or just increasing the LEXACT size. This means that we have
6836 * to save the string in the EXACT case before growing, and
6837 * then copy it afterwards to its new location */
6838 if (node_type == EXACT) {
6839 overhead_expansion = REGNODE_ARG_LEN(LEXACT) - REGNODE_ARG_LEN(EXACT);
6840 RExC_emit += overhead_expansion;
6841 Copy(s0, temp, len, char);
6844 /* Ready to grow. If it was a plain EXACT, the string was
6845 * saved, and the first few bytes of it overwritten by adding
6846 * an argument field. We assume, as we do elsewhere in this
6847 * file, that one byte of remaining input will translate into
6848 * one byte of output, and if that's too small, we grow again,
6849 * if too large the excess memory is freed at the end */
6851 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
6852 achievable = MIN(max_nodes_for_string,
6853 current_string_nodes + STR_SZ(RExC_end - p));
6854 delta = achievable - current_string_nodes;
6856 /* If there is just no more room, go finish up this chunk of
6862 change_engine_size(pRExC_state, delta + overhead_expansion);
6863 current_string_nodes += delta;
6865 = sizeof(struct regnode) * current_string_nodes;
6866 upper_fill = max_string_len + 1;
6868 /* If the length was small, we know this was originally an
6869 * EXACT node now converted to LEXACT, and the string has to be
6870 * restored. Otherwise the string was untouched. 260 is just
6871 * a number safely above 255 so don't have to worry about
6872 * getting it precise */
6875 FILL_NODE(ret, node_type);
6876 s0 = STRING(REGNODE_p(ret));
6877 Copy(temp, s0, len, char);
6881 goto continue_parse;
6884 bool splittable = FALSE;
6885 bool backed_up = FALSE;
6886 char * e; /* should this be U8? */
6887 char * s_start; /* should this be U8? */
6889 /* Here is /i. Running out of room creates a problem if we are
6890 * folding, and the split happens in the middle of a
6891 * multi-character fold, as a match that should have occurred,
6892 * won't, due to the way nodes are matched, and our artificial
6893 * boundary. So back off until we aren't splitting such a
6894 * fold. If there is no such place to back off to, we end up
6895 * taking the entire node as-is. This can happen if the node
6896 * consists entirely of 'f' or entirely of 's' characters (or
6897 * things that fold to them) as 'ff' and 'ss' are
6898 * multi-character folds.
6900 * The Unicode standard says that multi character folds consist
6901 * of either two or three characters. That means we would be
6902 * splitting one if the final character in the node is at the
6903 * beginning of either type, or is the second of a three
6907 * ender is the code point of the character that won't fit
6909 * s points to just beyond the final byte in the node.
6910 * It's where we would place ender if there were
6911 * room, and where in fact we do place ender's fold
6912 * in the code below, as we've over-allocated space
6913 * for s0 (hence s) to allow for this
6914 * e starts at 's' and advances as we append things.
6915 * old_s is the same as 's'. (If ender had fit, 's' would
6916 * have been advanced to beyond it).
6917 * old_old_s points to the beginning byte of the final
6918 * character in the node
6919 * p points to the beginning byte in the input of the
6920 * character beyond 'ender'.
6921 * oldp points to the beginning byte in the input of
6924 * In the case of /il, we haven't folded anything that could be
6925 * affected by the locale. That means only above-Latin1
6926 * characters that fold to other above-latin1 characters get
6927 * folded at compile time. To check where a good place to
6928 * split nodes is, everything in it will have to be folded.
6929 * The boolean 'maybe_exactfu' keeps track in /il if there are
6930 * any unfolded characters in the node. */
6931 bool need_to_fold_loc = LOC && ! maybe_exactfu;
6933 /* If we do need to fold the node, we need a place to store the
6934 * folded copy, and a way to map back to the unfolded original
6936 char * locfold_buf = NULL;
6937 Size_t * loc_correspondence = NULL;
6939 if (! need_to_fold_loc) { /* The normal case. Just
6940 initialize to the actual node */
6943 s = old_old_s; /* Point to the beginning of the final char
6944 that fits in the node */
6948 /* Here, we have filled a /il node, and there are unfolded
6949 * characters in it. If the runtime locale turns out to be
6950 * UTF-8, there are possible multi-character folds, just
6951 * like when not under /l. The node hence can't terminate
6952 * in the middle of such a fold. To determine this, we
6953 * have to create a folded copy of this node. That means
6954 * reparsing the node, folding everything assuming a UTF-8
6955 * locale. (If at runtime it isn't such a locale, the
6956 * actions here wouldn't have been necessary, but we have
6957 * to assume the worst case.) If we find we need to back
6958 * off the folded string, we do so, and then map that
6959 * position back to the original unfolded node, which then
6960 * gets output, truncated at that spot */
6962 char * redo_p = RExC_parse;
6966 /* Allow enough space assuming a single byte input folds to
6967 * a single byte output, plus assume that the two unparsed
6968 * characters (that we may need) fold to the largest number
6969 * of bytes possible, plus extra for one more worst case
6970 * scenario. In the loop below, if we start eating into
6971 * that final spare space, we enlarge this initial space */
6972 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
6974 Newxz(locfold_buf, size, char);
6975 Newxz(loc_correspondence, size, Size_t);
6977 /* Redo this node's parse, folding into 'locfold_buf' */
6978 redo_p = RExC_parse;
6979 old_redo_e = redo_e = locfold_buf;
6980 while (redo_p <= oldp) {
6982 old_redo_e = redo_e;
6983 loc_correspondence[redo_e - locfold_buf]
6984 = redo_p - RExC_parse;
6989 (void) _to_utf8_fold_flags((U8 *) redo_p,
6994 redo_e += added_len;
6995 redo_p += UTF8SKIP(redo_p);
6999 /* Note that if this code is run on some ancient
7000 * Unicode versions, SHARP S doesn't fold to 'ss',
7001 * but rather than clutter the code with #ifdef's,
7002 * as is done above, we ignore that possibility.
7003 * This is ok because this code doesn't affect what
7004 * gets matched, but merely where the node gets
7006 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
7007 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
7017 /* If we're getting so close to the end that a
7018 * worst-case fold in the next character would cause us
7019 * to overflow, increase, assuming one byte output byte
7020 * per one byte input one, plus room for another worst
7023 && redo_e > locfold_buf + size
7024 - (UTF8_MAXBYTES_CASE + 1))
7026 Size_t new_size = size
7028 + UTF8_MAXBYTES_CASE + 1;
7029 Ptrdiff_t e_offset = redo_e - locfold_buf;
7031 Renew(locfold_buf, new_size, char);
7032 Renew(loc_correspondence, new_size, Size_t);
7035 redo_e = locfold_buf + e_offset;
7039 /* Set so that things are in terms of the folded, temporary
7042 s_start = locfold_buf;
7047 /* Here, we have 's', 's_start' and 'e' set up to point to the
7048 * input that goes into the node, folded.
7050 * If the final character of the node and the fold of ender
7051 * form the first two characters of a three character fold, we
7052 * need to peek ahead at the next (unparsed) character in the
7053 * input to determine if the three actually do form such a
7054 * fold. Just looking at that character is not generally
7055 * sufficient, as it could be, for example, an escape sequence
7056 * that evaluates to something else, and it needs to be folded.
7058 * khw originally thought to just go through the parse loop one
7059 * extra time, but that doesn't work easily as that iteration
7060 * could cause things to think that the parse is over and to
7061 * goto loopdone. The character could be a '$' for example, or
7062 * the character beyond could be a quantifier, and other
7065 * The solution used here for peeking ahead is to look at that
7066 * next character. If it isn't ASCII punctuation, then it will
7067 * be something that would continue on in an EXACTish node if
7068 * there were space. We append the fold of it to s, having
7069 * reserved enough room in s0 for the purpose. If we can't
7070 * reasonably peek ahead, we instead assume the worst case:
7071 * that it is something that would form the completion of a
7074 * If we can't split between s and ender, we work backwards
7075 * character-by-character down to s0. At each current point
7076 * see if we are at the beginning of a multi-char fold. If so,
7077 * that means we would be splitting the fold across nodes, and
7078 * so we back up one and try again.
7080 * If we're not at the beginning, we still could be at the
7081 * final two characters of a (rare) three character fold. We
7082 * check if the sequence starting at the character before the
7083 * current position (and including the current and next
7084 * characters) is a three character fold. If not, the node can
7085 * be split here. If it is, we have to backup two characters
7088 * Otherwise, the node can be split at the current position.
7090 * The same logic is used for UTF-8 patterns and not */
7094 /* Append the fold of ender */
7095 (void) _to_uni_fold_flags(
7099 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7100 ? FOLD_FLAGS_NOMIX_ASCII
7104 /* 's' and the character folded to by ender may be the
7105 * first two of a three-character fold, in which case the
7106 * node should not be split here. That may mean examining
7107 * the so-far unparsed character starting at 'p'. But if
7108 * ender folded to more than one character, we already have
7109 * three characters to look at. Also, we first check if
7110 * the sequence consisting of s and the next character form
7111 * the first two of some three character fold. If not,
7112 * there's no need to peek ahead. */
7113 if ( added_len <= UTF8SKIP(e - added_len)
7114 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
7116 /* Here, the two do form the beginning of a potential
7117 * three character fold. The unexamined character may
7118 * or may not complete it. Peek at it. It might be
7119 * something that ends the node or an escape sequence,
7120 * in which case we don't know without a lot of work
7121 * what it evaluates to, so we have to assume the worst
7122 * case: that it does complete the fold, and so we
7123 * can't split here. All such instances will have
7124 * that character be an ASCII punctuation character,
7125 * like a backslash. So, for that case, backup one and
7126 * drop down to try at that position */
7128 s = (char *) utf8_hop_back((U8 *) s, -1,
7133 /* Here, since it's not punctuation, it must be a
7134 * real character, and we can append its fold to
7135 * 'e' (having deliberately reserved enough space
7136 * for this eventuality) and drop down to check if
7137 * the three actually do form a folded sequence */
7138 (void) _to_utf8_fold_flags(
7139 (U8 *) p, (U8 *) RExC_end,
7142 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
7143 ? FOLD_FLAGS_NOMIX_ASCII
7149 /* Here, we either have three characters available in
7150 * sequence starting at 's', or we have two characters and
7151 * know that the following one can't possibly be part of a
7152 * three character fold. We go through the node backwards
7153 * until we find a place where we can split it without
7154 * breaking apart a multi-character fold. At any given
7155 * point we have to worry about if such a fold begins at
7156 * the current 's', and also if a three-character fold
7157 * begins at s-1, (containing s and s+1). Splitting in
7158 * either case would break apart a fold */
7160 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
7163 /* If is a multi-char fold, can't split here. Backup
7164 * one char and try again */
7165 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
7171 /* If the two characters beginning at 's' are part of a
7172 * three character fold starting at the character
7173 * before s, we can't split either before or after s.
7174 * Backup two chars and try again */
7175 if ( LIKELY(s > s_start)
7176 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
7179 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
7184 /* Here there's no multi-char fold between s and the
7185 * next character following it. We can split */
7189 } while (s > s_start); /* End of loops backing up through the node */
7191 /* Here we either couldn't find a place to split the node,
7192 * or else we broke out of the loop setting 'splittable' to
7193 * true. In the latter case, the place to split is between
7194 * the first and second characters in the sequence starting
7200 else { /* Pattern not UTF-8 */
7201 if ( ender != LATIN_SMALL_LETTER_SHARP_S
7202 || ASCII_FOLD_RESTRICTED)
7204 assert( toLOWER_L1(ender) < 256 );
7205 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7213 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
7220 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
7221 || ASCII_FOLD_RESTRICTED)
7223 assert( toLOWER_L1(ender) < 256 );
7224 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
7234 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
7240 if ( LIKELY(s > s_start)
7241 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
7251 } while (s > s_start);
7258 /* Here, we are done backing up. If we didn't backup at all
7259 * (the likely case), just proceed */
7262 /* If we did find a place to split, reparse the entire node
7263 * stopping where we have calculated. */
7266 /* If we created a temporary folded string under /l, we
7267 * have to map that back to the original */
7268 if (need_to_fold_loc) {
7269 upper_fill = loc_correspondence[s - s_start];
7270 if (upper_fill == 0) {
7271 FAIL2("panic: loc_correspondence[%d] is 0",
7272 (int) (s - s_start));
7274 Safefree(locfold_buf);
7275 Safefree(loc_correspondence);
7278 upper_fill = s - s0;
7283 /* Here the node consists entirely of non-final multi-char
7284 * folds. (Likely it is all 'f's or all 's's.) There's no
7285 * decent place to split it, so give up and just take the
7290 if (need_to_fold_loc) {
7291 Safefree(locfold_buf);
7292 Safefree(loc_correspondence);
7294 } /* End of verifying node ends with an appropriate char */
7296 /* We need to start the next node at the character that didn't fit
7300 loopdone: /* Jumped to when encounters something that shouldn't be
7303 /* Free up any over-allocated space; cast is to silence bogus
7304 * warning in MS VC */
7305 change_engine_size(pRExC_state,
7306 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
7308 /* I (khw) don't know if you can get here with zero length, but the
7309 * old code handled this situation by creating a zero-length EXACT
7310 * node. Might as well be NOTHING instead */
7312 OP(REGNODE_p(ret)) = NOTHING;
7316 /* If the node type is EXACT here, check to see if it
7317 * should be EXACTL, or EXACT_REQ8. */
7318 if (node_type == EXACT) {
7322 else if (requires_utf8_target) {
7323 node_type = EXACT_REQ8;
7326 else if (node_type == LEXACT) {
7327 if (requires_utf8_target) {
7328 node_type = LEXACT_REQ8;
7332 if ( UNLIKELY(has_micro_sign || has_ss)
7333 && (node_type == EXACTFU || ( node_type == EXACTF
7335 { /* These two conditions are problematic in non-UTF-8
7338 node_type = EXACTFUP;
7340 else if (node_type == EXACTFL) {
7342 /* 'maybe_exactfu' is deliberately set above to
7343 * indicate this node type, where all code points in it
7345 if (maybe_exactfu) {
7346 node_type = EXACTFLU8;
7349 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
7351 /* A character that folds to more than one will
7352 * match multiple characters, so can't be SIMPLE.
7353 * We don't have to worry about this with EXACTFLU8
7354 * nodes just above, as they have already been
7355 * folded (since the fold doesn't vary at run
7356 * time). Here, if the final character in the node
7357 * folds to multiple, it can't be simple. (This
7358 * only has an effect if the node has only a single
7359 * character, hence the final one, as elsewhere we
7360 * turn off simple for nodes whose length > 1 */
7364 else if (node_type == EXACTF) { /* Means is /di */
7366 /* This intermediate variable is needed solely because
7367 * the asserts in the macro where used exceed Win32's
7368 * literal string capacity */
7369 char first_char = * STRING(REGNODE_p(ret));
7371 /* If 'maybe_exactfu' is clear, then we need to stay
7372 * /di. If it is set, it means there are no code
7373 * points that match differently depending on UTF8ness
7374 * of the target string, so it can become an EXACTFU
7376 if (! maybe_exactfu) {
7377 RExC_seen_d_op = TRUE;
7379 else if ( isALPHA_FOLD_EQ(first_char, 's')
7380 || isALPHA_FOLD_EQ(ender, 's'))
7382 /* But, if the node begins or ends in an 's' we
7383 * have to defer changing it into an EXACTFU, as
7384 * the node could later get joined with another one
7385 * that ends or begins with 's' creating an 'ss'
7386 * sequence which would then wrongly match the
7387 * sharp s without the target being UTF-8. We
7388 * create a special node that we resolve later when
7389 * we join nodes together */
7391 node_type = EXACTFU_S_EDGE;
7394 node_type = EXACTFU;
7398 if (requires_utf8_target && node_type == EXACTFU) {
7399 node_type = EXACTFU_REQ8;
7403 OP(REGNODE_p(ret)) = node_type;
7404 setSTR_LEN(REGNODE_p(ret), len);
7405 RExC_emit += STR_SZ(len);
7407 /* If the node isn't a single character, it can't be SIMPLE */
7408 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
7412 *flagp |= HASWIDTH | maybe_SIMPLE;
7418 /* len is STRLEN which is unsigned, need to copy to signed */
7421 vFAIL("Internal disaster");
7424 } /* End of label 'defchar:' */
7426 } /* End of giant switch on input character */
7428 /* Position parse to next real character */
7429 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
7430 FALSE /* Don't force to /x */ );
7431 if ( *RExC_parse == '{'
7432 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse, RExC_end, NULL))
7435 RExC_parse_inc_by(1);
7436 vFAIL("Unescaped left brace in regex is illegal here");
7438 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
7447 Perl_populate_anyof_bitmap_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
7449 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
7450 * sets up the bitmap and any flags, removing those code points from the
7451 * inversion list, setting it to NULL should it become completely empty */
7454 PERL_ARGS_ASSERT_POPULATE_ANYOF_BITMAP_FROM_INVLIST;
7456 /* There is no bitmap for this node type */
7457 if (REGNODE_TYPE(OP(node)) != ANYOF) {
7461 ANYOF_BITMAP_ZERO(node);
7464 /* This gets set if we actually need to modify things */
7465 bool change_invlist = FALSE;
7469 /* Start looking through *invlist_ptr */
7470 invlist_iterinit(*invlist_ptr);
7471 while (invlist_iternext(*invlist_ptr, &start, &end)) {
7475 /* Quit if are above what we should change */
7476 if (start >= NUM_ANYOF_CODE_POINTS) {
7480 change_invlist = TRUE;
7482 /* Set all the bits in the range, up to the max that we are doing */
7483 high = (end < NUM_ANYOF_CODE_POINTS - 1)
7485 : NUM_ANYOF_CODE_POINTS - 1;
7486 for (i = start; i <= (int) high; i++) {
7487 ANYOF_BITMAP_SET(node, i);
7490 invlist_iterfinish(*invlist_ptr);
7492 /* Done with loop; remove any code points that are in the bitmap from
7494 if (change_invlist) {
7495 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
7498 /* If have completely emptied it, remove it completely */
7499 if (_invlist_len(*invlist_ptr) == 0) {
7500 SvREFCNT_dec_NN(*invlist_ptr);
7501 *invlist_ptr = NULL;
7506 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7507 Character classes ([:foo:]) can also be negated ([:^foo:]).
7508 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7509 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7510 but trigger failures because they are currently unimplemented. */
7512 #define POSIXCC_DONE(c) ((c) == ':')
7513 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7514 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7515 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
7517 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
7518 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
7519 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
7521 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
7523 /* 'posix_warnings' and 'warn_text' are names of variables in the following
7525 #define ADD_POSIX_WARNING(p, text) STMT_START { \
7526 if (posix_warnings) { \
7527 if (! RExC_warn_text ) RExC_warn_text = \
7528 (AV *) sv_2mortal((SV *) newAV()); \
7529 av_push_simple(RExC_warn_text, Perl_newSVpvf(aTHX_ \
7533 REPORT_LOCATION_ARGS(p))); \
7536 #define CLEAR_POSIX_WARNINGS() \
7538 if (posix_warnings && RExC_warn_text) \
7539 av_clear(RExC_warn_text); \
7542 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
7544 CLEAR_POSIX_WARNINGS(); \
7549 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
7551 const char * const s, /* Where the putative posix class begins.
7552 Normally, this is one past the '['. This
7553 parameter exists so it can be somewhere
7554 besides RExC_parse. */
7555 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
7557 AV ** posix_warnings, /* Where to place any generated warnings, or
7559 const bool check_only /* Don't die if error */
7562 /* This parses what the caller thinks may be one of the three POSIX
7564 * 1) a character class, like [:blank:]
7565 * 2) a collating symbol, like [. .]
7566 * 3) an equivalence class, like [= =]
7567 * In the latter two cases, it croaks if it finds a syntactically legal
7568 * one, as these are not handled by Perl.
7570 * The main purpose is to look for a POSIX character class. It returns:
7571 * a) the class number
7572 * if it is a completely syntactically and semantically legal class.
7573 * 'updated_parse_ptr', if not NULL, is set to point to just after the
7574 * closing ']' of the class
7576 * if it appears that one of the three POSIX constructs was meant, but
7577 * its specification was somehow defective. 'updated_parse_ptr', if
7578 * not NULL, is set to point to the character just after the end
7579 * character of the class. See below for handling of warnings.
7580 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
7581 * if it doesn't appear that a POSIX construct was intended.
7582 * 'updated_parse_ptr' is not changed. No warnings nor errors are
7585 * In b) there may be errors or warnings generated. If 'check_only' is
7586 * TRUE, then any errors are discarded. Warnings are returned to the
7587 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
7588 * instead it is NULL, warnings are suppressed.
7590 * The reason for this function, and its complexity is that a bracketed
7591 * character class can contain just about anything. But it's easy to
7592 * mistype the very specific posix class syntax but yielding a valid
7593 * regular bracketed class, so it silently gets compiled into something
7596 * The solution adopted here maintains backward compatibility except that
7597 * it adds a warning if it looks like a posix class was intended but
7598 * improperly specified. The warning is not raised unless what is input
7599 * very closely resembles one of the 14 legal posix classes. To do this,
7600 * it uses fuzzy parsing. It calculates how many single-character edits it
7601 * would take to transform what was input into a legal posix class. Only
7602 * if that number is quite small does it think that the intention was a
7603 * posix class. Obviously these are heuristics, and there will be cases
7604 * where it errs on one side or another, and they can be tweaked as
7605 * experience informs.
7607 * The syntax for a legal posix class is:
7609 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
7611 * What this routine considers syntactically to be an intended posix class
7612 * is this (the comments indicate some restrictions that the pattern
7615 * qr/(?x: \[? # The left bracket, possibly
7617 * \h* # possibly followed by blanks
7618 * (?: \^ \h* )? # possibly a misplaced caret
7619 * [:;]? # The opening class character,
7620 * # possibly omitted. A typo
7621 * # semi-colon can also be used.
7623 * \^? # possibly a correctly placed
7624 * # caret, but not if there was also
7627 * .{3,15} # The class name. If there are
7628 * # deviations from the legal syntax,
7629 * # its edit distance must be close
7630 * # to a real class name in order
7631 * # for it to be considered to be
7632 * # an intended posix class.
7634 * [[:punct:]]? # The closing class character,
7635 * # possibly omitted. If not a colon
7636 * # nor semi colon, the class name
7637 * # must be even closer to a valid
7640 * \]? # The right bracket, possibly
7644 * In the above, \h must be ASCII-only.
7646 * These are heuristics, and can be tweaked as field experience dictates.
7647 * There will be cases when someone didn't intend to specify a posix class
7648 * that this warns as being so. The goal is to minimize these, while
7649 * maximizing the catching of things intended to be a posix class that
7650 * aren't parsed as such.
7654 const char * const e = RExC_end;
7655 unsigned complement = 0; /* If to complement the class */
7656 bool found_problem = FALSE; /* Assume OK until proven otherwise */
7657 bool has_opening_bracket = FALSE;
7658 bool has_opening_colon = FALSE;
7659 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
7661 const char * possible_end = NULL; /* used for a 2nd parse pass */
7662 const char* name_start; /* ptr to class name first char */
7664 /* If the number of single-character typos the input name is away from a
7665 * legal name is no more than this number, it is considered to have meant
7667 int max_distance = 2;
7669 /* to store the name. The size determines the maximum length before we
7670 * decide that no posix class was intended. Should be at least
7671 * sizeof("alphanumeric") */
7673 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
7675 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
7677 CLEAR_POSIX_WARNINGS();
7680 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
7683 if (*(p - 1) != '[') {
7684 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
7685 found_problem = TRUE;
7688 has_opening_bracket = TRUE;
7691 /* They could be confused and think you can put spaces between the
7694 found_problem = TRUE;
7698 } while (p < e && isBLANK(*p));
7700 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7703 /* For [. .] and [= =]. These are quite different internally from [: :],
7704 * so they are handled separately. */
7705 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
7706 and 1 for at least one char in it
7709 const char open_char = *p;
7710 const char * temp_ptr = p + 1;
7712 /* These two constructs are not handled by perl, and if we find a
7713 * syntactically valid one, we croak. khw, who wrote this code, finds
7714 * this explanation of them very unclear:
7715 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
7716 * And searching the rest of the internet wasn't very helpful either.
7717 * It looks like just about any byte can be in these constructs,
7718 * depending on the locale. But unless the pattern is being compiled
7719 * under /l, which is very rare, Perl runs under the C or POSIX locale.
7720 * In that case, it looks like [= =] isn't allowed at all, and that
7721 * [. .] could be any single code point, but for longer strings the
7722 * constituent characters would have to be the ASCII alphabetics plus
7723 * the minus-hyphen. Any sensible locale definition would limit itself
7724 * to these. And any portable one definitely should. Trying to parse
7725 * the general case is a nightmare (see [perl #127604]). So, this code
7726 * looks only for interiors of these constructs that match:
7728 * Using \w relaxes the apparent rules a little, without adding much
7729 * danger of mistaking something else for one of these constructs.
7731 * [. .] in some implementations described on the internet is usable to
7732 * escape a character that otherwise is special in bracketed character
7733 * classes. For example [.].] means a literal right bracket instead of
7734 * the ending of the class
7736 * [= =] can legitimately contain a [. .] construct, but we don't
7737 * handle this case, as that [. .] construct will later get parsed
7738 * itself and croak then. And [= =] is checked for even when not under
7739 * /l, as Perl has long done so.
7741 * The code below relies on there being a trailing NUL, so it doesn't
7742 * have to keep checking if the parse ptr < e.
7744 if (temp_ptr[1] == open_char) {
7747 else while ( temp_ptr < e
7748 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
7753 if (*temp_ptr == open_char) {
7755 if (*temp_ptr == ']') {
7757 if (! found_problem && ! check_only) {
7758 RExC_parse_set((char *) temp_ptr);
7759 vFAIL3("POSIX syntax [%c %c] is reserved for future "
7760 "extensions", open_char, open_char);
7763 /* Here, the syntax wasn't completely valid, or else the call
7764 * is to check-only */
7765 if (updated_parse_ptr) {
7766 *updated_parse_ptr = (char *) temp_ptr;
7769 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
7773 /* If we find something that started out to look like one of these
7774 * constructs, but isn't, we continue below so that it can be checked
7775 * for being a class name with a typo of '.' or '=' instead of a colon.
7779 /* Here, we think there is a possibility that a [: :] class was meant, and
7780 * we have the first real character. It could be they think the '^' comes
7783 found_problem = TRUE;
7784 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
7789 found_problem = TRUE;
7793 } while (p < e && isBLANK(*p));
7795 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7799 /* But the first character should be a colon, which they could have easily
7800 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
7801 * distinguish from a colon, so treat that as a colon). */
7804 has_opening_colon = TRUE;
7806 else if (*p == ';') {
7807 found_problem = TRUE;
7809 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7810 has_opening_colon = TRUE;
7813 found_problem = TRUE;
7814 ADD_POSIX_WARNING(p, "there must be a starting ':'");
7816 /* Consider an initial punctuation (not one of the recognized ones) to
7817 * be a left terminator */
7818 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
7823 /* They may think that you can put spaces between the components */
7825 found_problem = TRUE;
7829 } while (p < e && isBLANK(*p));
7831 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7836 /* We consider something like [^:^alnum:]] to not have been intended to
7837 * be a posix class, but XXX maybe we should */
7839 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7846 /* Again, they may think that you can put spaces between the components */
7848 found_problem = TRUE;
7852 } while (p < e && isBLANK(*p));
7854 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
7859 /* XXX This ']' may be a typo, and something else was meant. But
7860 * treating it as such creates enough complications, that that
7861 * possibility isn't currently considered here. So we assume that the
7862 * ']' is what is intended, and if we've already found an initial '[',
7863 * this leaves this construct looking like [:] or [:^], which almost
7864 * certainly weren't intended to be posix classes */
7865 if (has_opening_bracket) {
7866 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7869 /* But this function can be called when we parse the colon for
7870 * something like qr/[alpha:]]/, so we back up to look for the
7875 found_problem = TRUE;
7876 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
7878 else if (*p != ':') {
7880 /* XXX We are currently very restrictive here, so this code doesn't
7881 * consider the possibility that, say, /[alpha.]]/ was intended to
7882 * be a posix class. */
7883 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
7886 /* Here we have something like 'foo:]'. There was no initial colon,
7887 * and we back up over 'foo. XXX Unlike the going forward case, we
7888 * don't handle typos of non-word chars in the middle */
7889 has_opening_colon = FALSE;
7892 while (p > RExC_start && isWORDCHAR(*p)) {
7897 /* Here, we have positioned ourselves to where we think the first
7898 * character in the potential class is */
7901 /* Now the interior really starts. There are certain key characters that
7902 * can end the interior, or these could just be typos. To catch both
7903 * cases, we may have to do two passes. In the first pass, we keep on
7904 * going unless we come to a sequence that matches
7905 * qr/ [[:punct:]] [[:blank:]]* \] /xa
7906 * This means it takes a sequence to end the pass, so two typos in a row if
7907 * that wasn't what was intended. If the class is perfectly formed, just
7908 * this one pass is needed. We also stop if there are too many characters
7909 * being accumulated, but this number is deliberately set higher than any
7910 * real class. It is set high enough so that someone who thinks that
7911 * 'alphanumeric' is a correct name would get warned that it wasn't.
7912 * While doing the pass, we keep track of where the key characters were in
7913 * it. If we don't find an end to the class, and one of the key characters
7914 * was found, we redo the pass, but stop when we get to that character.
7915 * Thus the key character was considered a typo in the first pass, but a
7916 * terminator in the second. If two key characters are found, we stop at
7917 * the second one in the first pass. Again this can miss two typos, but
7918 * catches a single one
7920 * In the first pass, 'possible_end' starts as NULL, and then gets set to
7921 * point to the first key character. For the second pass, it starts as -1.
7927 bool has_blank = FALSE;
7928 bool has_upper = FALSE;
7929 bool has_terminating_colon = FALSE;
7930 bool has_terminating_bracket = FALSE;
7931 bool has_semi_colon = FALSE;
7932 unsigned int name_len = 0;
7933 int punct_count = 0;
7937 /* Squeeze out blanks when looking up the class name below */
7940 found_problem = TRUE;
7945 /* The name will end with a punctuation */
7947 const char * peek = p + 1;
7949 /* Treat any non-']' punctuation followed by a ']' (possibly
7950 * with intervening blanks) as trying to terminate the class.
7951 * ']]' is very likely to mean a class was intended (but
7952 * missing the colon), but the warning message that gets
7953 * generated shows the error position better if we exit the
7954 * loop at the bottom (eventually), so skip it here. */
7956 if (peek < e && isBLANK(*peek)) {
7958 found_problem = TRUE;
7961 } while (peek < e && isBLANK(*peek));
7964 if (peek < e && *peek == ']') {
7965 has_terminating_bracket = TRUE;
7967 has_terminating_colon = TRUE;
7969 else if (*p == ';') {
7970 has_semi_colon = TRUE;
7971 has_terminating_colon = TRUE;
7974 found_problem = TRUE;
7981 /* Here we have punctuation we thought didn't end the class.
7982 * Keep track of the position of the key characters that are
7983 * more likely to have been class-enders */
7984 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
7986 /* Allow just one such possible class-ender not actually
7987 * ending the class. */
7994 /* If we have too many punctuation characters, no use in
7996 if (++punct_count > max_distance) {
8000 /* Treat the punctuation as a typo. */
8001 input_text[name_len++] = *p;
8004 else if (isUPPER(*p)) { /* Use lowercase for lookup */
8005 input_text[name_len++] = toLOWER(*p);
8007 found_problem = TRUE;
8009 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
8010 input_text[name_len++] = *p;
8014 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
8018 /* The declaration of 'input_text' is how long we allow a potential
8019 * class name to be, before saying they didn't mean a class name at
8021 if (name_len >= C_ARRAY_LENGTH(input_text)) {
8026 /* We get to here when the possible class name hasn't been properly
8027 * terminated before:
8028 * 1) we ran off the end of the pattern; or
8029 * 2) found two characters, each of which might have been intended to
8030 * be the name's terminator
8031 * 3) found so many punctuation characters in the purported name,
8032 * that the edit distance to a valid one is exceeded
8033 * 4) we decided it was more characters than anyone could have
8034 * intended to be one. */
8036 found_problem = TRUE;
8038 /* In the final two cases, we know that looking up what we've
8039 * accumulated won't lead to a match, even a fuzzy one. */
8040 if ( name_len >= C_ARRAY_LENGTH(input_text)
8041 || punct_count > max_distance)
8043 /* If there was an intermediate key character that could have been
8044 * an intended end, redo the parse, but stop there */
8045 if (possible_end && possible_end != (char *) -1) {
8046 possible_end = (char *) -1; /* Special signal value to say
8047 we've done a first pass */
8052 /* Otherwise, it can't have meant to have been a class */
8053 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8056 /* If we ran off the end, and the final character was a punctuation
8057 * one, back up one, to look at that final one just below. Later, we
8058 * will restore the parse pointer if appropriate */
8059 if (name_len && p == e && isPUNCT(*(p-1))) {
8064 if (p < e && isPUNCT(*p)) {
8066 has_terminating_bracket = TRUE;
8068 /* If this is a 2nd ']', and the first one is just below this
8069 * one, consider that to be the real terminator. This gives a
8070 * uniform and better positioning for the warning message */
8072 && possible_end != (char *) -1
8073 && *possible_end == ']'
8074 && name_len && input_text[name_len - 1] == ']')
8079 /* And this is actually equivalent to having done the 2nd
8080 * pass now, so set it to not try again */
8081 possible_end = (char *) -1;
8086 has_terminating_colon = TRUE;
8088 else if (*p == ';') {
8089 has_semi_colon = TRUE;
8090 has_terminating_colon = TRUE;
8098 /* Here, we have a class name to look up. We can short circuit the
8099 * stuff below for short names that can't possibly be meant to be a
8100 * class name. (We can do this on the first pass, as any second pass
8101 * will yield an even shorter name) */
8103 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8106 /* Find which class it is. Initially switch on the length of the name.
8110 if (memEQs(name_start, 4, "word")) {
8111 /* this is not POSIX, this is the Perl \w */
8112 class_number = ANYOF_WORDCHAR;
8116 /* Names all of length 5: alnum alpha ascii blank cntrl digit
8117 * graph lower print punct space upper
8118 * Offset 4 gives the best switch position. */
8119 switch (name_start[4]) {
8121 if (memBEGINs(name_start, 5, "alph")) /* alpha */
8122 class_number = ANYOF_ALPHA;
8125 if (memBEGINs(name_start, 5, "spac")) /* space */
8126 class_number = ANYOF_SPACE;
8129 if (memBEGINs(name_start, 5, "grap")) /* graph */
8130 class_number = ANYOF_GRAPH;
8133 if (memBEGINs(name_start, 5, "asci")) /* ascii */
8134 class_number = ANYOF_ASCII;
8137 if (memBEGINs(name_start, 5, "blan")) /* blank */
8138 class_number = ANYOF_BLANK;
8141 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
8142 class_number = ANYOF_CNTRL;
8145 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
8146 class_number = ANYOF_ALPHANUMERIC;
8149 if (memBEGINs(name_start, 5, "lowe")) /* lower */
8150 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
8151 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
8152 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
8155 if (memBEGINs(name_start, 5, "digi")) /* digit */
8156 class_number = ANYOF_DIGIT;
8157 else if (memBEGINs(name_start, 5, "prin")) /* print */
8158 class_number = ANYOF_PRINT;
8159 else if (memBEGINs(name_start, 5, "punc")) /* punct */
8160 class_number = ANYOF_PUNCT;
8165 if (memEQs(name_start, 6, "xdigit"))
8166 class_number = ANYOF_XDIGIT;
8170 /* If the name exactly matches a posix class name the class number will
8171 * here be set to it, and the input almost certainly was meant to be a
8172 * posix class, so we can skip further checking. If instead the syntax
8173 * is exactly correct, but the name isn't one of the legal ones, we
8174 * will return that as an error below. But if neither of these apply,
8175 * it could be that no posix class was intended at all, or that one
8176 * was, but there was a typo. We tease these apart by doing fuzzy
8177 * matching on the name */
8178 if (class_number == OOB_NAMEDCLASS && found_problem) {
8179 const UV posix_names[][6] = {
8180 { 'a', 'l', 'n', 'u', 'm' },
8181 { 'a', 'l', 'p', 'h', 'a' },
8182 { 'a', 's', 'c', 'i', 'i' },
8183 { 'b', 'l', 'a', 'n', 'k' },
8184 { 'c', 'n', 't', 'r', 'l' },
8185 { 'd', 'i', 'g', 'i', 't' },
8186 { 'g', 'r', 'a', 'p', 'h' },
8187 { 'l', 'o', 'w', 'e', 'r' },
8188 { 'p', 'r', 'i', 'n', 't' },
8189 { 'p', 'u', 'n', 'c', 't' },
8190 { 's', 'p', 'a', 'c', 'e' },
8191 { 'u', 'p', 'p', 'e', 'r' },
8192 { 'w', 'o', 'r', 'd' },
8193 { 'x', 'd', 'i', 'g', 'i', 't' }
8195 /* The names of the above all have added NULs to make them the same
8196 * size, so we need to also have the real lengths */
8197 const UV posix_name_lengths[] = {
8198 sizeof("alnum") - 1,
8199 sizeof("alpha") - 1,
8200 sizeof("ascii") - 1,
8201 sizeof("blank") - 1,
8202 sizeof("cntrl") - 1,
8203 sizeof("digit") - 1,
8204 sizeof("graph") - 1,
8205 sizeof("lower") - 1,
8206 sizeof("print") - 1,
8207 sizeof("punct") - 1,
8208 sizeof("space") - 1,
8209 sizeof("upper") - 1,
8214 int temp_max = max_distance; /* Use a temporary, so if we
8215 reparse, we haven't changed the
8218 /* Use a smaller max edit distance if we are missing one of the
8220 if ( has_opening_bracket + has_opening_colon < 2
8221 || has_terminating_bracket + has_terminating_colon < 2)
8226 /* See if the input name is close to a legal one */
8227 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
8229 /* Short circuit call if the lengths are too far apart to be
8231 if (abs( (int) (name_len - posix_name_lengths[i]))
8237 if (edit_distance(input_text,
8240 posix_name_lengths[i],
8244 { /* If it is close, it probably was intended to be a class */
8245 goto probably_meant_to_be;
8249 /* Here the input name is not close enough to a valid class name
8250 * for us to consider it to be intended to be a posix class. If
8251 * we haven't already done so, and the parse found a character that
8252 * could have been terminators for the name, but which we absorbed
8253 * as typos during the first pass, repeat the parse, signalling it
8254 * to stop at that character */
8255 if (possible_end && possible_end != (char *) -1) {
8256 possible_end = (char *) -1;
8261 /* Here neither pass found a close-enough class name */
8262 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
8265 probably_meant_to_be:
8267 /* Here we think that a posix specification was intended. Update any
8269 if (updated_parse_ptr) {
8270 *updated_parse_ptr = (char *) p;
8273 /* If a posix class name was intended but incorrectly specified, we
8274 * output or return the warnings */
8275 if (found_problem) {
8277 /* We set flags for these issues in the parse loop above instead of
8278 * adding them to the list of warnings, because we can parse it
8279 * twice, and we only want one warning instance */
8281 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
8284 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
8286 if (has_semi_colon) {
8287 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
8289 else if (! has_terminating_colon) {
8290 ADD_POSIX_WARNING(p, "there is no terminating ':'");
8292 if (! has_terminating_bracket) {
8293 ADD_POSIX_WARNING(p, "there is no terminating ']'");
8298 && av_count(RExC_warn_text) > 0)
8300 *posix_warnings = RExC_warn_text;
8303 else if (class_number != OOB_NAMEDCLASS) {
8304 /* If it is a known class, return the class. The class number
8305 * #defines are structured so each complement is +1 to the normal
8307 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
8309 else if (! check_only) {
8311 /* Here, it is an unrecognized class. This is an error (unless the
8312 * call is to check only, which we've already handled above) */
8313 const char * const complement_string = (complement)
8316 RExC_parse_set((char *) p);
8317 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
8319 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
8323 return OOB_NAMEDCLASS;
8325 #undef ADD_POSIX_WARNING
8328 S_regex_set_precedence(const U8 my_operator) {
8330 /* Returns the precedence in the (?[...]) construct of the input operator,
8331 * specified by its character representation. The precedence follows
8332 * general Perl rules, but it extends this so that ')' and ']' have (low)
8333 * precedence even though they aren't really operators */
8335 switch (my_operator) {
8351 NOT_REACHED; /* NOTREACHED */
8352 return 0; /* Silence compiler warning */
8355 STATIC regnode_offset
8356 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
8357 I32 *flagp, U32 depth)
8359 /* Handle the (?[...]) construct to do set operations */
8361 U8 curchar; /* Current character being parsed */
8362 UV start, end; /* End points of code point ranges */
8363 SV* final = NULL; /* The end result inversion list */
8364 SV* result_string; /* 'final' stringified */
8365 AV* stack; /* stack of operators and operands not yet
8367 AV* fence_stack = NULL; /* A stack containing the positions in
8368 'stack' of where the undealt-with left
8369 parens would be if they were actually
8371 /* The 'volatile' is a workaround for an optimiser bug
8372 * in Solaris Studio 12.3. See RT #127455 */
8373 volatile IV fence = 0; /* Position of where most recent undealt-
8374 with left paren in stack is; -1 if none.
8376 STRLEN len; /* Temporary */
8377 regnode_offset node; /* Temporary, and final regnode returned by
8379 const bool save_fold = FOLD; /* Temporary */
8380 char *save_end, *save_parse; /* Temporaries */
8381 const bool in_locale = LOC; /* we turn off /l during processing */
8383 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8385 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
8387 DEBUG_PARSE("xcls");
8390 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
8393 /* The use of this operator implies /u. This is required so that the
8394 * compile time values are valid in all runtime cases */
8395 REQUIRE_UNI_RULES(flagp, 0);
8397 /* Everything in this construct is a metacharacter. Operands begin with
8398 * either a '\' (for an escape sequence), or a '[' for a bracketed
8399 * character class. Any other character should be an operator, or
8400 * parenthesis for grouping. Both types of operands are handled by calling
8401 * regclass() to parse them. It is called with a parameter to indicate to
8402 * return the computed inversion list. The parsing here is implemented via
8403 * a stack. Each entry on the stack is a single character representing one
8404 * of the operators; or else a pointer to an operand inversion list. */
8406 #define IS_OPERATOR(a) SvIOK(a)
8407 #define IS_OPERAND(a) (! IS_OPERATOR(a))
8409 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
8410 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
8411 * with pronouncing it called it Reverse Polish instead, but now that YOU
8412 * know how to pronounce it you can use the correct term, thus giving due
8413 * credit to the person who invented it, and impressing your geek friends.
8414 * Wikipedia says that the pronunciation of "Ł" has been changing so that
8415 * it is now more like an English initial W (as in wonk) than an L.)
8417 * This means that, for example, 'a | b & c' is stored on the stack as
8425 * where the numbers in brackets give the stack [array] element number.
8426 * In this implementation, parentheses are not stored on the stack.
8427 * Instead a '(' creates a "fence" so that the part of the stack below the
8428 * fence is invisible except to the corresponding ')' (this allows us to
8429 * replace testing for parens, by using instead subtraction of the fence
8430 * position). As new operands are processed they are pushed onto the stack
8431 * (except as noted in the next paragraph). New operators of higher
8432 * precedence than the current final one are inserted on the stack before
8433 * the lhs operand (so that when the rhs is pushed next, everything will be
8434 * in the correct positions shown above. When an operator of equal or
8435 * lower precedence is encountered in parsing, all the stacked operations
8436 * of equal or higher precedence are evaluated, leaving the result as the
8437 * top entry on the stack. This makes higher precedence operations
8438 * evaluate before lower precedence ones, and causes operations of equal
8439 * precedence to left associate.
8441 * The only unary operator '!' is immediately pushed onto the stack when
8442 * encountered. When an operand is encountered, if the top of the stack is
8443 * a '!", the complement is immediately performed, and the '!' popped. The
8444 * resulting value is treated as a new operand, and the logic in the
8445 * previous paragraph is executed. Thus in the expression
8447 * the stack looks like
8453 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
8460 * A ')' is treated as an operator with lower precedence than all the
8461 * aforementioned ones, which causes all operations on the stack above the
8462 * corresponding '(' to be evaluated down to a single resultant operand.
8463 * Then the fence for the '(' is removed, and the operand goes through the
8464 * algorithm above, without the fence.
8466 * A separate stack is kept of the fence positions, so that the position of
8467 * the latest so-far unbalanced '(' is at the top of it.
8469 * The ']' ending the construct is treated as the lowest operator of all,
8470 * so that everything gets evaluated down to a single operand, which is the
8473 stack = (AV*)newSV_type_mortal(SVt_PVAV);
8474 fence_stack = (AV*)newSV_type_mortal(SVt_PVAV);
8476 while (RExC_parse < RExC_end) {
8477 I32 top_index; /* Index of top-most element in 'stack' */
8478 SV** top_ptr; /* Pointer to top 'stack' element */
8479 SV* current = NULL; /* To contain the current inversion list
8481 SV* only_to_avoid_leaks;
8483 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
8484 TRUE /* Force /x */ );
8485 if (RExC_parse >= RExC_end) { /* Fail */
8489 curchar = UCHARAT(RExC_parse);
8493 #ifdef ENABLE_REGEX_SETS_DEBUGGING
8494 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
8495 DEBUG_U(dump_regex_sets_structures(pRExC_state,
8496 stack, fence, fence_stack));
8499 top_index = av_tindex_skip_len_mg(stack);
8502 SV** stacked_ptr; /* Ptr to something already on 'stack' */
8503 char stacked_operator; /* The topmost operator on the 'stack'. */
8504 SV* lhs; /* Operand to the left of the operator */
8505 SV* rhs; /* Operand to the right of the operator */
8506 SV* fence_ptr; /* Pointer to top element of the fence
8510 if ( RExC_parse < RExC_end - 2
8511 && UCHARAT(RExC_parse + 1) == '?'
8512 && strchr("^" STD_PAT_MODS, *(RExC_parse + 2)))
8514 const regnode_offset orig_emit = RExC_emit;
8515 SV * resultant_invlist;
8517 /* Here it could be an embedded '(?flags:(?[...])'.
8518 * This happens when we have some thing like
8520 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
8522 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
8524 * Here we would be handling the interpolated
8525 * '$thai_or_lao'. We handle this by a recursive call to
8526 * reg which returns the inversion list the
8527 * interpolated expression evaluates to. Actually, the
8528 * return is a special regnode containing a pointer to that
8529 * inversion list. If the return isn't that regnode alone,
8530 * we know that this wasn't such an interpolation, which is
8531 * an error: we need to get a single inversion list back
8532 * from the recursion */
8534 RExC_parse_inc_by(1);
8537 node = reg(pRExC_state, 2, flagp, depth+1);
8538 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8540 if ( OP(REGNODE_p(node)) != REGEX_SET
8541 /* If more than a single node returned, the nested
8542 * parens evaluated to more than just a (?[...]),
8543 * which isn't legal */
8544 || RExC_emit != orig_emit
8546 + REGNODE_ARG_LEN(REGEX_SET))
8548 vFAIL("Expecting interpolated extended charclass");
8550 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
8551 current = invlist_clone(resultant_invlist, NULL);
8552 SvREFCNT_dec(resultant_invlist);
8555 RExC_emit = orig_emit;
8556 goto handle_operand;
8559 /* A regular '('. Look behind for illegal syntax */
8560 if (top_index - fence >= 0) {
8561 /* If the top entry on the stack is an operator, it had
8562 * better be a '!', otherwise the entry below the top
8563 * operand should be an operator */
8564 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
8565 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
8566 || ( IS_OPERAND(*top_ptr)
8567 && ( top_index - fence < 1
8568 || ! (stacked_ptr = av_fetch(stack,
8571 || ! IS_OPERATOR(*stacked_ptr))))
8573 RExC_parse_inc_by(1);
8574 vFAIL("Unexpected '(' with no preceding operator");
8578 /* Stack the position of this undealt-with left paren */
8579 av_push_simple(fence_stack, newSViv(fence));
8580 fence = top_index + 1;
8584 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8585 * multi-char folds are allowed. */
8586 if (!regclass(pRExC_state, flagp, depth+1,
8587 TRUE, /* means parse just the next thing */
8588 FALSE, /* don't allow multi-char folds */
8589 FALSE, /* don't silence non-portable warnings. */
8591 FALSE, /* Require return to be an ANYOF */
8594 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8595 goto regclass_failed;
8600 /* regclass() will return with parsing just the \ sequence,
8601 * leaving the parse pointer at the next thing to parse */
8603 goto handle_operand;
8605 case '[': /* Is a bracketed character class */
8607 /* See if this is a [:posix:] class. */
8608 bool is_posix_class = (OOB_NAMEDCLASS
8609 < handle_possible_posix(pRExC_state,
8613 TRUE /* checking only */));
8614 /* If it is a posix class, leave the parse pointer at the '['
8615 * to fool regclass() into thinking it is part of a
8617 if (! is_posix_class) {
8618 RExC_parse_inc_by(1);
8621 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
8622 * multi-char folds are allowed. */
8623 if (!regclass(pRExC_state, flagp, depth+1,
8624 is_posix_class, /* parse the whole char
8627 FALSE, /* don't allow multi-char folds */
8628 TRUE, /* silence non-portable warnings. */
8630 FALSE, /* Require return to be an ANYOF */
8633 RETURN_FAIL_ON_RESTART(*flagp, flagp);
8634 goto regclass_failed;
8639 /* function call leaves parse pointing to the ']', except if we
8641 if (is_posix_class) {
8645 goto handle_operand;
8649 if (top_index >= 1) {
8650 goto join_operators;
8653 /* Only a single operand on the stack: are done */
8657 if (av_tindex_skip_len_mg(fence_stack) < 0) {
8658 if (UCHARAT(RExC_parse - 1) == ']') {
8661 RExC_parse_inc_by(1);
8662 vFAIL("Unexpected ')'");
8665 /* If nothing after the fence, is missing an operand */
8666 if (top_index - fence < 0) {
8667 RExC_parse_inc_by(1);
8670 /* If at least two things on the stack, treat this as an
8672 if (top_index - fence >= 1) {
8673 goto join_operators;
8676 /* Here only a single thing on the fenced stack, and there is a
8677 * fence. Get rid of it */
8678 fence_ptr = av_pop(fence_stack);
8680 fence = SvIV(fence_ptr);
8681 SvREFCNT_dec_NN(fence_ptr);
8688 /* Having gotten rid of the fence, we pop the operand at the
8689 * stack top and process it as a newly encountered operand */
8690 current = av_pop(stack);
8691 if (IS_OPERAND(current)) {
8692 goto handle_operand;
8695 RExC_parse_inc_by(1);
8704 /* These binary operators should have a left operand already
8706 if ( top_index - fence < 0
8707 || top_index - fence == 1
8708 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
8709 || ! IS_OPERAND(*top_ptr))
8711 goto unexpected_binary;
8714 /* If only the one operand is on the part of the stack visible
8715 * to us, we just place this operator in the proper position */
8716 if (top_index - fence < 2) {
8718 /* Place the operator before the operand */
8720 SV* lhs = av_pop(stack);
8721 av_push_simple(stack, newSVuv(curchar));
8722 av_push_simple(stack, lhs);
8726 /* But if there is something else on the stack, we need to
8727 * process it before this new operator if and only if the
8728 * stacked operation has equal or higher precedence than the
8733 /* The operator on the stack is supposed to be below both its
8735 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
8736 || IS_OPERAND(*stacked_ptr))
8738 /* But if not, it's legal and indicates we are completely
8739 * done if and only if we're currently processing a ']',
8740 * which should be the final thing in the expression */
8741 if (curchar == ']') {
8746 RExC_parse_inc_by(1);
8747 vFAIL2("Unexpected binary operator '%c' with no "
8748 "preceding operand", curchar);
8750 stacked_operator = (char) SvUV(*stacked_ptr);
8752 if (regex_set_precedence(curchar)
8753 > regex_set_precedence(stacked_operator))
8755 /* Here, the new operator has higher precedence than the
8756 * stacked one. This means we need to add the new one to
8757 * the stack to await its rhs operand (and maybe more
8758 * stuff). We put it before the lhs operand, leaving
8759 * untouched the stacked operator and everything below it
8761 lhs = av_pop(stack);
8762 assert(IS_OPERAND(lhs));
8763 av_push_simple(stack, newSVuv(curchar));
8764 av_push_simple(stack, lhs);
8768 /* Here, the new operator has equal or lower precedence than
8769 * what's already there. This means the operation already
8770 * there should be performed now, before the new one. */
8772 rhs = av_pop(stack);
8773 if (! IS_OPERAND(rhs)) {
8775 /* This can happen when a ! is not followed by an operand,
8776 * like in /(?[\t &!])/ */
8780 lhs = av_pop(stack);
8782 if (! IS_OPERAND(lhs)) {
8784 /* This can happen when there is an empty (), like in
8789 switch (stacked_operator) {
8791 _invlist_intersection(lhs, rhs, &rhs);
8796 _invlist_union(lhs, rhs, &rhs);
8800 _invlist_subtract(lhs, rhs, &rhs);
8803 case '^': /* The union minus the intersection */
8808 _invlist_union(lhs, rhs, &u);
8809 _invlist_intersection(lhs, rhs, &i);
8810 _invlist_subtract(u, i, &rhs);
8818 /* Here, the higher precedence operation has been done, and the
8819 * result is in 'rhs'. We overwrite the stacked operator with
8820 * the result. Then we redo this code to either push the new
8821 * operator onto the stack or perform any higher precedence
8822 * stacked operation */
8823 only_to_avoid_leaks = av_pop(stack);
8824 SvREFCNT_dec(only_to_avoid_leaks);
8825 av_push_simple(stack, rhs);
8828 case '!': /* Highest priority, right associative */
8830 /* If what's already at the top of the stack is another '!",
8831 * they just cancel each other out */
8832 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
8833 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
8835 only_to_avoid_leaks = av_pop(stack);
8836 SvREFCNT_dec(only_to_avoid_leaks);
8838 else { /* Otherwise, since it's right associative, just push
8840 av_push_simple(stack, newSVuv(curchar));
8846 if (RExC_parse >= RExC_end) {
8849 vFAIL("Unexpected character");
8853 /* Here 'current' is the operand. If something is already on the
8854 * stack, we have to check if it is a !. But first, the code above
8855 * may have altered the stack in the time since we earlier set
8858 top_index = av_tindex_skip_len_mg(stack);
8859 if (top_index - fence >= 0) {
8860 /* If the top entry on the stack is an operator, it had better
8861 * be a '!', otherwise the entry below the top operand should
8863 top_ptr = av_fetch(stack, top_index, FALSE);
8865 if (IS_OPERATOR(*top_ptr)) {
8867 /* The only permissible operator at the top of the stack is
8868 * '!', which is applied immediately to this operand. */
8869 curchar = (char) SvUV(*top_ptr);
8870 if (curchar != '!') {
8871 SvREFCNT_dec(current);
8872 vFAIL2("Unexpected binary operator '%c' with no "
8873 "preceding operand", curchar);
8876 _invlist_invert(current);
8878 only_to_avoid_leaks = av_pop(stack);
8879 SvREFCNT_dec(only_to_avoid_leaks);
8881 /* And we redo with the inverted operand. This allows
8882 * handling multiple ! in a row */
8883 goto handle_operand;
8885 /* Single operand is ok only for the non-binary ')'
8887 else if ((top_index - fence == 0 && curchar != ')')
8888 || (top_index - fence > 0
8889 && (! (stacked_ptr = av_fetch(stack,
8892 || IS_OPERAND(*stacked_ptr))))
8894 SvREFCNT_dec(current);
8895 vFAIL("Operand with no preceding operator");
8899 /* Here there was nothing on the stack or the top element was
8900 * another operand. Just add this new one */
8901 av_push_simple(stack, current);
8903 } /* End of switch on next parse token */
8906 } /* End of loop parsing through the construct */
8908 vFAIL("Syntax error in (?[...])");
8912 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
8913 if (RExC_parse < RExC_end) {
8914 RExC_parse_inc_by(1);
8917 vFAIL("Unexpected ']' with no following ')' in (?[...");
8920 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
8921 vFAIL("Unmatched (");
8924 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
8925 || ((final = av_pop(stack)) == NULL)
8926 || ! IS_OPERAND(final)
8927 || ! is_invlist(final)
8928 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
8931 SvREFCNT_dec(final);
8932 vFAIL("Incomplete expression within '(?[ ])'");
8935 /* Here, 'final' is the resultant inversion list from evaluating the
8936 * expression. Return it if so requested */
8937 if (return_invlist) {
8938 *return_invlist = final;
8942 if (RExC_sets_depth) { /* If within a recursive call, return in a special
8944 RExC_parse_inc_by(1);
8945 node = regpnode(pRExC_state, REGEX_SET, final);
8949 /* Otherwise generate a resultant node, based on 'final'. regclass()
8950 * is expecting a string of ranges and individual code points */
8951 invlist_iterinit(final);
8952 result_string = newSVpvs("");
8953 while (invlist_iternext(final, &start, &end)) {
8955 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
8958 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
8959 UVXf "}", start, end);
8963 /* About to generate an ANYOF (or similar) node from the inversion list
8964 * we have calculated */
8965 save_parse = RExC_parse;
8966 RExC_parse_set(SvPV(result_string, len));
8967 save_end = RExC_end;
8968 RExC_end = RExC_parse + len;
8969 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
8971 /* We turn off folding around the call, as the class we have
8972 * constructed already has all folding taken into consideration, and we
8973 * don't want regclass() to add to that */
8974 RExC_flags &= ~RXf_PMf_FOLD;
8975 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
8976 * folds are allowed. */
8977 node = regclass(pRExC_state, flagp, depth+1,
8978 FALSE, /* means parse the whole char class */
8979 FALSE, /* don't allow multi-char folds */
8980 TRUE, /* silence non-portable warnings. The above may
8981 very well have generated non-portable code
8982 points, but they're valid on this machine */
8983 FALSE, /* similarly, no need for strict */
8985 /* We can optimize into something besides an ANYOF,
8986 * except under /l, which needs to be ANYOF because of
8987 * runtime checks for locale sanity, etc */
8993 RExC_parse_set(save_parse + 1);
8994 RExC_end = save_end;
8995 SvREFCNT_dec_NN(final);
8996 SvREFCNT_dec_NN(result_string);
8999 RExC_flags |= RXf_PMf_FOLD;
9003 RETURN_FAIL_ON_RESTART(*flagp, flagp);
9004 goto regclass_failed;
9007 /* Fix up the node type if we are in locale. (We have pretended we are
9008 * under /u for the purposes of regclass(), as this construct will only
9009 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
9010 * (so as to cause any warnings about bad locales to be output in
9011 * regexec.c), and add the flag that indicates to check if not in a
9012 * UTF-8 locale. The reason we above forbid optimization into
9013 * something other than an ANYOF node is simply to minimize the number
9014 * of code changes in regexec.c. Otherwise we would have to create new
9015 * EXACTish node types and deal with them. This decision could be
9016 * revisited should this construct become popular.
9018 * (One might think we could look at the resulting ANYOF node and
9019 * suppress the flag if everything is above 255, as those would be
9020 * UTF-8 only, but this isn't true, as the components that led to that
9021 * result could have been locale-affected, and just happen to cancel
9022 * each other out under UTF-8 locales.) */
9024 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
9026 assert(OP(REGNODE_p(node)) == ANYOF);
9028 OP(REGNODE_p(node)) = ANYOFL;
9029 ANYOF_FLAGS(REGNODE_p(node)) |= ANYOFL_UTF8_LOCALE_REQD;
9033 nextchar(pRExC_state);
9037 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
9041 #ifdef ENABLE_REGEX_SETS_DEBUGGING
9044 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
9045 AV * stack, const IV fence, AV * fence_stack)
9046 { /* Dumps the stacks in handle_regex_sets() */
9048 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
9049 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
9052 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
9054 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
9056 if (stack_top < 0) {
9057 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
9060 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
9061 for (i = stack_top; i >= 0; i--) {
9062 SV ** element_ptr = av_fetch(stack, i, FALSE);
9063 if (! element_ptr) {
9066 if (IS_OPERATOR(*element_ptr)) {
9067 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
9068 (int) i, (int) SvIV(*element_ptr));
9071 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
9072 sv_dump(*element_ptr);
9077 if (fence_stack_top < 0) {
9078 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
9081 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
9082 for (i = fence_stack_top; i >= 0; i--) {
9083 SV ** element_ptr = av_fetch_simple(fence_stack, i, FALSE);
9084 if (! element_ptr) {
9087 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
9088 (int) i, (int) SvIV(*element_ptr));
9099 Perl_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
9101 /* This adds the Latin1/above-Latin1 folding rules.
9103 * This should be called only for a Latin1-range code points, cp, which is
9104 * known to be involved in a simple fold with other code points above
9105 * Latin1. It would give false results if /aa has been specified.
9106 * Multi-char folds are outside the scope of this, and must be handled
9109 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
9111 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
9113 /* The rules that are valid for all Unicode versions are hard-coded in */
9118 add_cp_to_invlist(*invlist, KELVIN_SIGN);
9122 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
9125 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
9126 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
9128 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
9129 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
9130 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
9132 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
9133 *invlist = add_cp_to_invlist(*invlist,
9134 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
9137 default: /* Other code points are checked against the data for the
9138 current Unicode version */
9142 const U32 * remaining_folds;
9146 folded_cp = toFOLD(cp);
9149 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
9151 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
9154 if (folded_cp > 255) {
9155 *invlist = add_cp_to_invlist(*invlist, folded_cp);
9158 folds_count = _inverse_folds(folded_cp, &first_fold,
9160 if (folds_count == 0) {
9162 /* Use deprecated warning to increase the chances of this being
9164 ckWARN2reg_d(RExC_parse,
9165 "Perl folding rules are not up-to-date for 0x%02X;"
9166 " please use the perlbug utility to report;", cp);
9171 if (first_fold > 255) {
9172 *invlist = add_cp_to_invlist(*invlist, first_fold);
9174 for (i = 0; i < folds_count - 1; i++) {
9175 if (remaining_folds[i] > 255) {
9176 *invlist = add_cp_to_invlist(*invlist,
9177 remaining_folds[i]);
9187 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
9189 /* Output the elements of the array given by '*posix_warnings' as REGEXP
9193 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
9195 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
9197 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
9198 CLEAR_POSIX_WARNINGS();
9202 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
9203 if (first_is_fatal) { /* Avoid leaking this */
9204 av_undef(posix_warnings); /* This isn't necessary if the
9205 array is mortal, but is a
9207 (void) sv_2mortal(msg);
9210 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
9211 SvREFCNT_dec_NN(msg);
9214 UPDATE_WARNINGS_LOC(RExC_parse);
9217 PERL_STATIC_INLINE Size_t
9218 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
9220 const U8 * const start = s1;
9221 const U8 * const send = start + max;
9223 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
9225 while (s1 < send && *s1 == *s2) {
9233 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
9235 /* This adds the string scalar <multi_string> to the array
9236 * <multi_char_matches>. <multi_string> is known to have exactly
9237 * <cp_count> code points in it. This is used when constructing a
9238 * bracketed character class and we find something that needs to match more
9239 * than a single character.
9241 * <multi_char_matches> is actually an array of arrays. Each top-level
9242 * element is an array that contains all the strings known so far that are
9243 * the same length. And that length (in number of code points) is the same
9244 * as the index of the top-level array. Hence, the [2] element is an
9245 * array, each element thereof is a string containing TWO code points;
9246 * while element [3] is for strings of THREE characters, and so on. Since
9247 * this is for multi-char strings there can never be a [0] nor [1] element.
9249 * When we rewrite the character class below, we will do so such that the
9250 * longest strings are written first, so that it prefers the longest
9251 * matching strings first. This is done even if it turns out that any
9252 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
9253 * Christiansen has agreed that this is ok. This makes the test for the
9254 * ligature 'ffi' come before the test for 'ff', for example */
9257 AV** this_array_ptr;
9259 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
9261 if (! multi_char_matches) {
9262 multi_char_matches = newAV();
9265 if (av_exists(multi_char_matches, cp_count)) {
9266 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches, cp_count, FALSE);
9267 this_array = *this_array_ptr;
9270 this_array = newAV();
9271 av_store_simple(multi_char_matches, cp_count,
9274 av_push_simple(this_array, multi_string);
9276 return multi_char_matches;
9279 /* The names of properties whose definitions are not known at compile time are
9280 * stored in this SV, after a constant heading. So if the length has been
9281 * changed since initialization, then there is a run-time definition. */
9282 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
9283 (SvCUR(listsv) != initial_listsv_len)
9285 /* There is a restricted set of white space characters that are legal when
9286 * ignoring white space in a bracketed character class. This generates the
9287 * code to skip them.
9289 * There is a line below that uses the same white space criteria but is outside
9290 * this macro. Both here and there must use the same definition */
9291 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
9294 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
9301 STATIC regnode_offset
9302 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
9303 const bool stop_at_1, /* Just parse the next thing, don't
9304 look for a full character class */
9305 bool allow_mutiple_chars,
9306 const bool silence_non_portable, /* Don't output warnings
9310 bool optimizable, /* ? Allow a non-ANYOF return
9312 SV** ret_invlist /* Return an inversion list, not a node */
9315 /* parse a bracketed class specification. Most of these will produce an
9316 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
9317 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
9318 * under /i with multi-character folds: it will be rewritten following the
9319 * paradigm of this example, where the <multi-fold>s are characters which
9320 * fold to multiple character sequences:
9321 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
9322 * gets effectively rewritten as:
9323 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
9324 * reg() gets called (recursively) on the rewritten version, and this
9325 * function will return what it constructs. (Actually the <multi-fold>s
9326 * aren't physically removed from the [abcdefghi], it's just that they are
9327 * ignored in the recursion by means of a flag:
9328 * <RExC_in_multi_char_class>.)
9330 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
9331 * characters, with the corresponding bit set if that character is in the
9332 * list. For characters above this, an inversion list is used. There
9333 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
9334 * determinable at compile time
9336 * On success, returns the offset at which any next node should be placed
9337 * into the regex engine program being compiled.
9339 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
9340 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
9344 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
9346 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
9347 regnode_offset ret = -1; /* Initialized to an illegal value */
9349 int namedclass = OOB_NAMEDCLASS;
9350 char *rangebegin = NULL;
9351 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
9352 aren't available at the time this was called */
9353 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
9354 than just initialized. */
9355 SV* properties = NULL; /* Code points that match \p{} \P{} */
9356 SV* posixes = NULL; /* Code points that match classes like [:word:],
9357 extended beyond the Latin1 range. These have to
9358 be kept separate from other code points for much
9359 of this function because their handling is
9360 different under /i, and for most classes under
9362 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
9363 separate for a while from the non-complemented
9364 versions because of complications with /d
9366 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
9367 treated more simply than the general case,
9368 leading to less compilation and execution
9370 UV element_count = 0; /* Number of distinct elements in the class.
9371 Optimizations may be possible if this is tiny */
9372 AV * multi_char_matches = NULL; /* Code points that fold to more than one
9373 character; used under /i */
9375 char * stop_ptr = RExC_end; /* where to stop parsing */
9377 /* ignore unescaped whitespace? */
9378 const bool skip_white = cBOOL( ret_invlist
9379 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
9381 /* inversion list of code points this node matches only when the target
9382 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
9384 SV* upper_latin1_only_utf8_matches = NULL;
9386 /* Inversion list of code points this node matches regardless of things
9387 * like locale, folding, utf8ness of the target string */
9390 /* Like cp_list, but code points on this list need to be checked for things
9391 * that fold to/from them under /i */
9392 SV* cp_foldable_list = NULL;
9394 /* Like cp_list, but code points on this list are valid only when the
9395 * runtime locale is UTF-8 */
9396 SV* only_utf8_locale_list = NULL;
9398 /* In a range, if one of the endpoints is non-character-set portable,
9399 * meaning that it hard-codes a code point that may mean a different
9400 * character in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
9401 * mnemonic '\t' which each mean the same character no matter which
9402 * character set the platform is on. */
9403 unsigned int non_portable_endpoint = 0;
9405 /* Is the range unicode? which means on a platform that isn't 1-1 native
9406 * to Unicode (i.e. non-ASCII), each code point in it should be considered
9407 * to be a Unicode value. */
9408 bool unicode_range = FALSE;
9409 bool invert = FALSE; /* Is this class to be complemented */
9411 bool warn_super = ALWAYS_WARN_SUPER;
9413 const char * orig_parse = RExC_parse;
9415 /* This variable is used to mark where the end in the input is of something
9416 * that looks like a POSIX construct but isn't. During the parse, when
9417 * something looks like it could be such a construct is encountered, it is
9418 * checked for being one, but not if we've already checked this area of the
9419 * input. Only after this position is reached do we check again */
9420 char *not_posix_region_end = RExC_parse - 1;
9422 AV* posix_warnings = NULL;
9423 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
9424 U8 op = ANYOF; /* The returned node-type, initialized to the expected
9426 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
9427 U32 posixl = 0; /* bit field of posix classes matched under /l */
9430 /* Flags as to what things aren't knowable until runtime. (Note that these are
9431 * mutually exclusive.) */
9432 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
9433 haven't been defined as of yet */
9434 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
9436 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
9438 U32 has_runtime_dependency = 0; /* OR of the above flags */
9440 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9442 PERL_ARGS_ASSERT_REGCLASS;
9444 PERL_UNUSED_ARG(depth);
9447 assert(! (ret_invlist && allow_mutiple_chars));
9449 /* If wants an inversion list returned, we can't optimize to something
9452 optimizable = FALSE;
9455 DEBUG_PARSE("clas");
9457 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
9458 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
9459 && UNICODE_DOT_DOT_VERSION == 0)
9460 allow_mutiple_chars = FALSE;
9463 /* We include the /i status at the beginning of this so that we can
9464 * know it at runtime */
9465 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
9466 initial_listsv_len = SvCUR(listsv);
9467 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
9469 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9471 assert(RExC_parse <= RExC_end);
9473 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
9474 RExC_parse_inc_by(1);
9476 allow_mutiple_chars = FALSE;
9478 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9481 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
9482 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
9483 int maybe_class = handle_possible_posix(pRExC_state,
9485 ¬_posix_region_end,
9487 TRUE /* checking only */);
9488 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
9489 ckWARN4reg(not_posix_region_end,
9490 "POSIX syntax [%c %c] belongs inside character classes%s",
9491 *RExC_parse, *RExC_parse,
9492 (maybe_class == OOB_NAMEDCLASS)
9493 ? ((POSIXCC_NOTYET(*RExC_parse))
9494 ? " (but this one isn't implemented)"
9495 : " (but this one isn't fully valid)")
9501 /* If the caller wants us to just parse a single element, accomplish this
9502 * by faking the loop ending condition */
9503 if (stop_at_1 && RExC_end > RExC_parse) {
9504 stop_ptr = RExC_parse + 1;
9507 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
9508 if (UCHARAT(RExC_parse) == ']')
9514 && av_tindex_skip_len_mg(posix_warnings) >= 0
9515 && RExC_parse > not_posix_region_end)
9517 /* Warnings about posix class issues are considered tentative until
9518 * we are far enough along in the parse that we can no longer
9519 * change our mind, at which point we output them. This is done
9520 * each time through the loop so that a later class won't zap them
9521 * before they have been dealt with. */
9522 output_posix_warnings(pRExC_state, posix_warnings);
9525 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
9527 if (RExC_parse >= stop_ptr) {
9531 if (UCHARAT(RExC_parse) == ']') {
9537 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
9539 save_prevvalue = prevvalue;
9542 rangebegin = RExC_parse;
9544 non_portable_endpoint = 0;
9546 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
9547 value = utf8n_to_uvchr((U8*)RExC_parse,
9548 RExC_end - RExC_parse,
9549 &numlen, UTF8_ALLOW_DEFAULT);
9550 RExC_parse_inc_by(numlen);
9553 value = UCHARAT(RExC_parse);
9554 RExC_parse_inc_by(1);
9558 char * posix_class_end;
9559 namedclass = handle_possible_posix(pRExC_state,
9562 do_posix_warnings ? &posix_warnings : NULL,
9563 FALSE /* die if error */);
9564 if (namedclass > OOB_NAMEDCLASS) {
9566 /* If there was an earlier attempt to parse this particular
9567 * posix class, and it failed, it was a false alarm, as this
9568 * successful one proves */
9570 && av_tindex_skip_len_mg(posix_warnings) >= 0
9571 && not_posix_region_end >= RExC_parse
9572 && not_posix_region_end <= posix_class_end)
9574 av_undef(posix_warnings);
9577 RExC_parse_set(posix_class_end);
9579 else if (namedclass == OOB_NAMEDCLASS) {
9580 not_posix_region_end = posix_class_end;
9583 namedclass = OOB_NAMEDCLASS;
9586 else if ( RExC_parse - 1 > not_posix_region_end
9587 && MAYBE_POSIXCC(value))
9589 (void) handle_possible_posix(
9591 RExC_parse - 1, /* -1 because parse has already been
9593 ¬_posix_region_end,
9594 do_posix_warnings ? &posix_warnings : NULL,
9595 TRUE /* checking only */);
9597 else if ( strict && ! skip_white
9598 && ( generic_isCC_(value, CC_VERTSPACE_)
9599 || is_VERTWS_cp_high(value)))
9601 vFAIL("Literal vertical space in [] is illegal except under /x");
9603 else if (value == '\\') {
9604 /* Is a backslash; get the code point of the char after it */
9606 if (RExC_parse >= RExC_end) {
9607 vFAIL("Unmatched [");
9610 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
9611 value = utf8n_to_uvchr((U8*)RExC_parse,
9612 RExC_end - RExC_parse,
9613 &numlen, UTF8_ALLOW_DEFAULT);
9614 RExC_parse_inc_by(numlen);
9617 value = UCHARAT(RExC_parse);
9618 RExC_parse_inc_by(1);
9621 /* Some compilers cannot handle switching on 64-bit integer
9622 * values, therefore value cannot be an UV. Yes, this will
9623 * be a problem later if we want switch on Unicode.
9624 * A similar issue a little bit later when switching on
9625 * namedclass. --jhi */
9627 /* If the \ is escaping white space when white space is being
9628 * skipped, it means that that white space is wanted literally, and
9629 * is already in 'value'. Otherwise, need to translate the escape
9630 * into what it signifies. */
9631 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
9632 const char * message;
9636 case 'w': namedclass = ANYOF_WORDCHAR; break;
9637 case 'W': namedclass = ANYOF_NWORDCHAR; break;
9638 case 's': namedclass = ANYOF_SPACE; break;
9639 case 'S': namedclass = ANYOF_NSPACE; break;
9640 case 'd': namedclass = ANYOF_DIGIT; break;
9641 case 'D': namedclass = ANYOF_NDIGIT; break;
9642 case 'v': namedclass = ANYOF_VERTWS; break;
9643 case 'V': namedclass = ANYOF_NVERTWS; break;
9644 case 'h': namedclass = ANYOF_HORIZWS; break;
9645 case 'H': namedclass = ANYOF_NHORIZWS; break;
9646 case 'N': /* Handle \N{NAME} in class */
9648 const char * const backslash_N_beg = RExC_parse - 2;
9651 if (! grok_bslash_N(pRExC_state,
9652 NULL, /* No regnode */
9653 &value, /* Yes single value */
9654 &cp_count, /* Multiple code pt count */
9660 if (*flagp & NEED_UTF8)
9661 FAIL("panic: grok_bslash_N set NEED_UTF8");
9663 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
9666 vFAIL("\\N in a character class must be a named character: \\N{...}");
9668 else if (cp_count == 0) {
9669 ckWARNreg(RExC_parse,
9670 "Ignoring zero length \\N{} in character class");
9672 else { /* cp_count > 1 */
9673 assert(cp_count > 1);
9674 if (! RExC_in_multi_char_class) {
9675 if ( ! allow_mutiple_chars
9678 || *RExC_parse == '-')
9682 vFAIL("\\N{} here is restricted to one character");
9684 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
9685 break; /* <value> contains the first code
9686 point. Drop out of the switch to
9690 SV * multi_char_N = newSVpvn(backslash_N_beg,
9691 RExC_parse - backslash_N_beg);
9693 = add_multi_match(multi_char_matches,
9698 } /* End of cp_count != 1 */
9700 /* This element should not be processed further in this
9704 prevvalue = save_prevvalue;
9705 continue; /* Back to top of loop to get next char */
9708 /* Here, is a single code point, and <value> contains it */
9709 unicode_range = TRUE; /* \N{} are Unicode */
9717 if (RExC_pm_flags & PMf_WILDCARD) {
9718 RExC_parse_inc_by(1);
9719 /* diag_listed_as: Use of %s is not allowed in Unicode
9720 property wildcard subpatterns in regex; marked by <--
9722 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
9723 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
9726 /* \p means they want Unicode semantics */
9727 REQUIRE_UNI_RULES(flagp, 0);
9729 if (RExC_parse >= RExC_end)
9730 vFAIL2("Empty \\%c", (U8)value);
9731 if (*RExC_parse == '{') {
9732 const U8 c = (U8)value;
9733 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
9735 RExC_parse_inc_by(1);
9736 vFAIL2("Missing right brace on \\%c{}", c);
9739 RExC_parse_inc_by(1);
9741 /* White space is allowed adjacent to the braces and after
9742 * any '^', even when not under /x */
9743 while (isSPACE(*RExC_parse)) {
9744 RExC_parse_inc_by(1);
9747 if (UCHARAT(RExC_parse) == '^') {
9749 /* toggle. (The rhs xor gets the single bit that
9750 * differs between P and p; the other xor inverts just
9754 RExC_parse_inc_by(1);
9755 while (isSPACE(*RExC_parse)) {
9756 RExC_parse_inc_by(1);
9760 if (e == RExC_parse)
9761 vFAIL2("Empty \\%c{}", c);
9764 while (isSPACE(*(RExC_parse + n - 1)))
9767 } /* The \p isn't immediately followed by a '{' */
9768 else if (! isALPHA(*RExC_parse)) {
9769 RExC_parse_inc_safe();
9770 vFAIL2("Character following \\%c must be '{' or a "
9771 "single-character Unicode property name",
9779 char* name = RExC_parse;
9781 /* Any message returned about expanding the definition */
9782 SV* msg = newSVpvs_flags("", SVs_TEMP);
9784 /* If set TRUE, the property is user-defined as opposed to
9785 * official Unicode */
9786 bool user_defined = FALSE;
9787 AV * strings = NULL;
9789 SV * prop_definition = parse_uniprop_string(
9791 FALSE, /* This is compile-time */
9793 /* We can't defer this defn when
9794 * the full result is required in
9796 ! cBOOL(ret_invlist),
9803 if (SvCUR(msg)) { /* Assumes any error causes a msg */
9804 assert(prop_definition == NULL);
9805 RExC_parse_set(e + 1);
9806 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
9807 thing so, or else the display is
9811 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
9812 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
9813 SvCUR(msg), SvPVX(msg)));
9816 assert(prop_definition || strings);
9820 if (! prop_definition) {
9821 RExC_parse_set(e + 1);
9822 vFAIL("Unicode string properties are not implemented in (?[...])");
9826 "Using just the single character results"
9827 " returned by \\p{} in (?[...])");
9830 else if (! RExC_in_multi_char_class) {
9831 if (invert ^ (value == 'P')) {
9832 RExC_parse_set(e + 1);
9833 vFAIL("Inverting a character class which contains"
9834 " a multi-character sequence is illegal");
9837 /* For each multi-character string ... */
9838 while (av_count(strings) > 0) {
9839 /* ... Each entry is itself an array of code
9841 AV * this_string = (AV *) av_shift( strings);
9842 STRLEN cp_count = av_count(this_string);
9843 SV * final = newSV(cp_count ? cp_count * 4 : 1);
9844 SvPVCLEAR_FRESH(final);
9846 /* Create another string of sequences of \x{...} */
9847 while (av_count(this_string) > 0) {
9848 SV * character = av_shift(this_string);
9849 UV cp = SvUV(character);
9852 REQUIRE_UTF8(flagp);
9854 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
9856 SvREFCNT_dec_NN(character);
9858 SvREFCNT_dec_NN(this_string);
9860 /* And add that to the list of such things */
9862 = add_multi_match(multi_char_matches,
9867 SvREFCNT_dec_NN(strings);
9870 if (! prop_definition) { /* If we got only a string,
9871 this iteration didn't really
9875 else if (! is_invlist(prop_definition)) {
9877 /* Here, the definition isn't known, so we have gotten
9878 * returned a string that will be evaluated if and when
9879 * encountered at runtime. We add it to the list of
9880 * such properties, along with whether it should be
9881 * complemented or not */
9883 sv_catpvs(listsv, "!");
9886 sv_catpvs(listsv, "+");
9888 sv_catsv(listsv, prop_definition);
9890 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
9892 /* We don't know yet what this matches, so have to flag
9894 anyof_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
9897 assert (prop_definition && is_invlist(prop_definition));
9899 /* Here we do have the complete property definition
9901 * Temporary workaround for [GH #16520]. For this
9902 * precise input that is in the .t that is failing,
9903 * load utf8.pm, which is what the test wants, so that
9905 if ( memEQs(RExC_start, e + 1 - RExC_start,
9907 && ! hv_common(GvHVn(PL_incgv),
9909 "utf8.pm", sizeof("utf8.pm") - 1,
9910 0, HV_FETCH_ISEXISTS, NULL, 0))
9912 require_pv("utf8.pm");
9915 if (! user_defined &&
9916 /* We warn on matching an above-Unicode code point
9917 * if the match would return true, except don't
9918 * warn for \p{All}, which has exactly one element
9920 (_invlist_contains_cp(prop_definition, 0x110000)
9921 && (! (_invlist_len(prop_definition) == 1
9922 && *invlist_array(prop_definition) == 0))))
9927 /* Invert if asking for the complement */
9929 _invlist_union_complement_2nd(properties,
9934 _invlist_union(properties, prop_definition, &properties);
9939 RExC_parse_set(e + 1);
9940 namedclass = ANYOF_UNIPROP; /* no official name, but it's
9944 case 'n': value = '\n'; break;
9945 case 'r': value = '\r'; break;
9946 case 't': value = '\t'; break;
9947 case 'f': value = '\f'; break;
9948 case 'b': value = '\b'; break;
9949 case 'e': value = ESC_NATIVE; break;
9950 case 'a': value = '\a'; break;
9952 RExC_parse--; /* function expects to be pointed at the 'o' */
9953 if (! grok_bslash_o(&RExC_parse,
9959 cBOOL(range), /* MAX_UV allowed for range
9965 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9966 warn_non_literal_string(RExC_parse, packed_warn, message);
9970 non_portable_endpoint++;
9974 RExC_parse--; /* function expects to be pointed at the 'x' */
9975 if (! grok_bslash_x(&RExC_parse,
9981 cBOOL(range), /* MAX_UV allowed for range
9987 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
9988 warn_non_literal_string(RExC_parse, packed_warn, message);
9992 non_portable_endpoint++;
9996 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
9999 /* going to die anyway; point to exact spot of
10001 RExC_parse_inc_safe();
10005 value = grok_c_char;
10006 RExC_parse_inc_by(1);
10007 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
10008 warn_non_literal_string(RExC_parse, packed_warn, message);
10011 non_portable_endpoint++;
10013 case '0': case '1': case '2': case '3': case '4':
10014 case '5': case '6': case '7':
10016 /* Take 1-3 octal digits */
10017 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
10018 | PERL_SCAN_NOTIFY_ILLDIGIT;
10019 numlen = (strict) ? 4 : 3;
10020 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10021 RExC_parse_inc_by(numlen);
10024 RExC_parse_inc_safe();
10025 vFAIL("Need exactly 3 octal digits");
10027 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
10028 && RExC_parse < RExC_end
10029 && isDIGIT(*RExC_parse)
10030 && ckWARN(WARN_REGEXP))
10032 reg_warn_non_literal_string(
10034 form_alien_digit_msg(8, numlen, RExC_parse,
10035 RExC_end, UTF, FALSE));
10039 non_portable_endpoint++;
10044 /* Allow \_ to not give an error */
10045 if (isWORDCHAR(value) && value != '_') {
10047 vFAIL2("Unrecognized escape \\%c in character class",
10051 ckWARN2reg(RExC_parse,
10052 "Unrecognized escape \\%c in character class passed through",
10057 } /* End of switch on char following backslash */
10058 } /* end of handling backslash escape sequences */
10060 /* Here, we have the current token in 'value' */
10062 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10065 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
10066 * literal, as is the character that began the false range, i.e.
10067 * the 'a' in the examples */
10069 const int w = (RExC_parse >= rangebegin)
10070 ? RExC_parse - rangebegin
10074 "False [] range \"%" UTF8f "\"",
10075 UTF8fARG(UTF, w, rangebegin));
10078 ckWARN2reg(RExC_parse,
10079 "False [] range \"%" UTF8f "\"",
10080 UTF8fARG(UTF, w, rangebegin));
10081 cp_list = add_cp_to_invlist(cp_list, '-');
10082 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
10086 range = 0; /* this was not a true range */
10087 element_count += 2; /* So counts for three values */
10090 classnum = namedclass_to_classnum(namedclass);
10092 if (LOC && namedclass < ANYOF_POSIXL_MAX
10093 #ifndef HAS_ISASCII
10094 && classnum != CC_ASCII_
10097 SV* scratch_list = NULL;
10099 /* What the Posix classes (like \w, [:space:]) match isn't
10100 * generally knowable under locale until actual match time. A
10101 * special node is used for these which has extra space for a
10102 * bitmap, with a bit reserved for each named class that is to
10103 * be matched against. (This isn't needed for \p{} and
10104 * pseudo-classes, as they are not affected by locale, and
10105 * hence are dealt with separately.) However, if a named class
10106 * and its complement are both present, then it matches
10107 * everything, and there is no runtime dependency. Odd numbers
10108 * are the complements of the next lower number, so xor works.
10109 * (Note that something like [\w\D] should match everything,
10110 * because \d should be a proper subset of \w. But rather than
10111 * trust that the locale is well behaved, we leave this to
10112 * runtime to sort out) */
10113 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
10114 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
10115 POSIXL_ZERO(posixl);
10116 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
10117 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
10118 continue; /* We could ignore the rest of the class, but
10119 best to parse it for any errors */
10121 else { /* Here, isn't the complement of any already parsed
10123 POSIXL_SET(posixl, namedclass);
10124 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
10125 anyof_flags |= ANYOF_MATCHES_POSIXL;
10127 /* The above-Latin1 characters are not subject to locale
10128 * rules. Just add them to the unconditionally-matched
10131 /* Get the list of the above-Latin1 code points this
10133 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
10134 PL_XPosix_ptrs[classnum],
10136 /* Odd numbers are complements,
10137 * like NDIGIT, NASCII, ... */
10138 namedclass % 2 != 0,
10140 /* Checking if 'cp_list' is NULL first saves an extra
10141 * clone. Its reference count will be decremented at the
10142 * next union, etc, or if this is the only instance, at the
10143 * end of the routine */
10145 cp_list = scratch_list;
10148 _invlist_union(cp_list, scratch_list, &cp_list);
10149 SvREFCNT_dec_NN(scratch_list);
10151 continue; /* Go get next character */
10156 /* Here, is not /l, or is a POSIX class for which /l doesn't
10157 * matter (or is a Unicode property, which is skipped here). */
10158 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
10159 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
10161 /* Here, should be \h, \H, \v, or \V. None of /d, /i
10162 * nor /l make a difference in what these match,
10163 * therefore we just add what they match to cp_list. */
10164 if (classnum != CC_VERTSPACE_) {
10165 assert( namedclass == ANYOF_HORIZWS
10166 || namedclass == ANYOF_NHORIZWS);
10168 /* It turns out that \h is just a synonym for
10170 classnum = CC_BLANK_;
10173 _invlist_union_maybe_complement_2nd(
10175 PL_XPosix_ptrs[classnum],
10176 namedclass % 2 != 0, /* Complement if odd
10177 (NHORIZWS, NVERTWS)
10182 else if ( AT_LEAST_UNI_SEMANTICS
10183 || classnum == CC_ASCII_
10184 || (DEPENDS_SEMANTICS && ( classnum == CC_DIGIT_
10185 || classnum == CC_XDIGIT_)))
10187 /* We usually have to worry about /d affecting what POSIX
10188 * classes match, with special code needed because we won't
10189 * know until runtime what all matches. But there is no
10190 * extra work needed under /u and /a; and [:ascii:] is
10191 * unaffected by /d; and :digit: and :xdigit: don't have
10192 * runtime differences under /d. So we can special case
10193 * these, and avoid some extra work below, and at runtime.
10195 _invlist_union_maybe_complement_2nd(
10197 ((AT_LEAST_ASCII_RESTRICTED)
10198 ? PL_Posix_ptrs[classnum]
10199 : PL_XPosix_ptrs[classnum]),
10200 namedclass % 2 != 0,
10203 else { /* Garden variety class. If is NUPPER, NALPHA, ...
10204 complement and use nposixes */
10205 SV** posixes_ptr = namedclass % 2 == 0
10208 _invlist_union_maybe_complement_2nd(
10210 PL_XPosix_ptrs[classnum],
10211 namedclass % 2 != 0,
10215 } /* end of namedclass \blah */
10217 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
10219 /* If 'range' is set, 'value' is the ending of a range--check its
10220 * validity. (If value isn't a single code point in the case of a
10221 * range, we should have figured that out above in the code that
10222 * catches false ranges). Later, we will handle each individual code
10223 * point in the range. If 'range' isn't set, this could be the
10224 * beginning of a range, so check for that by looking ahead to see if
10225 * the next real character to be processed is the range indicator--the
10230 /* For unicode ranges, we have to test that the Unicode as opposed
10231 * to the native values are not decreasing. (Above 255, there is
10232 * no difference between native and Unicode) */
10233 if (unicode_range && prevvalue < 255 && value < 255) {
10234 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
10235 goto backwards_range;
10240 if (prevvalue > value) /* b-a */ {
10245 w = RExC_parse - rangebegin;
10247 "Invalid [] range \"%" UTF8f "\"",
10248 UTF8fARG(UTF, w, rangebegin));
10249 NOT_REACHED; /* NOTREACHED */
10253 prevvalue = value; /* save the beginning of the potential range */
10254 if (! stop_at_1 /* Can't be a range if parsing just one thing */
10255 && *RExC_parse == '-')
10257 char* next_char_ptr = RExC_parse + 1;
10259 /* Get the next real char after the '-' */
10260 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
10262 /* If the '-' is at the end of the class (just before the ']',
10263 * it is a literal minus; otherwise it is a range */
10264 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
10265 RExC_parse_set(next_char_ptr);
10267 /* a bad range like \w-, [:word:]- ? */
10268 if (namedclass > OOB_NAMEDCLASS) {
10269 if (strict || ckWARN(WARN_REGEXP)) {
10270 const int w = RExC_parse >= rangebegin
10271 ? RExC_parse - rangebegin
10274 vFAIL4("False [] range \"%*.*s\"",
10279 "False [] range \"%*.*s\"",
10283 cp_list = add_cp_to_invlist(cp_list, '-');
10286 range = 1; /* yeah, it's a range! */
10287 continue; /* but do it the next time */
10292 if (namedclass > OOB_NAMEDCLASS) {
10296 /* Here, we have a single value this time through the loop, and
10297 * <prevvalue> is the beginning of the range, if any; or <value> if
10300 /* non-Latin1 code point implies unicode semantics. */
10302 if (value > MAX_LEGAL_CP && ( value != UV_MAX
10303 || prevvalue > MAX_LEGAL_CP))
10305 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
10307 REQUIRE_UNI_RULES(flagp, 0);
10308 if ( ! silence_non_portable
10309 && UNICODE_IS_PERL_EXTENDED(value)
10310 && TO_OUTPUT_WARNINGS(RExC_parse))
10312 ckWARN2_non_literal_string(RExC_parse,
10313 packWARN(WARN_PORTABLE),
10314 PL_extended_cp_format,
10319 /* Ready to process either the single value, or the completed range.
10320 * For single-valued non-inverted ranges, we consider the possibility
10321 * of multi-char folds. (We made a conscious decision to not do this
10322 * for the other cases because it can often lead to non-intuitive
10323 * results. For example, you have the peculiar case that:
10324 * "s s" =~ /^[^\xDF]+$/i => Y
10325 * "ss" =~ /^[^\xDF]+$/i => N
10327 * See [perl #89750] */
10328 if (FOLD && allow_mutiple_chars && value == prevvalue) {
10329 if ( value == LATIN_SMALL_LETTER_SHARP_S
10330 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
10333 /* Here <value> is indeed a multi-char fold. Get what it is */
10335 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10338 UV folded = _to_uni_fold_flags(
10342 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
10343 ? FOLD_FLAGS_NOMIX_ASCII
10347 /* Here, <folded> should be the first character of the
10348 * multi-char fold of <value>, with <foldbuf> containing the
10349 * whole thing. But, if this fold is not allowed (because of
10350 * the flags), <fold> will be the same as <value>, and should
10351 * be processed like any other character, so skip the special
10353 if (folded != value) {
10355 /* Skip if we are recursed, currently parsing the class
10356 * again. Otherwise add this character to the list of
10357 * multi-char folds. */
10358 if (! RExC_in_multi_char_class) {
10359 STRLEN cp_count = utf8_length(foldbuf,
10360 foldbuf + foldlen);
10361 SV* multi_fold = newSVpvs_flags("", SVs_TEMP);
10363 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
10366 = add_multi_match(multi_char_matches,
10372 /* This element should not be processed further in this
10375 value = save_value;
10376 prevvalue = save_prevvalue;
10382 if (strict && ckWARN(WARN_REGEXP)) {
10385 /* If the range starts above 255, everything is portable and
10386 * likely to be so for any forseeable character set, so don't
10388 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
10389 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
10391 else if (prevvalue != value) {
10393 /* Under strict, ranges that stop and/or end in an ASCII
10394 * printable should have each end point be a portable value
10395 * for it (preferably like 'A', but we don't warn if it is
10396 * a (portable) Unicode name or code point), and the range
10397 * must be all digits or all letters of the same case.
10398 * Otherwise, the range is non-portable and unclear as to
10399 * what it contains */
10400 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
10401 && ( non_portable_endpoint
10402 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
10403 || (isLOWER_A(prevvalue) && isLOWER_A(value))
10404 || (isUPPER_A(prevvalue) && isUPPER_A(value))
10406 vWARN(RExC_parse, "Ranges of ASCII printables should"
10407 " be some subset of \"0-9\","
10408 " \"A-Z\", or \"a-z\"");
10410 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
10411 SSize_t index_start;
10412 SSize_t index_final;
10414 /* But the nature of Unicode and languages mean we
10415 * can't do the same checks for above-ASCII ranges,
10416 * except in the case of digit ones. These should
10417 * contain only digits from the same group of 10. The
10418 * ASCII case is handled just above. Hence here, the
10419 * range could be a range of digits. First some
10420 * unlikely special cases. Grandfather in that a range
10421 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
10422 * if its starting value is one of the 10 digits prior
10423 * to it. This is because it is an alternate way of
10424 * writing 19D1, and some people may expect it to be in
10425 * that group. But it is bad, because it won't give
10426 * the expected results. In Unicode 5.2 it was
10427 * considered to be in that group (of 11, hence), but
10428 * this was fixed in the next version */
10430 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
10431 goto warn_bad_digit_range;
10433 else if (UNLIKELY( prevvalue >= 0x1D7CE
10434 && value <= 0x1D7FF))
10436 /* This is the only other case currently in Unicode
10437 * where the algorithm below fails. The code
10438 * points just above are the end points of a single
10439 * range containing only decimal digits. It is 5
10440 * different series of 0-9. All other ranges of
10441 * digits currently in Unicode are just a single
10442 * series. (And mktables will notify us if a later
10443 * Unicode version breaks this.)
10445 * If the range being checked is at most 9 long,
10446 * and the digit values represented are in
10447 * numerical order, they are from the same series.
10449 if ( value - prevvalue > 9
10450 || ((( value - 0x1D7CE) % 10)
10451 <= (prevvalue - 0x1D7CE) % 10))
10453 goto warn_bad_digit_range;
10458 /* For all other ranges of digits in Unicode, the
10459 * algorithm is just to check if both end points
10460 * are in the same series, which is the same range.
10462 index_start = _invlist_search(
10463 PL_XPosix_ptrs[CC_DIGIT_],
10466 /* Warn if the range starts and ends with a digit,
10467 * and they are not in the same group of 10. */
10468 if ( index_start >= 0
10469 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
10471 _invlist_search(PL_XPosix_ptrs[CC_DIGIT_],
10472 value)) != index_start
10473 && index_final >= 0
10474 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
10476 warn_bad_digit_range:
10477 vWARN(RExC_parse, "Ranges of digits should be"
10478 " from the same group of"
10485 if ((! range || prevvalue == value) && non_portable_endpoint) {
10486 if (isPRINT_A(value)) {
10489 if (isBACKSLASHED_PUNCT(value)) {
10490 literal[d++] = '\\';
10492 literal[d++] = (char) value;
10493 literal[d++] = '\0';
10496 "\"%.*s\" is more clearly written simply as \"%s\"",
10497 (int) (RExC_parse - rangebegin),
10502 else if (isMNEMONIC_CNTRL(value)) {
10504 "\"%.*s\" is more clearly written simply as \"%s\"",
10505 (int) (RExC_parse - rangebegin),
10507 cntrl_to_mnemonic((U8) value)
10513 /* Deal with this element of the class */
10516 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10519 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
10520 * that don't require special handling, we can just add the range like
10521 * we do for ASCII platforms */
10522 if ((UNLIKELY(prevvalue == 0) && value >= 255)
10523 || ! (prevvalue < 256
10525 || (! non_portable_endpoint
10526 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
10527 || (isUPPER_A(prevvalue)
10528 && isUPPER_A(value)))))))
10530 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10534 /* Here, requires special handling. This can be because it is a
10535 * range whose code points are considered to be Unicode, and so
10536 * must be individually translated into native, or because its a
10537 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
10538 * EBCDIC, but we have defined them to include only the "expected"
10539 * upper or lower case ASCII alphabetics. Subranges above 255 are
10540 * the same in native and Unicode, so can be added as a range */
10541 U8 start = NATIVE_TO_LATIN1(prevvalue);
10543 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
10544 for (j = start; j <= end; j++) {
10545 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
10548 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
10554 range = 0; /* this range (if it was one) is done now */
10555 } /* End of loop through all the text within the brackets */
10557 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
10558 output_posix_warnings(pRExC_state, posix_warnings);
10561 /* If anything in the class expands to more than one character, we have to
10562 * deal with them by building up a substitute parse string, and recursively
10563 * calling reg() on it, instead of proceeding */
10564 if (multi_char_matches) {
10565 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
10568 char *save_end = RExC_end;
10569 char *save_parse = RExC_parse;
10570 char *save_start = RExC_start;
10571 Size_t constructed_prefix_len = 0; /* This gives the length of the
10572 constructed portion of the
10573 substitute parse. */
10574 bool first_time = TRUE; /* First multi-char occurrence doesn't get
10579 /* Only one level of recursion allowed */
10580 assert(RExC_copy_start_in_constructed == RExC_precomp);
10582 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
10583 because too confusing */
10585 sv_catpvs(substitute_parse, "(?:");
10589 /* Look at the longest strings first */
10590 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
10595 if (av_exists(multi_char_matches, cp_count)) {
10596 AV** this_array_ptr;
10599 this_array_ptr = (AV**) av_fetch_simple(multi_char_matches,
10601 while ((this_sequence = av_pop(*this_array_ptr)) !=
10604 if (! first_time) {
10605 sv_catpvs(substitute_parse, "|");
10607 first_time = FALSE;
10609 sv_catpv(substitute_parse, SvPVX(this_sequence));
10614 /* If the character class contains anything else besides these
10615 * multi-character strings, have to include it in recursive parsing */
10616 if (element_count) {
10617 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
10619 sv_catpvs(substitute_parse, "|");
10620 if (has_l_bracket) { /* Add an [ if the original had one */
10621 sv_catpvs(substitute_parse, "[");
10623 constructed_prefix_len = SvCUR(substitute_parse);
10624 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
10626 /* Put in a closing ']' to match any opening one, but not if going
10627 * off the end, as otherwise we are adding something that really
10629 if (has_l_bracket && RExC_parse < RExC_end) {
10630 sv_catpvs(substitute_parse, "]");
10634 sv_catpvs(substitute_parse, ")");
10637 /* This is a way to get the parse to skip forward a whole named
10638 * sequence instead of matching the 2nd character when it fails the
10640 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
10644 /* Set up the data structure so that any errors will be properly
10645 * reported. See the comments at the definition of
10646 * REPORT_LOCATION_ARGS for details */
10647 RExC_copy_start_in_input = (char *) orig_parse;
10648 RExC_start = SvPV(substitute_parse, len);
10649 RExC_parse_set( RExC_start );
10650 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
10651 RExC_end = RExC_parse + len;
10652 RExC_in_multi_char_class = 1;
10654 ret = reg(pRExC_state, 1, ®_flags, depth+1);
10656 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
10658 /* And restore so can parse the rest of the pattern */
10659 RExC_parse_set(save_parse);
10660 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
10661 RExC_end = save_end;
10662 RExC_in_multi_char_class = 0;
10663 SvREFCNT_dec_NN(multi_char_matches);
10664 SvREFCNT_dec(properties);
10665 SvREFCNT_dec(cp_list);
10666 SvREFCNT_dec(simple_posixes);
10667 SvREFCNT_dec(posixes);
10668 SvREFCNT_dec(nposixes);
10669 SvREFCNT_dec(cp_foldable_list);
10673 /* If folding, we calculate all characters that could fold to or from the
10674 * ones already on the list */
10675 if (cp_foldable_list) {
10677 UV start, end; /* End points of code point ranges */
10679 SV* fold_intersection = NULL;
10682 /* Our calculated list will be for Unicode rules. For locale
10683 * matching, we have to keep a separate list that is consulted at
10684 * runtime only when the locale indicates Unicode rules (and we
10685 * don't include potential matches in the ASCII/Latin1 range, as
10686 * any code point could fold to any other, based on the run-time
10687 * locale). For non-locale, we just use the general list */
10689 use_list = &only_utf8_locale_list;
10692 use_list = &cp_list;
10695 /* Only the characters in this class that participate in folds need
10696 * be checked. Get the intersection of this class and all the
10697 * possible characters that are foldable. This can quickly narrow
10698 * down a large class */
10699 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
10700 &fold_intersection);
10702 /* Now look at the foldable characters in this class individually */
10703 invlist_iterinit(fold_intersection);
10704 while (invlist_iternext(fold_intersection, &start, &end)) {
10708 /* Look at every character in the range */
10709 for (j = start; j <= end; j++) {
10710 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
10713 Size_t folds_count;
10715 const U32 * remaining_folds;
10719 /* Under /l, we don't know what code points below 256
10720 * fold to, except we do know the MICRO SIGN folds to
10721 * an above-255 character if the locale is UTF-8, so we
10722 * add it to the special list (in *use_list) Otherwise
10723 * we know now what things can match, though some folds
10724 * are valid under /d only if the target is UTF-8.
10725 * Those go in a separate list */
10726 if ( IS_IN_SOME_FOLD_L1(j)
10727 && ! (LOC && j != MICRO_SIGN))
10730 /* ASCII is always matched; non-ASCII is matched
10731 * only under Unicode rules (which could happen
10732 * under /l if the locale is a UTF-8 one */
10733 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
10734 *use_list = add_cp_to_invlist(*use_list,
10735 PL_fold_latin1[j]);
10737 else if (j != PL_fold_latin1[j]) {
10738 upper_latin1_only_utf8_matches
10739 = add_cp_to_invlist(
10740 upper_latin1_only_utf8_matches,
10741 PL_fold_latin1[j]);
10745 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
10746 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
10748 add_above_Latin1_folds(pRExC_state,
10755 /* Here is an above Latin1 character. We don't have the
10756 * rules hard-coded for it. First, get its fold. This is
10757 * the simple fold, as the multi-character folds have been
10758 * handled earlier and separated out */
10759 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
10760 (ASCII_FOLD_RESTRICTED)
10761 ? FOLD_FLAGS_NOMIX_ASCII
10764 /* Single character fold of above Latin1. Add everything
10765 * in its fold closure to the list that this node should
10767 folds_count = _inverse_folds(folded, &first_fold,
10769 for (k = 0; k <= folds_count; k++) {
10770 UV c = (k == 0) /* First time through use itself */
10772 : (k == 1) /* 2nd time use, the first fold */
10775 /* Then the remaining ones */
10776 : remaining_folds[k-2];
10778 /* /aa doesn't allow folds between ASCII and non- */
10779 if (( ASCII_FOLD_RESTRICTED
10780 && (isASCII(c) != isASCII(j))))
10785 /* Folds under /l which cross the 255/256 boundary are
10786 * added to a separate list. (These are valid only
10787 * when the locale is UTF-8.) */
10788 if (c < 256 && LOC) {
10789 *use_list = add_cp_to_invlist(*use_list, c);
10793 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
10795 cp_list = add_cp_to_invlist(cp_list, c);
10798 /* Similarly folds involving non-ascii Latin1
10799 * characters under /d are added to their list */
10800 upper_latin1_only_utf8_matches
10801 = add_cp_to_invlist(
10802 upper_latin1_only_utf8_matches,
10808 SvREFCNT_dec_NN(fold_intersection);
10811 /* Now that we have finished adding all the folds, there is no reason
10812 * to keep the foldable list separate */
10813 _invlist_union(cp_list, cp_foldable_list, &cp_list);
10814 SvREFCNT_dec_NN(cp_foldable_list);
10817 /* And combine the result (if any) with any inversion lists from posix
10818 * classes. The lists are kept separate up to now because we don't want to
10819 * fold the classes */
10820 if (simple_posixes) { /* These are the classes known to be unaffected by
10823 _invlist_union(cp_list, simple_posixes, &cp_list);
10824 SvREFCNT_dec_NN(simple_posixes);
10827 cp_list = simple_posixes;
10830 if (posixes || nposixes) {
10831 if (! DEPENDS_SEMANTICS) {
10833 /* For everything but /d, we can just add the current 'posixes' and
10834 * 'nposixes' to the main list */
10837 _invlist_union(cp_list, posixes, &cp_list);
10838 SvREFCNT_dec_NN(posixes);
10846 _invlist_union(cp_list, nposixes, &cp_list);
10847 SvREFCNT_dec_NN(nposixes);
10850 cp_list = nposixes;
10855 /* Under /d, things like \w match upper Latin1 characters only if
10856 * the target string is in UTF-8. But things like \W match all the
10857 * upper Latin1 characters if the target string is not in UTF-8.
10859 * Handle the case with something like \W separately */
10861 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
10863 /* A complemented posix class matches all upper Latin1
10864 * characters if not in UTF-8. And it matches just certain
10865 * ones when in UTF-8. That means those certain ones are
10866 * matched regardless, so can just be added to the
10867 * unconditional list */
10869 _invlist_union(cp_list, nposixes, &cp_list);
10870 SvREFCNT_dec_NN(nposixes);
10874 cp_list = nposixes;
10877 /* Likewise for 'posixes' */
10878 _invlist_union(posixes, cp_list, &cp_list);
10879 SvREFCNT_dec(posixes);
10881 /* Likewise for anything else in the range that matched only
10883 if (upper_latin1_only_utf8_matches) {
10884 _invlist_union(cp_list,
10885 upper_latin1_only_utf8_matches,
10887 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10888 upper_latin1_only_utf8_matches = NULL;
10891 /* If we don't match all the upper Latin1 characters regardless
10892 * of UTF-8ness, we have to set a flag to match the rest when
10894 _invlist_subtract(only_non_utf8_list, cp_list,
10895 &only_non_utf8_list);
10896 if (_invlist_len(only_non_utf8_list) != 0) {
10897 anyof_flags |= ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared;
10899 SvREFCNT_dec_NN(only_non_utf8_list);
10902 /* Here there were no complemented posix classes. That means
10903 * the upper Latin1 characters in 'posixes' match only when the
10904 * target string is in UTF-8. So we have to add them to the
10905 * list of those types of code points, while adding the
10906 * remainder to the unconditional list.
10908 * First calculate what they are */
10909 SV* nonascii_but_latin1_properties = NULL;
10910 _invlist_intersection(posixes, PL_UpperLatin1,
10911 &nonascii_but_latin1_properties);
10913 /* And add them to the final list of such characters. */
10914 _invlist_union(upper_latin1_only_utf8_matches,
10915 nonascii_but_latin1_properties,
10916 &upper_latin1_only_utf8_matches);
10918 /* Remove them from what now becomes the unconditional list */
10919 _invlist_subtract(posixes, nonascii_but_latin1_properties,
10922 /* And add those unconditional ones to the final list */
10924 _invlist_union(cp_list, posixes, &cp_list);
10925 SvREFCNT_dec_NN(posixes);
10932 SvREFCNT_dec(nonascii_but_latin1_properties);
10934 /* Get rid of any characters from the conditional list that we
10935 * now know are matched unconditionally, which may make that
10937 _invlist_subtract(upper_latin1_only_utf8_matches,
10939 &upper_latin1_only_utf8_matches);
10940 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
10941 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
10942 upper_latin1_only_utf8_matches = NULL;
10948 /* And combine the result (if any) with any inversion list from properties.
10949 * The lists are kept separate up to now so that we can distinguish the two
10950 * in regards to matching above-Unicode. A run-time warning is generated
10951 * if a Unicode property is matched against a non-Unicode code point. But,
10952 * we allow user-defined properties to match anything, without any warning,
10953 * and we also suppress the warning if there is a portion of the character
10954 * class that isn't a Unicode property, and which matches above Unicode, \W
10955 * or [\x{110000}] for example.
10956 * (Note that in this case, unlike the Posix one above, there is no
10957 * <upper_latin1_only_utf8_matches>, because having a Unicode property
10958 * forces Unicode semantics */
10962 /* If it matters to the final outcome, see if a non-property
10963 * component of the class matches above Unicode. If so, the
10964 * warning gets suppressed. This is true even if just a single
10965 * such code point is specified, as, though not strictly correct if
10966 * another such code point is matched against, the fact that they
10967 * are using above-Unicode code points indicates they should know
10968 * the issues involved */
10970 warn_super = ! (invert
10971 ^ (UNICODE_IS_SUPER(invlist_highest(cp_list))));
10974 _invlist_union(properties, cp_list, &cp_list);
10975 SvREFCNT_dec_NN(properties);
10978 cp_list = properties;
10982 anyof_flags |= ANYOF_WARN_SUPER__shared;
10984 /* Because an ANYOF node is the only one that warns, this node
10985 * can't be optimized into something else */
10986 optimizable = FALSE;
10990 /* Here, we have calculated what code points should be in the character
10993 * Now we can see about various optimizations. Fold calculation (which we
10994 * did above) needs to take place before inversion. Otherwise /[^k]/i
10995 * would invert to include K, which under /i would match k, which it
10996 * shouldn't. Therefore we can't invert folded locale now, as it won't be
10997 * folded until runtime */
10999 /* If we didn't do folding, it's because some information isn't available
11000 * until runtime; set the run-time fold flag for these We know to set the
11001 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
11002 * at least one 0-255 range code point */
11005 /* Some things on the list might be unconditionally included because of
11006 * other components. Remove them, and clean up the list if it goes to
11008 if (only_utf8_locale_list && cp_list) {
11009 _invlist_subtract(only_utf8_locale_list, cp_list,
11010 &only_utf8_locale_list);
11012 if (_invlist_len(only_utf8_locale_list) == 0) {
11013 SvREFCNT_dec_NN(only_utf8_locale_list);
11014 only_utf8_locale_list = NULL;
11017 if ( only_utf8_locale_list
11019 && ( _invlist_contains_cp(cp_list,
11020 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
11021 || _invlist_contains_cp(cp_list,
11022 LATIN_SMALL_LETTER_DOTLESS_I))))
11024 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11025 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11027 else if (cp_list && invlist_lowest(cp_list) < 256) {
11028 /* If nothing is below 256, has no locale dependency; otherwise it
11030 anyof_flags |= ANYOFL_FOLD;
11031 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
11033 /* In a Turkish locale these could match, notify the run-time code
11034 * to check for that */
11035 if ( _invlist_contains_cp(cp_list, 'I')
11036 || _invlist_contains_cp(cp_list, 'i'))
11038 anyof_flags |= ANYOFL_FOLD|ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11042 else if ( DEPENDS_SEMANTICS
11043 && ( upper_latin1_only_utf8_matches
11045 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)))
11047 RExC_seen_d_op = TRUE;
11048 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
11051 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
11055 && ! has_runtime_dependency)
11057 _invlist_invert(cp_list);
11059 /* Clear the invert flag since have just done it here */
11063 /* All possible optimizations below still have these characteristics.
11064 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
11066 *flagp |= HASWIDTH|SIMPLE;
11069 *ret_invlist = cp_list;
11071 return (cp_list) ? RExC_emit : 0;
11074 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
11075 RExC_contains_locale = 1;
11080 /* Some character classes are equivalent to other nodes. Such nodes
11081 * take up less room, and some nodes require fewer operations to
11082 * execute, than ANYOF nodes. EXACTish nodes may be joinable with
11083 * adjacent nodes to improve efficiency. */
11084 op = optimize_regclass(pRExC_state, cp_list,
11085 only_utf8_locale_list,
11086 upper_latin1_only_utf8_matches,
11087 has_runtime_dependency,
11089 &anyof_flags, &invert, &ret, flagp);
11090 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
11092 /* If optimized to something else and emitted, clean up and return */
11094 SvREFCNT_dec(cp_list);;
11095 SvREFCNT_dec(only_utf8_locale_list);
11096 SvREFCNT_dec(upper_latin1_only_utf8_matches);
11100 /* If no optimization was found, an END was returned and we will now
11107 /* Here are going to emit an ANYOF; set the particular type */
11109 if (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) {
11120 ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
11121 FILL_NODE(ret, op); /* We set the argument later */
11122 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
11123 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
11125 /* Here, <cp_list> contains all the code points we can determine at
11126 * compile time that match under all conditions. Go through it, and
11127 * for things that belong in the bitmap, put them there, and delete from
11128 * <cp_list>. While we are at it, see if everything above 255 is in the
11129 * list, and if so, set a flag to speed up execution */
11131 populate_anyof_bitmap_from_invlist(REGNODE_p(ret), &cp_list);
11134 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
11138 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
11141 /* Here, the bitmap has been populated with all the Latin1 code points that
11142 * always match. Can now add to the overall list those that match only
11143 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
11145 if (upper_latin1_only_utf8_matches) {
11147 _invlist_union(cp_list,
11148 upper_latin1_only_utf8_matches,
11150 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
11153 cp_list = upper_latin1_only_utf8_matches;
11155 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
11158 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
11159 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
11162 only_utf8_locale_list);
11164 SvREFCNT_dec(cp_list);;
11165 SvREFCNT_dec(only_utf8_locale_list);
11170 S_optimize_regclass(pTHX_
11171 RExC_state_t *pRExC_state,
11173 SV* only_utf8_locale_list,
11174 SV* upper_latin1_only_utf8_matches,
11175 const U32 has_runtime_dependency,
11179 regnode_offset * ret,
11183 /* This function exists just to make S_regclass() smaller. It extracts out
11184 * the code that looks for potential optimizations away from a full generic
11185 * ANYOF node. The parameter names are the same as the corresponding
11186 * variables in S_regclass.
11188 * It returns the new op (the impossible END one if no optimization found)
11189 * and sets *ret to any created regnode. If the new op is sufficiently
11190 * like plain ANYOF, it leaves *ret unchanged for allocation in S_regclass.
11192 * Certain of the parameters may be updated as a result of the changes
11195 U8 op = END; /* The returned node-type, initialized to an impossible
11198 PERL_UINT_FAST8_T i;
11199 UV partial_cp_count = 0;
11200 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
11201 UV end[MAX_FOLD_FROMS+1] = { 0 };
11202 bool single_range = FALSE;
11203 UV lowest_cp = 0, highest_cp = 0;
11205 PERL_ARGS_ASSERT_OPTIMIZE_REGCLASS;
11207 if (cp_list) { /* Count the code points in enough ranges that we would see
11208 all the ones possible in any fold in this version of
11211 invlist_iterinit(cp_list);
11212 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
11213 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
11216 partial_cp_count += end[i] - start[i] + 1;
11220 single_range = TRUE;
11222 invlist_iterfinish(cp_list);
11224 /* If we know at compile time that this matches every possible code
11225 * point, any run-time dependencies don't matter */
11226 if (start[0] == 0 && end[0] == UV_MAX) {
11228 goto return_OPFAIL;
11235 /* Use a clearer mnemonic for below */
11236 lowest_cp = start[0];
11238 highest_cp = invlist_highest(cp_list);
11241 /* Similarly, for /l posix classes, if both a class and its complement
11242 * match, any run-time dependencies don't matter */
11245 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; namedclass += 2) {
11246 if ( POSIXL_TEST(posixl, namedclass) /* class */
11247 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
11250 goto return_OPFAIL;
11256 /* For well-behaved locales, some classes are subsets of others, so
11257 * complementing the subset and including the non-complemented superset
11258 * should match everything, like [\D[:alnum:]], and
11259 * [[:^alpha:][:alnum:]], but some implementations of locales are
11260 * buggy, and khw thinks its a bad idea to have optimization change
11261 * behavior, even if it avoids an OS bug in a given case */
11263 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
11265 /* If is a single posix /l class, can optimize to just that op. Such a
11266 * node will not match anything in the Latin1 range, as that is not
11267 * determinable until runtime, but will match whatever the class does
11268 * outside that range. (Note that some classes won't match anything
11269 * outside the range, like [:ascii:]) */
11270 if ( isSINGLE_BIT_SET(posixl)
11271 && (partial_cp_count == 0 || lowest_cp > 255))
11274 SV * class_above_latin1 = NULL;
11275 bool already_inverted;
11276 bool are_equivalent;
11279 namedclass = single_1bit_pos32(posixl);
11280 classnum = namedclass_to_classnum(namedclass);
11282 /* The named classes are such that the inverted number is one
11283 * larger than the non-inverted one */
11284 already_inverted = namedclass - classnum_to_namedclass(classnum);
11286 /* Create an inversion list of the official property, inverted if
11287 * the constructed node list is inverted, and restricted to only
11288 * the above latin1 code points, which are the only ones known at
11290 _invlist_intersection_maybe_complement_2nd(
11292 PL_XPosix_ptrs[classnum],
11294 &class_above_latin1);
11295 are_equivalent = _invlistEQ(class_above_latin1, cp_list, FALSE);
11296 SvREFCNT_dec_NN(class_above_latin1);
11298 if (are_equivalent) {
11300 /* Resolve the run-time inversion flag with this possibly
11301 * inverted class */
11302 *invert = *invert ^ already_inverted;
11304 op = POSIXL + *invert * (NPOSIXL - POSIXL);
11305 *ret = reg_node(pRExC_state, op);
11306 FLAGS(REGNODE_p(*ret)) = classnum;
11312 /* khw can't think of any other possible transformation involving these. */
11313 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
11317 if (! has_runtime_dependency) {
11319 /* If the list is empty, nothing matches. This happens, for example,
11320 * when a Unicode property that doesn't match anything is the only
11321 * element in the character class (perluniprops.pod notes such
11323 if (partial_cp_count == 0) {
11328 goto return_OPFAIL;
11332 /* If matches everything but \n */
11333 if ( start[0] == 0 && end[0] == '\n' - 1
11334 && start[1] == '\n' + 1 && end[1] == UV_MAX)
11336 assert (! *invert);
11338 *ret = reg_node(pRExC_state, op);
11344 /* Next see if can optimize classes that contain just a few code points
11345 * into an EXACTish node. The reason to do this is to let the optimizer
11346 * join this node with adjacent EXACTish ones, and ANYOF nodes require
11347 * runtime conversion to code point from UTF-8, which we'd like to avoid.
11349 * An EXACTFish node can be generated even if not under /i, and vice versa.
11350 * But care must be taken. An EXACTFish node has to be such that it only
11351 * matches precisely the code points in the class, but we want to generate
11352 * the least restrictive one that does that, to increase the odds of being
11353 * able to join with an adjacent node. For example, if the class contains
11354 * [kK], we have to make it an EXACTFAA node to prevent the KELVIN SIGN
11355 * from matching. Whether we are under /i or not is irrelevant in this
11356 * case. Less obvious is the pattern qr/[\x{02BC}]n/i. U+02BC is MODIFIER
11357 * LETTER APOSTROPHE. That is supposed to match the single character U+0149
11358 * LATIN SMALL LETTER N PRECEDED BY APOSTROPHE. And so even though there
11359 * is no simple fold that includes \X{02BC}, there is a multi-char fold
11360 * that does, and so the node generated for it must be an EXACTFish one.
11361 * On the other hand qr/:/i should generate a plain EXACT node since the
11362 * colon participates in no fold whatsoever, and having it be EXACT tells
11363 * the optimizer the target string cannot match unless it has a colon in
11368 /* Only try if there are no more code points in the class than in
11369 * the max possible fold */
11370 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
11372 /* We can always make a single code point class into an EXACTish node.
11374 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) {
11377 /* Here is /l: Use EXACTL, except if there is a fold not known
11378 * until runtime so shows as only a single code point here.
11379 * For code points above 255, we know which can cause problems
11380 * by having a potential fold to the Latin1 range. */
11382 || ( lowest_cp > 255
11383 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(lowest_cp)))
11391 else if (! FOLD) { /* Not /l and not /i */
11392 op = (lowest_cp < 256) ? EXACT : EXACT_REQ8;
11394 else if (lowest_cp < 256) { /* /i, not /l, and the code point is
11397 /* Under /i, it gets a little tricky. A code point that
11398 * doesn't participate in a fold should be an EXACT node. We
11399 * know this one isn't the result of a simple fold, or there'd
11400 * be more than one code point in the list, but it could be
11401 * part of a multi-character fold. In that case we better not
11402 * create an EXACT node, as we would wrongly be telling the
11403 * optimizer that this code point must be in the target string,
11404 * and that is wrong. This is because if the sequence around
11405 * this code point forms a multi-char fold, what needs to be in
11406 * the string could be the code point that folds to the
11409 * This handles the case of below-255 code points, as we have
11410 * an easy look up for those. The next clause handles the
11412 op = IS_IN_SOME_FOLD_L1(lowest_cp)
11416 else { /* /i, larger code point. Since we are under /i, and have
11417 just this code point, we know that it can't fold to
11418 something else, so PL_InMultiCharFold applies to it */
11419 op = (_invlist_contains_cp(PL_InMultiCharFold, lowest_cp))
11426 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
11427 && _invlist_contains_cp(PL_in_some_fold, lowest_cp))
11429 /* Here, the only runtime dependency, if any, is from /d, and the
11430 * class matches more than one code point, and the lowest code
11431 * point participates in some fold. It might be that the other
11432 * code points are /i equivalent to this one, and hence they would
11433 * be representable by an EXACTFish node. Above, we eliminated
11434 * classes that contain too many code points to be EXACTFish, with
11435 * the test for MAX_FOLD_FROMS
11437 * First, special case the ASCII fold pairs, like 'B' and 'b'. We
11438 * do this because we have EXACTFAA at our disposal for the ASCII
11440 if (partial_cp_count == 2 && isASCII(lowest_cp)) {
11442 /* The only ASCII characters that participate in folds are
11444 assert(isALPHA(lowest_cp));
11445 if ( end[0] == start[0] /* First range is a single
11446 character, so 2nd exists */
11447 && isALPHA_FOLD_EQ(start[0], start[1]))
11449 /* Here, is part of an ASCII fold pair */
11451 if ( ASCII_FOLD_RESTRICTED
11452 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(lowest_cp))
11454 /* If the second clause just above was true, it means
11455 * we can't be under /i, or else the list would have
11456 * included more than this fold pair. Therefore we
11457 * have to exclude the possibility of whatever else it
11458 * is that folds to these, by using EXACTFAA */
11461 else if (HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp)) {
11463 /* Here, there's no simple fold that lowest_cp is part
11464 * of, but there is a multi-character one. If we are
11465 * not under /i, we want to exclude that possibility;
11466 * if under /i, we want to include it */
11467 op = (FOLD) ? EXACTFU : EXACTFAA;
11471 /* Here, the only possible fold lowest_cp participates in
11472 * is with start[1]. /i or not isn't relevant */
11476 value = toFOLD(lowest_cp);
11479 else if ( ! upper_latin1_only_utf8_matches
11480 || ( _invlist_len(upper_latin1_only_utf8_matches) == 2
11482 invlist_highest(upper_latin1_only_utf8_matches)]
11485 /* Here, the smallest character is non-ascii or there are more
11486 * than 2 code points matched by this node. Also, we either
11487 * don't have /d UTF-8 dependent matches, or if we do, they
11488 * look like they could be a single character that is the fold
11489 * of the lowest one is in the always-match list. This test
11490 * quickly excludes most of the false positives when there are
11491 * /d UTF-8 depdendent matches. These are like LATIN CAPITAL
11492 * LETTER A WITH GRAVE matching LATIN SMALL LETTER A WITH GRAVE
11493 * iff the target string is UTF-8. (We don't have to worry
11494 * above about exceeding the array bounds of PL_fold_latin1[]
11495 * because any code point in 'upper_latin1_only_utf8_matches'
11498 * EXACTFAA would apply only to pairs (hence exactly 2 code
11499 * points) in the ASCII range, so we can't use it here to
11500 * artificially restrict the fold domain, so we check if the
11501 * class does or does not match some EXACTFish node. Further,
11502 * if we aren't under /i, and and the folded-to character is
11503 * part of a multi-character fold, we can't do this
11504 * optimization, as the sequence around it could be that
11505 * multi-character fold, and we don't here know the context, so
11506 * we have to assume it is that multi-char fold, to prevent
11509 * To do the general case, we first find the fold of the lowest
11510 * code point (which may be higher than that lowest unfolded
11511 * one), then find everything that folds to it. (The data
11512 * structure we have only maps from the folded code points, so
11513 * we have to do the earlier step.) */
11516 U8 foldbuf[UTF8_MAXBYTES_CASE];
11517 UV folded = _to_uni_fold_flags(lowest_cp, foldbuf, &foldlen, 0);
11519 const U32 * remaining_folds;
11520 Size_t folds_to_this_cp_count = _inverse_folds(
11524 Size_t folds_count = folds_to_this_cp_count + 1;
11525 SV * fold_list = _new_invlist(folds_count);
11528 /* If there are UTF-8 dependent matches, create a temporary
11529 * list of what this node matches, including them. */
11530 SV * all_cp_list = NULL;
11531 SV ** use_this_list = &cp_list;
11533 if (upper_latin1_only_utf8_matches) {
11534 all_cp_list = _new_invlist(0);
11535 use_this_list = &all_cp_list;
11536 _invlist_union(cp_list,
11537 upper_latin1_only_utf8_matches,
11541 /* Having gotten everything that participates in the fold
11542 * containing the lowest code point, we turn that into an
11543 * inversion list, making sure everything is included. */
11544 fold_list = add_cp_to_invlist(fold_list, lowest_cp);
11545 fold_list = add_cp_to_invlist(fold_list, folded);
11546 if (folds_to_this_cp_count > 0) {
11547 fold_list = add_cp_to_invlist(fold_list, first_fold);
11548 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
11549 fold_list = add_cp_to_invlist(fold_list,
11550 remaining_folds[i]);
11554 /* If the fold list is identical to what's in this ANYOF node,
11555 * the node can be represented by an EXACTFish one instead */
11556 if (_invlistEQ(*use_this_list, fold_list,
11557 0 /* Don't complement */ )
11560 /* But, we have to be careful, as mentioned above. Just
11561 * the right sequence of characters could match this if it
11562 * is part of a multi-character fold. That IS what we want
11563 * if we are under /i. But it ISN'T what we want if not
11564 * under /i, as it could match when it shouldn't. So, when
11565 * we aren't under /i and this character participates in a
11566 * multi-char fold, we don't optimize into an EXACTFish
11567 * node. So, for each case below we have to check if we
11568 * are folding, and if not, if it is not part of a
11569 * multi-char fold. */
11570 if (lowest_cp > 255) { /* Highish code point */
11571 if (FOLD || ! _invlist_contains_cp(
11572 PL_InMultiCharFold, folded))
11576 : (ASCII_FOLD_RESTRICTED)
11581 } /* Below, the lowest code point < 256 */
11584 && DEPENDS_SEMANTICS)
11585 { /* An EXACTF node containing a single character 's',
11586 can be an EXACTFU if it doesn't get joined with an
11588 op = EXACTFU_S_EDGE;
11592 || ! HAS_NONLATIN1_FOLD_CLOSURE(lowest_cp))
11594 if (upper_latin1_only_utf8_matches) {
11597 /* We can't use the fold, as that only matches
11601 else if ( UNLIKELY(lowest_cp == MICRO_SIGN)
11603 { /* EXACTFUP is a special node for this character */
11604 op = (ASCII_FOLD_RESTRICTED)
11607 value = MICRO_SIGN;
11609 else if ( ASCII_FOLD_RESTRICTED
11610 && ! isASCII(lowest_cp))
11611 { /* For ASCII under /iaa, we can use EXACTFU below
11623 SvREFCNT_dec_NN(fold_list);
11624 SvREFCNT_dec(all_cp_list);
11631 /* Here, we have calculated what EXACTish node to use. Have to
11632 * convert to UTF-8 if not already there */
11635 SvREFCNT_dec(cp_list);;
11636 REQUIRE_UTF8(flagp);
11639 /* This is a kludge to the special casing issues with this
11640 * ligature under /aa. FB05 should fold to FB06, but the call
11641 * above to _to_uni_fold_flags() didn't find this, as it didn't
11642 * use the /aa restriction in order to not miss other folds
11643 * that would be affected. This is the only instance likely to
11644 * ever be a problem in all of Unicode. So special case it. */
11645 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
11646 && ASCII_FOLD_RESTRICTED)
11648 value = LATIN_SMALL_LIGATURE_ST;
11652 len = (UTF) ? UVCHR_SKIP(value) : 1;
11654 *ret = REGNODE_GUTS(pRExC_state, op, len);
11655 FILL_NODE(*ret, op);
11656 RExC_emit += NODE_STEP_REGNODE + STR_SZ(len);
11657 setSTR_LEN(REGNODE_p(*ret), len);
11659 *STRINGs(REGNODE_p(*ret)) = (U8) value;
11662 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(*ret)), value);
11669 if (! has_runtime_dependency) {
11671 /* See if this can be turned into an ANYOFM node. Think about the bit
11672 * patterns in two different bytes. In some positions, the bits in
11673 * each will be 1; and in other positions both will be 0; and in some
11674 * positions the bit will be 1 in one byte, and 0 in the other. Let
11675 * 'n' be the number of positions where the bits differ. We create a
11676 * mask which has exactly 'n' 0 bits, each in a position where the two
11677 * bytes differ. Now take the set of all bytes that when ANDed with
11678 * the mask yield the same result. That set has 2**n elements, and is
11679 * representable by just two 8 bit numbers: the result and the mask.
11680 * Importantly, matching the set can be vectorized by creating a word
11681 * full of the result bytes, and a word full of the mask bytes,
11682 * yielding a significant speed up. Here, see if this node matches
11683 * such a set. As a concrete example consider [01], and the byte
11684 * representing '0' which is 0x30 on ASCII machines. It has the bits
11685 * 0011 0000. Take the mask 1111 1110. If we AND 0x31 and 0x30 with
11686 * that mask we get 0x30. Any other bytes ANDed yield something else.
11687 * So [01], which is a common usage, is optimizable into ANYOFM, and
11688 * can benefit from the speed up. We can only do this on UTF-8
11689 * invariant bytes, because they have the same bit patterns under UTF-8
11691 PERL_UINT_FAST8_T inverted = 0;
11693 /* Highest possible UTF-8 invariant is 7F on ASCII platforms; FF on
11695 const PERL_UINT_FAST8_T max_permissible
11696 = nBIT_UMAX(7 + ONE_IF_EBCDIC_ZERO_IF_NOT);
11698 /* If doesn't fit the criteria for ANYOFM, invert and try again. If
11699 * that works we will instead later generate an NANYOFM, and invert
11700 * back when through */
11701 if (highest_cp > max_permissible) {
11702 _invlist_invert(cp_list);
11706 if (invlist_highest(cp_list) <= max_permissible) {
11707 UV this_start, this_end;
11708 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
11709 U8 bits_differing = 0;
11710 Size_t full_cp_count = 0;
11711 bool first_time = TRUE;
11713 /* Go through the bytes and find the bit positions that differ */
11714 invlist_iterinit(cp_list);
11715 while (invlist_iternext(cp_list, &this_start, &this_end)) {
11716 unsigned int i = this_start;
11719 if (! UVCHR_IS_INVARIANT(i)) {
11723 first_time = FALSE;
11724 lowest_cp = this_start;
11726 /* We have set up the code point to compare with. Don't
11727 * compare it with itself */
11731 /* Find the bit positions that differ from the lowest code
11732 * point in the node. Keep track of all such positions by
11734 for (; i <= this_end; i++) {
11735 if (! UVCHR_IS_INVARIANT(i)) {
11739 bits_differing |= i ^ lowest_cp;
11742 full_cp_count += this_end - this_start + 1;
11745 /* At the end of the loop, we count how many bits differ from the
11746 * bits in lowest code point, call the count 'd'. If the set we
11747 * found contains 2**d elements, it is the closure of all code
11748 * points that differ only in those bit positions. To convince
11749 * yourself of that, first note that the number in the closure must
11750 * be a power of 2, which we test for. The only way we could have
11751 * that count and it be some differing set, is if we got some code
11752 * points that don't differ from the lowest code point in any
11753 * position, but do differ from each other in some other position.
11754 * That means one code point has a 1 in that position, and another
11755 * has a 0. But that would mean that one of them differs from the
11756 * lowest code point in that position, which possibility we've
11757 * already excluded. */
11758 if ( (inverted || full_cp_count > 1)
11759 && full_cp_count == 1U << PL_bitcount[bits_differing])
11763 op = ANYOFM + inverted;;
11765 /* We need to make the bits that differ be 0's */
11766 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
11768 /* The argument is the lowest code point */
11769 *ret = reg1node(pRExC_state, op, lowest_cp);
11770 FLAGS(REGNODE_p(*ret)) = ANYOFM_mask;
11774 invlist_iterfinish(cp_list);
11778 _invlist_invert(cp_list);
11785 /* XXX We could create an ANYOFR_LOW node here if we saved above if all
11786 * were invariants, it wasn't inverted, and there is a single range.
11787 * This would be faster than some of the posix nodes we create below
11788 * like /\d/a, but would be twice the size. Without having actually
11789 * measured the gain, khw doesn't think the tradeoff is really worth it
11793 if (! (*anyof_flags & ANYOF_LOCALE_FLAGS)) {
11794 PERL_UINT_FAST8_T type;
11795 SV * intersection = NULL;
11796 SV* d_invlist = NULL;
11798 /* See if this matches any of the POSIX classes. The POSIXA and POSIXD
11799 * ones are about the same speed as ANYOF ops, but take less room; the
11800 * ones that have above-Latin1 code point matches are somewhat faster
11803 for (type = POSIXA; type >= POSIXD; type--) {
11806 if (type == POSIXL) { /* But not /l posix classes */
11810 for (posix_class = 0;
11811 posix_class <= HIGHEST_REGCOMP_DOT_H_SYNC_;
11814 SV** our_code_points = &cp_list;
11815 SV** official_code_points;
11818 if (type == POSIXA) {
11819 official_code_points = &PL_Posix_ptrs[posix_class];
11822 official_code_points = &PL_XPosix_ptrs[posix_class];
11825 /* Skip non-existent classes of this type. e.g. \v only has an
11826 * entry in PL_XPosix_ptrs */
11827 if (! *official_code_points) {
11831 /* Try both the regular class, and its inversion */
11832 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
11833 bool this_inverted = *invert ^ try_inverted;
11835 if (type != POSIXD) {
11837 /* This class that isn't /d can't match if we have /d
11839 if (has_runtime_dependency
11840 & HAS_D_RUNTIME_DEPENDENCY)
11845 else /* is /d */ if (! this_inverted) {
11847 /* /d classes don't match anything non-ASCII below 256
11848 * unconditionally (which cp_list contains) */
11849 _invlist_intersection(cp_list, PL_UpperLatin1,
11851 if (_invlist_len(intersection) != 0) {
11855 SvREFCNT_dec(d_invlist);
11856 d_invlist = invlist_clone(cp_list, NULL);
11858 /* But under UTF-8 it turns into using /u rules. Add
11859 * the things it matches under these conditions so that
11860 * we check below that these are identical to what the
11861 * tested class should match */
11862 if (upper_latin1_only_utf8_matches) {
11865 upper_latin1_only_utf8_matches,
11868 our_code_points = &d_invlist;
11870 else { /* POSIXD, inverted. If this doesn't have this
11871 flag set, it isn't /d. */
11872 if (! ( *anyof_flags
11873 & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
11878 our_code_points = &cp_list;
11881 /* Here, have weeded out some things. We want to see if
11882 * the list of characters this node contains
11883 * ('*our_code_points') precisely matches those of the
11884 * class we are currently checking against
11885 * ('*official_code_points'). */
11886 if (_invlistEQ(*our_code_points,
11887 *official_code_points,
11890 /* Here, they precisely match. Optimize this ANYOF
11891 * node into its equivalent POSIX one of the correct
11892 * type, possibly inverted.
11894 * Some of these nodes match a single range of
11895 * characters (or [:alpha:] matches two parallel ranges
11896 * on ASCII platforms). The array lookup at execution
11897 * time could be replaced by a range check for such
11898 * nodes. But regnodes are a finite resource, and the
11899 * possible performance boost isn't large, so this
11900 * hasn't been done. An attempt to use just one node
11901 * (and its inverse) to encompass all such cases was
11902 * made in d62feba66bf43f35d092bb026694f927e9f94d38.
11903 * But the shifting/masking it used ended up being
11904 * slower than the array look up, so it was reverted */
11905 op = (try_inverted)
11906 ? type + NPOSIXA - POSIXA
11908 *ret = reg_node(pRExC_state, op);
11909 FLAGS(REGNODE_p(*ret)) = posix_class;
11910 SvREFCNT_dec(d_invlist);
11911 SvREFCNT_dec(intersection);
11917 SvREFCNT_dec(d_invlist);
11918 SvREFCNT_dec(intersection);
11921 /* If it is a single contiguous range, ANYOFR is an efficient regnode, both
11922 * in size and speed. Currently, a 20 bit range base (smallest code point
11923 * in the range), and a 12 bit maximum delta are packed into a 32 bit word.
11924 * This allows for using it on all of the Unicode code points except for
11925 * the highest plane, which is only for private use code points. khw
11926 * doubts that a bigger delta is likely in real world applications */
11928 && ! has_runtime_dependency
11929 && *anyof_flags == 0
11930 && start[0] < (1 << ANYOFR_BASE_BITS)
11931 && end[0] - start[0]
11932 < ((1U << (sizeof(ARG1u_LOC(NULL))
11933 * CHARBITS - ANYOFR_BASE_BITS))))
11936 U8 low_utf8[UTF8_MAXBYTES+1];
11937 U8 high_utf8[UTF8_MAXBYTES+1];
11940 *ret = reg1node(pRExC_state, op,
11941 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
11943 /* Place the lowest UTF-8 start byte in the flags field, so as to allow
11944 * efficient ruling out at run time of many possible inputs. */
11945 (void) uvchr_to_utf8(low_utf8, start[0]);
11946 (void) uvchr_to_utf8(high_utf8, end[0]);
11948 /* If all code points share the same first byte, this can be an
11949 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
11950 * quickly rule out many inputs at run-time without having to compute
11951 * the code point from UTF-8. For EBCDIC, we use I8, as not doing that
11952 * transformation would not rule out nearly so many things */
11953 if (low_utf8[0] == high_utf8[0]) {
11955 OP(REGNODE_p(*ret)) = op;
11956 ANYOF_FLAGS(REGNODE_p(*ret)) = low_utf8[0];
11959 ANYOF_FLAGS(REGNODE_p(*ret)) = NATIVE_UTF8_TO_I8(low_utf8[0]);
11965 /* If didn't find an optimization and there is no need for a bitmap,
11966 * of the lowest code points, optimize to indicate that */
11967 if ( lowest_cp >= NUM_ANYOF_CODE_POINTS
11969 && ! upper_latin1_only_utf8_matches
11970 && *anyof_flags == 0)
11972 U8 low_utf8[UTF8_MAXBYTES+1];
11973 UV highest_cp = invlist_highest(cp_list);
11975 /* Currently the maximum allowed code point by the system is IV_MAX.
11976 * Higher ones are reserved for future internal use. This particular
11977 * regnode can be used for higher ones, but we can't calculate the code
11978 * point of those. IV_MAX suffices though, as it will be a large first
11980 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(lowest_cp, IV_MAX))
11983 /* We store the lowest possible first byte of the UTF-8 representation,
11984 * using the flags field. This allows for quick ruling out of some
11985 * inputs without having to convert from UTF-8 to code point. For
11986 * EBCDIC, we use I8, as not doing that transformation would not rule
11987 * out nearly so many things */
11988 *anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
11992 /* If the first UTF-8 start byte for the highest code point in the
11993 * range is suitably small, we may be able to get an upper bound as
11995 if (highest_cp <= IV_MAX) {
11996 U8 high_utf8[UTF8_MAXBYTES+1];
11997 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp) - high_utf8;
11999 /* If the lowest and highest are the same, we can get an exact
12000 * first byte instead of a just minimum or even a sequence of exact
12001 * leading bytes. We signal these with different regnodes */
12002 if (low_utf8[0] == high_utf8[0]) {
12003 Size_t len = find_first_differing_byte_pos(low_utf8,
12005 MIN(low_len, high_len));
12008 /* No need to convert to I8 for EBCDIC as this is an exact
12010 *anyof_flags = low_utf8[0];
12012 if (high_len == 2) {
12013 /* If the elements matched all have a 2-byte UTF-8
12014 * representation, with the first byte being the same,
12015 * we can use a compact, fast regnode. capable of
12016 * matching any combination of continuation byte
12019 * (A similar regnode could be created for the Latin1
12020 * range; the complication being that it could match
12021 * non-UTF8 targets. The internal bitmap would serve
12022 * both cases; with some extra code in regexec.c) */
12024 *ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12025 FILL_NODE(*ret, op);
12026 FIRST_BYTE((struct regnode_bbm *) REGNODE_p(*ret)) = low_utf8[0],
12028 /* The 64 bit (or 32 on EBCCDIC) map can be looked up
12029 * directly based on the continuation byte, without
12030 * needing to convert to code point */
12031 populate_bitmap_from_invlist(
12034 /* The base code point is from the start byte */
12035 TWO_BYTE_UTF8_TO_NATIVE(low_utf8[0],
12036 UTF_CONTINUATION_MARK | 0),
12038 ((struct regnode_bbm *) REGNODE_p(*ret))->bitmap,
12039 REGNODE_BBM_BITMAP_LEN);
12040 RExC_emit += NODE_STEP_REGNODE + REGNODE_ARG_LEN(op);
12049 *ret = REGNODE_GUTS(pRExC_state, op,
12050 REGNODE_ARG_LEN(op) + STR_SZ(len));
12051 FILL_NODE(*ret, op);
12052 STR_LEN_U8((struct regnode_anyofhs *) REGNODE_p(*ret))
12054 Copy(low_utf8, /* Add the common bytes */
12055 ((struct regnode_anyofhs *) REGNODE_p(*ret))->string,
12057 RExC_emit = REGNODE_OFFSET(REGNODE_AFTER_varies(REGNODE_p(*ret)));
12058 set_ANYOF_arg(pRExC_state, REGNODE_p(*ret), cp_list,
12059 NULL, only_utf8_locale_list);
12063 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE) {
12065 /* Here, the high byte is not the same as the low, but is small
12066 * enough that its reasonable to have a loose upper bound,
12067 * which is packed in with the strict lower bound. See
12068 * comments at the definition of MAX_ANYOF_HRx_BYTE. On EBCDIC
12069 * platforms, I8 is used. On ASCII platforms I8 is the same
12070 * thing as UTF-8 */
12073 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - *anyof_flags;
12074 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
12077 if (range_diff <= max_range_diff / 8) {
12080 else if (range_diff <= max_range_diff / 4) {
12083 else if (range_diff <= max_range_diff / 2) {
12086 *anyof_flags = (*anyof_flags - 0xC0) << 2 | bits;
12096 *ret = reg1node(pRExC_state, op, 0);
12101 *ret = reg_node(pRExC_state, op);
12106 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
12109 Perl_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
12110 regnode* const node,
12112 SV* const runtime_defns,
12113 SV* const only_utf8_locale_list)
12115 /* Sets the arg field of an ANYOF-type node 'node', using information about
12116 * the node passed-in. If only the bitmap is needed to determine what
12117 * matches, the arg is set appropriately to either
12118 * 1) ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE
12119 * 2) ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE
12121 * Otherwise, it sets the argument to the count returned by reg_add_data(),
12122 * having allocated and stored an array, av, as follows:
12123 * av[0] stores the inversion list defining this class as far as known at
12124 * this time, or PL_sv_undef if nothing definite is now known.
12125 * av[1] stores the inversion list of code points that match only if the
12126 * current locale is UTF-8, or if none, PL_sv_undef if there is an
12127 * av[2], or no entry otherwise.
12128 * av[2] stores the list of user-defined properties whose subroutine
12129 * definitions aren't known at this time, or no entry if none. */
12133 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
12135 /* If this is set, the final disposition won't be known until runtime, so
12136 * we can't do any of the compile time optimizations */
12137 if (! runtime_defns) {
12139 /* On plain ANYOF nodes without the possibility of a runtime locale
12140 * making a difference, maybe there's no information to be gleaned
12141 * except for what's in the bitmap */
12142 if (REGNODE_TYPE(OP(node)) == ANYOF && ! only_utf8_locale_list) {
12144 /* There are two such cases:
12145 * 1) there is no list of code points matched outside the bitmap
12148 ARG1u_SET(node, ANYOF_MATCHES_NONE_OUTSIDE_BITMAP_VALUE);
12152 /* 2) the list indicates everything outside the bitmap matches */
12153 if ( invlist_highest(cp_list) == UV_MAX
12154 && invlist_highest_range_start(cp_list)
12155 <= NUM_ANYOF_CODE_POINTS)
12157 ARG1u_SET(node, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
12161 /* In all other cases there are things outside the bitmap that we
12162 * may need to check at runtime. */
12165 /* Here, we have resolved all the possible run-time matches, and they
12166 * are stored in one or both of two possible lists. (While some match
12167 * only under certain runtime circumstances, we know all the possible
12168 * ones for each such circumstance.)
12170 * It may very well be that the pattern being compiled contains an
12171 * identical class, already encountered. Reusing that class here saves
12172 * space. Look through all classes so far encountered. */
12173 U32 existing_items = RExC_rxi->data ? RExC_rxi->data->count : 0;
12174 for (unsigned int i = 0; i < existing_items; i++) {
12176 /* Only look at auxiliary data of this type */
12177 if (RExC_rxi->data->what[i] != 's') {
12181 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[i]);
12182 AV * const av = MUTABLE_AV(SvRV(rv));
12184 /* If the already encountered class has data that won't be known
12185 * until runtime (stored in the final element of the array), we
12187 if (av_top_index(av) > ONLY_LOCALE_MATCHES_INDEX) {
12191 SV ** stored_cp_list_ptr = av_fetch(av, INVLIST_INDEX,
12192 false /* no lvalue */);
12194 /* The new and the existing one both have to have or both not
12195 * have this element, for this one to duplicate that one */
12196 if (cBOOL(cp_list) != cBOOL(stored_cp_list_ptr)) {
12200 /* If the inversion lists aren't equivalent, can't share */
12201 if (cp_list && ! _invlistEQ(cp_list,
12202 *stored_cp_list_ptr,
12203 FALSE /* don't complement */))
12208 /* Similarly for the other list */
12209 SV ** stored_only_utf8_locale_list_ptr = av_fetch(
12211 ONLY_LOCALE_MATCHES_INDEX,
12212 false /* no lvalue */);
12213 if ( cBOOL(only_utf8_locale_list)
12214 != cBOOL(stored_only_utf8_locale_list_ptr))
12219 if (only_utf8_locale_list && ! _invlistEQ(
12220 only_utf8_locale_list,
12221 *stored_only_utf8_locale_list_ptr,
12222 FALSE /* don't complement */))
12227 /* Here, the existence and contents of both compile-time lists
12228 * are identical between the new and existing data. Re-use the
12230 ARG1u_SET(node, i);
12232 } /* end of loop through existing classes */
12235 /* Here, we need to create a new auxiliary data element; either because
12236 * this doesn't duplicate an existing one, or we can't tell at this time if
12237 * it eventually will */
12239 AV * const av = newAV();
12243 av_store_simple(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
12246 /* (Note that if any of this changes, the size calculations in
12247 * S_optimize_regclass() might need to be updated.) */
12249 if (only_utf8_locale_list) {
12250 av_store_simple(av, ONLY_LOCALE_MATCHES_INDEX,
12251 SvREFCNT_inc_NN(only_utf8_locale_list));
12254 if (runtime_defns) {
12255 av_store_simple(av, DEFERRED_USER_DEFINED_INDEX,
12256 SvREFCNT_inc_NN(runtime_defns));
12259 rv = newRV_noinc(MUTABLE_SV(av));
12260 n = reg_add_data(pRExC_state, STR_WITH_LEN("s"));
12261 RExC_rxi->data->data[n] = (void*)rv;
12262 ARG1u_SET(node, n);
12267 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12268 Perl_get_regclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12270 Perl_get_re_gclass_aux_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
12274 /* For internal core use only.
12275 * Returns the inversion list for the input 'node' in the regex 'prog'.
12276 * If <doinit> is 'true', will attempt to create the inversion list if not
12277 * already done. If it is created, it will add to the normal inversion
12278 * list any that comes from user-defined properties. It croaks if this
12279 * is called before such a list is ready to be generated, that is when a
12280 * user-defined property has been declared, buyt still not yet defined.
12281 * If <listsvp> is non-null, will return the printable contents of the
12282 * property definition. This can be used to get debugging information
12283 * even before the inversion list exists, by calling this function with
12284 * 'doinit' set to false, in which case the components that will be used
12285 * to eventually create the inversion list are returned (in a printable
12287 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
12288 * store an inversion list of code points that should match only if the
12289 * execution-time locale is a UTF-8 one.
12290 * If <output_invlist> is not NULL, it is where this routine is to store an
12291 * inversion list of the code points that would be instead returned in
12292 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
12293 * when this parameter is used, is just the non-code point data that
12294 * will go into creating the inversion list. This currently should be just
12295 * user-defined properties whose definitions were not known at compile
12296 * time. Using this parameter allows for easier manipulation of the
12297 * inversion list's data by the caller. It is illegal to call this
12298 * function with this parameter set, but not <listsvp>
12300 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
12301 * that, in spite of this function's name, the inversion list it returns
12302 * may include the bitmap data as well */
12304 SV *si = NULL; /* Input initialization string */
12305 SV* invlist = NULL;
12307 RXi_GET_DECL_NULL(prog, progi);
12308 const struct reg_data * const data = prog ? progi->data : NULL;
12310 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
12311 PERL_ARGS_ASSERT_GET_REGCLASS_AUX_DATA;
12313 PERL_ARGS_ASSERT_GET_RE_GCLASS_AUX_DATA;
12315 assert(! output_invlist || listsvp);
12317 if (data && data->count) {
12318 const U32 n = ARG1u(node);
12320 if (data->what[n] == 's') {
12321 SV * const rv = MUTABLE_SV(data->data[n]);
12322 AV * const av = MUTABLE_AV(SvRV(rv));
12323 SV **const ary = AvARRAY(av);
12325 invlist = ary[INVLIST_INDEX];
12327 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
12328 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
12331 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
12332 si = ary[DEFERRED_USER_DEFINED_INDEX];
12335 if (doinit && (si || invlist)) {
12338 SV * msg = newSVpvs_flags("", SVs_TEMP);
12340 SV * prop_definition = handle_user_defined_property(
12341 "", 0, FALSE, /* There is no \p{}, \P{} */
12342 SvPVX_const(si)[1] - '0', /* /i or not has been
12343 stored here for just
12345 TRUE, /* run time */
12346 FALSE, /* This call must find the defn */
12347 si, /* The property definition */
12350 0 /* base level call */
12354 assert(prop_definition == NULL);
12356 Perl_croak(aTHX_ "%" UTF8f,
12357 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
12361 _invlist_union(invlist, prop_definition, &invlist);
12362 SvREFCNT_dec_NN(prop_definition);
12365 invlist = prop_definition;
12368 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
12369 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
12371 ary[INVLIST_INDEX] = invlist;
12372 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
12373 ? ONLY_LOCALE_MATCHES_INDEX
12381 /* If requested, return a printable version of what this ANYOF node matches
12384 SV* matches_string = NULL;
12386 /* This function can be called at compile-time, before everything gets
12387 * resolved, in which case we return the currently best available
12388 * information, which is the string that will eventually be used to do
12389 * that resolving, 'si' */
12391 /* Here, we only have 'si' (and possibly some passed-in data in
12392 * 'invlist', which is handled below) If the caller only wants
12393 * 'si', use that. */
12394 if (! output_invlist) {
12395 matches_string = newSVsv(si);
12398 /* But if the caller wants an inversion list of the node, we
12399 * need to parse 'si' and place as much as possible in the
12400 * desired output inversion list, making 'matches_string' only
12401 * contain the currently unresolvable things */
12402 const char *si_string = SvPVX(si);
12403 STRLEN remaining = SvCUR(si);
12407 /* Ignore everything before and including the first new-line */
12408 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
12409 assert (si_string != NULL);
12411 remaining = SvPVX(si) + SvCUR(si) - si_string;
12413 while (remaining > 0) {
12415 /* The data consists of just strings defining user-defined
12416 * property names, but in prior incarnations, and perhaps
12417 * somehow from pluggable regex engines, it could still
12418 * hold hex code point definitions, all of which should be
12419 * legal (or it wouldn't have gotten this far). Each
12420 * component of a range would be separated by a tab, and
12421 * each range by a new-line. If these are found, instead
12422 * add them to the inversion list */
12423 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
12424 |PERL_SCAN_SILENT_NON_PORTABLE;
12425 STRLEN len = remaining;
12426 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
12428 /* If the hex decode routine found something, it should go
12429 * up to the next \n */
12430 if ( *(si_string + len) == '\n') {
12431 if (count) { /* 2nd code point on line */
12432 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
12435 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
12438 goto prepare_for_next_iteration;
12441 /* If the hex decode was instead for the lower range limit,
12442 * save it, and go parse the upper range limit */
12443 if (*(si_string + len) == '\t') {
12444 assert(count == 0);
12448 prepare_for_next_iteration:
12449 si_string += len + 1;
12450 remaining -= len + 1;
12454 /* Here, didn't find a legal hex number. Just add the text
12455 * from here up to the next \n, omitting any trailing
12459 len = strcspn(si_string,
12460 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
12462 if (matches_string) {
12463 sv_catpvn(matches_string, si_string, len);
12466 matches_string = newSVpvn(si_string, len);
12468 sv_catpvs(matches_string, " ");
12472 && UCHARAT(si_string)
12473 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
12478 if (remaining && UCHARAT(si_string) == '\n') {
12482 } /* end of loop through the text */
12484 assert(matches_string);
12485 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
12486 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
12488 } /* end of has an 'si' */
12491 /* Add the stuff that's already known */
12494 /* Again, if the caller doesn't want the output inversion list, put
12495 * everything in 'matches-string' */
12496 if (! output_invlist) {
12497 if ( ! matches_string) {
12498 matches_string = newSVpvs("\n");
12500 sv_catsv(matches_string, invlist_contents(invlist,
12501 TRUE /* traditional style */
12504 else if (! *output_invlist) {
12505 *output_invlist = invlist_clone(invlist, NULL);
12508 _invlist_union(*output_invlist, invlist, output_invlist);
12512 *listsvp = matches_string;
12518 /* reg_skipcomment()
12520 Absorbs an /x style # comment from the input stream,
12521 returning a pointer to the first character beyond the comment, or if the
12522 comment terminates the pattern without anything following it, this returns
12523 one past the final character of the pattern (in other words, RExC_end) and
12524 sets the REG_RUN_ON_COMMENT_SEEN flag.
12526 Note it's the callers responsibility to ensure that we are
12527 actually in /x mode
12531 PERL_STATIC_INLINE char*
12532 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
12534 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12538 while (p < RExC_end) {
12539 if (*(++p) == '\n') {
12544 /* we ran off the end of the pattern without ending the comment, so we have
12545 * to add an \n when wrapping */
12546 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12551 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
12553 const bool force_to_xmod
12556 /* If the text at the current parse position '*p' is a '(?#...)' comment,
12557 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
12558 * is /x whitespace, advance '*p' so that on exit it points to the first
12559 * byte past all such white space and comments */
12561 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
12563 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
12565 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
12568 if (RExC_end - (*p) >= 3
12570 && *(*p + 1) == '?'
12571 && *(*p + 2) == '#')
12573 while (*(*p) != ')') {
12574 if ((*p) == RExC_end)
12575 FAIL("Sequence (?#... not terminated");
12583 const char * save_p = *p;
12584 while ((*p) < RExC_end) {
12586 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
12589 else if (*(*p) == '#') {
12590 (*p) = reg_skipcomment(pRExC_state, (*p));
12596 if (*p != save_p) {
12609 Advances the parse position by one byte, unless that byte is the beginning
12610 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
12611 those two cases, the parse position is advanced beyond all such comments and
12614 This is the UTF, (?#...), and /x friendly way of saying RExC_parse_inc_by(1).
12618 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12620 PERL_ARGS_ASSERT_NEXTCHAR;
12622 if (RExC_parse < RExC_end) {
12624 || UTF8_IS_INVARIANT(*RExC_parse)
12625 || UTF8_IS_START(*RExC_parse));
12627 RExC_parse_inc_safe();
12629 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12630 FALSE /* Don't force /x */ );
12635 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
12637 /* 'size' is the delta number of smallest regnode equivalents to add or
12638 * subtract from the current memory allocated to the regex engine being
12641 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
12646 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
12647 /* +1 for REG_MAGIC */
12650 if ( RExC_rxi == NULL )
12651 FAIL("Regexp out of space");
12652 RXi_SET(RExC_rx, RExC_rxi);
12654 RExC_emit_start = RExC_rxi->program;
12656 Zero(REGNODE_p(RExC_emit), size, regnode);
12660 STATIC regnode_offset
12661 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size)
12663 /* Allocate a regnode that is (1 + extra_size) times as big as the
12664 * smallest regnode worth of space, and also aligns and increments
12665 * RExC_size appropriately.
12667 * It returns the regnode's offset into the regex engine program */
12669 const regnode_offset ret = RExC_emit;
12671 PERL_ARGS_ASSERT_REGNODE_GUTS;
12673 SIZE_ALIGN(RExC_size);
12674 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
12675 NODE_ALIGN_FILL(REGNODE_p(ret));
12681 STATIC regnode_offset
12682 S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) {
12683 PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG;
12684 assert(extra_size >= REGNODE_ARG_LEN(op) || REGNODE_TYPE(op) == ANYOF);
12685 return S_regnode_guts(aTHX_ pRExC_state, extra_size);
12693 - reg_node - emit a node
12695 STATIC regnode_offset /* Location. */
12696 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12698 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12699 regnode_offset ptr = ret;
12701 PERL_ARGS_ASSERT_REG_NODE;
12703 assert(REGNODE_ARG_LEN(op) == 0);
12705 FILL_ADVANCE_NODE(ptr, op);
12711 - reg1node - emit a node with an argument
12713 STATIC regnode_offset /* Location. */
12714 S_reg1node(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12716 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12717 regnode_offset ptr = ret;
12719 PERL_ARGS_ASSERT_REG1NODE;
12721 /* ANYOF are special cased to allow non-length 1 args */
12722 assert(REGNODE_ARG_LEN(op) == 1);
12724 FILL_ADVANCE_NODE_ARG1u(ptr, op, arg);
12730 - regpnode - emit a temporary node with a SV* argument
12732 STATIC regnode_offset /* Location. */
12733 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
12735 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12736 regnode_offset ptr = ret;
12738 PERL_ARGS_ASSERT_REGPNODE;
12740 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
12745 STATIC regnode_offset
12746 S_reg2node(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
12748 /* emit a node with U32 and I32 arguments */
12750 const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, REGNODE_ARG_LEN(op));
12751 regnode_offset ptr = ret;
12753 PERL_ARGS_ASSERT_REG2NODE;
12755 assert(REGNODE_ARG_LEN(op) == 2);
12757 FILL_ADVANCE_NODE_2ui_ARG(ptr, op, arg1, arg2);
12763 - reginsert - insert an operator in front of already-emitted operand
12765 * That means that on exit 'operand' is the offset of the newly inserted
12766 * operator, and the original operand has been relocated.
12768 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
12769 * set up NEXT_OFF() of the inserted node if needed. Something like this:
12771 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
12772 * NEXT_OFF(REGNODE_p(orig_emit)) = REGNODE_ARG_LEN(OPFAIL) + NODE_STEP_REGNODE;
12774 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
12777 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
12778 const regnode_offset operand, const U32 depth)
12783 const int offset = REGNODE_ARG_LEN((U8)op);
12784 const int size = NODE_STEP_REGNODE + offset;
12785 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12787 PERL_ARGS_ASSERT_REGINSERT;
12788 PERL_UNUSED_CONTEXT;
12789 PERL_UNUSED_ARG(depth);
12790 DEBUG_PARSE_FMT("inst"," - %s", REGNODE_NAME(op));
12791 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
12792 studying. If this is wrong then we need to adjust RExC_recurse
12793 below like we do with RExC_open_parens/RExC_close_parens. */
12794 change_engine_size(pRExC_state, (Ptrdiff_t) size);
12795 src = REGNODE_p(RExC_emit);
12797 dst = REGNODE_p(RExC_emit);
12799 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
12800 * and [perl #133871] shows this can lead to problems, so skip this
12801 * realignment of parens until a later pass when they are reliable */
12802 if (! IN_PARENS_PASS && RExC_open_parens) {
12804 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
12805 /* remember that RExC_npar is rex->nparens + 1,
12806 * iow it is 1 more than the number of parens seen in
12807 * the pattern so far. */
12808 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12809 /* note, RExC_open_parens[0] is the start of the
12810 * regex, it can't move. RExC_close_parens[0] is the end
12811 * of the regex, it *can* move. */
12812 if ( paren && RExC_open_parens[paren] >= operand ) {
12813 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
12814 RExC_open_parens[paren] += size;
12816 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12818 if ( RExC_close_parens[paren] >= operand ) {
12819 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
12820 RExC_close_parens[paren] += size;
12822 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12827 RExC_end_op += size;
12829 while (src > REGNODE_p(operand)) {
12830 StructCopy(--src, --dst, regnode);
12833 place = REGNODE_p(operand); /* Op node, where operand used to be. */
12834 src = place + 1; /* NOT REGNODE_AFTER! */
12836 FILL_NODE(operand, op);
12838 /* Zero out any arguments in the new node */
12839 Zero(src, offset, regnode);
12843 - regtail - set the next-pointer at the end of a node chain of p to val. If
12844 that value won't fit in the space available, instead returns FALSE.
12845 (Except asserts if we can't fit in the largest space the regex
12846 engine is designed for.)
12847 - SEE ALSO: regtail_study
12850 S_regtail(pTHX_ RExC_state_t * pRExC_state,
12851 const regnode_offset p,
12852 const regnode_offset val,
12855 regnode_offset scan;
12856 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12858 PERL_ARGS_ASSERT_REGTAIL;
12860 PERL_UNUSED_ARG(depth);
12863 /* The final node in the chain is the first one with a nonzero next pointer
12865 scan = (regnode_offset) p;
12867 regnode * const temp = regnext(REGNODE_p(scan));
12869 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12870 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12871 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
12872 SvPV_nolen_const(RExC_mysv), scan,
12873 (temp == NULL ? "->" : ""),
12874 (temp == NULL ? REGNODE_NAME(OP(REGNODE_p(val))) : "")
12879 scan = REGNODE_OFFSET(temp);
12882 /* Populate this node's next pointer */
12883 assert(val >= scan);
12884 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12885 assert((UV) (val - scan) <= U32_MAX);
12886 ARG1u_SET(REGNODE_p(scan), val - scan);
12889 if (val - scan > U16_MAX) {
12890 /* Populate this with something that won't loop and will likely
12891 * lead to a crash if the caller ignores the failure return, and
12892 * execution continues */
12893 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12896 NEXT_OFF(REGNODE_p(scan)) = val - scan;
12904 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12905 - Look for optimizable sequences at the same time.
12906 - currently only looks for EXACT chains.
12908 This is experimental code. The idea is to use this routine to perform
12909 in place optimizations on branches and groups as they are constructed,
12910 with the long term intention of removing optimization from study_chunk so
12911 that it is purely analytical.
12913 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12914 to control which is which.
12916 This used to return a value that was ignored. It was a problem that it is
12917 #ifdef'd to be another function that didn't return a value. khw has changed it
12918 so both currently return a pass/fail return.
12921 /* TODO: All four parms should be const */
12924 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
12925 const regnode_offset val, U32 depth)
12927 regnode_offset scan;
12929 #ifdef EXPERIMENTAL_INPLACESCAN
12932 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12934 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12937 /* Find last node. */
12941 regnode * const temp = regnext(REGNODE_p(scan));
12942 #ifdef EXPERIMENTAL_INPLACESCAN
12943 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12944 bool unfolded_multi_char; /* Unexamined in this routine */
12945 if (join_exact(pRExC_state, scan, &min,
12946 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
12947 return TRUE; /* Was return EXACT */
12951 if (REGNODE_TYPE(OP(REGNODE_p(scan))) == EXACT) {
12952 if (exact == PSEUDO )
12953 exact= OP(REGNODE_p(scan));
12954 else if (exact != OP(REGNODE_p(scan)) )
12957 else if (OP(REGNODE_p(scan)) != NOTHING) {
12962 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12963 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
12964 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
12965 SvPV_nolen_const(RExC_mysv),
12967 REGNODE_NAME(exact));
12971 scan = REGNODE_OFFSET(temp);
12974 DEBUG_PARSE_MSG("");
12975 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
12976 Perl_re_printf( aTHX_
12977 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
12978 SvPV_nolen_const(RExC_mysv),
12983 if (REGNODE_OFF_BY_ARG(OP(REGNODE_p(scan)))) {
12984 assert((UV) (val - scan) <= U32_MAX);
12985 ARG1u_SET(REGNODE_p(scan), val - scan);
12988 if (val - scan > U16_MAX) {
12989 /* Populate this with something that won't loop and will likely
12990 * lead to a crash if the caller ignores the failure return, and
12991 * execution continues */
12992 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
12995 NEXT_OFF(REGNODE_p(scan)) = val - scan;
12998 return TRUE; /* Was 'return exact' */
13003 Perl_get_ANYOFM_contents(pTHX_ const regnode * n) {
13005 /* Returns an inversion list of all the code points matched by the
13006 * ANYOFM/NANYOFM node 'n' */
13008 SV * cp_list = _new_invlist(-1);
13009 const U8 lowest = (U8) ARG1u(n);
13012 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
13014 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
13016 /* Starting with the lowest code point, any code point that ANDed with the
13017 * mask yields the lowest code point is in the set */
13018 for (i = lowest; i <= 0xFF; i++) {
13019 if ((i & FLAGS(n)) == ARG1u(n)) {
13020 cp_list = add_cp_to_invlist(cp_list, i);
13023 /* We know how many code points (a power of two) that are in the
13024 * set. No use looking once we've got that number */
13025 if (count >= needed) break;
13029 if (OP(n) == NANYOFM) {
13030 _invlist_invert(cp_list);
13036 Perl_get_ANYOFHbbm_contents(pTHX_ const regnode * n) {
13037 PERL_ARGS_ASSERT_GET_ANYOFHBBM_CONTENTS;
13039 SV * cp_list = NULL;
13040 populate_invlist_from_bitmap(
13041 ((struct regnode_bbm *) n)->bitmap,
13042 REGNODE_BBM_BITMAP_LEN * CHARBITS,
13045 /* The base cp is from the start byte plus a zero continuation */
13046 TWO_BYTE_UTF8_TO_NATIVE(FIRST_BYTE((struct regnode_bbm *) n),
13047 UTF_CONTINUATION_MARK | 0));
13054 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13055 { /* Assume that RE_INTUIT is set */
13056 /* Returns an SV containing a string that must appear in the target for it
13057 * to match, or NULL if nothing is known that must match.
13059 * CAUTION: the SV can be freed during execution of the regex engine */
13061 struct regexp *const prog = ReANY(r);
13062 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13064 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13065 PERL_UNUSED_CONTEXT;
13069 if (prog->maxlen > 0 && (prog->check_utf8 || prog->check_substr)) {
13070 const char * const s = SvPV_nolen_const(RX_UTF8(r)
13071 ? prog->check_utf8 : prog->check_substr);
13073 if (!PL_colorset) reginitcolors();
13074 Perl_re_printf( aTHX_
13075 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13077 RX_UTF8(r) ? "utf8 " : "",
13078 PL_colors[5], PL_colors[0],
13081 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
13085 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
13086 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
13092 handles refcounting and freeing the perl core regexp structure. When
13093 it is necessary to actually free the structure the first thing it
13094 does is call the 'free' method of the regexp_engine associated to
13095 the regexp, allowing the handling of the void *pprivate; member
13096 first. (This routine is not overridable by extensions, which is why
13097 the extensions free is called first.)
13099 See regdupe and regdupe_internal if you change anything here.
13101 #ifndef PERL_IN_XSUB_RE
13103 Perl_pregfree(pTHX_ REGEXP *r)
13109 Perl_pregfree2(pTHX_ REGEXP *rx)
13111 struct regexp *const r = ReANY(rx);
13112 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13114 PERL_ARGS_ASSERT_PREGFREE2;
13119 if (r->mother_re) {
13120 ReREFCNT_dec(r->mother_re);
13122 CALLREGFREE_PVT(rx); /* free the private data */
13123 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13127 for (i = 0; i < 2; i++) {
13128 SvREFCNT_dec(r->substrs->data[i].substr);
13129 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
13131 Safefree(r->substrs);
13133 RX_MATCH_COPY_FREE(rx);
13134 #ifdef PERL_ANY_COW
13135 SvREFCNT_dec(r->saved_copy);
13137 Safefree(RXp_OFFSp(r));
13138 if (r->logical_to_parno) {
13139 Safefree(r->logical_to_parno);
13140 Safefree(r->parno_to_logical);
13141 Safefree(r->parno_to_logical_next);
13144 SvREFCNT_dec(r->qr_anoncv);
13145 if (r->recurse_locinput)
13146 Safefree(r->recurse_locinput);
13152 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
13153 except that dsv will be created if NULL.
13155 This function is used in two main ways. First to implement
13156 $r = qr/....; $s = $$r;
13158 Secondly, it is used as a hacky workaround to the structural issue of
13160 being stored in the regexp structure which is in turn stored in
13161 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13162 could be PL_curpm in multiple contexts, and could require multiple
13163 result sets being associated with the pattern simultaneously, such
13164 as when doing a recursive match with (??{$qr})
13166 The solution is to make a lightweight copy of the regexp structure
13167 when a qr// is returned from the code executed by (??{$qr}) this
13168 lightweight copy doesn't actually own any of its data except for
13169 the starp/end and the actual regexp structure itself.
13175 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
13177 struct regexp *drx;
13178 struct regexp *const srx = ReANY(ssv);
13179 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
13181 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13184 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
13186 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
13188 /* our only valid caller, sv_setsv_flags(), should have done
13189 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
13190 assert(!SvOOK(dsv));
13191 assert(!SvIsCOW(dsv));
13192 assert(!SvROK(dsv));
13194 if (SvPVX_const(dsv)) {
13196 Safefree(SvPVX(dsv));
13201 SvOK_off((SV *)dsv);
13204 /* For PVLVs, the head (sv_any) points to an XPVLV, while
13205 * the LV's xpvlenu_rx will point to a regexp body, which
13206 * we allocate here */
13207 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
13208 assert(!SvPVX(dsv));
13209 /* We "steal" the body from the newly allocated SV temp, changing
13210 * the pointer in its HEAD to NULL. We then change its type to
13211 * SVt_NULL so that when we immediately release its only reference,
13212 * no memory deallocation happens.
13214 * The body will eventually be freed (from the PVLV) either in
13215 * Perl_sv_force_normal_flags() (if the PVLV is "downgraded" and
13216 * the regexp body needs to be removed)
13217 * or in Perl_sv_clear() (if the PVLV still holds the pointer until
13218 * the PVLV itself is deallocated). */
13219 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
13220 temp->sv_any = NULL;
13221 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
13222 SvREFCNT_dec_NN(temp);
13223 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
13224 ing below will not set it. */
13225 SvCUR_set(dsv, SvCUR(ssv));
13228 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
13229 sv_force_normal(sv) is called. */
13233 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
13234 SvPV_set(dsv, RX_WRAPPED(ssv));
13235 /* We share the same string buffer as the original regexp, on which we
13236 hold a reference count, incremented when mother_re is set below.
13237 The string pointer is copied here, being part of the regexp struct.
13239 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
13240 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13244 if (RXp_OFFSp(srx)) {
13245 const I32 npar = srx->nparens+1;
13246 NewCopy(RXp_OFFSp(srx), RXp_OFFSp(drx), npar, regexp_paren_pair);
13248 if (srx->substrs) {
13250 Newx(drx->substrs, 1, struct reg_substr_data);
13251 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
13253 for (i = 0; i < 2; i++) {
13254 SvREFCNT_inc_void(drx->substrs->data[i].substr);
13255 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
13258 /* check_substr and check_utf8, if non-NULL, point to either their
13259 anchored or float namesakes, and don't hold a second reference. */
13261 if (srx->logical_to_parno) {
13262 NewCopy(srx->logical_to_parno,
13263 drx->logical_to_parno,
13264 srx->nparens+1, I32);
13265 NewCopy(srx->parno_to_logical,
13266 drx->parno_to_logical,
13267 srx->nparens+1, I32);
13268 NewCopy(srx->parno_to_logical_next,
13269 drx->parno_to_logical_next,
13270 srx->nparens+1, I32);
13272 drx->logical_to_parno = NULL;
13273 drx->parno_to_logical = NULL;
13274 drx->parno_to_logical_next = NULL;
13276 drx->logical_nparens = srx->logical_nparens;
13278 RX_MATCH_COPIED_off(dsv);
13279 #ifdef PERL_ANY_COW
13280 RXp_SAVED_COPY(drx) = NULL;
13282 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
13283 SvREFCNT_inc_void(drx->qr_anoncv);
13284 if (srx->recurse_locinput)
13285 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
13292 /* regfree_internal()
13294 Free the private data in a regexp. This is overloadable by
13295 extensions. Perl takes care of the regexp structure in pregfree(),
13296 this covers the *pprivate pointer which technically perl doesn't
13297 know about, however of course we have to handle the
13298 regexp_internal structure when no extension is in use.
13300 Note this is called before freeing anything in the regexp
13305 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13307 struct regexp *const r = ReANY(rx);
13308 RXi_GET_DECL(r, ri);
13309 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13311 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13321 SV *dsv= sv_newmortal();
13322 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13323 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
13324 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
13325 PL_colors[4], PL_colors[5], s);
13329 if (ri->code_blocks)
13330 S_free_codeblocks(aTHX_ ri->code_blocks);
13333 int n = ri->data->count;
13336 /* If you add a ->what type here, update the comment in regcomp.h */
13337 switch (ri->data->what[n]) {
13343 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13346 Safefree(ri->data->data[n]);
13352 { /* Aho Corasick add-on structure for a trie node.
13353 Used in stclass optimization only */
13355 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13357 refcount = --aho->refcount;
13360 PerlMemShared_free(aho->states);
13361 PerlMemShared_free(aho->fail);
13362 /* do this last!!!! */
13363 PerlMemShared_free(ri->data->data[n]);
13364 /* we should only ever get called once, so
13365 * assert as much, and also guard the free
13366 * which /might/ happen twice. At the least
13367 * it will make code anlyzers happy and it
13368 * doesn't cost much. - Yves */
13369 assert(ri->regstclass);
13370 if (ri->regstclass) {
13371 PerlMemShared_free(ri->regstclass);
13372 ri->regstclass = 0;
13379 /* trie structure. */
13381 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13383 refcount = --trie->refcount;
13386 PerlMemShared_free(trie->charmap);
13387 PerlMemShared_free(trie->states);
13388 PerlMemShared_free(trie->trans);
13390 PerlMemShared_free(trie->bitmap);
13392 PerlMemShared_free(trie->jump);
13393 if (trie->j_before_paren)
13394 PerlMemShared_free(trie->j_before_paren);
13395 if (trie->j_after_paren)
13396 PerlMemShared_free(trie->j_after_paren);
13397 PerlMemShared_free(trie->wordinfo);
13398 /* do this last!!!! */
13399 PerlMemShared_free(ri->data->data[n]);
13404 /* NO-OP a '%' data contains a null pointer, so that reg_add_data
13405 * always returns non-zero, this should only ever happen in the
13410 Perl_croak(aTHX_ "panic: regfree data code '%c'",
13411 ri->data->what[n]);
13414 Safefree(ri->data->what);
13415 Safefree(ri->data);
13421 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
13424 =for apidoc re_dup_guts
13425 Duplicate a regexp.
13427 This routine is expected to clone a given regexp structure. It is only
13428 compiled under USE_ITHREADS.
13430 After all of the core data stored in struct regexp is duplicated
13431 the C<regexp_engine.dupe> method is used to copy any private data
13432 stored in the *pprivate pointer. This allows extensions to handle
13433 any duplication they need to do.
13437 See pregfree() and regfree_internal() if you change anything here.
13439 #if defined(USE_ITHREADS)
13440 #ifndef PERL_IN_XSUB_RE
13442 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13445 const struct regexp *r = ReANY(sstr);
13446 struct regexp *ret = ReANY(dstr);
13448 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13450 npar = r->nparens+1;
13451 NewCopy(RXp_OFFSp(r), RXp_OFFSp(ret), npar, regexp_paren_pair);
13453 if (ret->substrs) {
13454 /* Do it this way to avoid reading from *r after the StructCopy().
13455 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13456 cache, it doesn't matter. */
13458 const bool anchored = r->check_substr
13459 ? r->check_substr == r->substrs->data[0].substr
13460 : r->check_utf8 == r->substrs->data[0].utf8_substr;
13461 Newx(ret->substrs, 1, struct reg_substr_data);
13462 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13464 for (i = 0; i < 2; i++) {
13465 ret->substrs->data[i].substr =
13466 sv_dup_inc(ret->substrs->data[i].substr, param);
13467 ret->substrs->data[i].utf8_substr =
13468 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
13471 /* check_substr and check_utf8, if non-NULL, point to either their
13472 anchored or float namesakes, and don't hold a second reference. */
13474 if (ret->check_substr) {
13476 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
13478 ret->check_substr = ret->substrs->data[0].substr;
13479 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13481 assert(r->check_substr == r->substrs->data[1].substr);
13482 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
13484 ret->check_substr = ret->substrs->data[1].substr;
13485 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13487 } else if (ret->check_utf8) {
13489 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
13491 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
13496 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13497 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13498 if (r->recurse_locinput)
13499 Newx(ret->recurse_locinput, r->nparens + 1, char *);
13502 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
13504 if (RX_MATCH_COPIED(dstr))
13505 RXp_SUBBEG(ret) = SAVEPVN(RXp_SUBBEG(ret), RXp_SUBLEN(ret));
13507 RXp_SUBBEG(ret) = NULL;
13508 #ifdef PERL_ANY_COW
13509 RXp_SAVED_COPY(ret) = NULL;
13512 if (r->logical_to_parno) {
13513 /* we use total_parens for all three just for symmetry */
13514 ret->logical_to_parno = (I32*)SAVEPVN((char*)(r->logical_to_parno), (1+r->nparens) * sizeof(I32));
13515 ret->parno_to_logical = (I32*)SAVEPVN((char*)(r->parno_to_logical), (1+r->nparens) * sizeof(I32));
13516 ret->parno_to_logical_next = (I32*)SAVEPVN((char*)(r->parno_to_logical_next), (1+r->nparens) * sizeof(I32));
13518 ret->logical_to_parno = NULL;
13519 ret->parno_to_logical = NULL;
13520 ret->parno_to_logical_next = NULL;
13523 ret->logical_nparens = r->logical_nparens;
13525 /* Whether mother_re be set or no, we need to copy the string. We
13526 cannot refrain from copying it when the storage points directly to
13527 our mother regexp, because that's
13528 1: a buffer in a different thread
13529 2: something we no longer hold a reference on
13530 so we need to copy it locally. */
13531 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
13532 /* set malloced length to a non-zero value so it will be freed
13533 * (otherwise in combination with SVf_FAKE it looks like an alien
13534 * buffer). It doesn't have to be the actual malloced size, since it
13535 * should never be grown */
13536 SvLEN_set(dstr, SvCUR(sstr)+1);
13537 ret->mother_re = NULL;
13539 #endif /* PERL_IN_XSUB_RE */
13544 This is the internal complement to regdupe() which is used to copy
13545 the structure pointed to by the *pprivate pointer in the regexp.
13546 This is the core version of the extension overridable cloning hook.
13547 The regexp structure being duplicated will be copied by perl prior
13548 to this and will be provided as the regexp *r argument, however
13549 with the /old/ structures pprivate pointer value. Thus this routine
13550 may override any copying normally done by perl.
13552 It returns a pointer to the new regexp_internal structure.
13556 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13558 struct regexp *const r = ReANY(rx);
13559 regexp_internal *reti;
13561 RXi_GET_DECL(r, ri);
13563 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13567 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
13568 char, regexp_internal);
13569 Copy(ri->program, reti->program, len+1, regnode);
13572 if (ri->code_blocks) {
13574 Newx(reti->code_blocks, 1, struct reg_code_blocks);
13575 Newx(reti->code_blocks->cb, ri->code_blocks->count,
13576 struct reg_code_block);
13577 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
13578 ri->code_blocks->count, struct reg_code_block);
13579 for (n = 0; n < ri->code_blocks->count; n++)
13580 reti->code_blocks->cb[n].src_regex = (REGEXP*)
13581 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
13582 reti->code_blocks->count = ri->code_blocks->count;
13583 reti->code_blocks->refcnt = 1;
13586 reti->code_blocks = NULL;
13588 reti->regstclass = NULL;
13591 struct reg_data *d;
13592 const int count = ri->data->count;
13595 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13596 char, struct reg_data);
13597 Newx(d->what, count, U8);
13600 for (i = 0; i < count; i++) {
13601 d->what[i] = ri->data->what[i];
13602 switch (d->what[i]) {
13603 /* see also regcomp.h and regfree_internal() */
13604 case 'a': /* actually an AV, but the dup function is identical.
13605 values seem to be "plain sv's" generally. */
13606 case 'r': /* a compiled regex (but still just another SV) */
13607 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
13608 this use case should go away, the code could have used
13609 'a' instead - see S_set_ANYOF_arg() for array contents. */
13610 case 'S': /* actually an SV, but the dup function is identical. */
13611 case 'u': /* actually an HV, but the dup function is identical.
13612 values are "plain sv's" */
13613 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13616 /* Synthetic Start Class - "Fake" charclass we generate to optimize
13617 * patterns which could start with several different things. Pre-TRIE
13618 * this was more important than it is now, however this still helps
13619 * in some places, for instance /x?a+/ might produce a SSC equivalent
13620 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
13623 /* This is cheating. */
13624 Newx(d->data[i], 1, regnode_ssc);
13625 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
13626 reti->regstclass = (regnode*)d->data[i];
13629 /* AHO-CORASICK fail table */
13630 /* Trie stclasses are readonly and can thus be shared
13631 * without duplication. We free the stclass in pregfree
13632 * when the corresponding reg_ac_data struct is freed.
13634 reti->regstclass= ri->regstclass;
13637 /* TRIE transition table */
13639 ((reg_trie_data*)ri->data->data[i])->refcount++;
13642 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
13643 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
13644 is not from another regexp */
13645 d->data[i] = ri->data->data[i];
13648 /* this is a placeholder type, it exists purely so that
13649 * reg_add_data always returns a non-zero value, this type of
13650 * entry should ONLY be present in the 0 slot of the array */
13652 d->data[i]= ri->data->data[i];
13655 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
13656 ri->data->what[i]);
13665 if (ri->regstclass && !reti->regstclass) {
13666 /* Assume that the regstclass is a regnode which is inside of the
13667 * program which we have to copy over */
13668 regnode *node= ri->regstclass;
13669 assert(node >= ri->program && (node - ri->program) < len);
13670 reti->regstclass = reti->program + (node - ri->program);
13674 reti->name_list_idx = ri->name_list_idx;
13676 SetProgLen(reti, len);
13678 return (void*)reti;
13681 #endif /* USE_ITHREADS */
13684 S_re_croak(pTHX_ bool utf8, const char* pat,...)
13687 STRLEN len = strlen(pat);
13690 const char *message;
13692 PERL_ARGS_ASSERT_RE_CROAK;
13696 Copy(pat, buf, len , char);
13698 buf[len + 1] = '\0';
13699 va_start(args, pat);
13700 msv = vmess(buf, &args);
13702 message = SvPV_const(msv, len);
13705 Copy(message, buf, len , char);
13706 /* len-1 to avoid \n */
13707 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
13710 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13712 #ifndef PERL_IN_XSUB_RE
13714 Perl_save_re_context(pTHX)
13719 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13722 const REGEXP * const rx = PM_GETRE(PL_curpm);
13724 nparens = RX_NPARENS(rx);
13727 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
13728 * that PL_curpm will be null, but that utf8.pm and the modules it
13729 * loads will only use $1..$3.
13730 * The t/porting/re_context.t test file checks this assumption.
13735 for (i = 1; i <= nparens; i++) {
13736 char digits[TYPE_CHARS(long)];
13737 const STRLEN len = my_snprintf(digits, sizeof(digits),
13739 GV *const *const gvp
13740 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13743 GV * const gv = *gvp;
13744 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13751 #ifndef PERL_IN_XSUB_RE
13753 # include "uni_keywords.h"
13756 Perl_init_uniprops(pTHX)
13760 char * dump_len_string;
13762 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
13763 if ( ! dump_len_string
13764 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
13766 PL_dump_re_max_len = 60; /* A reasonable default */
13770 PL_user_def_props = newHV();
13772 # ifdef USE_ITHREADS
13774 HvSHAREKEYS_off(PL_user_def_props);
13775 PL_user_def_props_aTHX = aTHX;
13779 /* Set up the inversion list interpreter-level variables */
13781 PL_XPosix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13782 PL_XPosix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
13783 PL_XPosix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
13784 PL_XPosix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
13785 PL_XPosix_ptrs[CC_CASED_] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
13786 PL_XPosix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
13787 PL_XPosix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
13788 PL_XPosix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
13789 PL_XPosix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
13790 PL_XPosix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
13791 PL_XPosix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
13792 PL_XPosix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
13793 PL_XPosix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
13794 PL_XPosix_ptrs[CC_VERTSPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
13795 PL_XPosix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
13796 PL_XPosix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
13798 PL_Posix_ptrs[CC_ASCII_] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
13799 PL_Posix_ptrs[CC_ALPHANUMERIC_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
13800 PL_Posix_ptrs[CC_ALPHA_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
13801 PL_Posix_ptrs[CC_BLANK_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
13802 PL_Posix_ptrs[CC_CASED_] = PL_Posix_ptrs[CC_ALPHA_];
13803 PL_Posix_ptrs[CC_CNTRL_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
13804 PL_Posix_ptrs[CC_DIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
13805 PL_Posix_ptrs[CC_GRAPH_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
13806 PL_Posix_ptrs[CC_LOWER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
13807 PL_Posix_ptrs[CC_PRINT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
13808 PL_Posix_ptrs[CC_PUNCT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
13809 PL_Posix_ptrs[CC_SPACE_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
13810 PL_Posix_ptrs[CC_UPPER_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
13811 PL_Posix_ptrs[CC_VERTSPACE_] = NULL;
13812 PL_Posix_ptrs[CC_WORDCHAR_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
13813 PL_Posix_ptrs[CC_XDIGIT_] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
13815 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
13816 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
13817 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
13818 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
13819 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
13821 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
13822 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
13823 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
13824 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
13826 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
13828 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
13829 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
13831 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
13832 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
13834 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
13835 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13836 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
13837 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
13838 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
13839 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
13840 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
13841 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
13842 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
13843 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
13844 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
13845 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
13846 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
13847 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
13850 /* The below are used only by deprecated functions. They could be removed */
13851 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
13852 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
13853 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
13857 /* These four functions are compiled only in regcomp.c, where they have access
13858 * to the data they return. They are a way for re_comp.c to get access to that
13859 * data without having to compile the whole data structures. */
13862 Perl_do_uniprop_match(const char * const key, const U16 key_len)
13864 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
13866 return match_uniprop((U8 *) key, key_len);
13870 Perl_get_prop_definition(pTHX_ const int table_index)
13872 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
13874 /* Create and return the inversion list */
13875 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
13878 const char * const *
13879 Perl_get_prop_values(const int table_index)
13881 PERL_ARGS_ASSERT_GET_PROP_VALUES;
13883 return UNI_prop_value_ptrs[table_index];
13887 Perl_get_deprecated_property_msg(const Size_t warning_offset)
13889 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
13891 return deprecated_property_msgs[warning_offset];
13896 This code was mainly added for backcompat to give a warning for non-portable
13897 code points in user-defined properties. But experiments showed that the
13898 warning in earlier perls were only omitted on overflow, which should be an
13899 error, so there really isnt a backcompat issue, and actually adding the
13900 warning when none was present before might cause breakage, for little gain. So
13901 khw left this code in, but not enabled. Tests were never added.
13904 Ei |const char *|get_extended_utf8_msg|const UV cp
13906 PERL_STATIC_INLINE const char *
13907 S_get_extended_utf8_msg(pTHX_ const UV cp)
13909 U8 dummy[UTF8_MAXBYTES + 1];
13913 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
13916 msg = hv_fetchs(msgs, "text", 0);
13919 (void) sv_2mortal((SV *) msgs);
13921 return SvPVX(*msg);
13925 #endif /* end of ! PERL_IN_XSUB_RE */
13928 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
13929 const bool ignore_case)
13931 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
13932 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
13933 * because nothing outside of ASCII will match. Use /m because the input
13934 * string may be a bunch of lines strung together.
13936 * Also sets up the debugging info */
13938 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
13940 SV * subpattern_sv = newSVpvn_flags(subpattern, len, SVs_TEMP);
13941 REGEXP * subpattern_re;
13942 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13944 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
13949 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
13951 /* Like in op.c, we copy the compile time pm flags to the rx ones */
13952 rx_flags = flags & RXf_PMf_COMPILETIME;
13954 #ifndef PERL_IN_XSUB_RE
13955 /* Use the core engine if this file is regcomp.c. That means no
13956 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
13957 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13958 &PL_core_reg_engine,
13962 if (isDEBUG_WILDCARD) {
13963 /* Use the special debugging engine if this file is re_comp.c and wants
13964 * to output the wildcard matching. This uses whatever
13965 * 'use re "Debug ..." is in effect */
13966 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13972 /* Use the special wildcard engine if this file is re_comp.c and
13973 * doesn't want to output the wildcard matching. This uses whatever
13974 * 'use re "Debug ..." is in effect for compilation, but this engine
13975 * structure has been set up so that it uses the core engine for
13976 * execution, so no execution debugging as a result of re.pm will be
13978 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
13982 /* XXX The above has the effect that any user-supplied regex engine
13983 * won't be called for matching wildcards. That might be good, or bad.
13984 * It could be changed in several ways. The reason it is done the
13985 * current way is to avoid having to save and restore
13986 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
13987 * could be used. Another suggestion is to keep the authoritative
13988 * value of the debug flags in a thread-local variable and add set/get
13989 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
13990 * Still another is to pass a flag, say in the engine's intflags that
13991 * would be checked each time before doing the debug output */
13995 assert(subpattern_re); /* Should have died if didn't compile successfully */
13996 return subpattern_re;
14000 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
14001 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
14004 DECLARE_AND_GET_RE_DEBUG_FLAGS;
14006 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
14010 /* The compilation has set things up so that if the program doesn't want to
14011 * see the wildcard matching procedure, it will get the core execution
14012 * engine, which is subject only to -Dr. So we have to turn that off
14013 * around this procedure */
14014 if (! isDEBUG_WILDCARD) {
14015 /* Note! Casts away 'volatile' */
14017 PL_debug &= ~ DEBUG_r_FLAG;
14020 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
14028 S_handle_user_defined_property(pTHX_
14030 /* Parses the contents of a user-defined property definition; returning the
14031 * expanded definition if possible. If so, the return is an inversion
14034 * If there are subroutines that are part of the expansion and which aren't
14035 * known at the time of the call to this function, this returns what
14036 * parse_uniprop_string() returned for the first one encountered.
14038 * If an error was found, NULL is returned, and 'msg' gets a suitable
14039 * message appended to it. (Appending allows the back trace of how we got
14040 * to the faulty definition to be displayed through nested calls of
14041 * user-defined subs.)
14043 * The caller IS responsible for freeing any returned SV.
14045 * The syntax of the contents is pretty much described in perlunicode.pod,
14046 * but we also allow comments on each line */
14048 const char * name, /* Name of property */
14049 const STRLEN name_len, /* The name's length in bytes */
14050 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14051 const bool to_fold, /* ? Is this under /i */
14052 const bool runtime, /* ? Are we in compile- or run-time */
14053 const bool deferrable, /* Is it ok for this property's full definition
14054 to be deferred until later? */
14055 SV* contents, /* The property's definition */
14056 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
14057 getting called unless this is thought to be
14058 a user-defined property */
14059 SV * msg, /* Any error or warning msg(s) are appended to
14061 const STRLEN level) /* Recursion level of this call */
14064 const char * string = SvPV_const(contents, len);
14065 const char * const e = string + len;
14066 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
14067 const STRLEN msgs_length_on_entry = SvCUR(msg);
14069 const char * s0 = string; /* Points to first byte in the current line
14070 being parsed in 'string' */
14071 const char overflow_msg[] = "Code point too large in \"";
14072 SV* running_definition = NULL;
14074 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
14076 *user_defined_ptr = TRUE;
14078 /* Look at each line */
14080 const char * s; /* Current byte */
14081 char op = '+'; /* Default operation is 'union' */
14082 IV min = 0; /* range begin code point */
14083 IV max = -1; /* and range end */
14084 SV* this_definition;
14086 /* Skip comment lines */
14088 s0 = strchr(s0, '\n');
14096 /* For backcompat, allow an empty first line */
14102 /* First character in the line may optionally be the operation */
14111 /* If the line is one or two hex digits separated by blank space, its
14112 * a range; otherwise it is either another user-defined property or an
14117 if (! isXDIGIT(*s)) {
14118 goto check_if_property;
14121 do { /* Each new hex digit will add 4 bits. */
14122 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
14123 s = strchr(s, '\n');
14127 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14128 sv_catpv(msg, overflow_msg);
14129 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14130 UTF8fARG(is_contents_utf8, s - s0, s0));
14131 sv_catpvs(msg, "\"");
14132 goto return_failure;
14135 /* Accumulate this digit into the value */
14136 min = (min << 4) + READ_XDIGIT(s);
14137 } while (isXDIGIT(*s));
14139 while (isBLANK(*s)) { s++; }
14141 /* We allow comments at the end of the line */
14143 s = strchr(s, '\n');
14149 else if (s < e && *s != '\n') {
14150 if (! isXDIGIT(*s)) {
14151 goto check_if_property;
14154 /* Look for the high point of the range */
14157 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
14158 s = strchr(s, '\n');
14162 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14163 sv_catpv(msg, overflow_msg);
14164 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14165 UTF8fARG(is_contents_utf8, s - s0, s0));
14166 sv_catpvs(msg, "\"");
14167 goto return_failure;
14170 max = (max << 4) + READ_XDIGIT(s);
14171 } while (isXDIGIT(*s));
14173 while (isBLANK(*s)) { s++; }
14176 s = strchr(s, '\n');
14181 else if (s < e && *s != '\n') {
14182 goto check_if_property;
14186 if (max == -1) { /* The line only had one entry */
14189 else if (max < min) {
14190 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14191 sv_catpvs(msg, "Illegal range in \"");
14192 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14193 UTF8fARG(is_contents_utf8, s - s0, s0));
14194 sv_catpvs(msg, "\"");
14195 goto return_failure;
14198 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
14200 if ( UNICODE_IS_PERL_EXTENDED(min)
14201 || UNICODE_IS_PERL_EXTENDED(max))
14203 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
14205 /* If both code points are non-portable, warn only on the lower
14207 sv_catpv(msg, get_extended_utf8_msg(
14208 (UNICODE_IS_PERL_EXTENDED(min))
14210 sv_catpvs(msg, " in \"");
14211 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
14212 UTF8fARG(is_contents_utf8, s - s0, s0));
14213 sv_catpvs(msg, "\"");
14218 /* Here, this line contains a legal range */
14219 this_definition = sv_2mortal(_new_invlist(2));
14220 this_definition = _add_range_to_invlist(this_definition, min, max);
14225 /* Here it isn't a legal range line. See if it is a legal property
14226 * line. First find the end of the meat of the line */
14227 s = strpbrk(s, "#\n");
14232 /* Ignore trailing blanks in keeping with the requirements of
14233 * parse_uniprop_string() */
14235 while (s > s0 && isBLANK_A(*s)) {
14240 this_definition = parse_uniprop_string(s0, s - s0,
14241 is_utf8, to_fold, runtime,
14244 user_defined_ptr, msg,
14246 ? level /* Don't increase level
14247 if input is empty */
14250 if (this_definition == NULL) {
14251 goto return_failure; /* 'msg' should have had the reason
14252 appended to it by the above call */
14255 if (! is_invlist(this_definition)) { /* Unknown at this time */
14256 return newSVsv(this_definition);
14260 s = strchr(s, '\n');
14270 _invlist_union(running_definition, this_definition,
14271 &running_definition);
14274 _invlist_subtract(running_definition, this_definition,
14275 &running_definition);
14278 _invlist_intersection(running_definition, this_definition,
14279 &running_definition);
14282 _invlist_union_complement_2nd(running_definition,
14283 this_definition, &running_definition);
14286 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
14287 __FILE__, __LINE__, op);
14291 /* Position past the '\n' */
14293 } /* End of loop through the lines of 'contents' */
14295 /* Here, we processed all the lines in 'contents' without error. If we
14296 * didn't add any warnings, simply return success */
14297 if (msgs_length_on_entry == SvCUR(msg)) {
14299 /* If the expansion was empty, the answer isn't nothing: its an empty
14300 * inversion list */
14301 if (running_definition == NULL) {
14302 running_definition = _new_invlist(1);
14305 return running_definition;
14308 /* Otherwise, add some explanatory text, but we will return success */
14312 running_definition = NULL;
14316 if (name_len > 0) {
14317 sv_catpvs(msg, " in expansion of ");
14318 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
14321 return running_definition;
14324 /* As explained below, certain operations need to take place in the first
14325 * thread created. These macros switch contexts */
14326 # ifdef USE_ITHREADS
14327 # define DECLARATION_FOR_GLOBAL_CONTEXT \
14328 PerlInterpreter * save_aTHX = aTHX;
14329 # define SWITCH_TO_GLOBAL_CONTEXT \
14330 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
14331 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
14332 # define CUR_CONTEXT aTHX
14333 # define ORIGINAL_CONTEXT save_aTHX
14335 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
14336 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
14337 # define RESTORE_CONTEXT NOOP
14338 # define CUR_CONTEXT NULL
14339 # define ORIGINAL_CONTEXT NULL
14343 S_delete_recursion_entry(pTHX_ void *key)
14345 /* Deletes the entry used to detect recursion when expanding user-defined
14346 * properties. This is a function so it can be set up to be called even if
14347 * the program unexpectedly quits */
14349 SV ** current_entry;
14350 const STRLEN key_len = strlen((const char *) key);
14351 DECLARATION_FOR_GLOBAL_CONTEXT;
14353 SWITCH_TO_GLOBAL_CONTEXT;
14355 /* If the entry is one of these types, it is a permanent entry, and not the
14356 * one used to detect recursions. This function should delete only the
14357 * recursion entry */
14358 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
14360 && ! is_invlist(*current_entry)
14361 && ! SvPOK(*current_entry))
14363 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
14371 S_get_fq_name(pTHX_
14372 const char * const name, /* The first non-blank in the \p{}, \P{} */
14373 const Size_t name_len, /* Its length in bytes, not including any trailing space */
14374 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14375 const bool has_colon_colon
14378 /* Returns a mortal SV containing the fully qualified version of the input
14383 fq_name = newSVpvs_flags("", SVs_TEMP);
14385 /* Use the current package if it wasn't included in our input */
14386 if (! has_colon_colon) {
14387 const HV * pkg = (IN_PERL_COMPILETIME)
14389 : CopSTASH(PL_curcop);
14390 const char* pkgname = HvNAME(pkg);
14392 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14393 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
14394 sv_catpvs(fq_name, "::");
14397 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
14398 UTF8fARG(is_utf8, name_len, name));
14403 S_parse_uniprop_string(pTHX_
14405 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
14406 * now. If so, the return is an inversion list.
14408 * If the property is user-defined, it is a subroutine, which in turn
14409 * may call other subroutines. This function will call the whole nest of
14410 * them to get the definition they return; if some aren't known at the time
14411 * of the call to this function, the fully qualified name of the highest
14412 * level sub is returned. It is an error to call this function at runtime
14413 * without every sub defined.
14415 * If an error was found, NULL is returned, and 'msg' gets a suitable
14416 * message appended to it. (Appending allows the back trace of how we got
14417 * to the faulty definition to be displayed through nested calls of
14418 * user-defined subs.)
14420 * The caller should NOT try to free any returned inversion list.
14422 * Other parameters will be set on return as described below */
14424 const char * const name, /* The first non-blank in the \p{}, \P{} */
14425 Size_t name_len, /* Its length in bytes, not including any
14427 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
14428 const bool to_fold, /* ? Is this under /i */
14429 const bool runtime, /* TRUE if this is being called at run time */
14430 const bool deferrable, /* TRUE if it's ok for the definition to not be
14431 known at this call */
14432 AV ** strings, /* To return string property values, like named
14434 bool *user_defined_ptr, /* Upon return from this function it will be
14435 set to TRUE if any component is a
14436 user-defined property */
14437 SV * msg, /* Any error or warning msg(s) are appended to
14439 const STRLEN level) /* Recursion level of this call */
14441 char* lookup_name; /* normalized name for lookup in our tables */
14442 unsigned lookup_len; /* Its length */
14443 enum { Not_Strict = 0, /* Some properties have stricter name */
14444 Strict, /* normalization rules, which we decide */
14445 As_Is /* upon based on parsing */
14446 } stricter = Not_Strict;
14448 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
14449 * (though it requires extra effort to download them from Unicode and
14450 * compile perl to know about them) */
14451 bool is_nv_type = FALSE;
14453 unsigned int i, j = 0;
14454 int equals_pos = -1; /* Where the '=' is found, or negative if none */
14455 int slash_pos = -1; /* Where the '/' is found, or negative if none */
14456 int table_index = 0; /* The entry number for this property in the table
14457 of all Unicode property names */
14458 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
14459 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
14460 the normalized name in certain situations */
14461 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
14462 part of a package name */
14463 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
14464 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
14465 property rather than a Unicode
14467 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
14468 if an error. If it is an inversion list,
14469 it is the definition. Otherwise it is a
14470 string containing the fully qualified sub
14472 SV * fq_name = NULL; /* For user-defined properties, the fully
14474 bool invert_return = FALSE; /* ? Do we need to complement the result before
14476 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
14477 explicit utf8:: package that we strip
14479 /* The expansion of properties that could be either user-defined or
14480 * official unicode ones is deferred until runtime, including a marker for
14481 * those that might be in the latter category. This boolean indicates if
14482 * we've seen that marker. If not, what we're parsing can't be such an
14483 * official Unicode property whose expansion was deferred */
14484 bool could_be_deferred_official = FALSE;
14486 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
14488 /* The input will be normalized into 'lookup_name' */
14489 Newx(lookup_name, name_len, char);
14490 SAVEFREEPV(lookup_name);
14492 /* Parse the input. */
14493 for (i = 0; i < name_len; i++) {
14494 char cur = name[i];
14496 /* Most of the characters in the input will be of this ilk, being parts
14498 if (isIDCONT_A(cur)) {
14500 /* Case differences are ignored. Our lookup routine assumes
14501 * everything is lowercase, so normalize to that */
14502 if (isUPPER_A(cur)) {
14503 lookup_name[j++] = toLOWER_A(cur);
14507 if (cur == '_') { /* Don't include these in the normalized name */
14511 lookup_name[j++] = cur;
14513 /* The first character in a user-defined name must be of this type.
14515 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
14516 could_be_user_defined = FALSE;
14522 /* Here, the character is not something typically in a name, But these
14523 * two types of characters (and the '_' above) can be freely ignored in
14524 * most situations. Later it may turn out we shouldn't have ignored
14525 * them, and we have to reparse, but we don't have enough information
14526 * yet to make that decision */
14527 if (cur == '-' || isSPACE_A(cur)) {
14528 could_be_user_defined = FALSE;
14532 /* An equals sign or single colon mark the end of the first part of
14533 * the property name */
14535 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
14537 lookup_name[j++] = '='; /* Treat the colon as an '=' */
14538 equals_pos = j; /* Note where it occurred in the input */
14539 could_be_user_defined = FALSE;
14543 /* If this looks like it is a marker we inserted at compile time,
14544 * set a flag and otherwise ignore it. If it isn't in the final
14545 * position, keep it as it would have been user input. */
14546 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
14548 && could_be_user_defined
14549 && i == name_len - 1)
14552 could_be_deferred_official = TRUE;
14556 /* Otherwise, this character is part of the name. */
14557 lookup_name[j++] = cur;
14559 /* Here it isn't a single colon, so if it is a colon, it must be a
14563 /* A double colon should be a package qualifier. We note its
14564 * position and continue. Note that one could have
14565 * pkg1::pkg2::...::foo
14566 * so that the position at the end of the loop will be just after
14567 * the final qualifier */
14570 non_pkg_begin = i + 1;
14571 lookup_name[j++] = ':';
14572 lun_non_pkg_begin = j;
14574 else { /* Only word chars (and '::') can be in a user-defined name */
14575 could_be_user_defined = FALSE;
14577 } /* End of parsing through the lhs of the property name (or all of it if
14580 /* If there is a single package name 'utf8::', it is ambiguous. It could
14581 * be for a user-defined property, or it could be a Unicode property, as
14582 * all of them are considered to be for that package. For the purposes of
14583 * parsing the rest of the property, strip it off */
14584 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
14585 lookup_name += STRLENs("utf8::");
14586 j -= STRLENs("utf8::");
14587 equals_pos -= STRLENs("utf8::");
14588 stripped_utf8_pkg = TRUE;
14591 /* Here, we are either done with the whole property name, if it was simple;
14592 * or are positioned just after the '=' if it is compound. */
14594 if (equals_pos >= 0) {
14595 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
14597 /* Space immediately after the '=' is ignored */
14599 for (; i < name_len; i++) {
14600 if (! isSPACE_A(name[i])) {
14605 /* Most punctuation after the equals indicates a subpattern, like
14607 if ( isPUNCT_A(name[i])
14612 /* A backslash means the real delimiter is the next character,
14613 * but it must be punctuation */
14614 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
14616 bool special_property = memEQs(lookup_name, j - 1, "name")
14617 || memEQs(lookup_name, j - 1, "na");
14618 if (! special_property) {
14619 /* Find the property. The table includes the equals sign, so
14620 * we use 'j' as-is */
14621 table_index = do_uniprop_match(lookup_name, j);
14623 if (special_property || table_index) {
14624 REGEXP * subpattern_re;
14625 char open = name[i++];
14627 const char * pos_in_brackets;
14628 const char * const * prop_values;
14631 /* Backslash => delimiter is the character following. We
14632 * already checked that it is punctuation */
14633 if (open == '\\') {
14638 /* This data structure is constructed so that the matching
14639 * closing bracket is 3 past its matching opening. The second
14640 * set of closing is so that if the opening is something like
14641 * ']', the closing will be that as well. Something similar is
14642 * done in toke.c */
14643 pos_in_brackets = memCHRs("([<)]>)]>", open);
14644 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
14647 || name[name_len-1] != close
14648 || (escaped && name[name_len-2] != '\\')
14649 /* Also make sure that there are enough characters.
14650 * e.g., '\\\' would show up incorrectly as legal even
14651 * though it is too short */
14652 || (SSize_t) (name_len - i - 1 - escaped) < 0)
14654 sv_catpvs(msg, "Unicode property wildcard not terminated");
14655 goto append_name_to_msg;
14658 Perl_ck_warner_d(aTHX_
14659 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
14660 "The Unicode property wildcards feature is experimental");
14662 if (special_property) {
14663 const char * error_msg;
14664 const char * revised_name = name + i;
14665 Size_t revised_name_len = name_len - (i + 1 + escaped);
14667 /* Currently, the only 'special_property' is name, which we
14668 * lookup in _charnames.pm */
14670 if (! load_charnames(newSVpvs("placeholder"),
14671 revised_name, revised_name_len,
14674 sv_catpv(msg, error_msg);
14675 goto append_name_to_msg;
14678 /* Farm this out to a function just to make the current
14679 * function less unwieldy */
14680 if (handle_names_wildcard(revised_name, revised_name_len,
14684 return prop_definition;
14690 prop_values = get_prop_values(table_index);
14692 /* Now create and compile the wildcard subpattern. Use /i
14693 * because the property values are supposed to match with case
14695 subpattern_re = compile_wildcard(name + i,
14696 name_len - i - 1 - escaped,
14700 /* For each legal property value, see if the supplied pattern
14702 while (*prop_values) {
14703 const char * const entry = *prop_values;
14704 const Size_t len = strlen(entry);
14705 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
14707 if (execute_wildcard(subpattern_re,
14709 (char *) entry + len,
14713 { /* Here, matched. Add to the returned list */
14714 Size_t total_len = j + len;
14715 SV * sub_invlist = NULL;
14716 char * this_string;
14718 /* We know this is a legal \p{property=value}. Call
14719 * the function to return the list of code points that
14721 Newxz(this_string, total_len + 1, char);
14722 Copy(lookup_name, this_string, j, char);
14723 my_strlcat(this_string, entry, total_len + 1);
14724 SAVEFREEPV(this_string);
14725 sub_invlist = parse_uniprop_string(this_string,
14735 _invlist_union(prop_definition, sub_invlist,
14739 prop_values++; /* Next iteration, look at next propvalue */
14740 } /* End of looking through property values; (the data
14741 structure is terminated by a NULL ptr) */
14743 SvREFCNT_dec_NN(subpattern_re);
14745 if (prop_definition) {
14746 return prop_definition;
14749 sv_catpvs(msg, "No Unicode property value wildcard matches:");
14750 goto append_name_to_msg;
14753 /* Here's how khw thinks we should proceed to handle the properties
14754 * not yet done: Bidi Mirroring Glyph can map to ""
14755 Bidi Paired Bracket can map to ""
14756 Case Folding (both full and simple)
14757 Shouldn't /i be good enough for Full
14758 Decomposition Mapping
14759 Equivalent Unified Ideograph can map to ""
14760 Lowercase Mapping (both full and simple)
14761 NFKC Case Fold can map to ""
14762 Titlecase Mapping (both full and simple)
14763 Uppercase Mapping (both full and simple)
14764 * Handle these the same way Name is done, using say, _wild.pm, but
14765 * having both loose and full, like in charclass_invlists.h.
14766 * Perhaps move block and script to that as they are somewhat large
14767 * in charclass_invlists.h.
14768 * For properties where the default is the code point itself, such
14769 * as any of the case changing mappings, the string would otherwise
14770 * consist of all Unicode code points in UTF-8 strung together.
14771 * This would be impractical. So instead, examine their compiled
14772 * pattern, looking at the ssc. If none, reject the pattern as an
14773 * error. Otherwise run the pattern against every code point in
14774 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
14775 * And it might be good to create an API to return the ssc.
14776 * Or handle them like the algorithmic names are done
14778 } /* End of is a wildcard subppattern */
14780 /* \p{name=...} is handled specially. Instead of using the normal
14781 * mechanism involving charclass_invlists.h, it uses _charnames.pm
14782 * which has the necessary (huge) data accessible to it, and which
14783 * doesn't get loaded unless necessary. The legal syntax for names is
14784 * somewhat different than other properties due both to the vagaries of
14785 * a few outlier official names, and the fact that only a few ASCII
14786 * characters are permitted in them */
14787 if ( memEQs(lookup_name, j - 1, "name")
14788 || memEQs(lookup_name, j - 1, "na"))
14793 const char * error_msg;
14795 SV * character_name;
14796 STRLEN character_len;
14801 /* Since the RHS (after skipping initial space) is passed unchanged
14802 * to charnames, and there are different criteria for what are
14803 * legal characters in the name, just parse it here. A character
14804 * name must begin with an ASCII alphabetic */
14805 if (! isALPHA(name[i])) {
14808 lookup_name[j++] = name[i];
14810 for (++i; i < name_len; i++) {
14811 /* Official names can only be in the ASCII range, and only
14812 * certain characters */
14813 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
14816 lookup_name[j++] = name[i];
14819 /* Finished parsing, save the name into an SV */
14820 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
14822 /* Make sure _charnames is loaded. (The parameters give context
14823 * for any errors generated */
14824 table = load_charnames(character_name, name, name_len, &error_msg);
14825 if (table == NULL) {
14826 sv_catpv(msg, error_msg);
14827 goto append_name_to_msg;
14830 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
14831 if (! lookup_loose) {
14833 "panic: Can't find '_charnames::_loose_regcomp_lookup");
14836 PUSHSTACKi(PERLSI_REGCOMP);
14842 XPUSHs(character_name);
14844 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
14849 SvREFCNT_inc_simple_void_NN(character);
14856 if (! SvOK(character)) {
14860 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
14861 if (character_len == SvCUR(character)) {
14862 prop_definition = add_cp_to_invlist(NULL, cp);
14867 /* First of the remaining characters in the string. */
14868 char * remaining = SvPVX(character) + character_len;
14870 if (strings == NULL) {
14871 goto failed; /* XXX Perhaps a specific msg instead, like
14872 'not available here' */
14875 if (*strings == NULL) {
14876 *strings = newAV();
14879 this_string = newAV();
14880 av_push_simple(this_string, newSVuv(cp));
14883 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
14884 av_push_simple(this_string, newSVuv(cp));
14885 remaining += character_len;
14886 } while (remaining < SvEND(character));
14888 av_push_simple(*strings, (SV *) this_string);
14891 return prop_definition;
14894 /* Certain properties whose values are numeric need special handling.
14895 * They may optionally be prefixed by 'is'. Ignore that prefix for the
14896 * purposes of checking if this is one of those properties */
14897 if (memBEGINPs(lookup_name, j, "is")) {
14901 /* Then check if it is one of these specially-handled properties. The
14902 * possibilities are hard-coded because easier this way, and the list
14903 * is unlikely to change.
14905 * All numeric value type properties are of this ilk, and are also
14906 * special in a different way later on. So find those first. There
14907 * are several numeric value type properties in the Unihan DB (which is
14908 * unlikely to be compiled with perl, but we handle it here in case it
14909 * does get compiled). They all end with 'numeric'. The interiors
14910 * aren't checked for the precise property. This would stop working if
14911 * a cjk property were to be created that ended with 'numeric' and
14912 * wasn't a numeric type */
14913 is_nv_type = memEQs(lookup_name + lookup_offset,
14914 j - 1 - lookup_offset, "numericvalue")
14915 || memEQs(lookup_name + lookup_offset,
14916 j - 1 - lookup_offset, "nv")
14917 || ( memENDPs(lookup_name + lookup_offset,
14918 j - 1 - lookup_offset, "numeric")
14919 && ( memBEGINPs(lookup_name + lookup_offset,
14920 j - 1 - lookup_offset, "cjk")
14921 || memBEGINPs(lookup_name + lookup_offset,
14922 j - 1 - lookup_offset, "k")));
14924 || memEQs(lookup_name + lookup_offset,
14925 j - 1 - lookup_offset, "canonicalcombiningclass")
14926 || memEQs(lookup_name + lookup_offset,
14927 j - 1 - lookup_offset, "ccc")
14928 || memEQs(lookup_name + lookup_offset,
14929 j - 1 - lookup_offset, "age")
14930 || memEQs(lookup_name + lookup_offset,
14931 j - 1 - lookup_offset, "in")
14932 || memEQs(lookup_name + lookup_offset,
14933 j - 1 - lookup_offset, "presentin"))
14937 /* Since the stuff after the '=' is a number, we can't throw away
14938 * '-' willy-nilly, as those could be a minus sign. Other stricter
14939 * rules also apply. However, these properties all can have the
14940 * rhs not be a number, in which case they contain at least one
14941 * alphabetic. In those cases, the stricter rules don't apply.
14942 * But the numeric type properties can have the alphas [Ee] to
14943 * signify an exponent, and it is still a number with stricter
14944 * rules. So look for an alpha that signifies not-strict */
14946 for (k = i; k < name_len; k++) {
14947 if ( isALPHA_A(name[k])
14948 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
14950 stricter = Not_Strict;
14958 /* A number may have a leading '+' or '-'. The latter is retained
14960 if (name[i] == '+') {
14963 else if (name[i] == '-') {
14964 lookup_name[j++] = '-';
14968 /* Skip leading zeros including single underscores separating the
14969 * zeros, or between the final leading zero and the first other
14971 for (; i < name_len - 1; i++) {
14972 if ( name[i] != '0'
14973 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
14979 /* Turn nv=-0 into nv=0. These should be equivalent, but vary by
14980 * underling libc implementation. */
14981 if ( i == name_len - 1
14982 && name[name_len-1] == '0'
14983 && lookup_name[j-1] == '-')
14989 else { /* No '=' */
14991 /* Only a few properties without an '=' should be parsed with stricter
14992 * rules. The list is unlikely to change. */
14993 if ( memBEGINPs(lookup_name, j, "perl")
14994 && memNEs(lookup_name + 4, j - 4, "space")
14995 && memNEs(lookup_name + 4, j - 4, "word"))
14999 /* We set the inputs back to 0 and the code below will reparse,
15005 /* Here, we have either finished the property, or are positioned to parse
15006 * the remainder, and we know if stricter rules apply. Finish out, if not
15008 for (; i < name_len; i++) {
15009 char cur = name[i];
15011 /* In all instances, case differences are ignored, and we normalize to
15013 if (isUPPER_A(cur)) {
15014 lookup_name[j++] = toLOWER(cur);
15018 /* An underscore is skipped, but not under strict rules unless it
15019 * separates two digits */
15022 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
15023 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
15025 lookup_name[j++] = '_';
15030 /* Hyphens are skipped except under strict */
15031 if (cur == '-' && ! stricter) {
15035 /* XXX Bug in documentation. It says white space skipped adjacent to
15036 * non-word char. Maybe we should, but shouldn't skip it next to a dot
15038 if (isSPACE_A(cur) && ! stricter) {
15042 lookup_name[j++] = cur;
15044 /* Unless this is a non-trailing slash, we are done with it */
15045 if (i >= name_len - 1 || cur != '/') {
15051 /* A slash in the 'numeric value' property indicates that what follows
15052 * is a denominator. It can have a leading '+' and '0's that should be
15053 * skipped. But we have never allowed a negative denominator, so treat
15054 * a minus like every other character. (No need to rule out a second
15055 * '/', as that won't match anything anyway */
15058 if (i < name_len && name[i] == '+') {
15062 /* Skip leading zeros including underscores separating digits */
15063 for (; i < name_len - 1; i++) {
15064 if ( name[i] != '0'
15065 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
15071 /* Store the first real character in the denominator */
15072 if (i < name_len) {
15073 lookup_name[j++] = name[i];
15078 /* Here are completely done parsing the input 'name', and 'lookup_name'
15079 * contains a copy, normalized.
15081 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
15082 * different from without the underscores. */
15083 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
15084 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
15085 && UNLIKELY(name[name_len-1] == '_'))
15087 lookup_name[j++] = '&';
15090 /* If the original input began with 'In' or 'Is', it could be a subroutine
15091 * call to a user-defined property instead of a Unicode property name. */
15092 if ( name_len - non_pkg_begin > 2
15093 && name[non_pkg_begin+0] == 'I'
15094 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
15096 /* Names that start with In have different characteristics than those
15097 * that start with Is */
15098 if (name[non_pkg_begin+1] == 's') {
15099 starts_with_Is = TRUE;
15103 could_be_user_defined = FALSE;
15106 if (could_be_user_defined) {
15109 /* If the user defined property returns the empty string, it could
15110 * easily be because the pattern is being compiled before the data it
15111 * actually needs to compile is available. This could be argued to be
15112 * a bug in the perl code, but this is a change of behavior for Perl,
15113 * so we handle it. This means that intentionally returning nothing
15114 * will not be resolved until runtime */
15115 bool empty_return = FALSE;
15117 /* Here, the name could be for a user defined property, which are
15118 * implemented as subs. */
15119 user_sub = get_cvn_flags(name, name_len, 0);
15122 /* Here, the property name could be a user-defined one, but there
15123 * is no subroutine to handle it (as of now). Defer handling it
15124 * until runtime. Otherwise, a block defined by Unicode in a later
15125 * release would get the synonym InFoo added for it, and existing
15126 * code that used that name would suddenly break if it referred to
15127 * the property before the sub was declared. See [perl #134146] */
15129 goto definition_deferred;
15132 /* Here, we are at runtime, and didn't find the user property. It
15133 * could be an official property, but only if no package was
15134 * specified, or just the utf8:: package. */
15135 if (could_be_deferred_official) {
15136 lookup_name += lun_non_pkg_begin;
15137 j -= lun_non_pkg_begin;
15139 else if (! stripped_utf8_pkg) {
15140 goto unknown_user_defined;
15143 /* Drop down to look up in the official properties */
15146 const char insecure[] = "Insecure user-defined property";
15148 /* Here, there is a sub by the correct name. Normally we call it
15149 * to get the property definition */
15151 SV * user_sub_sv = MUTABLE_SV(user_sub);
15152 SV * error; /* Any error returned by calling 'user_sub' */
15153 SV * key; /* The key into the hash of user defined sub names
15156 SV ** saved_user_prop_ptr; /* Hash entry for this property */
15158 /* How many times to retry when another thread is in the middle of
15159 * expanding the same definition we want */
15160 PERL_INT_FAST8_T retry_countdown = 10;
15162 DECLARATION_FOR_GLOBAL_CONTEXT;
15164 /* If we get here, we know this property is user-defined */
15165 *user_defined_ptr = TRUE;
15167 /* We refuse to call a potentially tainted subroutine; returning an
15170 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15171 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15172 goto append_name_to_msg;
15175 /* In principal, we only call each subroutine property definition
15176 * once during the life of the program. This guarantees that the
15177 * property definition never changes. The results of the single
15178 * sub call are stored in a hash, which is used instead for future
15179 * references to this property. The property definition is thus
15180 * immutable. But, to allow the user to have a /i-dependent
15181 * definition, we call the sub once for non-/i, and once for /i,
15182 * should the need arise, passing the /i status as a parameter.
15184 * We start by constructing the hash key name, consisting of the
15185 * fully qualified subroutine name, preceded by the /i status, so
15186 * that there is a key for /i and a different key for non-/i */
15187 key = newSVpvn_flags(((to_fold) ? "1" : "0"), 1, SVs_TEMP);
15188 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15189 non_pkg_begin != 0);
15190 sv_catsv(key, fq_name);
15192 /* We only call the sub once throughout the life of the program
15193 * (with the /i, non-/i exception noted above). That means the
15194 * hash must be global and accessible to all threads. It is
15195 * created at program start-up, before any threads are created, so
15196 * is accessible to all children. But this creates some
15199 * 1) The keys can't be shared, or else problems arise; sharing is
15200 * turned off at hash creation time
15201 * 2) All SVs in it are there for the remainder of the life of the
15202 * program, and must be created in the same interpreter context
15203 * as the hash, or else they will be freed from the wrong pool
15204 * at global destruction time. This is handled by switching to
15205 * the hash's context to create each SV going into it, and then
15206 * immediately switching back
15207 * 3) All accesses to the hash must be controlled by a mutex, to
15208 * prevent two threads from getting an unstable state should
15209 * they simultaneously be accessing it. The code below is
15210 * crafted so that the mutex is locked whenever there is an
15211 * access and unlocked only when the next stable state is
15214 * The hash stores either the definition of the property if it was
15215 * valid, or, if invalid, the error message that was raised. We
15216 * use the type of SV to distinguish.
15218 * There's also the need to guard against the definition expansion
15219 * from infinitely recursing. This is handled by storing the aTHX
15220 * of the expanding thread during the expansion. Again the SV type
15221 * is used to distinguish this from the other two cases. If we
15222 * come to here and the hash entry for this property is our aTHX,
15223 * it means we have recursed, and the code assumes that we would
15224 * infinitely recurse, so instead stops and raises an error.
15225 * (Any recursion has always been treated as infinite recursion in
15228 * If instead, the entry is for a different aTHX, it means that
15229 * that thread has gotten here first, and hasn't finished expanding
15230 * the definition yet. We just have to wait until it is done. We
15231 * sleep and retry a few times, returning an error if the other
15232 * thread doesn't complete. */
15235 USER_PROP_MUTEX_LOCK;
15237 /* If we have an entry for this key, the subroutine has already
15238 * been called once with this /i status. */
15239 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
15240 SvPVX(key), SvCUR(key), 0);
15241 if (saved_user_prop_ptr) {
15243 /* If the saved result is an inversion list, it is the valid
15244 * definition of this property */
15245 if (is_invlist(*saved_user_prop_ptr)) {
15246 prop_definition = *saved_user_prop_ptr;
15248 /* The SV in the hash won't be removed until global
15249 * destruction, so it is stable and we can unlock */
15250 USER_PROP_MUTEX_UNLOCK;
15252 /* The caller shouldn't try to free this SV */
15253 return prop_definition;
15256 /* Otherwise, if it is a string, it is the error message
15257 * that was returned when we first tried to evaluate this
15258 * property. Fail, and append the message */
15259 if (SvPOK(*saved_user_prop_ptr)) {
15260 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15261 sv_catsv(msg, *saved_user_prop_ptr);
15263 /* The SV in the hash won't be removed until global
15264 * destruction, so it is stable and we can unlock */
15265 USER_PROP_MUTEX_UNLOCK;
15270 assert(SvIOK(*saved_user_prop_ptr));
15272 /* Here, we have an unstable entry in the hash. Either another
15273 * thread is in the middle of expanding the property's
15274 * definition, or we are ourselves recursing. We use the aTHX
15275 * in it to distinguish */
15276 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
15278 /* Here, it's another thread doing the expanding. We've
15279 * looked as much as we are going to at the contents of the
15280 * hash entry. It's safe to unlock. */
15281 USER_PROP_MUTEX_UNLOCK;
15283 /* Retry a few times */
15284 if (retry_countdown-- > 0) {
15289 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15290 sv_catpvs(msg, "Timeout waiting for another thread to "
15292 goto append_name_to_msg;
15295 /* Here, we are recursing; don't dig any deeper */
15296 USER_PROP_MUTEX_UNLOCK;
15298 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15300 "Infinite recursion in user-defined property");
15301 goto append_name_to_msg;
15304 /* Here, this thread has exclusive control, and there is no entry
15305 * for this property in the hash. So we have the go ahead to
15306 * expand the definition ourselves. */
15308 PUSHSTACKi(PERLSI_REGCOMP);
15311 /* Create a temporary placeholder in the hash to detect recursion
15313 SWITCH_TO_GLOBAL_CONTEXT;
15314 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
15315 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
15318 /* Now that we have a placeholder, we can let other threads
15320 USER_PROP_MUTEX_UNLOCK;
15322 /* Make sure the placeholder always gets destroyed */
15323 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
15328 /* Call the user's function, with the /i status as a parameter.
15329 * Note that we have gone to a lot of trouble to keep this call
15330 * from being within the locked mutex region. */
15331 XPUSHs(boolSV(to_fold));
15334 /* The following block was taken from swash_init(). Presumably
15335 * they apply to here as well, though we no longer use a swash --
15339 /* We might get here via a subroutine signature which uses a utf8
15340 * parameter name, at which point PL_subname will have been set
15341 * but not yet used. */
15342 save_item(PL_subname);
15344 /* G_SCALAR guarantees a single return value */
15345 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
15350 if (TAINT_get || SvTRUE(error)) {
15351 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15352 if (SvTRUE(error)) {
15353 sv_catpvs(msg, "Error \"");
15354 sv_catsv(msg, error);
15355 sv_catpvs(msg, "\"");
15358 if (SvTRUE(error)) sv_catpvs(msg, "; ");
15359 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
15362 if (name_len > 0) {
15363 sv_catpvs(msg, " in expansion of ");
15364 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
15370 prop_definition = NULL;
15373 SV * contents = POPs;
15375 /* The contents is supposed to be the expansion of the property
15376 * definition. If the definition is deferrable, and we got an
15377 * empty string back, set a flag to later defer it (after clean
15380 && (! SvPOK(contents) || SvCUR(contents) == 0))
15382 empty_return = TRUE;
15384 else { /* Otherwise, call a function to check for valid syntax,
15387 prop_definition = handle_user_defined_property(
15389 is_utf8, to_fold, runtime,
15391 contents, user_defined_ptr,
15397 /* Here, we have the results of the expansion. Delete the
15398 * placeholder, and if the definition is now known, replace it with
15399 * that definition. We need exclusive access to the hash, and we
15400 * can't let anyone else in, between when we delete the placeholder
15401 * and add the permanent entry */
15402 USER_PROP_MUTEX_LOCK;
15404 S_delete_recursion_entry(aTHX_ SvPVX(key));
15406 if ( ! empty_return
15407 && (! prop_definition || is_invlist(prop_definition)))
15409 /* If we got success we use the inversion list defining the
15410 * property; otherwise use the error message */
15411 SWITCH_TO_GLOBAL_CONTEXT;
15412 (void) hv_store_ent(PL_user_def_props,
15415 ? newSVsv(prop_definition)
15421 /* All done, and the hash now has a permanent entry for this
15422 * property. Give up exclusive control */
15423 USER_PROP_MUTEX_UNLOCK;
15429 if (empty_return) {
15430 goto definition_deferred;
15433 if (prop_definition) {
15435 /* If the definition is for something not known at this time,
15436 * we toss it, and go return the main property name, as that's
15437 * the one the user will be aware of */
15438 if (! is_invlist(prop_definition)) {
15439 SvREFCNT_dec_NN(prop_definition);
15440 goto definition_deferred;
15443 sv_2mortal(prop_definition);
15447 return prop_definition;
15449 } /* End of calling the subroutine for the user-defined property */
15450 } /* End of it could be a user-defined property */
15452 /* Here it wasn't a user-defined property that is known at this time. See
15453 * if it is a Unicode property */
15455 lookup_len = j; /* This is a more mnemonic name than 'j' */
15457 /* Get the index into our pointer table of the inversion list corresponding
15458 * to the property */
15459 table_index = do_uniprop_match(lookup_name, lookup_len);
15461 /* If it didn't find the property ... */
15462 if (table_index == 0) {
15464 /* Try again stripping off any initial 'Is'. This is because we
15465 * promise that an initial Is is optional. The same isn't true of
15466 * names that start with 'In'. Those can match only blocks, and the
15467 * lookup table already has those accounted for. The lookup table also
15468 * has already accounted for Perl extensions (without and = sign)
15469 * starting with 'i's'. */
15470 if (starts_with_Is && equals_pos >= 0) {
15476 table_index = do_uniprop_match(lookup_name, lookup_len);
15479 if (table_index == 0) {
15482 /* Here, we didn't find it. If not a numeric type property, and
15483 * can't be a user-defined one, it isn't a legal property */
15484 if (! is_nv_type) {
15485 if (! could_be_user_defined) {
15489 /* Here, the property name is legal as a user-defined one. At
15490 * compile time, it might just be that the subroutine for that
15491 * property hasn't been encountered yet, but at runtime, it's
15492 * an error to try to use an undefined one */
15493 if (! deferrable) {
15494 goto unknown_user_defined;;
15497 goto definition_deferred;
15498 } /* End of isn't a numeric type property */
15500 /* The numeric type properties need more work to decide. What we
15501 * do is make sure we have the number in canonical form and look
15504 if (slash_pos < 0) { /* No slash */
15506 /* When it isn't a rational, take the input, convert it to a
15507 * NV, then create a canonical string representation of that
15511 SSize_t value_len = lookup_len - equals_pos;
15513 /* Get the value */
15514 if ( value_len <= 0
15515 || my_atof3(lookup_name + equals_pos, &value,
15517 != lookup_name + lookup_len)
15522 /* If the value is an integer, the canonical value is integral
15524 if (Perl_ceil(value) == value) {
15525 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
15526 equals_pos, lookup_name, value);
15528 else { /* Otherwise, it is %e with a known precision */
15531 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
15532 equals_pos, lookup_name,
15533 PL_E_FORMAT_PRECISION, value);
15535 /* The exponent generated is expecting two digits, whereas
15536 * %e on some systems will generate three. Remove leading
15537 * zeros in excess of 2 from the exponent. We start
15538 * looking for them after the '=' */
15539 exp_ptr = strchr(canonical + equals_pos, 'e');
15541 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
15542 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
15544 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
15546 if (excess_exponent_len > 0) {
15547 SSize_t leading_zeros = strspn(cur_ptr, "0");
15548 SSize_t excess_leading_zeros
15549 = MIN(leading_zeros, excess_exponent_len);
15550 if (excess_leading_zeros > 0) {
15551 Move(cur_ptr + excess_leading_zeros,
15553 strlen(cur_ptr) - excess_leading_zeros
15554 + 1, /* Copy the NUL as well */
15561 else { /* Has a slash. Create a rational in canonical form */
15562 UV numerator, denominator, gcd, trial;
15563 const char * end_ptr;
15564 const char * sign = "";
15566 /* We can't just find the numerator, denominator, and do the
15567 * division, then use the method above, because that is
15568 * inexact. And the input could be a rational that is within
15569 * epsilon (given our precision) of a valid rational, and would
15570 * then incorrectly compare valid.
15572 * We're only interested in the part after the '=' */
15573 const char * this_lookup_name = lookup_name + equals_pos;
15574 lookup_len -= equals_pos;
15575 slash_pos -= equals_pos;
15577 /* Handle any leading minus */
15578 if (this_lookup_name[0] == '-') {
15580 this_lookup_name++;
15585 /* Convert the numerator to numeric */
15586 end_ptr = this_lookup_name + slash_pos;
15587 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
15591 /* It better have included all characters before the slash */
15592 if (*end_ptr != '/') {
15596 /* Set to look at just the denominator */
15597 this_lookup_name += slash_pos;
15598 lookup_len -= slash_pos;
15599 end_ptr = this_lookup_name + lookup_len;
15601 /* Convert the denominator to numeric */
15602 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
15606 /* It better be the rest of the characters, and don't divide by
15608 if ( end_ptr != this_lookup_name + lookup_len
15609 || denominator == 0)
15614 /* Get the greatest common denominator using
15615 http://en.wikipedia.org/wiki/Euclidean_algorithm */
15617 trial = denominator;
15618 while (trial != 0) {
15620 trial = gcd % trial;
15624 /* If already in lowest possible terms, we have already tried
15625 * looking this up */
15630 /* Reduce the rational, which should put it in canonical form
15633 denominator /= gcd;
15635 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
15636 equals_pos, lookup_name, sign, numerator, denominator);
15639 /* Here, we have the number in canonical form. Try that */
15640 table_index = do_uniprop_match(canonical, strlen(canonical));
15641 if (table_index == 0) {
15644 } /* End of still didn't find the property in our table */
15645 } /* End of didn't find the property in our table */
15647 /* Here, we have a non-zero return, which is an index into a table of ptrs.
15648 * A negative return signifies that the real index is the absolute value,
15649 * but the result needs to be inverted */
15650 if (table_index < 0) {
15651 invert_return = TRUE;
15652 table_index = -table_index;
15655 /* Out-of band indices indicate a deprecated property. The proper index is
15656 * modulo it with the table size. And dividing by the table size yields
15657 * an offset into a table constructed by regen/mk_invlists.pl to contain
15658 * the corresponding warning message */
15659 if (table_index > MAX_UNI_KEYWORD_INDEX) {
15660 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
15661 table_index %= MAX_UNI_KEYWORD_INDEX;
15662 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__UNICODE_PROPERTY_NAME),
15663 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
15664 (int) name_len, name,
15665 get_deprecated_property_msg(warning_offset));
15668 /* In a few properties, a different property is used under /i. These are
15669 * unlikely to change, so are hard-coded here. */
15671 if ( table_index == UNI_XPOSIXUPPER
15672 || table_index == UNI_XPOSIXLOWER
15673 || table_index == UNI_TITLE)
15675 table_index = UNI_CASED;
15677 else if ( table_index == UNI_UPPERCASELETTER
15678 || table_index == UNI_LOWERCASELETTER
15679 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
15680 || table_index == UNI_TITLECASELETTER
15683 table_index = UNI_CASEDLETTER;
15685 else if ( table_index == UNI_POSIXUPPER
15686 || table_index == UNI_POSIXLOWER)
15688 table_index = UNI_POSIXALPHA;
15692 /* Create and return the inversion list */
15693 prop_definition = get_prop_definition(table_index);
15694 sv_2mortal(prop_definition);
15696 /* See if there is a private use override to add to this definition */
15698 COPHH * hinthash = (IN_PERL_COMPILETIME)
15699 ? CopHINTHASH_get(&PL_compiling)
15700 : CopHINTHASH_get(PL_curcop);
15701 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
15703 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
15705 /* See if there is an element in the hints hash for this table */
15706 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
15707 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
15711 SV * pu_definition;
15713 SV * expanded_prop_definition =
15714 sv_2mortal(invlist_clone(prop_definition, NULL));
15716 /* If so, it's definition is the string from here to the next
15717 * \a character. And its format is the same as a user-defined
15719 pos += SvCUR(pu_lookup);
15720 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
15721 pu_invlist = handle_user_defined_property(lookup_name,
15724 0, /* Not folded */
15732 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15733 sv_catpvs(msg, "Insecure private-use override");
15734 goto append_name_to_msg;
15737 /* For now, as a safety measure, make sure that it doesn't
15738 * override non-private use code points */
15739 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
15741 /* Add it to the list to be returned */
15742 _invlist_union(prop_definition, pu_invlist,
15743 &expanded_prop_definition);
15744 prop_definition = expanded_prop_definition;
15745 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
15750 if (invert_return) {
15751 _invlist_invert(prop_definition);
15753 return prop_definition;
15755 unknown_user_defined:
15756 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15757 sv_catpvs(msg, "Unknown user-defined property name");
15758 goto append_name_to_msg;
15761 if (non_pkg_begin != 0) {
15762 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15763 sv_catpvs(msg, "Illegal user-defined property name");
15766 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
15767 sv_catpvs(msg, "Can't find Unicode property definition");
15771 append_name_to_msg:
15773 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
15774 const char * suffix = (runtime && level == 0) ? "}" : "\"";
15776 sv_catpv(msg, prefix);
15777 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
15778 sv_catpv(msg, suffix);
15783 definition_deferred:
15786 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
15788 /* Here it could yet to be defined, so defer evaluation of this until
15789 * its needed at runtime. We need the fully qualified property name to
15790 * avoid ambiguity */
15792 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
15796 /* If it didn't come with a package, or the package is utf8::, this
15797 * actually could be an official Unicode property whose inclusion we
15798 * are deferring until runtime to make sure that it isn't overridden by
15799 * a user-defined property of the same name (which we haven't
15800 * encountered yet). Add a marker to indicate this possibility, for
15801 * use at such time when we first need the definition during pattern
15802 * matching execution */
15803 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
15804 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
15807 /* We also need a trailing newline */
15808 sv_catpvs(fq_name, "\n");
15810 *user_defined_ptr = TRUE;
15816 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
15817 const STRLEN wname_len, /* Its length */
15818 SV ** prop_definition,
15821 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
15822 * any matches, adding them to prop_definition */
15826 CV * get_names_info; /* entry to charnames.pm to get info we need */
15827 SV * names_string; /* Contains all character names, except algo */
15828 SV * algorithmic_names; /* Contains info about algorithmically
15829 generated character names */
15830 REGEXP * subpattern_re; /* The user's pattern to match with */
15831 struct regexp * prog; /* The compiled pattern */
15832 char * all_names_start; /* lib/unicore/Name.pl string of every
15833 (non-algorithmic) character name */
15834 char * cur_pos; /* We match, effectively using /gc; this is
15835 where we are now */
15836 bool found_matches = FALSE; /* Did any name match so far? */
15837 SV * empty; /* For matching zero length names */
15838 SV * must_sv; /* Contains the substring, if any, that must be
15839 in a name for the subpattern to match */
15840 const char * must; /* The PV of 'must' */
15841 STRLEN must_len; /* And its length */
15842 SV * syllable_name = NULL; /* For Hangul syllables */
15843 const char hangul_prefix[] = "HANGUL SYLLABLE ";
15844 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
15846 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
15847 * syllable name, and these are immutable and guaranteed by the Unicode
15848 * standard to never be extended */
15849 const STRLEN syl_max_len = hangul_prefix_len + 7;
15853 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
15855 /* Make sure _charnames is loaded. (The parameters give context
15856 * for any errors generated */
15857 get_names_info = get_cv("_charnames::_get_names_info", 0);
15858 if (! get_names_info) {
15859 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
15862 /* Get the charnames data */
15863 PUSHSTACKi(PERLSI_REGCOMP);
15871 /* Special _charnames entry point that returns the info this routine
15873 call_sv(MUTABLE_SV(get_names_info), G_LIST);
15877 /* Data structure for names which end in their very own code points */
15878 algorithmic_names = POPs;
15879 SvREFCNT_inc_simple_void_NN(algorithmic_names);
15881 /* The lib/unicore/Name.pl string */
15882 names_string = POPs;
15883 SvREFCNT_inc_simple_void_NN(names_string);
15890 if ( ! SvROK(names_string)
15891 || ! SvROK(algorithmic_names))
15892 { /* Perhaps should panic instead XXX */
15893 SvREFCNT_dec(names_string);
15894 SvREFCNT_dec(algorithmic_names);
15898 names_string = sv_2mortal(SvRV(names_string));
15899 all_names_start = SvPVX(names_string);
15900 cur_pos = all_names_start;
15902 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
15904 /* Compile the subpattern consisting of the name being looked for */
15905 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
15907 must_sv = re_intuit_string(subpattern_re);
15909 /* regexec.c can free the re_intuit_string() return. GH #17734 */
15910 must_sv = sv_2mortal(newSVsv(must_sv));
15911 must = SvPV(must_sv, must_len);
15918 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
15919 * This works because the NUL causes the function to return early, thus
15920 * showing that there are characters in it other than the acceptable ones,
15921 * which is our desired result.) */
15923 prog = ReANY(subpattern_re);
15925 /* If only nothing is matched, skip to where empty names are looked for */
15926 if (prog->maxlen == 0) {
15930 /* And match against the string of all names /gc. Don't even try if it
15931 * must match a character not found in any name. */
15932 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
15934 while (execute_wildcard(subpattern_re,
15936 SvEND(names_string),
15937 all_names_start, 0,
15940 { /* Here, matched. */
15942 /* Note the string entries look like
15943 * 00001\nSTART OF HEADING\n\n
15944 * so we could match anywhere in that string. We have to rule out
15945 * matching a code point line */
15946 char * this_name_start = all_names_start
15947 + RX_OFFS_START(subpattern_re,0);
15948 char * this_name_end = all_names_start
15949 + RX_OFFS_END(subpattern_re,0);
15952 UV cp = 0; /* Silences some compilers */
15953 AV * this_string = NULL;
15954 bool is_multi = FALSE;
15956 /* If matched nothing, advance to next possible match */
15957 if (this_name_start == this_name_end) {
15958 cur_pos = (char *) memchr(this_name_end + 1, '\n',
15959 SvEND(names_string) - this_name_end);
15960 if (cur_pos == NULL) {
15965 /* Position the next match to start beyond the current returned
15967 cur_pos = (char *) memchr(this_name_end, '\n',
15968 SvEND(names_string) - this_name_end);
15971 /* Back up to the \n just before the beginning of the character. */
15972 cp_end = (char *) my_memrchr(all_names_start,
15974 this_name_start - all_names_start);
15976 /* If we didn't find a \n, it means it matched somewhere in the
15977 * initial '00000' in the string, so isn't a real match */
15978 if (cp_end == NULL) {
15982 this_name_start = cp_end + 1; /* The name starts just after */
15983 cp_end--; /* the \n, and the code point */
15984 /* ends just before it */
15986 /* All code points are 5 digits long */
15987 cp_start = cp_end - 4;
15989 /* This shouldn't happen, as we found a \n, and the first \n is
15990 * further along than what we subtracted */
15991 assert(cp_start >= all_names_start);
15993 if (cp_start == all_names_start) {
15994 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
15998 /* If the character is a blank, we either have a named sequence, or
15999 * something is wrong */
16000 if (*(cp_start - 1) == ' ') {
16001 cp_start = (char *) my_memrchr(all_names_start,
16003 cp_start - all_names_start);
16007 assert(cp_start != NULL && cp_start >= all_names_start + 2);
16009 /* Except for the first line in the string, the sequence before the
16010 * code point is \n\n. If that isn't the case here, we didn't
16011 * match the name of a character. (We could have matched a named
16012 * sequence, not currently handled */
16013 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
16017 /* We matched! Add this to the list */
16018 found_matches = TRUE;
16020 /* Loop through all the code points in the sequence */
16021 while (cp_start < cp_end) {
16023 /* Calculate this code point from its 5 digits */
16024 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
16025 + (XDIGIT_VALUE(cp_start[1]) << 12)
16026 + (XDIGIT_VALUE(cp_start[2]) << 8)
16027 + (XDIGIT_VALUE(cp_start[3]) << 4)
16028 + XDIGIT_VALUE(cp_start[4]);
16030 cp_start += 6; /* Go past any blank */
16032 if (cp_start < cp_end || is_multi) {
16033 if (this_string == NULL) {
16034 this_string = newAV();
16038 av_push_simple(this_string, newSVuv(cp));
16042 if (is_multi) { /* Was more than one code point */
16043 if (*strings == NULL) {
16044 *strings = newAV();
16047 av_push_simple(*strings, (SV *) this_string);
16049 else { /* Only a single code point */
16050 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
16052 } /* End of loop through the non-algorithmic names string */
16055 /* There are also character names not in 'names_string'. These are
16056 * algorithmically generatable. Try this pattern on each possible one.
16057 * (khw originally planned to leave this out given the large number of
16058 * matches attempted; but the speed turned out to be quite acceptable
16060 * There are plenty of opportunities to optimize to skip many of the tests.
16061 * beyond the rudimentary ones already here */
16063 /* First see if the subpattern matches any of the algorithmic generatable
16064 * Hangul syllable names.
16066 * We know none of these syllable names will match if the input pattern
16067 * requires more bytes than any syllable has, or if the input pattern only
16068 * matches an empty name, or if the pattern has something it must match and
16069 * one of the characters in that isn't in any Hangul syllable. */
16070 if ( prog->minlen <= (SSize_t) syl_max_len
16071 && prog->maxlen > 0
16072 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
16074 /* These constants, names, values, and algorithm are adapted from the
16075 * Unicode standard, version 5.1, section 3.12, and should never
16077 const char * JamoL[] = {
16078 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
16079 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
16081 const int LCount = C_ARRAY_LENGTH(JamoL);
16083 const char * JamoV[] = {
16084 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
16085 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
16088 const int VCount = C_ARRAY_LENGTH(JamoV);
16090 const char * JamoT[] = {
16091 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
16092 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
16093 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
16095 const int TCount = C_ARRAY_LENGTH(JamoT);
16099 /* This is the initial Hangul syllable code point; each time through the
16100 * inner loop, it maps to the next higher code point. For more info,
16101 * see the Hangul syllable section of the Unicode standard. */
16104 syllable_name = sv_2mortal(newSV(syl_max_len));
16105 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
16107 for (L = 0; L < LCount; L++) {
16108 for (V = 0; V < VCount; V++) {
16109 for (T = 0; T < TCount; T++) {
16111 /* Truncate back to the prefix, which is unvarying */
16112 SvCUR_set(syllable_name, hangul_prefix_len);
16114 sv_catpv(syllable_name, JamoL[L]);
16115 sv_catpv(syllable_name, JamoV[V]);
16116 sv_catpv(syllable_name, JamoT[T]);
16118 if (execute_wildcard(subpattern_re,
16119 SvPVX(syllable_name),
16120 SvEND(syllable_name),
16121 SvPVX(syllable_name), 0,
16125 *prop_definition = add_cp_to_invlist(*prop_definition,
16127 found_matches = TRUE;
16136 /* The rest of the algorithmically generatable names are of the form
16137 * "PREFIX-code_point". The prefixes and the code point limits of each
16138 * were returned to us in the array 'algorithmic_names' from data in
16139 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
16140 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
16143 /* Each element of the array is a hash, giving the details for the
16144 * series of names it covers. There is the base name of the characters
16145 * in the series, and the low and high code points in the series. And,
16146 * for optimization purposes a string containing all the legal
16147 * characters that could possibly be in a name in this series. */
16148 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
16149 SV * prefix = * hv_fetchs(this_series, "name", 0);
16150 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
16151 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
16152 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
16154 /* Pre-allocate an SV with enough space */
16155 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
16157 if (high >= 0x10000) {
16158 sv_catpvs(algo_name, "0");
16161 /* This series can be skipped entirely if the pattern requires
16162 * something longer than any name in the series, or can only match an
16163 * empty name, or contains a character not found in any name in the
16165 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
16166 && prog->maxlen > 0
16167 && (strspn(must, legal) == must_len))
16169 for (j = low; j <= high; j++) { /* For each code point in the series */
16171 /* Get its name, and see if it matches the subpattern */
16172 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
16175 if (execute_wildcard(subpattern_re,
16178 SvPVX(algo_name), 0,
16182 *prop_definition = add_cp_to_invlist(*prop_definition, j);
16183 found_matches = TRUE;
16190 /* Finally, see if the subpattern matches an empty string */
16191 empty = newSVpvs("");
16192 if (execute_wildcard(subpattern_re,
16199 /* Many code points have empty names. Currently these are the \p{GC=C}
16200 * ones, minus CC and CF */
16202 SV * empty_names_ref = get_prop_definition(UNI_C);
16203 SV * empty_names = invlist_clone(empty_names_ref, NULL);
16205 SV * subtract = get_prop_definition(UNI_CC);
16207 _invlist_subtract(empty_names, subtract, &empty_names);
16208 SvREFCNT_dec_NN(empty_names_ref);
16209 SvREFCNT_dec_NN(subtract);
16211 subtract = get_prop_definition(UNI_CF);
16212 _invlist_subtract(empty_names, subtract, &empty_names);
16213 SvREFCNT_dec_NN(subtract);
16215 _invlist_union(*prop_definition, empty_names, prop_definition);
16216 found_matches = TRUE;
16217 SvREFCNT_dec_NN(empty_names);
16219 SvREFCNT_dec_NN(empty);
16222 /* If we ever were to accept aliases for, say private use names, we would
16223 * need to do something fancier to find empty names. The code below works
16224 * (at the time it was written), and is slower than the above */
16225 const char empties_pat[] = "^.";
16226 if (strNE(name, empties_pat)) {
16227 SV * empty = newSVpvs("");
16228 if (execute_wildcard(subpattern_re,
16235 SV * empties = NULL;
16237 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
16239 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
16240 SvREFCNT_dec_NN(empties);
16242 found_matches = TRUE;
16244 SvREFCNT_dec_NN(empty);
16248 SvREFCNT_dec_NN(subpattern_re);
16249 return found_matches;
16253 * ex: set ts=8 sts=4 sw=4 et: