* precedence is structured in regular expressions. Serious changes in
* regular-expression syntax might require a total rethink.
*/
+
+/* Note on debug output:
+ *
+ * This is set up so that -Dr turns on debugging like all other flags that are
+ * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
+ * all regular expressions encountered in a program, and gives a huge amount of
+ * output for all but the shortest programs.
+ *
+ * The ability to output pattern debugging information lexically, and with much
+ * finer grained control was added, with 'use re qw(Debug ....);' available even
+ * in non-DEBUGGING builds. This is accomplished by copying the contents of
+ * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
+ * Those files are compiled and linked into the perl executable, and they are
+ * compiled essentially as if DEBUGGING were enabled, and controlled by calls
+ * to re.pm.
+ *
+ * That would normally mean linking errors when two functions of the same name
+ * are attempted to be placed into the same executable. That is solved in one
+ * of four ways:
+ * 1) Static functions aren't known outside the file they are in, so for the
+ * many functions of that type in this file, it just isn't a problem.
+ * 2) Most externally known functions are enclosed in
+ * #ifndef PERL_IN_XSUB_RE
+ * ...
+ * #endif
+ * blocks, so there is only one defintion for them in the whole
+ * executable, the one in regcomp.c (or regexec.c). The implication of
+ * that is any debugging info that comes from them is controlled only by
+ * -Dr. Further, any static function they call will also be the version
+ * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
+ * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
+ * have different names, so that what gets loaded in the executable is
+ * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
+ * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
+ * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
+ * versions and their callees are under control of re.pm. The catch is
+ * that references to all these go through the regexp_engine structure,
+ * which is initialized in regcomp.h to the Perl_foo versions, and
+ * substituted out in lexical scopes where 'use re' is in effect to the
+ * 'my_foo' ones. That structure is public API, so it would be a hard
+ * sell to add any additional members.
+ * 4) For functions in regcomp.c and re_comp.c that are called only from,
+ * respectively, regexec.c and re_exec.c, they can have two different
+ * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
+ * embed.fnc.
+ *
+ * The bottom line is that if you add code to one of the public functions
+ * listed in ext/re/re_top.h, debugging automagically works. But if you write
+ * a new function that needs to do debugging or there is a chain of calls from
+ * it that need to do debugging, all functions in the chain should use options
+ * 2) or 4) above.
+ *
+ * A function may have to be split so that debugging stuff is static, but it
+ * calls out to some other function that only gets compiled in regcomp.c to
+ * access data that we don't want to duplicate.
+ */
+
#include "EXTERN.h"
#define PERL_IN_REGCOMP_C
#include "perl.h"
#ifdef PERL_IN_XSUB_RE
# include "re_comp.h"
EXTERN_C const struct regexp_engine my_reg_engine;
+EXTERN_C const struct regexp_engine wild_reg_engine;
#else
# include "regcomp.h"
#endif
regnode *next_regnode; /* next node to process when last is reached */
U32 prev_recursed_depth;
I32 stopparen; /* what stopparen do we use */
+ bool in_gosub; /* this or an outer frame is for GOSUB */
struct scan_frame *this_prev_frame; /* this previous frame */
struct scan_frame *prev_frame; /* previous frame */
regnode_offset emit; /* Code-emit pointer */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
- U32 seen;
SSize_t size; /* Number of regnode equivalents in
pattern */
Size_t sets_depth; /* Counts recursion depth of already-
compiled regex set patterns */
+ U32 seen;
+
+ I32 parens_buf_size; /* #slots malloced open/close_parens */
+ regnode_offset *open_parens; /* offsets to open parens */
+ regnode_offset *close_parens; /* offsets to close parens */
+ HV *paren_names; /* Paren names */
/* position beyond 'precomp' of the warning message furthest away from
* 'precomp'. During the parse, no warnings are raised for any problems
I32 nestroot; /* root parens we are in - used by
accept */
I32 seen_zerolen;
- regnode_offset *open_parens; /* offsets to open parens */
- regnode_offset *close_parens; /* offsets to close parens */
- I32 parens_buf_size; /* #slots malloced open/close_parens */
regnode *end_op; /* END node in program */
I32 utf8; /* whether the pattern is utf8 or not */
I32 orig_utf8; /* whether the pattern was originally in utf8 */
I32 uni_semantics; /* If a d charset modifier should use unicode
rules, even if the pattern is not in
utf8 */
- HV *paren_names; /* Paren names */
- regnode **recurse; /* Recurse regops */
I32 recurse_count; /* Number of recurse regops we have generated */
+ regnode **recurse; /* Recurse regops */
U8 *study_chunk_recursed; /* bitmap of which subs we have moved
through */
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
I32 override_recoding;
I32 recode_x_to_native;
I32 in_multi_char_class;
+ int code_index; /* next code_blocks[] slot */
struct reg_code_blocks *code_blocks;/* positions of literal (?{})
within pattern */
- int code_index; /* next code_blocks[] slot */
SSize_t maxlen; /* mininum possible number of chars in string to match */
scan_frame *frame_head;
scan_frame *frame_last;
U32 frame_count;
AV *warn_text;
HV *unlexed_names;
-#ifdef ADD_TO_REGEXEC
- char *starttry; /* -Dr: where regtry was called. */
-#define RExC_starttry (pRExC_state->starttry)
-#endif
SV *runtime_code_qr; /* qr with the runtime code blocks */
#ifdef DEBUGGING
const char *lastparse;
I32 lastnum;
- AV *paren_name_list; /* idx -> name */
U32 study_chunk_recursed_count;
+ AV *paren_name_list; /* idx -> name */
SV *mysv1;
SV *mysv2;
RExC_naughty += RExC_naughty / (exp) + (add)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
-#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
- ((*s) == '{' && regcurly(s)))
+#define ISMULT2(s) (ISMULT1(*s) || ((*s) == '{' && regcurly(s)))
/*
* Flags to be passed up and down.
*/
-#define WORST 0 /* Worst case. */
#define HASWIDTH 0x01 /* Known to not match null strings, could match
non-null ones. */
-
-/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
- * character. (There needs to be a case: in the switch statement in regexec.c
- * for any node marked SIMPLE.) Note that this is not the same thing as
- * REGNODE_SIMPLE */
-#define SIMPLE 0x02
-#define SPSTART 0x04 /* Starts with * or + */
+#define SIMPLE 0x02 /* Exactly one character wide */
+ /* (or LNBREAK as a special case) */
#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
#define TRYAGAIN 0x10 /* Weeded out a declaration. */
#define RESTART_PARSE 0x20 /* Need to redo the parse */
#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
+/* What is infinity for optimization purposes */
+#define OPTIMIZE_INFTY SSize_t_MAX
+
/* About scan_data_t.
During optimisation we recurse through the regexp program performing
- max_offset
Only used for floating strings. This is the rightmost point that
- the string can appear at. If set to SSize_t_MAX it indicates that the
+ the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
string can occur infinitely far to the right.
For fixed strings, it is equal to min_offset.
* Like Simple_vFAIL(), but accepts two arguments.
*/
#define Simple_vFAIL2(m,a1) STMT_START { \
- S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
+ S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
* Like Simple_vFAIL(), but accepts three arguments.
*/
#define Simple_vFAIL3(m, a1, a2) STMT_START { \
- S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
+ S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
* Like Simple_vFAIL(), but accepts four arguments.
*/
#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
- S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
+ S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
/* A specialized version of vFAIL2 that works with UTF8f */
#define vFAIL2utf8f(m, a1) STMT_START { \
PREPARE_TO_DIE; \
- S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
+ S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
#define vFAIL3utf8f(m, a1, a2) STMT_START { \
PREPARE_TO_DIE; \
- S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
+ S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
U32 depth, int is_inf)
{
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_OPTIMISE_MORE_r({
if (!data)
S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
regnode *scan, U32 depth, U32 flags)
{
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_OPTIMISE_r({
regnode *Next;
const STRLEN l = CHR_SVLEN(data->last_found);
SV * const longest_sv = data->substrs[data->cur_is_floating].str;
const STRLEN old_l = CHR_SVLEN(longest_sv);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_SCAN_COMMIT;
if (!i) /* fixed */
data->substrs[0].max_offset = data->substrs[0].min_offset;
else { /* float */
- data->substrs[1].max_offset = (l
+ data->substrs[1].max_offset =
+ (is_inf)
+ ? OPTIMIZE_INFTY
+ : (l
? data->last_start_max
- : (data->pos_delta > SSize_t_MAX - data->pos_min
- ? SSize_t_MAX
+ /* temporary underflow guard for 5.32 */
+ : data->pos_delta < 0 ? OPTIMIZE_INFTY
+ : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
+ ? OPTIMIZE_INFTY
: data->pos_min + data->pos_delta));
- if (is_inf
- || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
- data->substrs[1].max_offset = SSize_t_MAX;
}
- if (data->flags & SF_BEFORE_EOL)
- data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
- else
- data->substrs[i].flags &= ~SF_BEFORE_EOL;
+ data->substrs[i].flags &= ~SF_BEFORE_EOL;
+ data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
data->substrs[i].minlenp = minlenp;
data->substrs[i].lookbehind = 0;
}
* returned list must, and will, contain every code point that is a
* possibility. */
- dVAR;
SV* invlist = NULL;
SV* only_utf8_locale_invlist = NULL;
unsigned int i;
);
}
-PERL_STATIC_INLINE void
+STATIC void
S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
{
PERL_ARGS_ASSERT_SSC_UNION;
&ssc->invlist);
}
-PERL_STATIC_INLINE void
+STATIC void
S_ssc_intersection(pTHX_ regnode_ssc *ssc,
SV* const invlist,
const bool invert2nd)
&ssc->invlist);
}
-PERL_STATIC_INLINE void
+STATIC void
S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
{
PERL_ARGS_ASSERT_SSC_ADD_RANGE;
ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
}
-PERL_STATIC_INLINE void
+STATIC void
S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
{
/* AND just the single code point 'cp' into the SSC 'ssc' */
SvREFCNT_dec_NN(cp_list);
}
-PERL_STATIC_INLINE void
+STATIC void
S_ssc_clear_locale(regnode_ssc *ssc)
{
/* Set the SSC 'ssc' to not match any locale things */
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
U16 word;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_DUMP_TRIE;
U32 state;
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
U16 charid;
SV *sv=sv_newmortal();
int colwidth= widecharmap ? 6 : 4;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
STRLEN trie_charcount=0;
#endif
SV *re_trie_maxbuff;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_MAKE_TRIE;
#ifndef DEBUGGING
reg_ac_data *aho;
const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
regnode *stclass;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
PERL_UNUSED_CONTEXT;
* character folded sequences. Since a single character can fold into
* such a sequence, the minimum match length for this node is less than
* the number of characters in the node. This routine returns in
- * *min_subtract how many characters to subtract from the the actual
+ * *min_subtract how many characters to subtract from the actual
* length of the string to get a real minimum match length; it is 0 if
* there are no multi-char foldeds. This delta is used by the caller to
* adjust the min length of the match, and the delta between min and max,
U32 stopnow = 0;
#ifdef DEBUGGING
regnode *stop = scan;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
#else
PERL_UNUSED_ARG(depth);
#endif
s++;
}
}
- else {
+ else if (OP(scan) != EXACTFAA_NO_TRIE) {
/* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
* folds that are all Latin1. As explained in the comments
/* EXACTF nodes need to know that the minimum length
* changed so that a sharp s in the string can match this
* ss in the pattern, but they remain EXACTF nodes, as they
- * won't match this unless the target string is is UTF-8,
+ * won't match this unless the target string is in UTF-8,
* which we don't know until runtime. EXACTFL nodes can't
* transform into EXACTFU nodes */
if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
} while (f);
}
+/* Follow the next-chain of the current node and optimize away
+ all the NOTHINGs from it.
+ */
+STATIC void
+S_rck_elide_nothing(pTHX_ regnode *node)
+{
+ PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
+
+ if (OP(node) != CURLYX) {
+ const int max = (reg_off_by_arg[OP(node)]
+ ? I32_MAX
+ /* I32 may be smaller than U16 on CRAYs! */
+ : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
+ int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
+ int noff;
+ regnode *n = node;
+
+ /* Skip NOTHING and LONGJMP. */
+ while (
+ (n = regnext(n))
+ && (
+ (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
+ || ((OP(n) == LONGJMP) && (noff = ARG(n)))
+ )
+ && off + noff < max
+ ) {
+ off += noff;
+ }
+ if (reg_off_by_arg[OP(node)])
+ ARG(node) = off;
+ else
+ NEXT_OFF(node) = off;
+ }
+ return;
+}
+
/* the return from this sub is the minimum length that could possibly match */
STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
I32 stopparen,
U32 recursed_depth,
regnode_ssc *and_withp,
- U32 flags, U32 depth)
+ U32 flags, U32 depth, bool was_mutate_ok)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
/* recursed: which subroutines have we recursed into */
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
- dVAR;
SSize_t final_minlen;
/* There must be at least this number of characters to match */
SSize_t min = 0;
scan_data_t data_fake;
SV *re_trie_maxbuff = NULL;
regnode *first_non_open = scan;
- SSize_t stopmin = SSize_t_MAX;
+ SSize_t stopmin = OPTIMIZE_INFTY;
scan_frame *frame = NULL;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_STUDY_CHUNK;
RExC_study_started= 1;
node length to get a real minimum (because
the folded version may be shorter) */
bool unfolded_multi_char = FALSE;
+ /* avoid mutating ops if we are anywhere within the recursed or
+ * enframed handling for a GOSUB: the outermost level will handle it.
+ */
+ bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep", data, depth, is_inf);
DEBUG_PEEP("Peep", scan, depth, flags);
* parsing code, as each (?:..) is handled by a different invocation of
* reg() -- Yves
*/
- if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT
- && OP(scan) != LEXACT_REQ8)
+ if (PL_regkind[OP(scan)] == EXACT
+ && OP(scan) != LEXACT
+ && OP(scan) != LEXACT_REQ8
+ && mutate_ok
+ ) {
join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
0, NULL, depth + 1);
+ }
/* Follow the next-chain of the current node and optimize
- away all the NOTHINGs from it. */
- if (OP(scan) != CURLYX) {
- const int max = (reg_off_by_arg[OP(scan)]
- ? I32_MAX
- /* I32 may be smaller than U16 on CRAYs! */
- : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
- int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
- int noff;
- regnode *n = scan;
-
- /* Skip NOTHING and LONGJMP. */
- while ( (n = regnext(n))
- && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
- || ((OP(n) == LONGJMP) && (noff = ARG(n))))
- && off + noff < max)
- off += noff;
- if (reg_off_by_arg[OP(scan)])
- ARG(scan) = off;
- else
- NEXT_OFF(scan) = off;
- }
+ away all the NOTHINGs from it.
+ */
+ rck_elide_nothing(scan);
/* The principal pseudo-switch. Cannot be a switch, since we look into
* several different things. */
/* DEFINEP study_chunk() recursion */
(void)study_chunk(pRExC_state, &scan, &minlen,
&deltanext, next, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1, mutate_ok);
scan = next;
} else
/* NOTE - There is similar code to this block below for
* handling TRIE nodes on a re-study. If you change stuff here
* check there too. */
- SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
+ SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
regnode_ssc accum;
regnode * const startbranch=scan;
/* recurse study_chunk() for each BRANCH in an alternation */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, next, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1,
+ mutate_ok);
if (min1 > minnext)
min1 = minnext;
- if (deltanext == SSize_t_MAX) {
+ if (deltanext == OPTIMIZE_INFTY) {
is_inf = is_inf_internal = 1;
- max1 = SSize_t_MAX;
+ max1 = OPTIMIZE_INFTY;
} else if (max1 < minnext + deltanext)
max1 = minnext + deltanext;
scan = next;
min1 = 0;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += min1;
- if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
- data->pos_delta = SSize_t_MAX;
+ if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
+ data->pos_delta = OPTIMIZE_INFTY;
else
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
data->cur_is_floating = 1;
}
min += min1;
- if (delta == SSize_t_MAX
- || SSize_t_MAX - delta - (max1 - min1) < 0)
- delta = SSize_t_MAX;
+ if (delta == OPTIMIZE_INFTY
+ || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
+ delta = OPTIMIZE_INFTY;
else
delta += max1 - min1;
if (flags & SCF_DO_STCLASS_OR) {
}
}
- if (PERL_ENABLE_TRIE_OPTIMISATION &&
- OP( startbranch ) == BRANCH )
- {
+ if (PERL_ENABLE_TRIE_OPTIMISATION
+ && OP(startbranch) == BRANCH
+ && mutate_ok
+ ) {
/* demq.
Assuming this was/is a branch we are dealing with: 'scan'
* might result in a minlen of 1 and not of 4,
* but this doesn't make us mismatch, just try a bit
* harder than we should.
- * */
+ *
+ * However we must assume this GOSUB is infinite, to
+ * avoid wrongly applying other optimizations in the
+ * enclosing scope - see GH 18096, for example.
+ */
+ is_inf = is_inf_internal = 1;
scan= regnext(scan);
continue;
}
newframe->stopparen = stopparen;
newframe->prev_recursed_depth = recursed_depth;
newframe->this_prev_frame= frame;
+ newframe->in_gosub = (
+ (frame && frame->in_gosub) || OP(scan) == GOSUB
+ );
DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
DEBUG_PEEP("fnew", scan, depth, flags);
offset, later match for variable offset. */
if (data->last_end == -1) { /* Update the start info. */
data->last_start_min = data->pos_min;
- data->last_start_max = is_inf
- ? SSize_t_MAX : data->pos_min + data->pos_delta;
+ data->last_start_max =
+ is_inf ? OPTIMIZE_INFTY
+ : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
+ ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), bytelen);
if (UTF)
&& isALPHA_A(*s)
&& ( OP(scan) == EXACTFAA
|| ( OP(scan) == EXACTFU
- && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s))))
- {
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
+ && mutate_ok
+ ) {
U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
OP(scan) = ANYOFM;
}
if (flags & SCF_DO_SUBSTR)
data->pos_min++;
+ /* This will bypass the formal 'min += minnext * mincount'
+ * calculation in the do_curly path, so assumes min width
+ * of the PLUS payload is exactly one. */
min++;
/* FALLTHROUGH */
case STAR:
/* This temporary node can now be turned into EXACTFU, and
* must, as regexec.c doesn't handle it */
- if (OP(next) == EXACTFU_S_EDGE) {
+ if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
OP(next) = EXACTFU;
}
&& isALPHA_A(* STRING(next))
&& ( OP(next) == EXACTFAA
|| ( OP(next) == EXACTFU
- && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
- {
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
+ && mutate_ok
+ ) {
/* These differ in just one bit */
U8 mask = ~ ('A' ^ 'a');
(mincount == 0
? (f & ~SCF_DO_SUBSTR)
: f)
- ,depth+1);
+ , depth+1, mutate_ok);
if (flags & SCF_DO_STCLASS)
data->start_class = oclass;
RExC_precomp)));
}
+ if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
+ || min >= SSize_t_MAX - minnext * mincount )
+ {
+ FAIL("Regexp out of space");
+ }
+
min += minnext * mincount;
- is_inf_internal |= deltanext == SSize_t_MAX
+ is_inf_internal |= deltanext == OPTIMIZE_INFTY
|| (maxcount == REG_INFTY && minnext + deltanext > 0);
is_inf |= is_inf_internal;
if (is_inf) {
- delta = SSize_t_MAX;
+ delta = OPTIMIZE_INFTY;
} else {
delta += (minnext + deltanext) * maxcount
- minnext * mincount;
if ( OP(oscan) == CURLYX && data
&& data->flags & SF_IN_PAR
&& !(data->flags & SF_HAS_EVAL)
- && !deltanext && minnext == 1 ) {
+ && !deltanext && minnext == 1
+ && mutate_ok
+ ) {
/* Try to optimize to CURLYN. */
regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
regnode * const nxt1 = nxt;
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext /* atom is fixed width */
&& minnext != 0 /* CURLYM can't handle zero width */
-
/* Nor characters whose fold at run-time may be
* multi-character */
&& ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
+ && mutate_ok
) {
/* XXXX How to optimize if data == 0? */
/* Optimize to a simpler form. */
/* recurse study_chunk() on optimised CURLYX => CURLYM */
study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
NULL, stopparen, recursed_depth, NULL, 0,
- depth+1);
+ depth+1, mutate_ok);
}
else
oscan->flags = 0;
data->last_start_min += minnext * (mincount - 1);
data->last_start_max =
is_inf
- ? SSize_t_MAX
+ ? OPTIMIZE_INFTY
: data->last_start_max +
(maxcount - 1) * (minnext + data->pos_delta);
}
data->pos_min += minnext * (mincount - counted);
#if 0
Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
- " SSize_t_MAX=%" UVuf " minnext=%" UVuf
+ " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
" maxcount=%" UVuf " mincount=%" UVuf "\n",
- (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
+ (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
(UV)mincount);
-if (deltanext != SSize_t_MAX)
+if (deltanext != OPTIMIZE_INFTY)
Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
- - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
+ - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
#endif
- if (deltanext == SSize_t_MAX
- || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
- data->pos_delta = SSize_t_MAX;
+ if (deltanext == OPTIMIZE_INFTY
+ || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
+ data->pos_delta = OPTIMIZE_INFTY;
else
data->pos_delta += - counted * deltanext +
(minnext + deltanext) * maxcount - minnext * mincount;
data->last_end = data->pos_min;
data->last_start_min = data->pos_min - last_chrs;
data->last_start_max = is_inf
- ? SSize_t_MAX
+ ? OPTIMIZE_INFTY
: data->pos_min + data->pos_delta - last_chrs;
}
data->cur_is_floating = 1; /* float */
if (data && (fl & SF_HAS_EVAL))
data->flags |= SF_HAS_EVAL;
optimize_curly_tail:
- if (OP(oscan) != CURLYX) {
- while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
- && NEXT_OFF(next))
- NEXT_OFF(oscan) += NEXT_OFF(next);
- }
+ rck_elide_nothing(oscan);
continue;
default:
flags &= ~SCF_DO_STCLASS;
}
min++;
- if (delta != SSize_t_MAX)
+ if (delta != OPTIMIZE_INFTY)
delta++; /* Because of the 2 char string cr-lf */
if (flags & SCF_DO_SUBSTR) {
/* Cannot expect anything... */
scan_commit(pRExC_state, data, minlenp, is_inf);
data->pos_min += 1;
- if (data->pos_delta != SSize_t_MAX) {
+ if (data->pos_delta != OPTIMIZE_INFTY) {
data->pos_delta += 1;
}
data->cur_is_floating = 1; /* float */
(regnode_charclass *) scan);
break;
- case NANYOFM:
+ case NANYOFM: /* NANYOFM already contains the inversion of the
+ input ANYOF data, so, unlike things like
+ NPOSIXA, don't change 'invert' to TRUE */
+ /* FALLTHROUGH */
case ANYOFM:
{
SV* cp_list = get_ANYOFM_contents(scan);
/* recurse study_chunk() for lookahead body */
minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
last, &data_fake, stopparen,
- recursed_depth, NULL, f, depth+1);
+ recursed_depth, NULL, f, depth+1,
+ mutate_ok);
if (scan->flags) {
if ( deltanext < 0
|| deltanext > (I32) U8_MAX
*minnextp = study_chunk(pRExC_state, &nscan, minnextp,
&deltanext, last, &data_fake,
stopparen, recursed_depth, NULL,
- f, depth+1);
+ f, depth+1, mutate_ok);
if (scan->flags) {
assert(0); /* This code has never been tested since this
is normally not compiled */
regnode *trie_node= scan;
regnode *tail= regnext(scan);
reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
- SSize_t max1 = 0, min1 = SSize_t_MAX;
+ SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
regnode_ssc accum;
if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
/* optimise study_chunk() for TRIE */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, (regnode *)nextbranch, &data_fake,
- stopparen, recursed_depth, NULL, f, depth+1);
+ stopparen, recursed_depth, NULL, f, depth+1,
+ mutate_ok);
}
if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
nextbranch= regnext((regnode*)nextbranch);
if (min1 > (SSize_t)(minnext + trie->minlen))
min1 = minnext + trie->minlen;
- if (deltanext == SSize_t_MAX) {
+ if (deltanext == OPTIMIZE_INFTY) {
is_inf = is_inf_internal = 1;
- max1 = SSize_t_MAX;
+ max1 = OPTIMIZE_INFTY;
} else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
max1 = minnext + deltanext + trie->maxlen;
data->cur_is_floating = 1; /* float */
}
min += min1;
- if (delta != SSize_t_MAX) {
- if (SSize_t_MAX - (max1 - min1) >= delta)
+ if (delta != OPTIMIZE_INFTY) {
+ if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
delta += max1 - min1;
else
- delta = SSize_t_MAX;
+ delta = OPTIMIZE_INFTY;
}
if (flags & SCF_DO_STCLASS_OR) {
ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
*scanp = scan;
- *deltap = is_inf_internal ? SSize_t_MAX : delta;
+ *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
if (flags & SCF_DO_SUBSTR && is_inf)
- data->pos_delta = SSize_t_MAX - data->pos_min;
+ data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
if (is_par > (I32)U8_MAX)
is_par = 0;
if (is_par && pars==1 && data) {
final_minlen = min < stopmin
? min : stopmin;
+
if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
- if (final_minlen > SSize_t_MAX - delta)
- RExC_maxlen = SSize_t_MAX;
+ if (final_minlen > OPTIMIZE_INFTY - delta)
+ RExC_maxlen = OPTIMIZE_INFTY;
else if (RExC_maxlen < final_minlen + delta)
RExC_maxlen = final_minlen + delta;
}
Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
{
regexp_engine const *eng = current_re_engine();
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_PREGCOMP;
REGEXP *
Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
{
- PERL_ARGS_ASSERT_RE_COMPILE;
- return re_op_compile_wrapper(pattern, rx_flags, 0);
-}
-
-REGEXP *
-S_re_op_compile_wrapper(pTHX_ SV * const pattern, U32 rx_flags, const U32 pm_flags)
-{
SV *pat = pattern; /* defeat constness! */
- PERL_ARGS_ASSERT_RE_OP_COMPILE_WRAPPER;
+ PERL_ARGS_ASSERT_RE_COMPILE;
return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
#ifdef PERL_IN_XSUB_RE
#else
&PL_core_reg_engine,
#endif
- NULL, NULL, rx_flags, pm_flags);
+ NULL, NULL, rx_flags, 0);
}
-
static void
S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
{
int n=0;
STRLEN s = 0;
bool do_end = 0;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
DEBUG_PARSE_r(Perl_re_printf( aTHX_
"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
/* we make the assumption here that each op in the list of
* op_siblings maps to one SV pushed onto the stack,
* except for code blocks, with have both an OP_NULL and
- * and OP_CONST.
+ * an OP_CONST.
* This allows us to match up the list of SVs against the
* list of OPs to find the next code block.
*
pRExC_state->code_blocks->count -= n;
n = 0;
}
- else {
+ else {
/* ... or failing that, try "" overload */
while (SvAMAGIC(msv)
&& (sv = AMG_CALLunary(msv, string_amg))
{
SV *qr;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
if (pRExC_state->runtime_code_qr) {
/* this is the second time we've been called; this should
PERL_ARGS_ASSERT_SET_REGEX_PV;
/* make sure PL_bitcount bounds not exceeded */
- assert(sizeof(STD_PAT_MODS) <= 8);
+ STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
SvPOK_on(Rx);
* length of the pattern. Patches welcome to improve that guess. That amount
* of space is malloc'd and then immediately freed, and then clawed back node
* by node. This design is to minimze, to the extent possible, memory churn
- * when doing the the reallocs.
+ * when doing the reallocs.
*
* A separate parentheses counting pass may be needed in some cases.
* (Previously the sizing pass did this.) Patches welcome to reduce the number
OP *expr, const regexp_engine* eng, REGEXP *old_re,
bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
{
- dVAR;
REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
STRLEN plen;
char *exp;
int restudied = 0;
RExC_state_t copyRExC_state;
#endif
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_RE_OP_COMPILE;
DEBUG_OFFSETS_r(if (RExC_offsets) {
const STRLEN len = RExC_offsets[0];
STRLEN i;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
Perl_re_printf( aTHX_
"Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
for (i = 1; i <= len; i++) {
&data, -1, 0, NULL,
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
| (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
- 0);
+ 0, TRUE);
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
/* Don't offset infinity */
- if (data.substrs[i].max_offset < SSize_t_MAX)
+ if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
}
SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
? SCF_TRIE_DOING_RESTUDY
: 0),
- 0);
+ 0, TRUE);
CHECK_RESTUDY_GOTO_butfirst(NOOP);
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
{
struct regexp *const rx = ReANY(r);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
} else if (flags & RXapif_ONE) {
ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
av = MUTABLE_AV(SvRV(ret));
- length = av_tindex(av);
+ length = av_count(av);
SvREFCNT_dec_NN(ret);
- return newSViv(length + 1);
+ return newSViv(length);
} else {
Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
(int)flags);
Perl_re_printf( aTHX_ "%16s",""); \
\
if (RExC_lastnum!=RExC_emit) \
- Perl_re_printf( aTHX_ "|%4d", RExC_emit); \
+ Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
else \
Perl_re_printf( aTHX_ "|%4s",""); \
Perl_re_printf( aTHX_ "|%*s%-4s", \
* one of them */
while (i_a < len_a && i_b < len_b) {
UV cp; /* The element to potentially add to the union's array */
- bool cp_in_set; /* is it in the the input list's set or not */
+ bool cp_in_set; /* is it in the input list's set or not */
/* We need to take one or the other of the two inputs for the union.
* Since we are merging two sorted lists, we take the smaller of the
STATIC SV*
S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
{
- dVAR;
const U8 * s = (U8*)STRING(node);
SSize_t bytelen = STR_LEN(node);
UV uc;
fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
/* The only code points that aren't folded in a UTF EXACTFish
- * node are are the problematic ones in EXACTFL nodes */
+ * node are the problematic ones in EXACTFL nodes */
if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
/* We need to check for the possibility that this EXACTFL
* node begins with a multi-char fold. Therefore we fold
#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
#endif
-PERL_STATIC_INLINE regnode_offset
+STATIC regnode_offset
S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
I32 *flagp,
char * parse_start,
char* name_start = RExC_parse;
U32 num = 0;
SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
char * parse_start = RExC_parse; /* MJD */
char * const oregcomp_parse = RExC_parse;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REG;
DEBUG_PARSE("reg ");
vFAIL("Too many nested open parens");
}
- *flagp = 0; /* Tentatively. */
+ *flagp = 0; /* Initialize. */
if (RExC_in_lookbehind) {
RExC_in_lookbehind++;
}
if ( arg_required && !start_arg ) {
vFAIL3("Verb pattern '%.*s' has a mandatory argument",
- verb_len, start_verb);
+ (int) verb_len, start_verb);
}
if (internal_argval == -1) {
ret = reganode(pRExC_state, op, 0);
bool is_logical = 0;
const char * const seqstart = RExC_parse;
const char * endptr;
+ const char non_existent_group_msg[]
+ = "Reference to nonexistent group";
+ const char impossible_group[] = "Invalid reference to group";
+
if (has_intervening_patws) {
RExC_parse++;
vFAIL("In '(?...)', the '(' and '?' must be adjacent");
RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
- RExC_parse-seqstart, seqstart);
+ (int) (RExC_parse - seqstart), seqstart);
NOT_REACHED; /*NOTREACHED*/
case '<': /* (?<...) */
/* If you want to support (?<*...), first reconcile with GH #17363 */
|| *RExC_parse != paren)
{
vFAIL2("Sequence (?%c... not terminated",
- paren=='>' ? '<' : paren);
+ paren=='>' ? '<' : (char) paren);
}
{
HE *he_str;
) {
num = (I32)unum;
RExC_parse = (char*)endptr;
- } else
- num = I32_MAX;
+ }
+ else { /* Overflow, or something like that. Position
+ beyond all digits for the message */
+ while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
+ RExC_parse++;
+ }
+ vFAIL(impossible_group);
+ }
if (is_neg) {
- /* Some limit for num? */
+ /* -num is always representable on 1 and 2's complement
+ * machines */
num = -num;
}
}
vFAIL("Expecting close bracket");
gen_recurse_regop:
- if ( paren == '-' ) {
+ if (paren == '-' || paren == '+') {
+
+ /* Don't overflow */
+ if (UNLIKELY(I32_MAX - RExC_npar < num)) {
+ RExC_parse++;
+ vFAIL(impossible_group);
+ }
+
/*
Diagram of capture buffer numbering.
Top line is the normal capture buffer numbers
Bottom line is the negative indexing as from
the X (the (?-2))
- + 1 2 3 4 5 X 6 7
+ 1 2 3 4 5 X Y 6 7
+ /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
/(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
- - 5 4 3 2 1 X x x
+ - 5 4 3 2 1 X Y x x
+ Resolve to absolute group. Recall that RExC_npar is +1 of
+ the actual parenthesis group number. For lookahead, we
+ have to compensate for that. Using the above example, when
+ we get to Y in the parse, num is 2 and RExC_npar is 6. We
+ want 7 for +2, and 4 for -2.
*/
- num = RExC_npar + num;
- if (num < 1) {
+ if ( paren == '+' ) {
+ num--;
+ }
- /* It might be a forward reference; we can't fail until
- * we know, by completing the parse to get all the
- * groups, and then reparsing */
- if (ALL_PARENS_COUNTED) {
- RExC_parse++;
- vFAIL("Reference to nonexistent group");
- }
- else {
- REQUIRE_PARENS_PASS;
- }
+ num += RExC_npar;
+
+ if (paren == '-' && num < 1) {
+ RExC_parse++;
+ vFAIL(non_existent_group_msg);
}
- } else if ( paren == '+' ) {
- num = RExC_npar + num - 1;
}
- /* We keep track how many GOSUB items we have produced.
- To start off the ARG2L() of the GOSUB holds its "id",
- which is used later in conjunction with RExC_recurse
- to calculate the offset we need to jump for the GOSUB,
- which it will store in the final representation.
- We have to defer the actual calculation until much later
- as the regop may move.
- */
- ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
if (num >= RExC_npar) {
/* It might be a forward reference; we can't fail until we
if (ALL_PARENS_COUNTED) {
if (num >= RExC_total_parens) {
RExC_parse++;
- vFAIL("Reference to nonexistent group");
+ vFAIL(non_existent_group_msg);
}
}
else {
REQUIRE_PARENS_PASS;
}
}
+
+ /* We keep track how many GOSUB items we have produced.
+ To start off the ARG2L() of the GOSUB holds its "id",
+ which is used later in conjunction with RExC_recurse
+ to calculate the offset we need to jump for the GOSUB,
+ which it will store in the final representation.
+ We have to defer the actual calculation until much later
+ as the regop may move.
+ */
+ ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
RExC_recurse_count++;
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
if (RExC_open_parens && !RExC_open_parens[parno])
{
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting open paren #%" IVdf " to %d\n",
+ "%*s%*s Setting open paren #%" IVdf " to %zu\n",
22, "| |", (int)(depth * 2 + 1), "",
(IV)parno, ret));
RExC_open_parens[parno]= ret;
}
else if (paren != '?') /* Not Conditional */
ret = br;
- *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
+ *flagp |= flags & (HASWIDTH | POSTPONED);
lastbr = br;
while (*RExC_parse == '|') {
if (RExC_use_BRANCHJ) {
REQUIRE_BRANCHJ(flagp, 0);
}
lastbr = br;
- *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
+ *flagp |= flags & (HASWIDTH | POSTPONED);
}
if (have_branch || paren != ':') {
ender = reganode(pRExC_state, CLOSE, parno);
if ( RExC_close_parens ) {
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting close paren #%" IVdf " to %d\n",
+ "%*s%*s Setting close paren #%" IVdf " to %zu\n",
22, "| |", (int)(depth * 2 + 1), "",
(IV)parno, ender));
RExC_close_parens[parno]= ender;
RExC_end_op = REGNODE_p(ender);
if (RExC_close_parens) {
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting close paren #0 (END) to %d\n",
+ "%*s%*s Setting close paren #0 (END) to %zu\n",
22, "| |", (int)(depth * 2 + 1), "",
ender));
regnode_offset chain = 0;
regnode_offset latest;
I32 flags = 0, c = 0;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGBRANCH;
}
}
- *flagp = WORST; /* Tentatively. */
+ *flagp = 0; /* Initialize. */
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force to /x */ );
else if (ret == 0)
ret = latest;
*flagp |= flags&(HASWIDTH|POSTPONED);
- if (chain == 0) /* First piece. */
- *flagp |= flags&SPSTART;
- else {
+ if (chain != 0) {
/* FIXME adding one for every branch after the first is probably
* excessive now we have TRIE support. (hv) */
MARK_NAUGHTY(1);
/* Save the original in case we change the emitted regop to a FAIL. */
const regnode_offset orig_emit = RExC_emit;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGPIECE;
FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
}
+ if (! ISMULT2(RExC_parse)) {
+ *flagp = flags;
+ return(ret);
+ }
+
+ /* Here we know the input is a legal quantifier, including {m,n} */
+
op = *RExC_parse;
- if (op == '{' && regcurly(RExC_parse)) {
- maxpos = NULL;
#ifdef RE_TRACK_PATTERN_OFFSETS
- parse_start = RExC_parse; /* MJD */
+ parse_start = RExC_parse;
#endif
- next = RExC_parse + 1;
- while (isDIGIT(*next) || *next == ',') {
- if (*next == ',') {
- if (maxpos)
- break;
- else
- maxpos = next;
- }
- next++;
- }
- if (*next == '}') { /* got one */
- const char* endptr;
- if (!maxpos)
- maxpos = next;
- RExC_parse++;
- if (isDIGIT(*RExC_parse)) {
- endptr = RExC_end;
- if (!grok_atoUV(RExC_parse, &uv, &endptr))
- vFAIL("Invalid quantifier in {,}");
- if (uv >= REG_INFTY)
- vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
- min = (I32)uv;
- } else {
- min = 0;
- }
- if (*maxpos == ',')
- maxpos++;
- else
- maxpos = RExC_parse;
- if (isDIGIT(*maxpos)) {
- endptr = RExC_end;
- if (!grok_atoUV(maxpos, &uv, &endptr))
- vFAIL("Invalid quantifier in {,}");
- if (uv >= REG_INFTY)
- vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
- max = (I32)uv;
- } else {
- max = REG_INFTY; /* meaning "infinity" */
- }
- RExC_parse = next;
- nextchar(pRExC_state);
- if (max < min) { /* If can't match, warn and optimize to fail
- unconditionally */
- reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
- ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
- NEXT_OFF(REGNODE_p(orig_emit)) =
- regarglen[OPFAIL] + NODE_STEP_REGNODE;
- return ret;
- }
- else if (min == max && *RExC_parse == '?')
- {
- ckWARN2reg(RExC_parse + 1,
- "Useless use of greediness modifier '%c'",
- *RExC_parse);
- }
- do_curly:
- if ((flags&SIMPLE)) {
- if (min == 0 && max == REG_INFTY) {
-
- /* Going from 0..inf is currently forbidden in wildcard
- * subpatterns. The only reason is to make it harder to
- * write patterns that take a long long time to halt, and
- * because the use of this construct isn't necessary in
- * matching Unicode property values */
- if (RExC_pm_flags & PMf_WILDCARD) {
- RExC_parse++;
- /* diag_listed_as: Use of %s is not allowed in Unicode
- property wildcard subpatterns in regex; marked by
- <-- HERE in m/%s/ */
- vFAIL("Use of quantifier '*' is not allowed in"
- " Unicode property wildcard subpatterns");
- /* Note, don't need to worry about {0,}, as a '}' isn't
- * legal at all in wildcards, so wouldn't get this far
- * */
- }
- reginsert(pRExC_state, STAR, ret, depth+1);
- MARK_NAUGHTY(4);
- RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
- goto nest_check;
- }
- if (min == 1 && max == REG_INFTY) {
- reginsert(pRExC_state, PLUS, ret, depth+1);
- MARK_NAUGHTY(3);
- RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
- goto nest_check;
- }
- MARK_NAUGHTY_EXP(2, 2);
- reginsert(pRExC_state, CURLY, ret, depth+1);
- Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
- Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
- }
- else {
- const regnode_offset w = reg_node(pRExC_state, WHILEM);
+ if (op != '{') {
+ nextchar(pRExC_state);
- FLAGS(REGNODE_p(w)) = 0;
- if (! REGTAIL(pRExC_state, ret, w)) {
- REQUIRE_BRANCHJ(flagp, 0);
- }
- if (RExC_use_BRANCHJ) {
- reginsert(pRExC_state, LONGJMP, ret, depth+1);
- reginsert(pRExC_state, NOTHING, ret, depth+1);
- NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
- }
- reginsert(pRExC_state, CURLYX, ret, depth+1);
- /* MJD hk */
- Set_Node_Offset(REGNODE_p(ret), parse_start+1);
- Set_Node_Length(REGNODE_p(ret),
- op == '{' ? (RExC_parse - parse_start) : 1);
+ *flagp = HASWIDTH;
- if (RExC_use_BRANCHJ)
- NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
- LONGJMP. */
- if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
- NOTHING)))
- {
- REQUIRE_BRANCHJ(flagp, 0);
- }
- RExC_whilem_seen++;
- MARK_NAUGHTY_EXP(1, 4); /* compound interest */
- }
- FLAGS(REGNODE_p(ret)) = 0;
-
- if (min > 0)
- *flagp = WORST;
- if (max > 0)
- *flagp |= HASWIDTH;
- ARG1_SET(REGNODE_p(ret), (U16)min);
- ARG2_SET(REGNODE_p(ret), (U16)max);
- if (max == REG_INFTY)
- RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
-
- goto nest_check;
- }
+ if (op == '*') {
+ min = 0;
+ }
+ else if (op == '+') {
+ min = 1;
+ }
+ else if (op == '?') {
+ min = 0; max = 1;
+ }
}
+ else { /* is '{' */
+ const char* endptr;
- if (!ISMULT1(op)) {
- *flagp = flags;
- return(ret);
- }
+ maxpos = NULL;
+ next = RExC_parse + 1;
+ while (isDIGIT(*next) || *next == ',') {
+ if (*next == ',') {
+ if (maxpos)
+ break;
+ else
+ maxpos = next;
+ }
+ next++;
+ }
-#if 0 /* Now runtime fix should be reliable. */
+ assert(*next == '}');
- /* if this is reinstated, don't forget to put this back into perldiag:
+ if (!maxpos)
+ maxpos = next;
+ RExC_parse++;
+ if (isDIGIT(*RExC_parse)) {
+ endptr = RExC_end;
+ if (!grok_atoUV(RExC_parse, &uv, &endptr))
+ vFAIL("Invalid quantifier in {,}");
+ if (uv >= REG_INFTY)
+ vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+ min = (I32)uv;
+ } else {
+ min = 0;
+ }
+ if (*maxpos == ',')
+ maxpos++;
+ else
+ maxpos = RExC_parse;
+ if (isDIGIT(*maxpos)) {
+ endptr = RExC_end;
+ if (!grok_atoUV(maxpos, &uv, &endptr))
+ vFAIL("Invalid quantifier in {,}");
+ if (uv >= REG_INFTY)
+ vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+ max = (I32)uv;
+ } else {
+ max = REG_INFTY; /* meaning "infinity" */
+ }
+ RExC_parse = next;
+ nextchar(pRExC_state);
+ if (max < min) { /* If can't match, warn and optimize to fail
+ unconditionally */
+ reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
+ ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+ NEXT_OFF(REGNODE_p(orig_emit)) =
+ regarglen[OPFAIL] + NODE_STEP_REGNODE;
+ return ret;
+ }
+ else if (min == max && *RExC_parse == '?')
+ {
+ ckWARN2reg(RExC_parse + 1,
+ "Useless use of greediness modifier '%c'",
+ *RExC_parse);
+ }
+ }
- =item Regexp *+ operand could be empty at {#} in regex m/%s/
+ if (! (flags & (HASWIDTH|POSTPONED))) {
+ if (max > REG_INFTY/3) {
+ if (origparse[0] == '\\' && origparse[1] == 'K') {
+ vFAIL2utf8f(
+ "%" UTF8f " is forbidden - matches null string"
+ " many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse
+ ? RExC_parse - origparse
+ : 0),
+ origparse));
+ } else {
+ ckWARN2reg(RExC_parse,
+ "%" UTF8f " matches null string many times",
+ UTF8fARG(UTF, (RExC_parse >= origparse
+ ? RExC_parse - origparse
+ : 0),
+ origparse));
+ }
+ }
+ }
- (F) The part of the regexp subject to either the * or + quantifier
- could match an empty string. The {#} shows in the regular
- expression about where the problem was discovered.
+ if ((flags&SIMPLE)) {
+ if (min == 0 && max == REG_INFTY) {
- */
+ /* Going from 0..inf is currently forbidden in wildcard
+ * subpatterns. The only reason is to make it harder to
+ * write patterns that take a long long time to halt, and
+ * because the use of this construct isn't necessary in
+ * matching Unicode property values */
+ if (RExC_pm_flags & PMf_WILDCARD) {
+ RExC_parse++;
+ /* diag_listed_as: Use of %s is not allowed in Unicode
+ property wildcard subpatterns in regex; marked by
+ <-- HERE in m/%s/ */
+ vFAIL("Use of quantifier '*' is not allowed in"
+ " Unicode property wildcard subpatterns");
+ /* Note, don't need to worry about {0,}, as a '}' isn't
+ * legal at all in wildcards, so wouldn't get this far
+ * */
+ }
+ reginsert(pRExC_state, STAR, ret, depth+1);
+ MARK_NAUGHTY(4);
+ RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+ goto nest_check;
+ }
+ if (min == 1 && max == REG_INFTY) {
+ reginsert(pRExC_state, PLUS, ret, depth+1);
+ MARK_NAUGHTY(3);
+ RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+ goto nest_check;
+ }
+ MARK_NAUGHTY_EXP(2, 2);
+ reginsert(pRExC_state, CURLY, ret, depth+1);
+ Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
+ Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
+ }
+ else {
+ const regnode_offset w = reg_node(pRExC_state, WHILEM);
- if (!(flags&HASWIDTH) && op != '?')
- vFAIL("Regexp *+ operand could be empty");
-#endif
+ FLAGS(REGNODE_p(w)) = 0;
+ if (! REGTAIL(pRExC_state, ret, w)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
+ if (RExC_use_BRANCHJ) {
+ reginsert(pRExC_state, LONGJMP, ret, depth+1);
+ reginsert(pRExC_state, NOTHING, ret, depth+1);
+ NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
+ }
+ reginsert(pRExC_state, CURLYX, ret, depth+1);
+ /* MJD hk */
+ Set_Node_Offset(REGNODE_p(ret), parse_start+1);
+ Set_Node_Length(REGNODE_p(ret),
+ op == '{' ? (RExC_parse - parse_start) : 1);
+
+ if (RExC_use_BRANCHJ)
+ NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
+ LONGJMP. */
+ if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
+ NOTHING)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
+ RExC_whilem_seen++;
+ MARK_NAUGHTY_EXP(1, 4); /* compound interest */
+ }
+ FLAGS(REGNODE_p(ret)) = 0;
-#ifdef RE_TRACK_PATTERN_OFFSETS
- parse_start = RExC_parse;
-#endif
- nextchar(pRExC_state);
+ if (min > 0)
+ *flagp = 0;
+ if (max > 0)
+ *flagp |= HASWIDTH;
+ ARG1_SET(REGNODE_p(ret), (U16)min);
+ ARG2_SET(REGNODE_p(ret), (U16)max);
+ if (max == REG_INFTY)
+ RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
- *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
+ goto nest_check;
- if (op == '*') {
- min = 0;
- goto do_curly;
- }
- else if (op == '+') {
- min = 1;
- goto do_curly;
- }
- else if (op == '?') {
- min = 0; max = 1;
- goto do_curly;
- }
nest_check:
- if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
- if (origparse[0] == '\\' && origparse[1] == 'K') {
- vFAIL2utf8f(
- "%" UTF8f " is forbidden - matches null string many times",
- UTF8fARG(UTF, (RExC_parse >= origparse
- ? RExC_parse - origparse
- : 0),
- origparse));
- /* NOT-REACHED */
- } else {
- ckWARN2reg(RExC_parse,
- "%" UTF8f " matches null string many times",
- UTF8fARG(UTF, (RExC_parse >= origparse
- ? RExC_parse - origparse
- : 0),
- origparse));
- }
- }
if (*RExC_parse == '?') {
- nextchar(pRExC_state);
- reginsert(pRExC_state, MINMOD, ret, depth+1);
+ nextchar(pRExC_state);
+ reginsert(pRExC_state, MINMOD, ret, depth+1);
if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
REQUIRE_BRANCHJ(flagp, 0);
}
}
if (ISMULT2(RExC_parse)) {
- RExC_parse++;
- vFAIL("Nested quantifiers");
+ RExC_parse++;
+ vFAIL("Nested quantifiers");
}
return(ret);
char *save_start;
I32 flags;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_GROK_BSLASH_N;
- GET_RE_DEBUG_FLAGS;
-
assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
assert(! (node_p && cp_count)); /* At most 1 should be set */
FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
(UV) flags);
}
- *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
+ *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
nextchar(pRExC_state);
}
-PERL_STATIC_INLINE U8
+STATIC U8
S_compute_EXACTish(RExC_state_t *pRExC_state)
{
U8 op;
STATIC regnode_offset
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
- dVAR;
regnode_offset ret = 0;
I32 flags = 0;
char *parse_start;
U8 op;
int invert = 0;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
- *flagp = WORST; /* Tentatively. */
+ *flagp = 0; /* Initialize. */
DEBUG_PARSE("atom");
FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
(UV) flags);
}
- *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
+ *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
break;
case '|':
case ')':
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
+ /* Under wildcards, this is changed to match \n; should be
+ * invisible to the user, as they have to compile under /m */
if (RExC_pm_flags & PMf_WILDCARD) {
ret = reg_node(pRExC_state, MBOL);
}
/* SBOL is shared with /^/ so we set the flags so we can tell
* /\A/ from /^/ in split. */
FLAGS(REGNODE_p(ret)) = 1;
+ *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
- *flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
if (RExC_pm_flags & PMf_WILDCARD) {
}
ret = reg_node(pRExC_state, GPOS);
RExC_seen |= REG_GPOS_SEEN;
- *flagp |= SIMPLE;
goto finish_meta_pat;
case 'K':
if (!RExC_in_lookbehind && !RExC_in_lookahead) {
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, KEEPS);
- *flagp |= SIMPLE;
/* XXX:dmq : disabling in-place substitution seems to
* be necessary here to avoid cases of memory corruption, as
* with: C<$_="x" x 80; s/x\K/y/> -- rgs
}
case 'Z':
if (RExC_pm_flags & PMf_WILDCARD) {
+ /* See comment under \A above */
ret = reg_node(pRExC_state, MEOL);
}
else {
ret = reg_node(pRExC_state, SEOL);
+ *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
- *flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
case 'z':
if (RExC_pm_flags & PMf_WILDCARD) {
+ /* See comment under \A above */
ret = reg_node(pRExC_state, MEOL);
}
else {
ret = reg_node(pRExC_state, EOS);
+ *flagp |= SIMPLE; /* Wrong, but too late to fix for 5.32 */
}
- *flagp |= SIMPLE;
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
case 'C':
ret = reg_node(pRExC_state, op);
FLAGS(REGNODE_p(ret)) = flags;
- *flagp |= SIMPLE;
-
goto finish_meta_pat;
}
num > 9
/* any numeric escape < RExC_npar is a backref */
&& num >= RExC_npar
- /* cannot be an octal escape if it starts with 8 */
- && *RExC_parse != '8'
- /* cannot be an octal escape if it starts with 9 */
- && *RExC_parse != '9'
+ /* cannot be an octal escape if it starts with [89] */
+ && ! inRANGE(*RExC_parse, '8', '9')
) {
/* Probably not meant to be a backref, instead likely
* to be an octal character escape, e.g. \35 or \777.
* have to map that back to the original */
if (need_to_fold_loc) {
upper_fill = loc_correspondence[s - s_start];
- Safefree(locfold_buf);
- Safefree(loc_correspondence);
-
if (upper_fill == 0) {
FAIL2("panic: loc_correspondence[%d] is 0",
(int) (s - s_start));
}
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
}
else {
upper_fill = s - s0;
}
goto reparse;
}
- else if (need_to_fold_loc) {
- Safefree(locfold_buf);
- Safefree(loc_correspondence);
- }
/* Here the node consists entirely of non-final multi-char
* folds. (Likely it is all 'f's or all 's's.) There's no
* whole thing */
len = old_s - s0;
}
+
+ if (need_to_fold_loc) {
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+ }
} /* End of verifying node ends with an appropriate char */
/* We need to start the next node at the character that didn't fit
* sets up the bitmap and any flags, removing those code points from the
* inversion list, setting it to NULL should it become completely empty */
- dVAR;
PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
assert(PL_regkind[OP(node)] == ANYOF);
? end
: NUM_ANYOF_CODE_POINTS - 1;
for (i = start; i <= (int) high; i++) {
- if (! ANYOF_BITMAP_TEST(node, i)) {
- ANYOF_BITMAP_SET(node, i);
- }
+ ANYOF_BITMAP_SET(node, i);
}
}
invlist_iterfinish(*invlist_ptr);
if ( posix_warnings
&& RExC_warn_text
- && av_top_index(RExC_warn_text) > -1)
+ && av_count(RExC_warn_text) > 0)
{
*posix_warnings = RExC_warn_text;
}
char *save_end, *save_parse; /* Temporaries */
const bool in_locale = LOC; /* we turn off /l during processing */
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
+ PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
DEBUG_PARSE("xcls");
/* If more than a single node returned, the nested
* parens evaluated to more than just a (?[...]),
* which isn't legal */
- || node != 1) {
+ || RExC_emit != orig_emit
+ + NODE_STEP_REGNODE
+ + regarglen[REGEX_SET])
+ {
vFAIL("Expecting interpolated extended charclass");
}
resultant_invlist = (SV *) ARGp(REGNODE_p(node));
goto regclass_failed;
}
+ assert(current);
+
/* regclass() will return with parsing just the \ sequence,
* leaving the parse pointer at the next thing to parse */
RExC_parse--;
goto regclass_failed;
}
- if (! current) {
- break;
- }
+ assert(current);
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (RExC_sets_depth) { /* If within a recursive call, return in a special
regnode */
RExC_parse++;
- node = regpnode(pRExC_state, REGEX_SET, (void *) final);
+ node = regpnode(pRExC_state, REGEX_SET, final);
}
else {
*
* There is a line below that uses the same white space criteria but is outside
* this macro. Both here and there must use the same definition */
-#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
STMT_START { \
if (do_skip) { \
- while (isBLANK_A(UCHARAT(p))) \
+ while (p < stop_p && isBLANK_A(UCHARAT(p))) \
{ \
p++; \
} \
* UTF-8
*/
- dVAR;
UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
IV range = 0;
UV value = OOB_UNICODE, save_value = OOB_UNICODE;
what gets folded */
U32 has_runtime_dependency = 0; /* OR of the above flags */
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGCLASS;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
+ assert(! (ret_invlist && allow_mutiple_chars));
/* If wants an inversion list returned, we can't optimize to something
* else. */
initial_listsv_len = SvCUR(listsv);
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
- SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
assert(RExC_parse <= RExC_end);
invert = TRUE;
allow_mutiple_chars = FALSE;
MARK_NAUGHTY(1);
- SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
}
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
output_posix_warnings(pRExC_state, posix_warnings);
}
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
+
if (RExC_parse >= stop_ptr) {
break;
}
- SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
-
if (UCHARAT(RExC_parse) == ']') {
break;
}
property wildcard subpatterns in regex; marked by <--
HERE in m/%s/ */
vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
- " wildcard subpatterns", value, *(RExC_parse - 1));
+ " wildcard subpatterns", (char) value, *(RExC_parse - 1));
}
/* \p means they want Unicode semantics */
/* If set TRUE, the property is user-defined as opposed to
* official Unicode */
bool user_defined = FALSE;
+ AV * strings = NULL;
SV * prop_definition = parse_uniprop_string(
name, n, UTF, FOLD,
* this call */
! cBOOL(ret_invlist),
+ &strings,
&user_defined,
msg,
0 /* Base level */
SvCUR(msg), SvPVX(msg)));
}
- if (! is_invlist(prop_definition)) {
+ assert(prop_definition || strings);
+
+ if (strings) {
+ if (ret_invlist) {
+ if (! prop_definition) {
+ RExC_parse = e + 1;
+ vFAIL("Unicode string properties are not implemented in (?[...])");
+ }
+ else {
+ ckWARNreg(e + 1,
+ "Using just the single character results"
+ " returned by \\p{} in (?[...])");
+ }
+ }
+ else if (! RExC_in_multi_char_class) {
+ if (invert ^ (value == 'P')) {
+ RExC_parse = e + 1;
+ vFAIL("Inverting a character class which contains"
+ " a multi-character sequence is illegal");
+ }
+
+ /* For each multi-character string ... */
+ while (av_count(strings) > 0) {
+ /* ... Each entry is itself an array of code
+ * points. */
+ AV * this_string = (AV *) av_shift( strings);
+ STRLEN cp_count = av_count(this_string);
+ SV * final = newSV(cp_count * 4);
+ SvPVCLEAR(final);
+
+ /* Create another string of sequences of \x{...} */
+ while (av_count(this_string) > 0) {
+ SV * character = av_shift(this_string);
+ UV cp = SvUV(character);
+
+ if (cp > 255) {
+ REQUIRE_UTF8(flagp);
+ }
+ Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
+ cp);
+ SvREFCNT_dec_NN(character);
+ }
+ SvREFCNT_dec_NN(this_string);
+
+ /* And add that to the list of such things */
+ multi_char_matches
+ = add_multi_match(multi_char_matches,
+ final,
+ cp_count);
+ }
+ }
+ SvREFCNT_dec_NN(strings);
+ }
+
+ if (! prop_definition) { /* If we got only a string,
+ this iteration didn't really
+ find a character */
+ element_count--;
+ }
+ else if (! is_invlist(prop_definition)) {
/* Here, the definition isn't known, so we have gotten
* returned a string that will be evaluated if and when
}
} /* end of namedclass \blah */
- SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
/* If 'range' is set, 'value' is the ending of a range--check its
* validity. (If value isn't a single code point in the case of a
char* next_char_ptr = RExC_parse + 1;
/* Get the next real char after the '-' */
- SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
/* If the '-' is at the end of the class (just before the ']',
* it is a literal minus; otherwise it is a range */
* printable should have each end point be a portable value
* for it (preferably like 'A', but we don't warn if it is
* a (portable) Unicode name or code point), and the range
- * must be be all digits or all letters of the same case.
+ * must be all digits or all letters of the same case.
* Otherwise, the range is non-portable and unclear as to
* what it contains */
if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
}
#endif
- /* Look at the longest folds first */
+ /* Look at the longest strings first */
for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
cp_count > 0;
cp_count--)
}
/* If the character class contains anything else besides these
- * multi-character folds, have to include it in recursive parsing */
+ * multi-character strings, have to include it in recursive parsing */
if (element_count) {
- sv_catpvs(substitute_parse, "|[");
+ bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
+
+ sv_catpvs(substitute_parse, "|");
+ if (has_l_bracket) { /* Add an [ if the original had one */
+ sv_catpvs(substitute_parse, "[");
+ }
constructed_prefix_len = SvCUR(substitute_parse);
sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
- /* Put in a closing ']' only if not going off the end, as otherwise
- * we are adding something that really isn't there */
- if (RExC_parse < RExC_end) {
+ /* Put in a closing ']' to match any opening one, but not if going
+ * off the end, as otherwise we are adding something that really
+ * isn't there */
+ if (has_l_bracket && RExC_parse < RExC_end) {
sv_catpvs(substitute_parse, "]");
}
}
ret = reg(pRExC_state, 1, ®_flags, depth+1);
- *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
+ *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
/* And restore so can parse the rest of the pattern */
RExC_parse = save_parse;
if (ret_invlist) {
*ret_invlist = cp_list;
- return RExC_emit;
+ return (cp_list) ? RExC_emit : 0;
}
if (anyof_flags & ANYOF_LOCALE_FLAGS) {
* points) in the ASCII range, so we can't use it here to
* artificially restrict the fold domain, so we check if
* the class does or does not match some EXACTFish node.
- * Further, if we aren't under /i, and and the folded-to
+ * Further, if we aren't under /i, and the folded-to
* character is part of a multi-character fold, we can't do
* this optimization, as the sequence around it could be
* that multi-character fold, and we don't here know the
av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
}
+ /* (Note that if any of this changes, the size calculations in
+ * S_optimize_regclass() might need to be updated.) */
+
if (only_utf8_locale_list) {
av_store(av, ONLY_LOCALE_MATCHES_INDEX,
SvREFCNT_inc_NN(only_utf8_locale_list));
}
}
-#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
SV *
-Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
- const regnode* node,
- bool doinit,
- SV** listsvp,
- SV** only_utf8_locale_ptr,
- SV** output_invlist)
+
+#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
+Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
+#else
+Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
+#endif
{
/* For internal core use only.
RXi_GET_DECL(prog, progi);
const struct reg_data * const data = prog ? progi->data : NULL;
- PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
+#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
+ PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
+#else
+ PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
+#endif
assert(! output_invlist || listsvp);
if (data && data->count) {
return invlist;
}
-#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
/* reg_skipcomment()
const regnode_offset ret = RExC_emit;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGNODE_GUTS;
}
/*
-- regpnode - emit a temporary node with a void* argument
+- regpnode - emit a temporary node with a SV* argument
*/
STATIC regnode_offset /* Location. */
-S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg)
+S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
{
- const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regvnode");
+ const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
regnode_offset ptr = ret;
PERL_ARGS_ASSERT_REGPNODE;
regnode *place;
const int offset = regarglen[(U8)op];
const int size = NODE_STEP_REGNODE + offset;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGINSERT;
PERL_UNUSED_CONTEXT;
const U32 depth)
{
regnode_offset scan;
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGTAIL;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
- /* Find last node. */
+ /* The final node in the chain is the first one with a nonzero next pointer
+ * */
scan = (regnode_offset) p;
for (;;) {
regnode * const temp = regnext(REGNODE_p(scan));
DEBUG_PARSE_r({
DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
- Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
+ Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
SvPV_nolen_const(RExC_mysv), scan,
(temp == NULL ? "->" : ""),
(temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
scan = REGNODE_OFFSET(temp);
}
+ /* Populate this node's next pointer */
assert(val >= scan);
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
assert((UV) (val - scan) <= U32_MAX);
#ifdef EXPERIMENTAL_INPLACESCAN
I32 min = 0;
#endif
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGTAIL_STUDY;
DEBUG_PARSE_r({
DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
- Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
+ Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
SvPV_nolen_const(RExC_mysv),
scan,
PL_reg_name[exact]);
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r, ri);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGDUMP;
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
{
#ifdef DEBUGGING
- dVAR;
int k;
RXi_GET_DECL(prog, progi);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGPROP;
ANYOFRbase(o) + ANYOFRdelta(o));
}
else {
- (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
+ (void) get_regclass_nonbitmap_data(prog, o, FALSE,
+ &unresolved,
+ &only_utf8_locale_invlist,
+ &nonbitmap_invlist);
+#else
+ (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
&unresolved,
&only_utf8_locale_invlist,
&nonbitmap_invlist);
+#endif
}
/* The non-bitmap data may contain stuff that could fit in the
: (OP(o) == ANYOFH || OP(o) == ANYOFR)
? 0xFF
: lowest;
- Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
- if (lowest != highest) {
- Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+#ifndef EBCDIC
+ if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
+#endif
+ {
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
+ if (lowest != highest) {
+ Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
+ }
+ Perl_sv_catpvf(aTHX_ sv, ")");
}
- Perl_sv_catpvf(aTHX_ sv, ")");
}
SvREFCNT_dec(unresolved);
SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)
{ /* Assume that RE_INTUIT is set */
+ /* Returns an SV containing a string that must appear in the target for it
+ * to match, or NULL if nothing is known that must match.
+ *
+ * CAUTION: the SV can be freed during execution of the regex engine */
+
struct regexp *const prog = ReANY(r);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_RE_INTUIT_STRING;
PERL_UNUSED_CONTEXT;
DEBUG_COMPILE_r(
{
- const char * const s = SvPV_nolen_const(RX_UTF8(r)
+ if (prog->maxlen > 0) {
+ const char * const s = SvPV_nolen_const(RX_UTF8(r)
? prog->check_utf8 : prog->check_substr);
- if (!PL_colorset) reginitcolors();
- Perl_re_printf( aTHX_
+ if (!PL_colorset) reginitcolors();
+ Perl_re_printf( aTHX_
"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
PL_colors[4],
RX_UTF8(r) ? "utf8 " : "",
s,
PL_colors[1],
(strlen(s) > PL_dump_re_max_len ? "..." : ""));
+ }
} );
/* use UTF8 check substring if regexp pattern itself is in UTF8 */
Perl_pregfree2(pTHX_ REGEXP *rx)
{
struct regexp *const r = ReANY(rx);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_PREGFREE2;
{
struct regexp *const r = ReANY(rx);
RXi_GET_DECL(r, ri);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_REGFREE_INTERNAL;
U32 refcount;
reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
#ifdef USE_ITHREADS
- dVAR;
#endif
OP_REFCNT_LOCK;
refcount = --aho->refcount;
U32 refcount;
reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
#ifdef USE_ITHREADS
- dVAR;
#endif
OP_REFCNT_LOCK;
refcount = --trie->refcount;
#define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
/*
- re_dup_guts - duplicate a regexp.
+=for apidoc_section REGEXP Functions
+=for apidoc re_dup_guts
+Duplicate a regexp.
- This routine is expected to clone a given regexp structure. It is only
- compiled under USE_ITHREADS.
+This routine is expected to clone a given regexp structure. It is only
+compiled under USE_ITHREADS.
- After all of the core data stored in struct regexp is duplicated
- the regexp_engine.dupe method is used to copy any private data
- stored in the *pprivate pointer. This allows extensions to handle
- any duplication it needs to do.
+After all of the core data stored in struct regexp is duplicated
+the regexp_engine.dupe method is used to copy any private data
+stored in the *pprivate pointer. This allows extensions to handle
+any duplication they need to do.
+
+=cut
See pregfree() and regfree_internal() if you change anything here.
*/
void
Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
{
- dVAR;
I32 npar;
const struct regexp *r = ReANY(sstr);
struct regexp *ret = ReANY(dstr);
void *
Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
{
- dVAR;
struct regexp *const r = ReANY(rx);
regexp_internal *reti;
int len;
#endif
STATIC void
-S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
+S_re_croak(pTHX_ bool utf8, const char* pat,...)
{
va_list args;
- STRLEN l1 = strlen(pat1);
- STRLEN l2 = strlen(pat2);
+ STRLEN len = strlen(pat);
char buf[512];
SV *msv;
const char *message;
- PERL_ARGS_ASSERT_RE_CROAK2;
-
- if (l1 > 510)
- l1 = 510;
- if (l1 + l2 > 510)
- l2 = 510 - l1;
- Copy(pat1, buf, l1 , char);
- Copy(pat2, buf + l1, l2 , char);
- buf[l1 + l2] = '\n';
- buf[l1 + l2 + 1] = '\0';
- va_start(args, pat2);
+ PERL_ARGS_ASSERT_RE_CROAK;
+
+ if (len > 510)
+ len = 510;
+ Copy(pat, buf, len , char);
+ buf[len] = '\n';
+ buf[len + 1] = '\0';
+ va_start(args, pat);
msv = vmess(buf, &args);
va_end(args);
- message = SvPV_const(msv, l1);
- if (l1 > 512)
- l1 = 512;
- Copy(message, buf, l1 , char);
- /* l1-1 to avoid \n */
- Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
+ message = SvPV_const(msv, len);
+ if (len > 512)
+ len = 512;
+ Copy(message, buf, len , char);
+ /* len-1 to avoid \n */
+ Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
}
/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
UV this_end;
const char * format;
- if (end - start < min_range_count) {
-
- /* Output chars individually when they occur in short ranges */
+ if ( end - start < min_range_count
+ && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
+ {
+ /* Output a range of 1 or 2 chars individually, or longer ranges
+ * when printable */
for (; start <= end; start++) {
put_code_point(sv, start);
}
* output would have been only the inversion indicator '^', NULL is instead
* returned. */
- dVAR;
SV * output;
PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
* cases where it can't try inverting, as what actually matches isn't known
* until runtime, and hence the inversion isn't either. */
- dVAR;
bool inverting_allowed = ! force_as_is_display;
int i;
int inverted_bias, as_is_bias;
- /* We will apply our bias to whichever of the the results doesn't have
+ /* We will apply our bias to whichever of the results doesn't have
* the '^' */
if (invert) {
invert = FALSE;
const regnode *optstart= NULL;
RXi_GET_DECL(r, ri);
- GET_RE_DEBUG_FLAGS_DECL;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_DUMPUNTIL;
void
Perl_init_uniprops(pTHX)
{
- dVAR;
# ifdef DEBUGGING
char * dump_len_string;
# endif
}
+/* These four functions are compiled only in regcomp.c, where they have access
+ * to the data they return. They are a way for re_comp.c to get access to that
+ * data without having to compile the whole data structures. */
+
+I16
+Perl_do_uniprop_match(const char * const key, const U16 key_len)
+{
+ PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
+
+ return match_uniprop((U8 *) key, key_len);
+}
+
+SV *
+Perl_get_prop_definition(pTHX_ const int table_index)
+{
+ PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
+
+ /* Create and return the inversion list */
+ return _new_invlist_C_array(uni_prop_ptrs[table_index]);
+}
+
+const char * const *
+Perl_get_prop_values(const int table_index)
+{
+ PERL_ARGS_ASSERT_GET_PROP_VALUES;
+
+ return UNI_prop_value_ptrs[table_index];
+}
+
+const char *
+Perl_get_deprecated_property_msg(const Size_t warning_offset)
+{
+ PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
+
+ return deprecated_property_msgs[warning_offset];
+}
+
# if 0
This code was mainly added for backcompat to give a warning for non-portable
}
# endif
+#endif /* end of ! PERL_IN_XSUB_RE */
STATIC REGEXP *
-S_compile_wildcard(pTHX_ const char * name, const STRLEN len,
+S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
const bool ignore_case)
{
+ /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
+ * possibly with /i if the 'ignore_case' parameter is true. Use /aa
+ * because nothing outside of ASCII will match. Use /m because the input
+ * string may be a bunch of lines strung together.
+ *
+ * Also sets up the debugging info */
+
U32 flags = PMf_MULTILINE|PMf_WILDCARD;
+ U32 rx_flags;
+ SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
REGEXP * subpattern_re;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_COMPILE_WILDCARD;
}
set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
- subpattern_re = re_op_compile_wrapper(sv_2mortal(newSVpvn(name, len)),
- /* Like in op.c, we copy the compile
- * time pm flags to the rx ones */
- (flags & RXf_PMf_COMPILETIME), flags);
+ /* Like in op.c, we copy the compile time pm flags to the rx ones */
+ rx_flags = flags & RXf_PMf_COMPILETIME;
+
+#ifndef PERL_IN_XSUB_RE
+ /* Use the core engine if this file is regcomp.c. That means no
+ * 'use re "Debug ..." is in effect, so the core engine is sufficient */
+ subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
+ &PL_core_reg_engine,
+ NULL, NULL,
+ rx_flags, flags);
+#else
+ if (isDEBUG_WILDCARD) {
+ /* Use the special debugging engine if this file is re_comp.c and wants
+ * to output the wildcard matching. This uses whatever
+ * 'use re "Debug ..." is in effect */
+ subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
+ &my_reg_engine,
+ NULL, NULL,
+ rx_flags, flags);
+ }
+ else {
+ /* Use the special wildcard engine if this file is re_comp.c and
+ * doesn't want to output the wildcard matching. This uses whatever
+ * 'use re "Debug ..." is in effect for compilation, but this engine
+ * structure has been set up so that it uses the core engine for
+ * execution, so no execution debugging as a result of re.pm will be
+ * displayed. */
+ subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
+ &wild_reg_engine,
+ NULL, NULL,
+ rx_flags, flags);
+ /* XXX The above has the effect that any user-supplied regex engine
+ * won't be called for matching wildcards. That might be good, or bad.
+ * It could be changed in several ways. The reason it is done the
+ * current way is to avoid having to save and restore
+ * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
+ * could be used. Another suggestion is to keep the authoritative
+ * value of the debug flags in a thread-local variable and add set/get
+ * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
+ * Still another is to pass a flag, say in the engine's intflags that
+ * would be checked each time before doing the debug output */
+ }
+#endif
assert(subpattern_re); /* Should have died if didn't compile successfully */
return subpattern_re;
char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
{
I32 result;
+ DECLARE_AND_GET_RE_DEBUG_FLAGS;
PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
- result = pregexec(prog, stringarg, strend, strbeg, minend, screamer, nosave);
+ ENTER;
+
+ /* The compilation has set things up so that if the program doesn't want to
+ * see the wildcard matching procedure, it will get the core execution
+ * engine, which is subject only to -Dr. So we have to turn that off
+ * around this procedure */
+ if (! isDEBUG_WILDCARD) {
+ /* Note! Casts away 'volatile' */
+ SAVEI32(PL_debug);
+ PL_debug &= ~ DEBUG_r_FLAG;
+ }
+
+ result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
+ NULL, nosave);
+ LEAVE;
return result;
}
SV *
-Perl_handle_user_defined_property(pTHX_
+S_handle_user_defined_property(pTHX_
/* Parses the contents of a user-defined property definition; returning the
* expanded definition if possible. If so, the return is an inversion
this_definition = parse_uniprop_string(s0, s - s0,
is_utf8, to_fold, runtime,
deferrable,
+ NULL,
user_defined_ptr, msg,
(name_len == 0)
? level /* Don't increase level
# define CUR_CONTEXT aTHX
# define ORIGINAL_CONTEXT save_aTHX
# else
-# define DECLARATION_FOR_GLOBAL_CONTEXT
+# define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
# define SWITCH_TO_GLOBAL_CONTEXT NOOP
# define RESTORE_CONTEXT NOOP
# define CUR_CONTEXT NULL
* properties. This is a function so it can be set up to be called even if
* the program unexpectedly quits */
- dVAR;
SV ** current_entry;
const STRLEN key_len = strlen((const char *) key);
DECLARATION_FOR_GLOBAL_CONTEXT;
return fq_name;
}
-SV *
-Perl_parse_uniprop_string(pTHX_
+STATIC SV *
+S_parse_uniprop_string(pTHX_
/* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
* now. If so, the return is an inversion list.
const bool runtime, /* TRUE if this is being called at run time */
const bool deferrable, /* TRUE if it's ok for the definition to not be
known at this call */
+ AV ** strings, /* To return string property values, like named
+ sequences */
bool *user_defined_ptr, /* Upon return from this function it will be
set to TRUE if any component is a
user-defined property */
this */
const STRLEN level) /* Recursion level of this call */
{
- dVAR;
char* lookup_name; /* normalized name for lookup in our tables */
unsigned lookup_len; /* Its length */
enum { Not_Strict = 0, /* Some properties have stricter name */
* but it must be punctuation */
&& (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
{
- /* Find the property. The table includes the equals sign, so we
- * use 'j' as-is */
- table_index = match_uniprop((U8 *) lookup_name, j);
- if (table_index) {
- const char * const * prop_values
- = UNI_prop_value_ptrs[table_index];
+ bool special_property = memEQs(lookup_name, j - 1, "name")
+ || memEQs(lookup_name, j - 1, "na");
+ if (! special_property) {
+ /* Find the property. The table includes the equals sign, so
+ * we use 'j' as-is */
+ table_index = do_uniprop_match(lookup_name, j);
+ }
+ if (special_property || table_index) {
REGEXP * subpattern_re;
char open = name[i++];
char close;
const char * pos_in_brackets;
+ const char * const * prop_values;
bool escaped = 0;
/* Backslash => delimitter is the character following. We
packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
"The Unicode property wildcards feature is experimental");
- /* Now create and compile the wildcard subpattern. Use /iaa
- * because nothing outside of ASCII will match, and it the
- * property values should all match /i. Note that when the
- * pattern fails to compile, our added text to the user's
- * pattern will be displayed to the user, which is not so
- * desirable. */
+ if (special_property) {
+ const char * error_msg;
+ const char * revised_name = name + i;
+ Size_t revised_name_len = name_len - (i + 1 + escaped);
+
+ /* Currently, the only 'special_property' is name, which we
+ * lookup in _charnames.pm */
+
+ if (! load_charnames(newSVpvs("placeholder"),
+ revised_name, revised_name_len,
+ &error_msg))
+ {
+ sv_catpv(msg, error_msg);
+ goto append_name_to_msg;
+ }
+
+ /* Farm this out to a function just to make the current
+ * function less unwieldy */
+ if (handle_names_wildcard(revised_name, revised_name_len,
+ &prop_definition,
+ strings))
+ {
+ return prop_definition;
+ }
+
+ goto failed;
+ }
+
+ prop_values = get_prop_values(table_index);
+
+ /* Now create and compile the wildcard subpattern. Use /i
+ * because the property values are supposed to match with case
+ * ignored. */
subpattern_re = compile_wildcard(name + i,
name_len - i - 1 - escaped,
TRUE /* /i */
to_fold,
runtime,
deferrable,
+ NULL,
user_defined_ptr,
msg,
level + 1);
}
/* Here's how khw thinks we should proceed to handle the properties
- * not yet done: Bidi Mirroring Glyph
- Bidi Paired Bracket
+ * not yet done: Bidi Mirroring Glyph can map to ""
+ Bidi Paired Bracket can map to ""
Case Folding (both full and simple)
+ Shouldn't /i be good enough for Full
Decomposition Mapping
- Equivalent Unified Ideograph
- Name
- Name Alias
+ Equivalent Unified Ideograph can map to ""
Lowercase Mapping (both full and simple)
- NFKC Case Fold
+ NFKC Case Fold can map to ""
Titlecase Mapping (both full and simple)
Uppercase Mapping (both full and simple)
- * Move the part that looks at the property values into a perl
- * script, like utf8_heavy.pl was done. This makes things somewhat
- * easier, but most importantly, it avoids always adding all these
- * strings to the memory usage when the feature is little-used.
- *
- * The property values would all be concatenated into a single
- * string per property with each value on a separate line, and the
- * code point it's for on alternating lines. Then we match the
- * user's input pattern m//mg, without having to worry about their
- * uses of '^' and '$'. Only the values that aren't the default
- * would be in the strings. Code points would be in UTF-8. The
- * search pattern that we would construct would look like
- * (?: \n (code-point_re) \n (?aam: user-re ) \n )
- * And so $1 would contain the code point that matched the user-re.
+ * Handle these the same way Name is done, using say, _wild.pm, but
+ * having both loose and full, like in charclass_invlists.h.
+ * Perhaps move block and script to that as they are somewhat large
+ * in charclass_invlists.h.
* For properties where the default is the code point itself, such
* as any of the case changing mappings, the string would otherwise
* consist of all Unicode code points in UTF-8 strung together.
* error. Otherwise run the pattern against every code point in
* the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
* And it might be good to create an API to return the ssc.
- *
- * For the name properties, a new function could be created in
- * charnames which essentially does the same thing as above,
- * sharing Name.pl with the other charname functions. Don't know
- * about loose name matching, or algorithmically determined names.
- * Decomposition.pl similarly.
- *
- * It might be that a new pattern modifier would have to be
- * created, like /t for resTricTed, which changed the behavior of
- * some constructs in their subpattern, like \A. */
+ * Or handle them like the algorithmic names are done
+ */
} /* End of is a wildcard subppattern */
/* \p{name=...} is handled specially. Instead of using the normal
goto append_name_to_msg;
}
- lookup_loose = get_cv("_charnames::_loose_regcomp_lookup", 0);
+ lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
if (! lookup_loose) {
Perl_croak(aTHX_
"panic: Can't find '_charnames::_loose_regcomp_lookup");
}
- PUSHSTACKi(PERLSI_OVERLOAD);
+ PUSHSTACKi(PERLSI_REGCOMP);
ENTER ;
SAVETMPS;
save_re_context();
}
cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
- if (character_len < SvCUR(character)) {
- goto failed;
+ if (character_len == SvCUR(character)) {
+ prop_definition = add_cp_to_invlist(NULL, cp);
+ }
+ else {
+ AV * this_string;
+
+ /* First of the remaining characters in the string. */
+ char * remaining = SvPVX(character) + character_len;
+
+ if (strings == NULL) {
+ goto failed; /* XXX Perhaps a specific msg instead, like
+ 'not available here' */
+ }
+
+ if (*strings == NULL) {
+ *strings = newAV();
+ }
+
+ this_string = newAV();
+ av_push(this_string, newSVuv(cp));
+
+ do {
+ cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
+ av_push(this_string, newSVuv(cp));
+ remaining += character_len;
+ } while (remaining < SvEND(character));
+
+ av_push(*strings, (SV *) this_string);
}
- prop_definition = add_cp_to_invlist(NULL, cp);
return prop_definition;
}
* for this property in the hash. So we have the go ahead to
* expand the definition ourselves. */
- PUSHSTACKi(PERLSI_MAGIC);
+ PUSHSTACKi(PERLSI_REGCOMP);
ENTER;
/* Create a temporary placeholder in the hash to detect recursion
/* Get the index into our pointer table of the inversion list corresponding
* to the property */
- table_index = match_uniprop((U8 *) lookup_name, lookup_len);
+ table_index = do_uniprop_match(lookup_name, lookup_len);
/* If it didn't find the property ... */
if (table_index == 0) {
/* Try again stripping off any initial 'Is'. This is because we
* promise that an initial Is is optional. The same isn't true of
* names that start with 'In'. Those can match only blocks, and the
- * lookup table already has those accounted for. */
- if (starts_with_Is) {
+ * lookup table already has those accounted for. The lookup table also
+ * has already accounted for Perl extensions (without and = sign)
+ * starting with 'i's'. */
+ if (starts_with_Is && equals_pos >= 0) {
lookup_name += 2;
lookup_len -= 2;
equals_pos -= 2;
slash_pos -= 2;
- table_index = match_uniprop((U8 *) lookup_name, lookup_len);
+ table_index = do_uniprop_match(lookup_name, lookup_len);
}
if (table_index == 0) {
}
/* Here, we have the number in canonical form. Try that */
- table_index = match_uniprop((U8 *) canonical, strlen(canonical));
+ table_index = do_uniprop_match(canonical, strlen(canonical));
if (table_index == 0) {
goto failed;
}
table_index %= MAX_UNI_KEYWORD_INDEX;
Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
"Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
- (int) name_len, name, deprecated_property_msgs[warning_offset]);
+ (int) name_len, name,
+ get_deprecated_property_msg(warning_offset));
}
/* In a few properties, a different property is used under /i. These are
}
/* Create and return the inversion list */
- prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
+ prop_definition = get_prop_definition(table_index);
sv_2mortal(prop_definition);
-
/* See if there is a private use override to add to this definition */
{
COPHH * hinthash = (IN_PERL_COMPILETIME)
}
}
-#endif /* end of ! PERL_IN_XSUB_RE */
+STATIC bool
+S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
+ const STRLEN wname_len, /* Its length */
+ SV ** prop_definition,
+ AV ** strings)
+{
+ /* Deal with Name property wildcard subpatterns; returns TRUE if there were
+ * any matches, adding them to prop_definition */
+
+ dSP;
+
+ CV * get_names_info; /* entry to charnames.pm to get info we need */
+ SV * names_string; /* Contains all character names, except algo */
+ SV * algorithmic_names; /* Contains info about algorithmically
+ generated character names */
+ REGEXP * subpattern_re; /* The user's pattern to match with */
+ struct regexp * prog; /* The compiled pattern */
+ char * all_names_start; /* lib/unicore/Name.pl string of every
+ (non-algorithmic) character name */
+ char * cur_pos; /* We match, effectively using /gc; this is
+ where we are now */
+ bool found_matches = FALSE; /* Did any name match so far? */
+ SV * empty; /* For matching zero length names */
+ SV * must_sv; /* Contains the substring, if any, that must be
+ in a name for the subpattern to match */
+ const char * must; /* The PV of 'must' */
+ STRLEN must_len; /* And its length */
+ SV * syllable_name = NULL; /* For Hangul syllables */
+ const char hangul_prefix[] = "HANGUL SYLLABLE ";
+ const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
+
+ /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
+ * syllable name, and these are immutable and guaranteed by the Unicode
+ * standard to never be extended */
+ const STRLEN syl_max_len = hangul_prefix_len + 7;
+
+ IV i;
+
+ PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
+
+ /* Make sure _charnames is loaded. (The parameters give context
+ * for any errors generated */
+ get_names_info = get_cv("_charnames::_get_names_info", 0);
+ if (! get_names_info) {
+ Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
+ }
+
+ /* Get the charnames data */
+ PUSHSTACKi(PERLSI_REGCOMP);
+ ENTER ;
+ SAVETMPS;
+ save_re_context();
+
+ PUSHMARK(SP) ;
+ PUTBACK;
+
+ /* Special _charnames entry point that returns the info this routine
+ * requires */
+ call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
+
+ SPAGAIN ;
+
+ /* Data structure for names which end in their very own code points */
+ algorithmic_names = POPs;
+ SvREFCNT_inc_simple_void_NN(algorithmic_names);
+
+ /* The lib/unicore/Name.pl string */
+ names_string = POPs;
+ SvREFCNT_inc_simple_void_NN(names_string);
+
+ PUTBACK ;
+ FREETMPS ;
+ LEAVE ;
+ POPSTACK;
+
+ if ( ! SvROK(names_string)
+ || ! SvROK(algorithmic_names))
+ { /* Perhaps should panic instead XXX */
+ SvREFCNT_dec(names_string);
+ SvREFCNT_dec(algorithmic_names);
+ return FALSE;
+ }
+
+ names_string = sv_2mortal(SvRV(names_string));
+ all_names_start = SvPVX(names_string);
+ cur_pos = all_names_start;
+
+ algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
+
+ /* Compile the subpattern consisting of the name being looked for */
+ subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
+
+ must_sv = re_intuit_string(subpattern_re);
+ if (must_sv) {
+ /* regexec.c can free the re_intuit_string() return. GH #17734 */
+ must_sv = sv_2mortal(newSVsv(must_sv));
+ must = SvPV(must_sv, must_len);
+ }
+ else {
+ must = "";
+ must_len = 0;
+ }
+
+ /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
+ * This works because the NUL causes the function to return early, thus
+ * showing that there are characters in it other than the acceptable ones,
+ * which is our desired result.) */
+
+ prog = ReANY(subpattern_re);
+
+ /* If only nothing is matched, skip to where empty names are looked for */
+ if (prog->maxlen == 0) {
+ goto check_empty;
+ }
+
+ /* And match against the string of all names /gc. Don't even try if it
+ * must match a character not found in any name. */
+ if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
+ {
+ while (execute_wildcard(subpattern_re,
+ cur_pos,
+ SvEND(names_string),
+ all_names_start, 0,
+ names_string,
+ 0))
+ { /* Here, matched. */
+
+ /* Note the string entries look like
+ * 00001\nSTART OF HEADING\n\n
+ * so we could match anywhere in that string. We have to rule out
+ * matching a code point line */
+ char * this_name_start = all_names_start
+ + RX_OFFS(subpattern_re)->start;
+ char * this_name_end = all_names_start
+ + RX_OFFS(subpattern_re)->end;
+ char * cp_start;
+ char * cp_end;
+ UV cp = 0; /* Silences some compilers */
+ AV * this_string = NULL;
+ bool is_multi = FALSE;
+
+ /* If matched nothing, advance to next possible match */
+ if (this_name_start == this_name_end) {
+ cur_pos = (char *) memchr(this_name_end + 1, '\n',
+ SvEND(names_string) - this_name_end);
+ if (cur_pos == NULL) {
+ break;
+ }
+ }
+ else {
+ /* Position the next match to start beyond the current returned
+ * entry */
+ cur_pos = (char *) memchr(this_name_end, '\n',
+ SvEND(names_string) - this_name_end);
+ }
+
+ /* Back up to the \n just before the beginning of the character. */
+ cp_end = (char *) my_memrchr(all_names_start,
+ '\n',
+ this_name_start - all_names_start);
+
+ /* If we didn't find a \n, it means it matched somewhere in the
+ * initial '00000' in the string, so isn't a real match */
+ if (cp_end == NULL) {
+ continue;
+ }
+
+ this_name_start = cp_end + 1; /* The name starts just after */
+ cp_end--; /* the \n, and the code point */
+ /* ends just before it */
+
+ /* All code points are 5 digits long */
+ cp_start = cp_end - 4;
+
+ /* This shouldn't happen, as we found a \n, and the first \n is
+ * further along than what we subtracted */
+ assert(cp_start >= all_names_start);
+
+ if (cp_start == all_names_start) {
+ *prop_definition = add_cp_to_invlist(*prop_definition, 0);
+ continue;
+ }
+
+ /* If the character is a blank, we either have a named sequence, or
+ * something is wrong */
+ if (*(cp_start - 1) == ' ') {
+ cp_start = (char *) my_memrchr(all_names_start,
+ '\n',
+ cp_start - all_names_start);
+ cp_start++;
+ }
+
+ assert(cp_start != NULL && cp_start >= all_names_start + 2);
+
+ /* Except for the first line in the string, the sequence before the
+ * code point is \n\n. If that isn't the case here, we didn't
+ * match the name of a character. (We could have matched a named
+ * sequence, not currently handled */
+ if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
+ continue;
+ }
+
+ /* We matched! Add this to the list */
+ found_matches = TRUE;
+
+ /* Loop through all the code points in the sequence */
+ while (cp_start < cp_end) {
+
+ /* Calculate this code point from its 5 digits */
+ cp = (XDIGIT_VALUE(cp_start[0]) << 16)
+ + (XDIGIT_VALUE(cp_start[1]) << 12)
+ + (XDIGIT_VALUE(cp_start[2]) << 8)
+ + (XDIGIT_VALUE(cp_start[3]) << 4)
+ + XDIGIT_VALUE(cp_start[4]);
+
+ cp_start += 6; /* Go past any blank */
+
+ if (cp_start < cp_end || is_multi) {
+ if (this_string == NULL) {
+ this_string = newAV();
+ }
+
+ is_multi = TRUE;
+ av_push(this_string, newSVuv(cp));
+ }
+ }
+
+ if (is_multi) { /* Was more than one code point */
+ if (*strings == NULL) {
+ *strings = newAV();
+ }
+
+ av_push(*strings, (SV *) this_string);
+ }
+ else { /* Only a single code point */
+ *prop_definition = add_cp_to_invlist(*prop_definition, cp);
+ }
+ } /* End of loop through the non-algorithmic names string */
+ }
+
+ /* There are also character names not in 'names_string'. These are
+ * algorithmically generatable. Try this pattern on each possible one.
+ * (khw originally planned to leave this out given the large number of
+ * matches attempted; but the speed turned out to be quite acceptable
+ *
+ * There are plenty of opportunities to optimize to skip many of the tests.
+ * beyond the rudimentary ones already here */
+
+ /* First see if the subpattern matches any of the algorithmic generatable
+ * Hangul syllable names.
+ *
+ * We know none of these syllable names will match if the input pattern
+ * requires more bytes than any syllable has, or if the input pattern only
+ * matches an empty name, or if the pattern has something it must match and
+ * one of the characters in that isn't in any Hangul syllable. */
+ if ( prog->minlen <= (SSize_t) syl_max_len
+ && prog->maxlen > 0
+ && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
+ {
+ /* These constants, names, values, and algorithm are adapted from the
+ * Unicode standard, version 5.1, section 3.12, and should never
+ * change. */
+ const char * JamoL[] = {
+ "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
+ "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
+ };
+ const int LCount = C_ARRAY_LENGTH(JamoL);
+
+ const char * JamoV[] = {
+ "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
+ "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
+ "I"
+ };
+ const int VCount = C_ARRAY_LENGTH(JamoV);
+
+ const char * JamoT[] = {
+ "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
+ "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
+ "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
+ };
+ const int TCount = C_ARRAY_LENGTH(JamoT);
+
+ int L, V, T;
+
+ /* This is the initial Hangul syllable code point; each time through the
+ * inner loop, it maps to the next higher code point. For more info,
+ * see the Hangul syllable section of the Unicode standard. */
+ int cp = 0xAC00;
+
+ syllable_name = sv_2mortal(newSV(syl_max_len));
+ sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
+
+ for (L = 0; L < LCount; L++) {
+ for (V = 0; V < VCount; V++) {
+ for (T = 0; T < TCount; T++) {
+
+ /* Truncate back to the prefix, which is unvarying */
+ SvCUR_set(syllable_name, hangul_prefix_len);
+
+ sv_catpv(syllable_name, JamoL[L]);
+ sv_catpv(syllable_name, JamoV[V]);
+ sv_catpv(syllable_name, JamoT[T]);
+
+ if (execute_wildcard(subpattern_re,
+ SvPVX(syllable_name),
+ SvEND(syllable_name),
+ SvPVX(syllable_name), 0,
+ syllable_name,
+ 0))
+ {
+ *prop_definition = add_cp_to_invlist(*prop_definition,
+ cp);
+ found_matches = TRUE;
+ }
+
+ cp++;
+ }
+ }
+ }
+ }
+
+ /* The rest of the algorithmically generatable names are of the form
+ * "PREFIX-code_point". The prefixes and the code point limits of each
+ * were returned to us in the array 'algorithmic_names' from data in
+ * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
+ for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
+ IV j;
+
+ /* Each element of the array is a hash, giving the details for the
+ * series of names it covers. There is the base name of the characters
+ * in the series, and the low and high code points in the series. And,
+ * for optimization purposes a string containing all the legal
+ * characters that could possibly be in a name in this series. */
+ HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
+ SV * prefix = * hv_fetchs(this_series, "name", 0);
+ IV low = SvIV(* hv_fetchs(this_series, "low", 0));
+ IV high = SvIV(* hv_fetchs(this_series, "high", 0));
+ char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
+
+ /* Pre-allocate an SV with enough space */
+ SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
+ SvPVX(prefix)));
+ if (high >= 0x10000) {
+ sv_catpvs(algo_name, "0");
+ }
+
+ /* This series can be skipped entirely if the pattern requires
+ * something longer than any name in the series, or can only match an
+ * empty name, or contains a character not found in any name in the
+ * series */
+ if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
+ && prog->maxlen > 0
+ && (strspn(must, legal) == must_len))
+ {
+ for (j = low; j <= high; j++) { /* For each code point in the series */
+
+ /* Get its name, and see if it matches the subpattern */
+ Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
+ (unsigned) j);
+
+ if (execute_wildcard(subpattern_re,
+ SvPVX(algo_name),
+ SvEND(algo_name),
+ SvPVX(algo_name), 0,
+ algo_name,
+ 0))
+ {
+ *prop_definition = add_cp_to_invlist(*prop_definition, j);
+ found_matches = TRUE;
+ }
+ }
+ }
+ }
+
+ check_empty:
+ /* Finally, see if the subpattern matches an empty string */
+ empty = newSVpvs("");
+ if (execute_wildcard(subpattern_re,
+ SvPVX(empty),
+ SvEND(empty),
+ SvPVX(empty), 0,
+ empty,
+ 0))
+ {
+ /* Many code points have empty names. Currently these are the \p{GC=C}
+ * ones, minus CC and CF */
+
+ SV * empty_names_ref = get_prop_definition(UNI_C);
+ SV * empty_names = invlist_clone(empty_names_ref, NULL);
+
+ SV * subtract = get_prop_definition(UNI_CC);
+
+ _invlist_subtract(empty_names, subtract, &empty_names);
+ SvREFCNT_dec_NN(empty_names_ref);
+ SvREFCNT_dec_NN(subtract);
+
+ subtract = get_prop_definition(UNI_CF);
+ _invlist_subtract(empty_names, subtract, &empty_names);
+ SvREFCNT_dec_NN(subtract);
+
+ _invlist_union(*prop_definition, empty_names, prop_definition);
+ found_matches = TRUE;
+ SvREFCNT_dec_NN(empty_names);
+ }
+ SvREFCNT_dec_NN(empty);
+
+#if 0
+ /* If we ever were to accept aliases for, say private use names, we would
+ * need to do something fancier to find empty names. The code below works
+ * (at the time it was written), and is slower than the above */
+ const char empties_pat[] = "^.";
+ if (strNE(name, empties_pat)) {
+ SV * empty = newSVpvs("");
+ if (execute_wildcard(subpattern_re,
+ SvPVX(empty),
+ SvEND(empty),
+ SvPVX(empty), 0,
+ empty,
+ 0))
+ {
+ SV * empties = NULL;
+
+ (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
+
+ _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
+ SvREFCNT_dec_NN(empties);
+
+ found_matches = TRUE;
+ }
+ SvREFCNT_dec_NN(empty);
+ }
+#endif
+
+ SvREFCNT_dec_NN(subpattern_re);
+ return found_matches;
+}
/*
* ex: set ts=8 sts=4 sw=4 et: