#define RExC_mysv2 (pRExC_state->mysv2)
#endif
+ bool seen_unfolded_sharp_s;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_end (pRExC_state->end)
#define RExC_parse (pRExC_state->parse)
#define RExC_whilem_seen (pRExC_state->whilem_seen)
+
+/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
+ * EXACTF node, hence was parsed under /di rules. If later in the parse,
+ * something forces the pattern into using /ui rules, the sharp s should be
+ * folded into the sequence 'ss', which takes up more space than previously
+ * calculated. This means that the sizing pass needs to be restarted. (The
+ * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
+ * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
+ * so there is no need to resize [perl #125990]. */
+#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
+
#ifdef RE_TRACK_PATTERN_OFFSETS
#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
others */
} \
} STMT_END
+/* Change from /d into /u rules, and restart the parse if we've already seen
+ * something whose size would increase as a result, by setting *flagp and
+ * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
+ * we've change to /u during the parse. */
+#define REQUIRE_UNI_RULES(flagp, restart_retval) \
+ STMT_START { \
+ if (DEPENDS_SEMANTICS) { \
+ assert(PASS1); \
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
+ RExC_uni_semantics = 1; \
+ if (RExC_seen_unfolded_sharp_s) { \
+ *flagp |= RESTART_PASS1; \
+ return restart_retval; \
+ } \
+ } \
+ } STMT_END
+
/* This converts the named class defined in regcomp.h to its equivalent class
* number defined in handy.h. */
#define namedclass_to_classnum(class) ((int) ((class) / 2))
} STMT_END
/* A specialized version of vFAIL2 that works with UTF8f */
-#define vFAIL2utf8f(m, a1) STMT_START { \
+#define vFAIL2utf8f(m, a1) STMT_START { \
const IV offset = RExC_parse - RExC_precomp; \
if (!SIZE_ONLY) \
SAVEFREESV(RExC_rx_sv); \
REPORT_LOCATION_ARGS(offset)); \
} STMT_END
+#define vFAIL3utf8f(m, a1, a2) STMT_START { \
+ const IV offset = RExC_parse - RExC_precomp; \
+ if (!SIZE_ONLY) \
+ SAVEFREESV(RExC_rx_sv); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
+ REPORT_LOCATION_ARGS(offset)); \
+} STMT_END
+
/* These have asserts in them because of [perl #122671] Many warnings in
* regcomp.c can occur twice. If they get output in pass1 and later in that
* pass, the pattern has to be converted to UTF-8 and the pass restarted, they
else {
anded_flags = ANYOF_FLAGS(and_with)
&( ANYOF_COMMON_FLAGS
- |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER);
+ |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
}
}
if (OP(or_with) != ANYOFD) {
ored_flags
|= ANYOF_FLAGS(or_with)
- & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
}
}
* by the time we reach here */
assert(! (ANYOF_FLAGS(ssc)
& ~( ANYOF_COMMON_FLAGS
- |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)));
+ |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
/* ignore the utf8ness if the pattern is 0 length */
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
+
RExC_uni_semantics = 0;
+ RExC_seen_unfolded_sharp_s = 0;
RExC_contains_locale = 0;
RExC_contains_i = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
});
redo_first_pass:
- /* we jump here if we upgrade the pattern to utf8 and have to
- * recompile */
+ /* we jump here if we have to recompile, e.g., from upgrading the pattern
+ * to utf8 */
if ((pm_flags & PMf_USE_RE_EVAL)
/* this second condition covers the non-regex literal case,
if (rx_flags & PMf_FOLD) {
RExC_contains_i = 1;
}
- if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+ if ( initial_charset == REGEX_DEPENDS_CHARSET
+ && (RExC_utf8 ||RExC_uni_semantics))
+ {
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
RExC_pm_flags = pm_flags;
if (runtime_code) {
- if (TAINTING_get && TAINT_get)
+ assert(TAINTING_get || !TAINT_get);
+ if (TAINT_get)
Perl_croak(aTHX_ "Eval-group in insecure regular expression");
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
pRExC_state->num_code_blocks);
}
+ else {
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Need to redo pass 1\n"));
+ }
+
goto redo_first_pass;
}
Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
#ifdef PERL_ARGS_ASSERT__INVLISTEQ
bool
-S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
+Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
{
/* Return a boolean as to if the two passed in inversion lists are
* identical. The final argument, if TRUE, says to take the complement of
++RExC_parse;
}
- if (PASS2) {
- STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
- }
+ vFAIL("Sequence (?... not terminated");
}
/*
STRLEN verb_len = 0;
char *start_arg = NULL;
unsigned char op = 0;
- int argok = 1;
- int internal_argval = 0; /* internal_argval is only useful if
- !argok */
+ int arg_required = 0;
+ int internal_argval = -1; /* if >-1 we are not allowed an argument*/
if (has_intervening_patws) {
RExC_parse++;
case 'F': /* (*FAIL) */
if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
op = OPFAIL;
- argok = 0;
}
break;
case ':': /* (*:NAME) */
case 'M': /* (*MARK:NAME) */
if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
op = MARKPOINT;
- argok = -1;
+ arg_required = 1;
}
break;
case 'P': /* (*PRUNE) */
"Unknown verb pattern '%"UTF8f"'",
UTF8fARG(UTF, verb_len, start_verb));
}
- if ( argok ) {
- if ( start_arg && internal_argval ) {
- vFAIL3("Verb pattern '%.*s' may not have an argument",
- verb_len, start_verb);
- } else if ( argok < 0 && !start_arg ) {
- vFAIL3("Verb pattern '%.*s' has a mandatory argument",
- verb_len, start_verb);
- } else {
- ret = reganode(pRExC_state, op, internal_argval);
- if ( ! internal_argval && ! SIZE_ONLY ) {
- if (start_arg) {
- SV *sv = newSVpvn( start_arg,
- RExC_parse - start_arg);
- ARG(ret) = add_data( pRExC_state,
- STR_WITH_LEN("S"));
- RExC_rxi->data->data[ARG(ret)]=(void*)sv;
- ret->flags = 0;
- } else {
- ret->flags = 1;
- }
- }
- }
- if (!internal_argval)
- RExC_seen |= REG_VERBARG_SEEN;
- } else if ( start_arg ) {
- vFAIL3("Verb pattern '%.*s' may not have an argument",
- verb_len, start_verb);
- } else {
- ret = reg_node(pRExC_state, op);
- }
+ if ( arg_required && !start_arg ) {
+ vFAIL3("Verb pattern '%.*s' has a mandatory argument",
+ verb_len, start_verb);
+ }
+ if (internal_argval == -1) {
+ ret = reganode(pRExC_state, op, 0);
+ } else {
+ ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
+ }
+ RExC_seen |= REG_VERBARG_SEEN;
+ if ( ! SIZE_ONLY ) {
+ if (start_arg) {
+ SV *sv = newSVpvn( start_arg,
+ RExC_parse - start_arg);
+ ARG(ret) = add_data( pRExC_state,
+ STR_WITH_LEN("S"));
+ RExC_rxi->data->data[ARG(ret)]=(void*)sv;
+ ret->flags = 1;
+ } else {
+ ret->flags = 0;
+ }
+ if ( internal_argval != -1 )
+ ARG2L_SET(ret, internal_argval);
+ }
nextchar(pRExC_state);
return ret;
}
RExC_parse++;
paren = *RExC_parse++;
- ret = NULL; /* For look-ahead/behind. */
+ ret = NULL; /* For lookahead/behind. */
switch (paren) {
case 'P': /* (?P...) variants for those used to PCRE/Python */
case '!': /* (?!...) */
RExC_seen_zerolen++;
/* check if we're really just a "FAIL" assertion */
- --RExC_parse;
- nextchar(pRExC_state);
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
if (*RExC_parse == ')') {
- ret=reg_node(pRExC_state, OPFAIL);
+ ret=reganode(pRExC_state, OPFAIL, 0);
nextchar(pRExC_state);
return ret;
}
int is_define= 0;
const int DEFINE_len = sizeof("DEFINE") - 1;
if (RExC_parse[0] == '?') { /* (?(?...)) */
- if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
- || RExC_parse[1] == '<'
- || RExC_parse[1] == '{') { /* Lookahead or eval. */
+ if (
+ RExC_parse[1] == '=' ||
+ RExC_parse[1] == '!' ||
+ RExC_parse[1] == '<' ||
+ RExC_parse[1] == '{'
+ ) { /* Lookahead or eval. */
I32 flag;
regnode *tail;
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
/* (?(1)...) */
char c;
- char *tmp;
UV uv;
if (grok_atoUV(RExC_parse, &uv, &endptr)
&& uv <= I32_MAX
parno = (I32)uv;
RExC_parse = (char*)endptr;
}
- /* XXX else what? */
+ else {
+ vFAIL("panic: grok_atoUV returned FALSE");
+ }
ret = reganode(pRExC_state, GROUPP, parno);
insert_if_check_paren:
- if (*(tmp = nextchar(pRExC_state)) != ')') {
- /* nextchar also skips comments, so undo its work
- * and skip over the the next character.
- */
- RExC_parse = tmp;
+ if (UCHARAT(RExC_parse) != ')') {
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
vFAIL("Switch condition not recognized");
}
+ nextchar(pRExC_state);
insert_if:
REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
br = regbranch(pRExC_state, &flags, 1,depth+1);
} else
REGTAIL(pRExC_state, br, reganode(pRExC_state,
LONGJMP, 0));
- c = *nextchar(pRExC_state);
+ c = UCHARAT(RExC_parse);
+ nextchar(pRExC_state);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
if (c == '|') {
REGTAIL(pRExC_state, ret, lastbr);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
- c = *nextchar(pRExC_state);
+ c = UCHARAT(RExC_parse);
+ nextchar(pRExC_state);
}
else
lastbr = NULL;
/* Check for proper termination. */
if (paren) {
- /* restore original flags, but keep (?p) */
+ /* restore original flags, but keep (?p) and, if we've changed from /d
+ * rules to /u, keep the /u */
RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
- if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
+ if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
+ }
+ if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
}
+ nextchar(pRExC_state);
}
else if (!paren && RExC_parse < RExC_end) {
if (*RExC_parse == ')') {
*flagp = WORST; /* Tentatively. */
- RExC_parse--;
- nextchar(pRExC_state);
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
flags &= ~TRYAGAIN;
latest = regpiece(pRExC_state, &flags,depth+1);
* enough space for all the things we are about to throw
* away, but we can shrink it by the ammount we are about
* to re-use here */
- RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
+ RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
}
else {
ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
RExC_emit = orig_emit;
}
- ret = reg_node(pRExC_state, OPFAIL);
+ ret = reganode(pRExC_state, OPFAIL, 0);
return ret;
}
else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
"Useless use of greediness modifier '%c'",
*RExC_parse);
}
- /* Absorb the modifier, so later code doesn't see nor use
- * it */
- nextchar(pRExC_state);
}
do_curly:
* sequence. *node_p * will be set to a generated node returned by this
* function calling S_reg().
*
- * The final possibility, which happens only when the fourth one would
- * otherwise be in effect, is that one of those code points requires the
- * pattern to be recompiled as UTF-8. The function returns FALSE, and sets
- * the RESTART_PASS1 and NEED_UTF8 flags in *flagp. When this happens, the
- * caller needs to desist from continuing parsing, and return this information
- * to its caller. This is not set for when there is only one code point, as
- * this can be called as part of an ANYOF node, and they can store
- * above-Latin1 code points without the pattern having to be in UTF-8.
- * XXX
+ * The final possibility is that it is premature to be calling this function;
+ * that pass1 needs to be restarted. This can happen when this changes from
+ * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
+ * latter occurs only when the fourth possibility would otherwise be in
+ * effect, and is because one of those code points requires the pattern to be
+ * recompiled as UTF-8. The function returns FALSE, and sets the
+ * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this
+ * happens, the caller needs to desist from continuing parsing, and return
+ * this information to its caller. This is not set for when there is only one
+ * code point, as this can be called as part of an ANYOF node, and they can
+ * store above-Latin1 code points without the pattern having to be in UTF-8.
*
* For non-single-quoted regexes, the tokenizer has resolved character and
* sequence names inside \N{...} into their Unicode values, normalizing the
* parser. But if the single-quoted regex is something like '\N{U+41}', that
* is legal and handled here. The code point is Unicode, and has to be
* translated into the native character set for non-ASCII platforms.
- * the tokenizer passes the \N sequence through unchanged; this code will not
- * attempt to determine this nor expand those, instead raising a syntax error.
*/
char * endbrace; /* points to '}' following the name */
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
- char* p; /* Temporary */
+ char* p = RExC_parse; /* Temporary */
GET_RE_DEBUG_FLAGS_DECL;
/* The [^\n] meaning of \N ignores spaces and comments under the /x
* modifier. The other meanings do not, so use a temporary until we find
* out which we are being called with */
- p = (RExC_flags & RXf_PMf_EXTENDED)
- ? regpatws(pRExC_state, RExC_parse,
- TRUE) /* means recognize comments */
- : RExC_parse;
+ skip_to_be_ignored_text(pRExC_state, &p,
+ FALSE /* Don't force to /x */ );
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The latter is assumed when the {...} following the \N is a legal
- * quantifier, or there is no a '{' at all */
+ * quantifier, or there is no '{' at all */
if (*p != '{' || regcurly(p)) {
RExC_parse = p;
if (cp_count) {
if (! node_p) {
return FALSE;
}
- RExC_parse--; /* Need to back off so nextchar() doesn't skip the
- current char */
- nextchar(pRExC_state);
+
*node_p = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
MARK_NAUGHTY(1);
vFAIL("\\N{NAME} must be resolved by the lexer");
}
- RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
+ REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
+ semantics */
if (endbrace == RExC_parse) { /* empty: \N{} */
if (cp_count) {
* it returns U+FFFD (Replacement character) and sets *encp to NULL.
*/
STATIC UV
-S_reg_recode(pTHX_ const char value, SV **encp)
+S_reg_recode(pTHX_ const U8 value, SV **encp)
{
STRLEN numlen = 1;
- SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
+ SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
const STRLEN newlen = SvCUR(sv);
UV uv = UNICODE_REPLACEMENT;
{
regnode *ret = NULL;
I32 flags = 0;
- char *parse_start = RExC_parse;
+ char *parse_start;
U8 op;
int invert = 0;
U8 arg;
PERL_ARGS_ASSERT_REGATOM;
tryagain:
+ parse_start = RExC_parse;
switch ((U8)*RExC_parse) {
case '^':
RExC_seen_zerolen++;
NOT_REACHED; /*NOTREACHED*/
}
RExC_parse = endbrace;
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
OP(ret) = BOUNDU;
break;
case 'p':
case 'P':
- {
-#ifdef DEBUGGING
- char* parse_start = RExC_parse - 2;
-#endif
+ RExC_parse--;
- RExC_parse--;
-
- ret = regclass(pRExC_state, flagp,depth+1,
- TRUE, /* means just parse this element */
- FALSE, /* don't allow multi-char folds */
- FALSE, /* don't silence non-portable warnings.
- It would be a bug if these returned
- non-portables */
- (bool) RExC_strict,
- TRUE, /* Allow an optimized regnode result */
- NULL);
- /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
- * multi-char folds are allowed. */
- if (!ret)
- FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
- (UV) *flagp);
+ ret = regclass(pRExC_state, flagp,depth+1,
+ TRUE, /* means just parse this element */
+ FALSE, /* don't allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings. It
+ would be a bug if these returned
+ non-portables */
+ (bool) RExC_strict,
+ TRUE, /* Allow an optimized regnode result */
+ NULL);
+ if (*flagp & RESTART_PASS1)
+ return NULL;
+ /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+ * multi-char folds are allowed. */
+ if (!ret)
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ (UV) *flagp);
- RExC_parse--;
+ RExC_parse--;
- Set_Node_Offset(ret, parse_start + 2);
- Set_Node_Cur_Length(ret, parse_start);
- nextchar(pRExC_state);
- }
+ Set_Node_Offset(ret, parse_start);
+ Set_Node_Cur_Length(ret, parse_start - 2);
+ nextchar(pRExC_state);
break;
case 'N':
/* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
if (*flagp & RESTART_PASS1)
return NULL;
- RExC_parse--;
+
+ /* Here, evaluates to a single code point. Go get that */
+ RExC_parse = parse_start;
goto defchar;
case 'k': /* Handle \k<NAME> and \k'NAME' */
* octal character escape, e.g. \35 or \777.
* The above logic should make it obvious why using
* octal escapes in patterns is problematic. - Yves */
+ RExC_parse = parse_start;
goto defchar;
}
}
* as an octal escape. It may or may not be a valid backref
* escape. For instance \88888888 is unlikely to be a valid
* backref. */
- {
-#ifdef RE_TRACK_PATTERN_OFFSETS
- char * const parse_start = RExC_parse - 1; /* MJD */
-#endif
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
- if (hasbrace) {
- if (*RExC_parse != '}')
- vFAIL("Unterminated \\g{...} pattern");
- RExC_parse++;
- }
- if (!SIZE_ONLY) {
- if (num > (I32)RExC_rx->nparens)
- vFAIL("Reference to nonexistent group");
- }
- RExC_sawback = 1;
- ret = reganode(pRExC_state,
- ((! FOLD)
- ? REF
- : (ASCII_FOLD_RESTRICTED)
- ? REFFA
- : (AT_LEAST_UNI_SEMANTICS)
- ? REFFU
- : (LOC)
- ? REFFL
- : REFF),
- num);
- *flagp |= HASWIDTH;
+ while (isDIGIT(*RExC_parse))
+ RExC_parse++;
+ if (hasbrace) {
+ if (*RExC_parse != '}')
+ vFAIL("Unterminated \\g{...} pattern");
+ RExC_parse++;
+ }
+ if (!SIZE_ONLY) {
+ if (num > (I32)RExC_rx->nparens)
+ vFAIL("Reference to nonexistent group");
+ }
+ RExC_sawback = 1;
+ ret = reganode(pRExC_state,
+ ((! FOLD)
+ ? REF
+ : (ASCII_FOLD_RESTRICTED)
+ ? REFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? REFFU
+ : (LOC)
+ ? REFFL
+ : REFF),
+ num);
+ *flagp |= HASWIDTH;
- /* override incorrect value set in reganode MJD */
- Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret, parse_start);
- RExC_parse--;
- nextchar(pRExC_state);
- }
+ /* override incorrect value set in reganode MJD */
+ Set_Node_Offset(ret, parse_start);
+ Set_Node_Cur_Length(ret, parse_start-1);
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
}
break;
case '\0':
default:
/* Do not generate "unrecognized" warnings here, we fall
back into the quick-grab loop below */
- parse_start--;
+ RExC_parse = parse_start;
goto defchar;
- }
+ } /* end of switch on a \foo sequence */
break;
case '#':
- if (RExC_flags & RXf_PMf_EXTENDED) {
+
+ /* '#' comments should have been spaced over before this function was
+ * called */
+ assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
+ /*
+ if (RExC_flags & RXf_PMf_EXTENDED) {
RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
if (RExC_parse < RExC_end)
goto tryagain;
}
+ */
+
/* FALLTHROUGH */
default:
+ defchar: {
- parse_start = RExC_parse - 1;
-
- RExC_parse++;
+ /* Here, we have determined that the next thing is probably a
+ * literal character. RExC_parse points to the first byte of its
+ * definition. (It still may be an escape sequence that evaluates
+ * to a single character) */
- defchar: {
STRLEN len = 0;
UV ender = 0;
char *p;
reparse:
- /* We do the EXACTFish to EXACT node only if folding. (And we
- * don't need to figure this out until pass 2) */
+ /* We look for the EXACTFish to EXACT node optimizaton only if
+ * folding. (And we don't need to figure this out until pass 2) */
maybe_exact = FOLD && PASS2;
/* XXX The node can hold up to 255 bytes, yet this only goes to
* could back off to end with only a code point that isn't such a
* non-final, but it is possible for there not to be any in the
* entire node. */
- for (p = RExC_parse - 1;
+
+ assert( ! UTF /* Is at the beginning of a character */
+ || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
+ || UTF8_IS_START(UCHARAT(RExC_parse)));
+
+ for (p = RExC_parse;
len < upper_parse && p < RExC_end;
len++)
{
oldp = p;
- if (RExC_flags & RXf_PMf_EXTENDED)
- p = regpatws(pRExC_state, p,
- TRUE); /* means recognize comments */
+ /* White space has already been ignored */
+ assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
+ || ! is_PATWS_safe((p), RExC_end, UTF));
+
switch ((U8)*p) {
case '^':
case '$':
) {
if (*flagp & NEED_UTF8)
FAIL("panic: grok_bslash_N set NEED_UTF8");
+ if (*flagp & RESTART_PASS1)
+ return NULL;
/* Here, it wasn't a single code point. Go close
* up this EXACTish node. The switch() prior to
recode_encoding:
if (! RExC_override_recoding) {
SV* enc = _get_encoding();
- ender = reg_recode((const char)(U8)ender, &enc);
+ ender = reg_recode((U8)ender, &enc);
if (!enc && PASS2)
ckWARNreg(p, "Invalid escape in the specified encoding");
REQUIRE_UTF8(flagp);
/* FALLTHROUGH */
default:
if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
- /* Include any { following the alpha to emphasize
+ /* Include any left brace following the alpha to emphasize
* that it could be part of an escape at some point
* in the future */
int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
case '{':
/* Currently we don't warn when the lbrace is at the start
* of a construct. This catches it in the middle of a
- * literal string, or when its the first thing after
+ * literal string, or when it's the first thing after
* something like "\b" */
if (! SIZE_ONLY
&& (len || (p > RExC_start && isALPHA_A(*(p -1)))))
/*FALLTHROUGH*/
default: /* A literal character */
normal_default:
- if (UTF8_IS_START(*p) && UTF) {
+ if (! UTF8_IS_INVARIANT(*p) && UTF) {
STRLEN numlen;
ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
&numlen, UTF8_ALLOW_DEFAULT);
} /* End of switch on the literal */
/* Here, have looked at the literal character and <ender>
- * contains its ordinal, <p> points to the character after it
- */
-
- if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regpatws(pRExC_state, p,
- TRUE); /* means recognize comments */
+ * contains its ordinal, <p> points to the character after it.
+ * We need to check if the next non-ignored thing is a
+ * quantifier. Move <p> to after anything that should be
+ * ignored, which, as a side effect, positions <p> for the next
+ * loop iteration */
+ skip_to_be_ignored_text(pRExC_state, &p,
+ FALSE /* Don't force to /x */ );
/* If the next thing is a quantifier, it applies to this
* character only, which means that this character has to be in
* the node, close the node with just them, and set up to do
* this character again next time through, when it will be the
* only thing in its new node */
- if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
+ if ((next_is_quantifier = ( LIKELY(p < RExC_end)
+ && UNLIKELY(ISMULT2(p))))
+ && LIKELY(len))
{
p = oldp;
goto loopdone;
}
+ /* Ready to add 'ender' to the node */
+
if (! FOLD) { /* The simple case, just append the literal */
/* In the sizing pass, we need only the size of the
goto not_fold_common;
}
else /* A regular FOLD code point */
- if (! ( UTF
+ if (! ( UTF
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- /* See comments for join_exact() as to why we fold this
- * non-UTF at compile time */
- || (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S)
+ /* See comments for join_exact() as to why we fold
+ * this non-UTF at compile time */
+ || ( node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S)
#endif
)) {
/* Here, are folding and are not UTF-8 encoded; therefore
/* See if the character's fold differs between /d and
* /u. This includes the multi-char fold SHARP S to
* 'ss' */
- if (maybe_exactfu
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ RExC_seen_unfolded_sharp_s = 1;
+ maybe_exactfu = FALSE;
+ }
+ else if (maybe_exactfu
&& (PL_fold[ender] != PL_fold_latin1[ender]
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- || ender == LATIN_SMALL_LETTER_SHARP_S
- || (len > 0
- && isALPHA_FOLD_EQ(ender, 's')
- && isALPHA_FOLD_EQ(*(s-1), 's'))
+ || ( len > 0
+ && isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's'))
#endif
)) {
maybe_exactfu = FALSE;
RExC_parse = p - 1;
Set_Node_Cur_Length(ret, parse_start);
- nextchar(pRExC_state);
+ RExC_parse = p;
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
{
/* len is STRLEN which is unsigned, need to copy to signed */
IV iv = len;
return(ret);
}
-STATIC char *
-S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
-{
- /* Returns the next non-pattern-white space, non-comment character (the
- * latter only if 'recognize_comment is true) in the string p, which is
- * ended by RExC_end. See also reg_skipcomment */
- const char *e = RExC_end;
-
- PERL_ARGS_ASSERT_REGPATWS;
-
- while (p < e) {
- STRLEN len;
- if ((len = is_PATWS_safe(p, e, UTF))) {
- p += len;
- }
- else if (recognize_comment && *p == '#') {
- p = reg_skipcomment(pRExC_state, p);
- }
- else
- break;
- }
- return p;
-}
STATIC void
S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
}
- else if (end >= NUM_ANYOF_CODE_POINTS) {
- ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
- }
/* Quit if are above what we should change */
if (start >= NUM_ANYOF_CODE_POINTS) {
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
- RExC_uni_semantics = 1; /* The use of this operator implies /u. This
- is required so that the compile time values
- are valid in all runtime cases */
+ REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
+ This is required so that the compile
+ time values are valid in all runtime
+ cases */
/* This will return only an ANYOF regnode, or (unlikely) something smaller
* (such as EXACT). Thus we can skip most everything if just sizing. We
while (RExC_parse < RExC_end) {
SV* current = NULL;
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ TRUE /* Force /x */ );
+
switch (*RExC_parse) {
case '?':
if (RExC_parse[1] == '[') depth++, RExC_parse++;
* default: case next time and keep on incrementing until
* we find one of the invariants we do handle. */
RExC_parse++;
+ if (*RExC_parse == 'c') {
+ /* Skip the \cX notation for control characters */
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ }
break;
case '[':
{
}
goto no_close;
}
- RExC_parse++;
+
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
no_close:
* a stack. Each entry on the stack is a single character representing one
* of the operators; or else a pointer to an operand inversion list. */
-#define IS_OPERAND(a) (! SvIOK(a))
+#define IS_OPERATOR(a) SvIOK(a)
+#define IS_OPERAND(a) (! IS_OPERATOR(a))
/* The stack is kept in Łukasiewicz order. (That's pronounced similar
* to luke-a-shave-itch (or -itz), but people who didn't want to bother
operand */
SV* only_to_avoid_leaks;
- /* Skip white space */
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE /* means recognize comments */ );
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ TRUE /* Force /x */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
/* If the top entry on the stack is an operator, it had
* better be a '!', otherwise the entry below the top
* operand should be an operator */
- if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
- || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
- || top_index - fence < 1
- || ! (stacked_ptr = av_fetch(stack,
- top_index - 1,
- FALSE))
- || IS_OPERAND(*stacked_ptr))
+ if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
+ || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
+ || ( IS_OPERAND(*top_ptr)
+ && ( top_index - fence < 1
+ || ! (stacked_ptr = av_fetch(stack,
+ top_index - 1,
+ FALSE))
+ || ! IS_OPERATOR(*stacked_ptr))))
{
RExC_parse++;
vFAIL("Unexpected '(' with no preceding operator");
/* Having gotten rid of the fence, we pop the operand at the
* stack top and process it as a newly encountered operand */
current = av_pop(stack);
- assert(IS_OPERAND(current));
- goto handle_operand;
+ if (IS_OPERAND(current)) {
+ goto handle_operand;
+ }
+
+ RExC_parse++;
+ goto bad_syntax;
case '&':
case '|':
/* Here, the new operator has equal or lower precedence than
* what's already there. This means the operation already
* there should be performed now, before the new one. */
+
rhs = av_pop(stack);
+ if (! IS_OPERAND(rhs)) {
+
+ /* This can happen when a ! is not followed by an operand,
+ * like in /(?[\t &!])/ */
+ goto bad_syntax;
+ }
+
lhs = av_pop(stack);
- assert(IS_OPERAND(rhs));
- assert(IS_OPERAND(lhs));
+ if (! IS_OPERAND(lhs)) {
+
+ /* This can happen when there is an empty (), like in
+ * /(?[[0]+()+])/ */
+ goto bad_syntax;
+ }
switch (stacked_operator) {
case '&':
av_push(stack, rhs);
goto redo_curchar;
- case '!': /* Highest priority, right associative, so just push
- onto stack */
- av_push(stack, newSVuv(curchar));
+ case '!': /* Highest priority, right associative */
+
+ /* If what's already at the top of the stack is another '!",
+ * they just cancel each other out */
+ if ( (top_ptr = av_fetch(stack, top_index, FALSE))
+ && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
+ {
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ }
+ else { /* Otherwise, since it's right associative, just push
+ onto the stack */
+ av_push(stack, newSVuv(curchar));
+ }
break;
default:
* be an operator */
top_ptr = av_fetch(stack, top_index, FALSE);
assert(top_ptr);
- if (! IS_OPERAND(*top_ptr)) {
+ if (IS_OPERATOR(*top_ptr)) {
/* The only permissible operator at the top of the stack is
* '!', which is applied immediately to this operand. */
|| SvTYPE(final) != SVt_INVLIST
|| av_tindex(stack) >= 0) /* More left on stack */
{
+ bad_syntax:
SvREFCNT_dec(final);
vFAIL("Incomplete expression within '(?[ ])'");
}
Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
return node;
}
+#undef IS_OPERATOR
#undef IS_OPERAND
STATIC void
#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
(SvCUR(listsv) != initial_listsv_len)
+/* There is a restricted set of white space characters that are legal when
+ * ignoring white space in a bracketed character class. This generates the
+ * code to skip them.
+ *
+ * There is a line below that uses the same white space criteria but is outside
+ * this macro. Both here and there must use the same definition */
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
+ STMT_START { \
+ if (do_skip) { \
+ while ( p < RExC_end \
+ && isBLANK_A(UCHARAT(p))) \
+ { \
+ p++; \
+ } \
+ } \
+ } STMT_END
+
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
const bool stop_at_1, /* Just parse the next thing, don't
about too large
characters */
const bool strict,
- const bool optimizable, /* ? Allow a non-ANYOF return
+ bool optimizable, /* ? Allow a non-ANYOF return
node */
SV** ret_invlist /* Return an inversion list, not a node */
)
bool has_user_defined_property = FALSE;
/* inversion list of code points this node matches only when the target
- * string is in UTF-8. (Because is under /d) */
- SV* depends_list = NULL;
+ * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
+ * /d) */
+ SV* has_upper_latin1_only_utf8_matches = NULL;
/* Inversion list of code points this node matches regardless of things
* like locale, folding, utf8ness of the target string */
ret = reganode(pRExC_state,
(LOC)
? ANYOFL
- : (DEPENDS_SEMANTICS)
- ? ANYOFD
- : ANYOF,
+ : ANYOF,
0);
if (SIZE_ONLY) {
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
}
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_parse++;
invert = TRUE;
allow_multi_folds = FALSE;
MARK_NAUGHTY(1);
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
}
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
break;
}
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
if (UCHARAT(RExC_parse) == ']') {
break;
* skipped, it means that that white space is wanted literally, and
* is already in 'value'. Otherwise, need to translate the escape
* into what it signifies. */
- if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
+ if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
case 'w': namedclass = ANYOF_WORDCHAR; break;
case 'W': namedclass = ANYOF_NWORDCHAR; break;
if (*flagp & NEED_UTF8)
FAIL("panic: grok_bslash_N set NEED_UTF8");
+ if (*flagp & RESTART_PASS1)
+ return NULL;
if (cp_count < 0) {
vFAIL("\\N in a character class must be a named character: \\N{...}");
vFAIL2("Empty \\%c{}", (U8)value);
if (*RExC_parse == '{') {
const U8 c = (U8)value;
- e = strchr(RExC_parse++, '}');
- if (!e)
+ e = strchr(RExC_parse, '}');
+ if (!e) {
+ RExC_parse++;
vFAIL2("Missing right brace on \\%c{}", c);
- while (isSPACE(*RExC_parse))
- RExC_parse++;
+ }
+
+ RExC_parse++;
+ while (isSPACE(*RExC_parse)) {
+ RExC_parse++;
+ }
+
+ if (UCHARAT(RExC_parse) == '^') {
+
+ /* toggle. (The rhs xor gets the single bit that
+ * differs between P and p; the other xor inverts just
+ * that bit) */
+ value ^= 'P' ^ 'p';
+
+ RExC_parse++;
+ while (isSPACE(*RExC_parse)) {
+ RExC_parse++;
+ }
+ }
+
if (e == RExC_parse)
vFAIL2("Empty \\%c{}", c);
+
n = e - RExC_parse;
while (isSPACE(*(RExC_parse + n - 1)))
n--;
- }
- else {
+ } /* The \p isn't immediately followed by a '{' */
+ else if (! isALPHA(*RExC_parse)) {
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL2("Character following \\%c must be '{' or a "
+ "single-character Unicode property name",
+ (U8) value);
+ }
+ else {
e = RExC_parse;
n = 1;
}
if (!SIZE_ONLY) {
SV* invlist;
char* name;
+ char* base_name; /* name after any packages are stripped */
+ const char * const colon_colon = "::";
- if (UCHARAT(RExC_parse) == '^') {
- RExC_parse++;
- n--;
- /* toggle. (The rhs xor gets the single bit that
- * differs between P and p; the other xor inverts just
- * that bit) */
- value ^= 'P' ^ 'p';
-
- while (isSPACE(*RExC_parse)) {
- RExC_parse++;
- n--;
- }
- }
/* Try to get the definition of the property into
* <invlist>. If /i is in effect, the effective property
* will have its name be <__NAME_i>. The design is
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
- if (swash) {
+ if (swash) { /* Return any left-overs */
SvREFCNT_dec_NN(swash);
}
swash = _core_swash_init("utf8", name, &PL_sv_undef,
HV* curpkg = (IN_PERL_COMPILETIME)
? PL_curstash
: CopSTASH(PL_curcop);
- if (swash) {
+ UV final_n = n;
+ bool has_pkg;
+
+ if (swash) { /* Got a swash but no inversion list.
+ Something is likely wrong that will
+ be sorted-out later */
SvREFCNT_dec_NN(swash);
swash = NULL;
}
- /* Here didn't find it. It could be a user-defined
- * property that will be available at run-time. If we
- * accept only compile-time properties, is an error;
- * otherwise add it to the list for run-time look up */
- if (ret_invlist) {
+ /* Here didn't find it. It could be a an error (like a
+ * typo) in specifying a Unicode property, or it could
+ * be a user-defined property that will be available at
+ * run-time. The names of these must begin with 'In'
+ * or 'Is' (after any packages are stripped off). So
+ * if not one of those, or if we accept only
+ * compile-time properties, is an error; otherwise add
+ * it to the list for run-time look up. */
+ if ((base_name = rninstr(name, name + n,
+ colon_colon, colon_colon + 2)))
+ { /* Has ::. We know this must be a user-defined
+ property */
+ base_name += 2;
+ final_n -= base_name - name;
+ has_pkg = TRUE;
+ }
+ else {
+ base_name = name;
+ has_pkg = FALSE;
+ }
+
+ if ( final_n < 3
+ || base_name[0] != 'I'
+ || (base_name[1] != 's' && base_name[1] != 'n')
+ || ret_invlist)
+ {
+ const char * const msg
+ = (has_pkg)
+ ? "Illegal user-defined property name"
+ : "Can't find Unicode property definition";
RExC_parse = e + 1;
- vFAIL2utf8f(
- "Property '%"UTF8f"' is unknown",
- UTF8fARG(UTF, n, name));
+
+ /* diag_listed_as: Can't find Unicode property definition "%s" */
+ vFAIL3utf8f("%s \"%"UTF8f"\"",
+ msg, UTF8fARG(UTF, n, name));
}
/* If the property name doesn't already have a package
* name, add the current one to it so that it can be
* referred to outside it. [perl #121777] */
- if (curpkg && ! instr(name, "::")) {
+ if (! has_pkg && curpkg) {
char* pkgname = HvNAME(curpkg);
if (strNE(pkgname, "main")) {
char* full_name = Perl_form(aTHX_
(value == 'p' ? '+' : '!'),
UTF8fARG(UTF, n, name));
has_user_defined_property = TRUE;
+ optimizable = FALSE; /* Will have to leave this an
+ ANYOF node */
- /* We don't know yet, so have to assume that the
- * property could match something in the Latin1 range,
- * hence something that isn't utf8. Note that this
- * would cause things in <depends_list> to match
- * inappropriately, except that any \p{}, including
- * this one forces Unicode semantics, which means there
- * is no <depends_list> */
- ANYOF_FLAGS(ret)
- |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
+ /* We don't know yet what this matches, so have to flag
+ * it */
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
else {
named */
/* \p means they want Unicode semantics */
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
}
break;
case 'n': value = '\n'; break;
recode_encoding:
if (! RExC_override_recoding) {
SV* enc = _get_encoding();
- value = reg_recode((const char)(U8)value, &enc);
+ value = reg_recode((U8)value, &enc);
if (!enc) {
if (strict) {
vFAIL("Invalid escape in the specified encoding");
}
ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
ANYOF_POSIXL_ZERO(ret);
+
+ /* We can't change this into some other type of node
+ * (unless this is the only element, in which case there
+ * are nodes that mean exactly this) as has runtime
+ * dependencies */
+ optimizable = FALSE;
}
/* Coverity thinks it is possible for this to be negative; both
}
} /* end of namedclass \blah */
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
/* If 'range' is set, 'value' is the ending of a range--check its
* validity. (If value isn't a single code point in the case of a
&& *RExC_parse == '-')
{
char* next_char_ptr = RExC_parse + 1;
- if (skip_white) { /* Get the next real char after the '-' */
- next_char_ptr = regpatws(pRExC_state,
- RExC_parse + 1,
- FALSE); /* means don't recognize
- comments */
- }
+
+ /* Get the next real char after the '-' */
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
/* If the '-' is at the end of the class (just before the ']',
* it is a literal minus; otherwise it is a range */
/* non-Latin1 code point implies unicode semantics. Must be set in
* pass1 so is there for the whole of pass 2 */
if (value > 255) {
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
}
/* Ready to process either the single value, or the completed range.
/* Our calculated list will be for Unicode rules. For locale
* matching, we have to keep a separate list that is consulted at
* runtime only when the locale indicates Unicode rules. For
- * non-locale, we just use to the general list */
+ * non-locale, we just use the general list */
if (LOC) {
use_list = &only_utf8_locale_list;
}
PL_fold_latin1[j]);
}
else {
- depends_list =
- add_cp_to_invlist(depends_list,
- PL_fold_latin1[j]);
+ has_upper_latin1_only_utf8_matches
+ = add_cp_to_invlist(
+ has_upper_latin1_only_utf8_matches,
+ PL_fold_latin1[j]);
}
}
else {
/* Similarly folds involving non-ascii Latin1
* characters under /d are added to their list */
- depends_list = add_cp_to_invlist(depends_list,
- c);
+ has_upper_latin1_only_utf8_matches
+ = add_cp_to_invlist(
+ has_upper_latin1_only_utf8_matches,
+ c);
}
}
}
cp_list = posixes;
}
- if (depends_list) {
- _invlist_union(depends_list, nonascii_but_latin1_properties,
- &depends_list);
+ if (has_upper_latin1_only_utf8_matches) {
+ _invlist_union(has_upper_latin1_only_utf8_matches,
+ nonascii_but_latin1_properties,
+ &has_upper_latin1_only_utf8_matches);
SvREFCNT_dec_NN(nonascii_but_latin1_properties);
}
else {
- depends_list = nonascii_but_latin1_properties;
+ has_upper_latin1_only_utf8_matches
+ = nonascii_but_latin1_properties;
}
}
}
* class that isn't a Unicode property, and which matches above Unicode, \W
* or [\x{110000}] for example.
* (Note that in this case, unlike the Posix one above, there is no
- * <depends_list>, because having a Unicode property forces Unicode
- * semantics */
+ * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
+ * forces Unicode semantics */
if (properties) {
if (cp_list) {
if (warn_super) {
ANYOF_FLAGS(ret)
- |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+
+ /* Because an ANYOF node is the only one that warns, this node
+ * can't be optimized into something else */
+ optimizable = FALSE;
}
}
* locales, or the class matches at least one 0-255 range code point */
if (LOC && FOLD) {
if (only_utf8_locale_list) {
- ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
+ ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD
+ |ANYOF_ONLY_UTF8_LOC_FOLD_MATCHES;
}
- else if (cp_list) { /* Look to see if there a 0-255 code point is in
- the list */
+ else if (cp_list) { /* Look to see if a 0-255 code point is in list */
UV start, end;
invlist_iterinit(cp_list);
if (invlist_iternext(cp_list, &start, &end) && start < 256) {
}
}
+#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \
+ ( DEPENDS_SEMANTICS \
+ && ANYOF_FLAGS(ret) \
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+
+ /* See if we can simplify things under /d */
+ if ( has_upper_latin1_only_utf8_matches
+ || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+ {
+ if (has_upper_latin1_only_utf8_matches) {
+ if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+
+ /* Here, we have two, almost opposite, constraints in effect
+ * for upper latin1 characters. The macro means they all match
+ * when the target string ISN'T in UTF-8.
+ * 'has_upper_latin1_only_utf8_matches' contains the chars that
+ * match only if the target string IS UTF-8. Therefore the
+ * ones in 'has_upper_latin1_only_utf8_matches' match
+ * regardless of UTF-8, so can be added to the regular list,
+ * and 'has_upper_latin1_only_utf8_matches' cleared */
+ _invlist_union(cp_list,
+ has_upper_latin1_only_utf8_matches,
+ &cp_list);
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
+ else if (cp_list) {
+
+ /* Here, 'cp_list' gives chars that always match, and
+ * 'has_upper_latin1_only_utf8_matches' gives chars that were
+ * specified to match only if the target string is in UTF-8.
+ * It may be that these overlap, so we can subtract the
+ * unconditionally matching from the conditional ones, to make
+ * the conditional list as small as possible, perhaps even
+ * clearing it, in which case more optimizations are possible
+ * later */
+ _invlist_subtract(has_upper_latin1_only_utf8_matches,
+ cp_list,
+ &has_upper_latin1_only_utf8_matches);
+ if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
+ }
+ }
+
+ /* Similarly, if the unconditional matches include every upper latin1
+ * character, we can clear that flag to permit later optimizations */
+ if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+ SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+ _invlist_subtract(only_non_utf8_list, cp_list, &only_non_utf8_list);
+ if (_invlist_len(only_non_utf8_list) == 0) {
+ ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ }
+ SvREFCNT_dec_NN(only_non_utf8_list);
+ only_non_utf8_list = NULL;;
+ }
+
+ /* If we haven't gotten rid of all conditional matching, we change the
+ * regnode type to indicate that */
+ if ( has_upper_latin1_only_utf8_matches
+ || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+ {
+ OP(ret) = ANYOFD;
+ optimizable = FALSE;
+ }
+ }
+#undef MATCHES_ALL_NON_UTF8_NON_ASCII
+
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
* at compile time. Besides not inverting folded locale now, we can't
* invert if there are things such as \w, which aren't known until runtime
* */
if (cp_list
&& invert
+ && OP(ret) != ANYOFD
&& ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
- && ! depends_list
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
_invlist_invert(cp_list);
* adjacent such nodes. And if the class is equivalent to things like /./,
* expensive run-time swashes can be avoided. Now that we have more
* complete information, we can find things necessarily missed by the
- * earlier code. I (khw) am not sure how much to look for here. It would
- * be easy, but perhaps too slow, to check any candidates against all the
- * node types they could possibly match using _invlistEQ(). */
+ * earlier code. */
- if ( optimizable
- && cp_list
- && ! invert
- && ! depends_list
- && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
- && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
-
- /* We don't optimize if we are supposed to make sure all non-Unicode
- * code points raise a warning, as only ANYOF nodes have this check.
- * */
- && ! ((ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
- && OP(ret) != ANYOFD
- && ALWAYS_WARN_SUPER))
- {
+ if (optimizable && cp_list && ! invert) {
UV start, end;
U8 op = END; /* The optimzation node-type */
+ int posix_class = -1; /* Illegal value */
const char * cur_parse= RExC_parse;
invlist_iterinit(cp_list);
if (! invlist_iternext(cp_list, &start, &end)) {
/* Here, the list is empty. This happens, for example, when a
- * Unicode property is the only thing in the character class, and
- * it doesn't match anything. (perluniprops.pod notes such
- * properties) */
+ * Unicode property that doesn't match anything is the only element
+ * in the character class (perluniprops.pod notes such properties).
+ * */
op = OPFAIL;
*flagp |= HASWIDTH|SIMPLE;
}
}
}
}
- }
+ } /* End of first range contains just a single code point */
else if (start == 0) {
if (end == UV_MAX) {
op = SANY;
}
invlist_iterfinish(cp_list);
+ if (op == END) {
+
+ /* Here, didn't find an optimization. See if this matches any of
+ * the POSIX classes. These run slightly faster for above-Unicode
+ * code points, so don't bother with POSIXA ones nor the 2 that
+ * have no above-Unicode matches */
+ for (posix_class = 0;
+ posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+ posix_class++)
+ {
+ int try_inverted;
+ if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
+ continue;
+ }
+ for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+
+ /* Check if matches normal or inverted */
+ if (_invlistEQ(cp_list,
+ PL_XPosix_ptrs[posix_class],
+ try_inverted))
+ {
+ op = (try_inverted)
+ ? NPOSIXU
+ : POSIXU;
+ *flagp |= HASWIDTH|SIMPLE;
+ goto found_posix;
+ }
+ }
+ }
+ found_posix: ;
+ }
if (op != END) {
RExC_parse = (char *)orig_parse;
RExC_emit = (regnode *)orig_emit;
- ret = reg_node(pRExC_state, op);
+ if (regarglen[op]) {
+ ret = reganode(pRExC_state, op, 0);
+ } else {
+ ret = reg_node(pRExC_state, op);
+ }
RExC_parse = (char *)cur_parse;
TRUE /* downgradable to EXACT */
);
}
+ else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
+ FLAGS(ret) = posix_class;
+ }
SvREFCNT_dec_NN(cp_list);
return ret;
/* Here, the bitmap has been populated with all the Latin1 code points that
* always match. Can now add to the overall list those that match only
- * when the target string is UTF-8 (<depends_list>). */
- if (depends_list) {
+ * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
+ * */
+ if (has_upper_latin1_only_utf8_matches) {
if (cp_list) {
- _invlist_union(cp_list, depends_list, &cp_list);
- SvREFCNT_dec_NN(depends_list);
+ _invlist_union(cp_list,
+ has_upper_latin1_only_utf8_matches,
+ &cp_list);
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
}
else {
- cp_list = depends_list;
+ cp_list = has_upper_latin1_only_utf8_matches;
}
- ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
/* If there is a swash and more than one element, we can't use the swash in
if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
assert(! (ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
+ & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
}
else {
AV * const av = newAV();
SV *rv;
- assert(ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
-
av_store(av, 0, (runtime_defns)
? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
if (swash) {
PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
- assert(ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
-
if (data && data->count) {
const U32 n = ARG(node);
si = *ary; /* ary[0] = the string to initialize the swash with */
- /* Elements 3 and 4 are either both present or both absent. [3] is
- * any inversion list generated at compile time; [4] indicates if
- * that inversion list has any user-defined properties in it. */
if (av_tindex(av) >= 2) {
if (only_utf8_locale_ptr
&& ary[2]
*only_utf8_locale_ptr = NULL;
}
+ /* Elements 3 and 4 are either both present or both absent. [3]
+ * is any inversion list generated at compile time; [4]
+ * indicates if that inversion list has any user-defined
+ * properties in it. */
if (av_tindex(av) >= 3) {
invlist = ary[3];
if (SvUV(ary[4])) {
return p;
}
-/* nextchar()
-
- Advances the parse position, and optionally absorbs
- "whitespace" from the inputstream.
-
- Without /x "whitespace" means (?#...) style comments only,
- with /x this means (?#...) and # comments and whitespace proper.
-
- Returns the RExC_parse point from BEFORE the scan occurs.
+STATIC void
+S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
+ char ** p,
+ const bool force_to_xmod
+ )
+{
+ /* If the text at the current parse position '*p' is a '(?#...)' comment,
+ * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
+ * is /x whitespace, advance '*p' so that on exit it points to the first
+ * byte past all such white space and comments */
- This is the /x friendly way of saying RExC_parse++.
-*/
+ const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
-STATIC char*
-S_nextchar(pTHX_ RExC_state_t *pRExC_state)
-{
- char* const retval = RExC_parse++;
+ PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
- PERL_ARGS_ASSERT_NEXTCHAR;
+ assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
for (;;) {
- if (RExC_end - RExC_parse >= 3
- && *RExC_parse == '('
- && RExC_parse[1] == '?'
- && RExC_parse[2] == '#')
+ if (RExC_end - (*p) >= 3
+ && *(*p) == '('
+ && *(*p + 1) == '?'
+ && *(*p + 2) == '#')
{
- while (*RExC_parse != ')') {
- if (RExC_parse == RExC_end)
+ while (*(*p) != ')') {
+ if ((*p) == RExC_end)
FAIL("Sequence (?#... not terminated");
- RExC_parse++;
+ (*p)++;
}
- RExC_parse++;
+ (*p)++;
continue;
}
- if (RExC_flags & RXf_PMf_EXTENDED) {
- char * p = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
- if (p != RExC_parse) {
- RExC_parse = p;
+
+ if (use_xmod) {
+ const char * save_p = *p;
+ while ((*p) < RExC_end) {
+ STRLEN len;
+ if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
+ (*p) += len;
+ }
+ else if (*(*p) == '#') {
+ (*p) = reg_skipcomment(pRExC_state, (*p));
+ }
+ else {
+ break;
+ }
+ }
+ if (*p != save_p) {
continue;
}
}
- return retval;
+
+ break;
}
+
+ return;
+}
+
+/* nextchar()
+
+ Advances the parse position by one byte, unless that byte is the beginning
+ of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
+ those two cases, the parse position is advanced beyond all such comments and
+ white space.
+
+ This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
+*/
+
+STATIC void
+S_nextchar(pTHX_ RExC_state_t *pRExC_state)
+{
+ PERL_ARGS_ASSERT_NEXTCHAR;
+
+ assert( ! UTF
+ || UTF8_IS_INVARIANT(*RExC_parse)
+ || UTF8_IS_START(*RExC_parse));
+
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't assume /x */ );
}
STATIC regnode *
|| k == GROUPP || OP(o)==ACCEPT)
{
AV *name_list= NULL;
- Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
+ U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
+ Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */
if ( RXp_PAREN_NAMES(prog) ) {
name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
} else if ( pRExC_state ) {
}
if (name_list) {
if ( k != REF || (OP(o) < NREF)) {
- SV **name= av_fetch(name_list, ARG(o), 0 );
+ SV **name= av_fetch(name_list, parno, 0 );
if (name)
Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
}
else {
- SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
+ SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
I32 *nums=(I32*)SvPVX(sv_dat);
SV **name= av_fetch(name_list, nums[0], 0 );
I32 n;
Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
}
}
- else if (k == VERB) {
- if (!o->flags)
- Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
- SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
- } else if (k == LOGICAL)
+ else if (k == LOGICAL)
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
const U8 flags = ANYOF_FLAGS(o);
int do_sep = 0;
- SV* bitmap_invlist; /* Will hold what the bit map contains */
+ SV* bitmap_invlist = NULL; /* Will hold what the bit map contains */
if (OP(o) == ANYOFL) {
}
}
- if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
- |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
- |ANYOF_LOC_FOLD)))
+ if ( ARG(o) != ANYOF_ONLY_HAS_BITMAP
+ || (flags
+ & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
+ |ANYOF_LOC_FOLD)))
{
if (do_sep) {
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
if (*s == '\n') {
const char * const t = ++s;
- if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
- sv_catpvs(sv, "{outside bitmap}");
- }
- else {
- sv_catpvs(sv, "{utf8}");
+ if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) {
+ if (OP(o) == ANYOFD) {
+ sv_catpvs(sv, "{utf8}");
+ }
+ else {
+ sv_catpvs(sv, "{outside bitmap}");
+ }
}
if (byte_output) {
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
else if (OP(o) == SBOL)
Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
+
+ /* add on the verb argument if there is one */
+ if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
+ Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
+ SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+ }
#else
PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(sv);
this_end = (end < NUM_ANYOF_CODE_POINTS)
? end
: NUM_ANYOF_CODE_POINTS - 1;
+#if NUM_ANYOF_CODE_POINTS > 256
format = (this_end < 256)
? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
: "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
+#else
+ format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
+#endif
GCC_DIAG_IGNORE(-Wformat-nonliteral);
Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
GCC_DIAG_RESTORE;
int i;
UV start, end;
unsigned int punct_count = 0;
- SV* invlist = NULL;
- SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
+ SV* invlist;
bool allow_literals = TRUE;
+ bool inverted_for_output = FALSE;
PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
- invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
-
/* Worst case is exactly every-other code point is in the list */
- *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+ invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
/* Convert the bit map to an inversion list, keeping track of how many
* ASCII puncts are set, including an extra amount for the backslashed
* ones. */
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (BITMAP_TEST(bitmap, i)) {
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
+ invlist = add_cp_to_invlist(invlist, i);
if (isPUNCT_A(i)) {
punct_count++;
if isBACKSLASHED_PUNCT(i) {
}
/* Nothing to output */
- if (_invlist_len(*invlist_ptr) == 0) {
- SvREFCNT_dec(invlist);
+ if (_invlist_len(invlist) == 0) {
+ SvREFCNT_dec_NN(invlist);
return FALSE;
}
* literals, but if a range (nearly) spans all of them, it's best to output
* it as a single range. This code will use a single range if all but 2
* printables are in it */
- invlist_iterinit(*invlist_ptr);
- while (invlist_iternext(*invlist_ptr, &start, &end)) {
+ invlist_iterinit(invlist);
+ while (invlist_iternext(invlist, &start, &end)) {
/* If range starts beyond final printable, it doesn't have any in it */
if (start > MAX_PRINT_A) {
break;
}
}
- invlist_iterfinish(*invlist_ptr);
+ invlist_iterfinish(invlist);
/* The legibility of the output depends mostly on how many punctuation
* characters are output. There are 32 possible ASCII ones, and some have
/* Add everything remaining to the list, so when we invert it just
* below, it will be excluded */
- _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
- _invlist_invert(*invlist_ptr);
+ _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
+ _invlist_invert(invlist);
+ inverted_for_output = TRUE;
}
/* Here we have figured things out. Output each range */
- invlist_iterinit(*invlist_ptr);
- while (invlist_iternext(*invlist_ptr, &start, &end)) {
+ invlist_iterinit(invlist);
+ while (invlist_iternext(invlist, &start, &end)) {
if (start >= NUM_ANYOF_CODE_POINTS) {
break;
}
put_range(sv, start, end, allow_literals);
}
- invlist_iterfinish(*invlist_ptr);
+ invlist_iterfinish(invlist);
+
+ if (bitmap_invlist) {
+
+ /* Here, wants the inversion list returned. If we inverted it, we have
+ * to restore it to the original */
+ if (inverted_for_output) {
+ _invlist_invert(invlist);
+ _invlist_intersection(invlist, PL_InBitmap, &invlist);
+ }
+
+ *bitmap_invlist = invlist;
+ }
+ else {
+ SvREFCNT_dec_NN(invlist);
+ }
return TRUE;
}