#endif
#include "dquote_static.c"
-#include "charclass_invlists.h"
#include "inline_invlist.c"
#include "unicode_constants.h"
I32 contains_locale;
I32 contains_i;
I32 override_recoding;
+#ifdef EBCDIC
+ I32 recode_x_to_native;
+#endif
I32 in_multi_char_class;
struct reg_code_block *code_blocks; /* positions of literal (?{})
within pattern */
#define RExC_contains_locale (pRExC_state->contains_locale)
#define RExC_contains_i (pRExC_state->contains_i)
#define RExC_override_recoding (pRExC_state->override_recoding)
+#ifdef EBCDIC
+# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+#endif
#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
#define RExC_frame_head (pRExC_state->frame_head)
#define RExC_frame_last (pRExC_state->frame_last)
UTF8fARG(UTF, offset, RExC_precomp), \
UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
+/* Used to point after bad bytes for an error message, but avoid skipping
+ * past a nul byte. */
+#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
+
/*
* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
* arg. Show regex, up to a maximum length. If it's too long, chop and add
( flags & SCF_IN_DEFINE )
||
(
- (is_inf_internal || is_inf || data->flags & SF_IS_INF)
+ (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
&&
( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
)
}
return final_minlen;
}
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
STATIC U32
DEBUG_r(if (!PL_colorset) reginitcolors());
-#ifndef PERL_IN_XSUB_RE
/* Initialize these here instead of as-needed, as is quick and avoids
* having to test them each time otherwise */
if (! PL_AboveLatin1) {
PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
NUM_ANYOF_CODE_POINTS - 1);
}
-#endif
pRExC_state->code_blocks = NULL;
pRExC_state->num_code_blocks = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_extralen = 0;
RExC_override_recoding = 0;
+#ifdef EBCDIC
+ RExC_recode_x_to_native = 0;
+#endif
RExC_in_multi_char_class = 0;
/* First pass: determine size, legality. */
Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
(unsigned long) flags);
}
- NOT_REACHED; /* NOT REACHED */
+ NOT_REACHED; /* NOTREACHED */
}
return NULL;
}
/*NOTREACHED*/
default:
fail_modifiers:
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += SKIP_IF_CHAR(RExC_parse);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
nextchar(pRExC_state);
return ret;
}
- RExC_parse++;
+ --RExC_parse;
+ RExC_parse += SKIP_IF_CHAR(RExC_parse);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
RExC_parse-seqstart, seqstart);
if (RExC_parse == RExC_end || *RExC_parse != ')')
vFAIL("Sequence (?&... not terminated");
goto gen_recurse_regop;
- /* NOT REACHED */
+ /* NOTREACHED */
case '+':
if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
RExC_parse++;
vFAIL("Illegal pattern");
}
goto parse_recursion;
- /* NOT REACHED*/
+ /* NOTREACHED*/
case '-': /* (?-1) */
if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
RExC_parse--; /* rewind to let it be handled later */
parse_recursion:
{
bool is_neg = FALSE;
+ UV unum;
parse_start = RExC_parse - 1; /* MJD */
if (*RExC_parse == '-') {
RExC_parse++;
is_neg = TRUE;
}
- num = grok_atou(RExC_parse, &endptr);
- if (endptr)
- RExC_parse = (char*)endptr;
+ if (grok_atoUV(RExC_parse, &unum, &endptr)
+ && unum <= I32_MAX
+ ) {
+ num = (I32)unum;
+ RExC_parse = (char*)endptr;
+ } else
+ num = I32_MAX;
if (is_neg) {
/* Some limit for num? */
num = -num;
nextchar(pRExC_state);
return ret;
- /* NOT REACHED */
+ /* NOTREACHED */
case '?': /* (??...) */
is_logical = 1;
if (*RExC_parse != '{') {
- RExC_parse++;
+ RExC_parse += SKIP_IF_CHAR(RExC_parse);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f(
"Sequence (%"UTF8f"...) not recognized",
RExC_parse++;
parno = 0;
if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
- parno = grok_atou(RExC_parse, &endptr);
- if (endptr)
+ UV uv;
+ if (grok_atoUV(RExC_parse, &uv, &endptr)
+ && uv <= I32_MAX
+ ) {
+ parno = (I32)uv;
RExC_parse = (char*)endptr;
+ }
+ /* else "Switch condition not recognized" below */
} else if (RExC_parse[0] == '&') {
SV *sv_dat;
RExC_parse++;
/* (?(1)...) */
char c;
char *tmp;
- parno = grok_atou(RExC_parse, &endptr);
- if (endptr)
- RExC_parse = (char*)endptr;
+ UV uv;
+ if (grok_atoUV(RExC_parse, &uv, &endptr)
+ && uv <= I32_MAX
+ ) {
+ parno = (I32)uv;
+ RExC_parse = (char*)endptr;
+ }
+ /* XXX else what? */
ret = reganode(pRExC_state, GROUPP, parno);
insert_if_check_paren:
parse_flags:
parse_lparen_question_flags(pRExC_state);
if (UCHARAT(RExC_parse) != ':') {
- nextchar(pRExC_state);
+ if (*RExC_parse)
+ nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
}
Set_Node_Offset(ret, RExC_parse); /* MJD */
is_open = 1;
} else {
+ /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
+ paren = ':';
ret = NULL;
}
}
char *parse_start;
#endif
const char *maxpos = NULL;
+ UV uv;
/* Save the original in case we change the emitted regop to a FAIL. */
regnode * const orig_emit = RExC_emit;
if (!maxpos)
maxpos = next;
RExC_parse++;
- min = grok_atou(RExC_parse, &endptr);
+ if (isDIGIT(*RExC_parse)) {
+ if (!grok_atoUV(RExC_parse, &uv, &endptr))
+ vFAIL("Invalid quantifier in {,}");
+ if (uv >= REG_INFTY)
+ vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+ min = (I32)uv;
+ } else {
+ min = 0;
+ }
if (*maxpos == ',')
maxpos++;
else
maxpos = RExC_parse;
- max = grok_atou(maxpos, &endptr);
- if (!max && *maxpos != '0')
+ if (isDIGIT(*maxpos)) {
+ if (!grok_atoUV(maxpos, &uv, &endptr))
+ vFAIL("Invalid quantifier in {,}");
+ if (uv >= REG_INFTY)
+ vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+ max = (I32)uv;
+ } else {
max = REG_INFTY; /* meaning "infinity" */
- else if (max >= REG_INFTY)
- vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
+ }
RExC_parse = next;
nextchar(pRExC_state);
if (max < min) { /* If can't match, warn and optimize to fail
return(ret);
}
-STATIC STRLEN
-S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
- UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
+STATIC bool
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
+ regnode ** node_p,
+ UV * code_point_p,
+ int * cp_count,
+ I32 * flagp,
+ const U32 depth
)
{
-
- /* This is expected to be called by a parser routine that has recognized '\N'
- and needs to handle the rest. RExC_parse is expected to point at the first
- char following the N at the time of the call. On successful return,
- RExC_parse has been updated to point to just after the sequence identified
- by this routine, <*flagp> has been updated, and the non-NULL input pointers
- have been set appropriately.
-
- The typical case for this is \N{some character name}. This is usually
- called while parsing the input, filling in or ready to fill in an EXACTish
- node, and the code point for the character should be returned, so that it
- can be added to the node, and parsing continued with the next input
- character. But it may be that instead of a single character the \N{}
- expands to more than one, a named sequence. In this case any following
- quantifier applies to the whole sequence, and it is easier, given the code
- structure that calls this, to handle it from a different area of the code.
- For this reason, the input parameters can be set so that it returns valid
- only on one or the other of these cases.
-
- Another possibility is for the input to be an empty \N{}, which for
- backwards compatibility we accept, but generate a NOTHING node which should
- later get optimized out. This is handled from the area of code which can
- handle a named sequence, so if called with the parameters for the other, it
- fails.
-
- Still another possibility is for the \N to mean [^\n], and not a single
- character or explicit sequence at all. This is determined by context.
- Again, this is handled from the area of code which can handle a named
- sequence, so if called with the parameters for the other, it also fails.
-
- And the final possibility is for the \N to be called from within a bracketed
- character class. In this case the [^\n] meaning makes no sense, and so is
- an error. Other anomalous situations are left to the calling code to handle.
-
- For non-single-quoted regexes, the tokenizer has attempted to decide which
- of the above applies, and in the case of a named sequence, has converted it
- into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
- where c1... are the characters in the sequence. For single-quoted regexes,
- the tokenizer passes the \N sequence through unchanged; this code will not
- attempt to determine this nor expand those, instead raising a syntax error.
- The net effect is that if the beginning of the passed-in pattern isn't '{U+'
- or there is no '}', it signals that this \N occurrence means to match a
- non-newline. (This mostly was done because of [perl #56444].)
-
- The API is somewhat convoluted due to historical and the above reasons.
-
- The function raises an error (via vFAIL), and doesn't return for various
- syntax errors. For other failures, it returns (STRLEN) -1. For successes,
- it returns a count of how many characters were accounted for by it. (This
- can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
- points in the sequence. It sets <node_p>, <valuep>, and/or
- <substitute_parse> on success.
-
- If <valuep> is non-null, it means the caller can accept an input sequence
- consisting of just a single code point; <*valuep> is set to the value of the
- only or first code point in the input.
-
- If <substitute_parse> is non-null, it means the caller can accept an input
- sequence consisting of one or more code points; <*substitute_parse> is a
- newly created mortal SV* in this case, containing \x{} escapes representing
- those code points.
-
- Both <valuep> and <substitute_parse> can be non-NULL.
-
- If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
- that the caller can accept any legal sequence other than a single code
- point. To wit, <*node_p> is set as follows:
- 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
- 2) \N{}: points to a new NOTHING node; return is 0
- 3) otherwise: points to a new EXACT node containing the resolved
- string; return is the number of code points in the
- string. This will never be 1.
- Note that failure is returned for single code point sequences if <valuep> is
- null and <node_p> is not.
- */
-
- char * endbrace; /* '}' following the name */
- char* p;
+ /* This routine teases apart the various meanings of \N and returns
+ * accordingly. The input parameters constrain which meaning(s) is/are valid
+ * in the current context.
+ *
+ * Exactly one of <node_p> and <code_point_p> must be non-NULL.
+ *
+ * If <code_point_p> is not NULL, the context is expecting the result to be a
+ * single code point. If this \N instance turns out to a single code point,
+ * the function returns TRUE and sets *code_point_p to that code point.
+ *
+ * If <node_p> is not NULL, the context is expecting the result to be one of
+ * the things representable by a regnode. If this \N instance turns out to be
+ * one such, the function generates the regnode, returns TRUE and sets *node_p
+ * to point to that regnode.
+ *
+ * If this instance of \N isn't legal in any context, this function will
+ * generate a fatal error and not return.
+ *
+ * On input, RExC_parse should point to the first char following the \N at the
+ * time of the call. On successful return, RExC_parse will have been updated
+ * to point to just after the sequence identified by this routine. Also
+ * *flagp has been updated as needed.
+ *
+ * When there is some problem with the current context and this \N instance,
+ * the function returns FALSE, without advancing RExC_parse, nor setting
+ * *node_p, nor *code_point_p, nor *flagp.
+ *
+ * If <cp_count> is not NULL, the caller wants to know the length (in code
+ * points) that this \N sequence matches. This is set even if the function
+ * returns FALSE, as detailed below.
+ *
+ * There are 5 possibilities here, as detailed in the next 5 paragraphs.
+ *
+ * Probably the most common case is for the \N to specify a single code point.
+ * *cp_count will be set to 1, and *code_point_p will be set to that code
+ * point.
+ *
+ * Another possibility is for the input to be an empty \N{}, which for
+ * backwards compatibility we accept. *cp_count will be set to 0. *node_p
+ * will be set to a generated NOTHING node.
+ *
+ * Still another possibility is for the \N to mean [^\n]. *cp_count will be
+ * set to 0. *node_p will be set to a generated REG_ANY node.
+ *
+ * The fourth possibility is that \N resolves to a sequence of more than one
+ * code points. *cp_count will be set to the number of code points in the
+ * sequence. *node_p * will be set to a generated node returned by this
+ * function calling S_reg().
+ *
+ * The final possibility, which happens only when the fourth one would
+ * otherwise be in effect, is that one of those code points requires the
+ * pattern to be recompiled as UTF-8. The function returns FALSE, and sets
+ * the RESTART_UTF8 flag in *flagp. When this happens, the caller needs to
+ * desist from continuing parsing, and return this information to its caller.
+ * This is not set for when there is only one code point, as this can be
+ * called as part of an ANYOF node, and they can store above-Latin1 code
+ * points without the pattern having to be in UTF-8.
+ *
+ * For non-single-quoted regexes, the tokenizer has resolved character and
+ * sequence names inside \N{...} into their Unicode values, normalizing the
+ * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
+ * hex-represented code points in the sequence. This is done there because
+ * the names can vary based on what charnames pragma is in scope at the time,
+ * so we need a way to take a snapshot of what they resolve to at the time of
+ * the original parse. [perl #56444].
+ *
+ * That parsing is skipped for single-quoted regexes, so we may here get
+ * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
+ * parser. But if the single-quoted regex is something like '\N{U+41}', that
+ * is legal and handled here. The code point is Unicode, and has to be
+ * translated into the native character set for non-ASCII platforms.
+ * the tokenizer passes the \N sequence through unchanged; this code will not
+ * attempt to determine this nor expand those, instead raising a syntax error.
+ */
+
+ char * endbrace; /* points to '}' following the name */
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
- bool has_multiple_chars; /* true if the input stream contains a sequence of
- more than one character */
- bool in_char_class = substitute_parse != NULL;
- STRLEN count = 0; /* Number of characters in this sequence */
+ char* p; /* Temporary */
GET_RE_DEBUG_FLAGS_DECL;
GET_RE_DEBUG_FLAGS;
- assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
- assert(! (node_p && substitute_parse)); /* At most 1 should be set */
+ assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
+ assert(! (node_p && cp_count)); /* At most 1 should be set */
+
+ if (cp_count) { /* Initialize return for the most common case */
+ *cp_count = 1;
+ }
/* The [^\n] meaning of \N ignores spaces and comments under the /x
- * modifier. The other meaning does not, so use a temporary until we find
+ * modifier. The other meanings do not, so use a temporary until we find
* out which we are being called with */
p = (RExC_flags & RXf_PMf_EXTENDED)
? regpatws(pRExC_state, RExC_parse,
: RExC_parse;
/* Disambiguate between \N meaning a named character versus \N meaning
- * [^\n]. The former is assumed when it can't be the latter. */
+ * [^\n]. The latter is assumed when the {...} following the \N is a legal
+ * quantifier, or there is no a '{' at all */
if (*p != '{' || regcurly(p)) {
RExC_parse = p;
+ if (cp_count) {
+ *cp_count = -1;
+ }
+
if (! node_p) {
- /* no bare \N allowed in a charclass */
- if (in_char_class) {
- vFAIL("\\N in a character class must be a named character: \\N{...}");
- }
- return (STRLEN) -1;
+ return FALSE;
}
RExC_parse--; /* Need to back off so nextchar() doesn't skip the
current char */
*flagp |= HASWIDTH|SIMPLE;
MARK_NAUGHTY(1);
Set_Node_Length(*node_p, 1); /* MJD */
- return 1;
+ return TRUE;
}
/* Here, we have decided it should be a named character or sequence */
RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
if (endbrace == RExC_parse) { /* empty: \N{} */
- if (node_p) {
- *node_p = reg_node(pRExC_state,NOTHING);
- }
- else if (! in_char_class) {
- return (STRLEN) -1;
+ if (cp_count) {
+ *cp_count = 0;
}
nextchar(pRExC_state);
- return 0;
+ if (! node_p) {
+ return FALSE;
+ }
+
+ *node_p = reg_node(pRExC_state,NOTHING);
+ return TRUE;
}
RExC_parse += 2; /* Skip past the 'U+' */
/* Code points are separated by dots. If none, there is only one code
* point, and is terminated by the brace */
- has_multiple_chars = (endchar < endbrace);
- /* We get the first code point if we want it, and either there is only one,
- * or we can accept both cases of one and there is more than one */
- if (valuep && (substitute_parse || ! has_multiple_chars)) {
- STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
- I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
+ if (endchar >= endbrace) {
+ STRLEN length_of_hex;
+ I32 grok_hex_flags;
+
+ /* Here, exactly one code point. If that isn't what is wanted, fail */
+ if (! code_point_p) {
+ RExC_parse = p;
+ return FALSE;
+ }
+
+ /* Convert code point from hex */
+ length_of_hex = (STRLEN)(endchar - RExC_parse);
+ grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX
/* No errors in the first pass (See [perl
* #122671].) We let the code below find the
* errors when there are multiple chars. */
- | ((SIZE_ONLY || has_multiple_chars)
+ | ((SIZE_ONLY)
? PERL_SCAN_SILENT_ILLDIGIT
: 0);
- *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
+ /* This routine is the one place where both single- and double-quotish
+ * \N{U+xxxx} are evaluated. The value is a Unicode code point which
+ * must be converted to native. */
+ *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
+ &length_of_hex,
+ &grok_hex_flags,
+ NULL));
/* The tokenizer should have guaranteed validity, but it's possible to
* bypass it by using single quoting, so check. Don't do the check
* here when there are multiple chars; we do it below anyway. */
- if (! has_multiple_chars) {
- if (length_of_hex == 0
- || length_of_hex != (STRLEN)(endchar - RExC_parse) )
- {
- RExC_parse += length_of_hex; /* Includes all the valid */
- RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- /* Guard against malformed utf8 */
- if (RExC_parse >= endchar) {
- RExC_parse = endchar;
- }
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ if (length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+ {
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) {
+ RExC_parse = endchar;
}
-
- RExC_parse = endbrace + 1;
- return 1;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- }
- /* Here, we should have already handled the case where a single character
- * is expected and found. So it is a failure if we aren't expecting
- * multiple chars and got them; or didn't get them but wanted them. We
- * fail without advancing the parse, so that the caller can try again with
- * different acceptance criteria */
- if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
- RExC_parse = p;
- return (STRLEN) -1;
+ RExC_parse = endbrace + 1;
+ return TRUE;
}
-
- {
- /* What is done here is to convert this to a sub-pattern of the form
- * \x{char1}\x{char2}...
- * and then either return it in <*substitute_parse> if non-null; or
- * call reg recursively to parse it (enclosing in "(?: ... )" ). That
- * way, it retains its atomicness, while not having to worry about
- * special handling that some code points may have. toke.c has
- * converted the original Unicode values to native, so that we can just
- * pass on the hex values unchanged. We do have to set a flag to keep
- * recoding from happening in the recursion */
-
- SV * dummy = NULL;
+ else { /* Is a multiple character sequence */
+ SV * substitute_parse;
STRLEN len;
char *orig_end = RExC_end;
I32 flags;
- if (substitute_parse) {
- *substitute_parse = newSVpvs("");
+ /* Count the code points, if desired, in the sequence */
+ if (cp_count) {
+ *cp_count = 0;
+ while (RExC_parse < endbrace) {
+ /* Point to the beginning of the next character in the sequence. */
+ RExC_parse = endchar + 1;
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
+ (*cp_count)++;
+ }
}
- else {
- substitute_parse = &dummy;
- *substitute_parse = newSVpvs("?:");
+
+ /* Fail if caller doesn't want to handle a multi-code-point sequence.
+ * But don't backup up the pointer if the caller want to know how many
+ * code points there are (they can then handle things) */
+ if (! node_p) {
+ if (! cp_count) {
+ RExC_parse = p;
+ }
+ return FALSE;
}
- *substitute_parse = sv_2mortal(*substitute_parse);
+
+ /* What is done here is to convert this to a sub-pattern of the form
+ * \x{char1}\x{char2}... and then call reg recursively to parse it
+ * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
+ * while not having to worry about special handling that some code
+ * points may have. */
+
+ substitute_parse = newSVpvs("?:");
while (RExC_parse < endbrace) {
/* Convert to notation the rest of the code understands */
- sv_catpv(*substitute_parse, "\\x{");
- sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
- sv_catpv(*substitute_parse, "}");
+ sv_catpv(substitute_parse, "\\x{");
+ sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+ sv_catpv(substitute_parse, "}");
/* Point to the beginning of the next character in the sequence. */
RExC_parse = endchar + 1;
endchar = RExC_parse + strcspn(RExC_parse, ".}");
- count++;
}
- if (! in_char_class) {
- sv_catpv(*substitute_parse, ")");
- }
+ sv_catpv(substitute_parse, ")");
- RExC_parse = SvPV(*substitute_parse, len);
+ RExC_parse = SvPV(substitute_parse, len);
/* Don't allow empty number */
- if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
+ if (len < (STRLEN) 8) {
RExC_parse = endbrace;
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
RExC_end = RExC_parse + len;
- /* The values are Unicode, and therefore not subject to recoding */
+ /* The values are Unicode, and therefore not subject to recoding, but
+ * have to be converted to native on a non-Unicode (meaning non-ASCII)
+ * platform. */
RExC_override_recoding = 1;
+#ifdef EBCDIC
+ RExC_recode_x_to_native = 1;
+#endif
if (node_p) {
if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
if (flags & RESTART_UTF8) {
*flagp = RESTART_UTF8;
- return (STRLEN) -1;
+ return FALSE;
}
FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
(UV) flags);
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
}
+ /* Restore the saved values */
RExC_parse = endbrace;
RExC_end = orig_end;
RExC_override_recoding = 0;
+#ifdef EBCDIC
+ RExC_recode_x_to_native = 0;
+#endif
+ SvREFCNT_dec_NN(substitute_parse);
nextchar(pRExC_state);
- }
- return count;
+ return TRUE;
+ }
}
S_backref_value(char *p)
{
const char* endptr;
- UV val = grok_atou(p, &endptr);
- if (endptr == p || endptr == NULL || val > I32_MAX)
- return I32_MAX;
- return (I32)val;
+ UV val;
+ if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
+ return (I32)val;
+ return I32_MAX;
}
invert = 1;
/* FALLTHROUGH */
case 'b':
+ {
+ regex_charset charset = get_regex_charset(RExC_flags);
+
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
- op = BOUND + get_regex_charset(RExC_flags);
- if (op > BOUNDA) { /* /aa is same as /a */
- op = BOUNDA;
- }
- else if (op == BOUNDL) {
- RExC_contains_locale = 1;
- }
+ op = BOUND + charset;
- if (invert) {
- op += NBOUND - BOUND;
+ if (op == BOUNDL) {
+ RExC_contains_locale = 1;
}
ret = reg_node(pRExC_state, op);
*flagp |= SIMPLE;
- if ((U8) *(RExC_parse + 1) == '{') {
- /* diag_listed_as: Use "%s" instead of "%s" */
- vFAIL3("Use \"\\%c\\{\" instead of \"\\%c{\"", *RExC_parse, *RExC_parse);
+ if (*(RExC_parse + 1) != '{') {
+ FLAGS(ret) = TRADITIONAL_BOUND;
+ if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
+ OP(ret) = BOUNDA;
+ }
+ }
+ else {
+ STRLEN length;
+ char name = *RExC_parse;
+ char * endbrace;
+ RExC_parse += 2;
+ endbrace = strchr(RExC_parse, '}');
+
+ if (! endbrace) {
+ vFAIL2("Missing right brace on \\%c{}", name);
+ }
+ /* XXX Need to decide whether to take spaces or not. Should be
+ * consistent with \p{}, but that currently is SPACE, which
+ * means vertical too, which seems wrong
+ * while (isBLANK(*RExC_parse)) {
+ RExC_parse++;
+ }*/
+ if (endbrace == RExC_parse) {
+ RExC_parse++; /* After the '}' */
+ vFAIL2("Empty \\%c{}", name);
+ }
+ length = endbrace - RExC_parse;
+ /*while (isBLANK(*(RExC_parse + length - 1))) {
+ length--;
+ }*/
+ switch (*RExC_parse) {
+ case 'g':
+ if (length != 1
+ && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+ {
+ goto bad_bound_type;
+ }
+ FLAGS(ret) = GCB_BOUND;
+ break;
+ case 's':
+ if (length != 2 || *(RExC_parse + 1) != 'b') {
+ goto bad_bound_type;
+ }
+ FLAGS(ret) = SB_BOUND;
+ break;
+ case 'w':
+ if (length != 2 || *(RExC_parse + 1) != 'b') {
+ goto bad_bound_type;
+ }
+ FLAGS(ret) = WB_BOUND;
+ break;
+ default:
+ bad_bound_type:
+ RExC_parse = endbrace;
+ vFAIL2utf8f(
+ "'%"UTF8f"' is an unknown bound type",
+ UTF8fARG(UTF, length, endbrace - length));
+ NOT_REACHED; /*NOTREACHED*/
+ }
+ RExC_parse = endbrace;
+ RExC_uni_semantics = 1;
+
+ if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
+ OP(ret) = BOUNDU;
+ length += 4;
+
+ /* Don't have to worry about UTF-8, in this message because
+ * to get here the contents of the \b must be ASCII */
+ ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
+ "Using /u for '%.*s' instead of /%s",
+ (unsigned) length,
+ endbrace - length + 1,
+ (charset == REGEX_ASCII_RESTRICTED_CHARSET)
+ ? ASCII_RESTRICT_PAT_MODS
+ : ASCII_MORE_RESTRICT_PAT_MODS);
+ }
}
+
+ if (PASS2 && invert) {
+ OP(ret) += NBOUND - BOUND;
+ }
goto finish_meta_pat;
+ }
case 'D':
invert = 1;
}
break;
case 'N':
- /* Handle \N and \N{NAME} with multiple code points here and not
- * below because it can be multicharacter. join_exact() will join
- * them up later on. Also this makes sure that things like
- * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
- * The options to the grok function call causes it to fail if the
- * sequence is just a single code point. We then go treat it as
- * just another character in the current EXACT node, and hence it
- * gets uniform treatment with all the other characters. The
- * special treatment for quantifiers is not needed for such single
- * character sequences */
+ /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
+ * \N{...} evaluates to a sequence of more than one code points).
+ * The function call below returns a regnode, which is our result.
+ * The parameters cause it to fail if the \N{} evaluates to a
+ * single code point; we handle those like any other literal. The
+ * reason that the multicharacter case is handled here and not as
+ * part of the EXACtish code is because of quantifiers. In
+ * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
+ * this way makes that Just Happen. dmq.
+ * join_exact() will join this up with adjacent EXACTish nodes
+ * later on, if appropriate. */
++RExC_parse;
- if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
- depth, FALSE))
- {
- if (*flagp & RESTART_UTF8)
- return NULL;
- RExC_parse--;
- goto defchar;
+ if (grok_bslash_N(pRExC_state,
+ &ret, /* Want a regnode returned */
+ NULL, /* Fail if evaluates to a single code
+ point */
+ NULL, /* Don't need a count of how many code
+ points */
+ flagp,
+ depth)
+ ) {
+ break;
}
- break;
+
+ if (*flagp & RESTART_UTF8)
+ return NULL;
+ RExC_parse--;
+ goto defchar;
+
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
{
}
else {
num = S_backref_value(RExC_parse);
- /* bare \NNN might be backref or octal - if it is larger than or equal
- * RExC_npar then it is assumed to be and octal escape.
- * Note RExC_npar is +1 from the actual number of parens*/
- if (num == I32_MAX || (num > 9 && num >= RExC_npar
- && *RExC_parse != '8' && *RExC_parse != '9'))
+ /* bare \NNN might be backref or octal - if it is larger
+ * than or equal RExC_npar then it is assumed to be an
+ * octal escape. Note RExC_npar is +1 from the actual
+ * number of parens. */
+ /* Note we do NOT check if num == I32_MAX here, as that is
+ * handled by the RExC_npar check */
+
+ if (
+ /* any numeric escape < 10 is always a backref */
+ num > 9
+ /* any numeric escape < RExC_npar is a backref */
+ && num >= RExC_npar
+ /* cannot be an octal escape if it starts with 8 */
+ && *RExC_parse != '8'
+ /* cannot be an octal escape it it starts with 9 */
+ && *RExC_parse != '9'
+ )
{
- /* Probably a character specified in octal, e.g. \35 */
+ /* Probably not a backref, instead likely to be an
+ * octal character escape, e.g. \35 or \777.
+ * The above logic should make it obvious why using
+ * octal escapes in patterns is problematic. - Yves */
goto defchar;
}
}
- /* at this point RExC_parse definitely points to a backref
- * number */
+ /* At this point RExC_parse points at a numeric escape like
+ * \12 or \88 or something similar, which we should NOT treat
+ * as an octal escape. It may or may not be a valid backref
+ * escape. For instance \88888888 is unlikely to be a valid
+ * backref. */
{
#ifdef RE_TRACK_PATTERN_OFFSETS
char * const parse_start = RExC_parse - 1; /* MJD */
p++;
break;
case 'N': /* Handle a single-code point named character. */
- /* The options cause it to fail if a multiple code
- * point sequence. Handle those in the switch() above
- * */
RExC_parse = p + 1;
- if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
- &ender,
- flagp,
- depth,
- FALSE
- )) {
+ if (! grok_bslash_N(pRExC_state,
+ NULL, /* Fail if evaluates to
+ anything other than a
+ single code point */
+ &ender, /* The returned single code
+ point */
+ NULL, /* Don't need a count of
+ how many code points */
+ flagp,
+ depth)
+ ) {
if (*flagp & RESTART_UTF8)
FAIL("panic: grok_bslash_N set RESTART_UTF8");
+
+ /* Here, it wasn't a single code point. Go close
+ * up this EXACTish node. The switch() prior to
+ * this switch handles the other cases */
RExC_parse = p = oldp;
goto loopdone;
}
}
ender = result;
- if (IN_ENCODING && ender < 0x100) {
- goto recode_encoding;
+ if (ender < 0x100) {
+#ifdef EBCDIC
+ if (RExC_recode_x_to_native) {
+ ender = LATIN1_TO_NATIVE(ender);
+ }
+ else
+#endif
+ if (IN_ENCODING) {
+ goto recode_encoding;
+ }
}
- if (ender > 0xff) {
+ else {
REQUIRE_UTF8;
}
break;
break;
case '8': case '9': /* must be a backreference */
--p;
+ /* we have an escape like \8 which cannot be an octal escape
+ * so we exit the loop, and let the outer loop handle this
+ * escape which may or may not be a legitimate backref. */
goto loopdone;
case '1': case '2': case '3':case '4':
case '5': case '6': case '7':
break;
case 'e':
if (memEQ(posixcc, "spac", 4)) /* space */
- namedclass = ANYOF_PSXSPC;
+ namedclass = ANYOF_SPACE;
break;
case 'h':
if (memEQ(posixcc, "grap", 4)) /* graph */
&& first_char == *(p - 1));
}
+STATIC unsigned int
+S_regex_set_precedence(const U8 my_operator) {
+
+ /* Returns the precedence in the (?[...]) construct of the input operator,
+ * specified by its character representation. The precedence follows
+ * general Perl rules, but it extends this so that ')' and ']' have (low)
+ * precedence even though they aren't really operators */
+
+ switch (my_operator) {
+ case '!':
+ return 5;
+ case '&':
+ return 4;
+ case '^':
+ case '|':
+ case '+':
+ case '-':
+ return 3;
+ case ')':
+ return 2;
+ case ']':
+ return 1;
+ }
+
+ NOT_REACHED; /* NOTREACHED */
+ return 0; /* Silence compiler warning */
+}
+
STATIC regnode *
S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
I32 *flagp, U32 depth,
{
/* Handle the (?[...]) construct to do set operations */
- U8 curchar;
- UV start, end; /* End points of code point ranges */
- SV* result_string;
- char *save_end, *save_parse;
- SV* final;
- STRLEN len;
- regnode* node;
- AV* stack;
- const bool save_fold = FOLD;
+ U8 curchar; /* Current character being parsed */
+ UV start, end; /* End points of code point ranges */
+ SV* final = NULL; /* The end result inversion list */
+ SV* result_string; /* 'final' stringified */
+ AV* stack; /* stack of operators and operands not yet
+ resolved */
+ AV* fence_stack = NULL; /* A stack containing the positions in
+ 'stack' of where the undealt-with left
+ parens would be if they were actually
+ put there */
+ IV fence = 0; /* Position of where most recent undealt-
+ with left paren in stack is; -1 if none.
+ */
+ STRLEN len; /* Temporary */
+ regnode* node; /* Temporary, and final regnode returned by
+ this function */
+ const bool save_fold = FOLD; /* Temporary */
+ char *save_end, *save_parse; /* Temporaries */
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
- if (LOC) {
+ if (LOC) { /* XXX could make valid in UTF-8 locales */
vFAIL("(?[...]) not valid in locale");
}
- RExC_uni_semantics = 1;
+ RExC_uni_semantics = 1; /* The use of this operator implies /u. This
+ is required so that the compile time values
+ are valid in all runtime cases */
/* This will return only an ANYOF regnode, or (unlikely) something smaller
* (such as EXACT). Thus we can skip most everything if just sizing. We
* upon an unescaped ']' that isn't one ending a regclass. To do both
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
- if (PASS2) {
- Perl_ck_warner_d(aTHX_
- packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
- "The regex_sets feature is experimental" REPORT_LOCATION,
- UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
- UTF8fARG(UTF,
- RExC_end - RExC_start - (RExC_parse - RExC_precomp),
- RExC_precomp + (RExC_parse - RExC_precomp)));
- }
- else {
+ if (SIZE_ONLY) {
UV depth = 0; /* how many nested (?[...]) constructs */
while (RExC_parse < RExC_end) {
TRUE, /* strict */
¤t
))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
/* function call leaves parse pointing to the ']', except
* if we faked it */
FAIL("Syntax error in (?[...])");
}
- /* Pass 2 only after this. Everything in this construct is a
- * metacharacter. Operands begin with either a '\' (for an escape
- * sequence), or a '[' for a bracketed character class. Any other
- * character should be an operator, or parenthesis for grouping. Both
- * types of operands are handled by calling regclass() to parse them. It
- * is called with a parameter to indicate to return the computed inversion
- * list. The parsing here is implemented via a stack. Each entry on the
- * stack is a single character representing one of the operators, or the
- * '('; or else a pointer to an operand inversion list. */
+ /* Pass 2 only after this. */
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
+ "The regex_sets feature is experimental" REPORT_LOCATION,
+ UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
+ UTF8fARG(UTF,
+ RExC_end - RExC_start - (RExC_parse - RExC_precomp),
+ RExC_precomp + (RExC_parse - RExC_precomp)));
+
+ /* Everything in this construct is a metacharacter. Operands begin with
+ * either a '\' (for an escape sequence), or a '[' for a bracketed
+ * character class. Any other character should be an operator, or
+ * parenthesis for grouping. Both types of operands are handled by calling
+ * regclass() to parse them. It is called with a parameter to indicate to
+ * return the computed inversion list. The parsing here is implemented via
+ * a stack. Each entry on the stack is a single character representing one
+ * of the operators; or else a pointer to an operand inversion list. */
#define IS_OPERAND(a) (! SvIOK(a))
- /* The stack starts empty. It is a syntax error if the first thing parsed
- * is a binary operator; everything else is pushed on the stack. When an
- * operand is parsed, the top of the stack is examined. If it is a binary
- * operator, the item before it should be an operand, and both are replaced
- * by the result of doing that operation on the new operand and the one on
- * the stack. Thus a sequence of binary operands is reduced to a single
- * one before the next one is parsed.
+ /* The stack is kept in Łukasiewicz order. (That's pronounced similar
+ * to luke-a-shave-itch (or -itz), but people who didn't want to bother
+ * with prounouncing it called it Reverse Polish instead, but now that YOU
+ * know how to prounounce it you can use the correct term, thus giving due
+ * credit to the person who invented it, and impressing your geek friends.
+ * Wikipedia says that the pronounciation of "Ł" has been changing so that
+ * it is now more like an English initial W (as in wonk) than an L.)
+ *
+ * This means that, for example, 'a | b & c' is stored on the stack as
+ *
+ * c [4]
+ * b [3]
+ * & [2]
+ * a [1]
+ * | [0]
+ *
+ * where the numbers in brackets give the stack [array] element number.
+ * In this implementation, parentheses are not stored on the stack.
+ * Instead a '(' creates a "fence" so that the part of the stack below the
+ * fence is invisible except to the corresponding ')' (this allows us to
+ * replace testing for parens, by using instead subtraction of the fence
+ * position). As new operands are processed they are pushed onto the stack
+ * (except as noted in the next paragraph). New operators of higher
+ * precedence than the current final one are inserted on the stack before
+ * the lhs operand (so that when the rhs is pushed next, everything will be
+ * in the correct positions shown above. When an operator of equal or
+ * lower precedence is encountered in parsing, all the stacked operations
+ * of equal or higher precedence are evaluated, leaving the result as the
+ * top entry on the stack. This makes higher precedence operations
+ * evaluate before lower precedence ones, and causes operations of equal
+ * precedence to left associate.
*
- * A unary operator may immediately follow a binary in the input, for
- * example
+ * The only unary operator '!' is immediately pushed onto the stack when
+ * encountered. When an operand is encountered, if the top of the stack is
+ * a '!", the complement is immediately performed, and the '!' popped. The
+ * resulting value is treated as a new operand, and the logic in the
+ * previous paragraph is executed. Thus in the expression
* [a] + ! [b]
- * When an operand is parsed and the top of the stack is a unary operator,
- * the operation is performed, and then the stack is rechecked to see if
- * this new operand is part of a binary operation; if so, it is handled as
- * above.
+ * the stack looks like
+ *
+ * !
+ * a
+ * +
+ *
+ * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
+ * becomes
+ *
+ * !b
+ * a
+ * +
+ *
+ * A ')' is treated as an operator with lower precedence than all the
+ * aforementioned ones, which causes all operations on the stack above the
+ * corresponding '(' to be evaluated down to a single resultant operand.
+ * Then the fence for the '(' is removed, and the operand goes through the
+ * algorithm above, without the fence.
*
- * A '(' is simply pushed on the stack; it is valid only if the stack is
- * empty, or the top element of the stack is an operator or another '('
- * (for which the parenthesized expression will become an operand). By the
- * time the corresponding ')' is parsed everything in between should have
- * been parsed and evaluated to a single operand (or else is a syntax
- * error), and is handled as a regular operand */
+ * A separate stack is kept of the fence positions, so that the position of
+ * the latest so-far unbalanced '(' is at the top of it.
+ *
+ * The ']' ending the construct is treated as the lowest operator of all,
+ * so that everything gets evaluated down to a single operand, which is the
+ * result */
sv_2mortal((SV *)(stack = newAV()));
+ sv_2mortal((SV *)(fence_stack = newAV()));
while (RExC_parse < RExC_end) {
- I32 top_index = av_tindex(stack);
- SV** top_ptr;
- SV* current = NULL;
+ I32 top_index; /* Index of top-most element in 'stack' */
+ SV** top_ptr; /* Pointer to top 'stack' element */
+ SV* current = NULL; /* To contain the current inversion list
+ operand */
+ SV* only_to_avoid_leaks;
/* Skip white space */
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE /* means recognize comments */ );
+ TRUE /* means recognize comments */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
- if ((curchar = UCHARAT(RExC_parse)) == ']') {
- break;
- }
+
+ curchar = UCHARAT(RExC_parse);
+
+redo_curchar:
+
+ top_index = av_tindex(stack);
switch (curchar) {
+ SV** stacked_ptr; /* Ptr to something already on 'stack' */
+ char stacked_operator; /* The topmost operator on the 'stack'. */
+ SV* lhs; /* Operand to the left of the operator */
+ SV* rhs; /* Operand to the right of the operator */
+ SV* fence_ptr; /* Pointer to top element of the fence
+ stack */
+
+ case '(':
- case '?':
- if (av_tindex(stack) >= 0 /* This makes sure that we can
- safely subtract 1 from
- RExC_parse in the next clause.
- If we have something on the
- stack, we have parsed something
- */
- && UCHARAT(RExC_parse - 1) == '('
- && RExC_parse < RExC_end)
+ if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
{
/* If is a '(?', could be an embedded '(?flags:(?[...])'.
* This happens when we have some thing like
* interpolated expression evaluates to. We use the flags
* from the interpolated pattern. */
U32 save_flags = RExC_flags;
- const char * const save_parse = ++RExC_parse;
+ const char * save_parse;
+
+ RExC_parse += 2; /* Skip past the '(?' */
+ save_parse = RExC_parse;
+ /* Parse any flags for the '(?' */
parse_lparen_question_flags(pRExC_state);
if (RExC_parse == save_parse /* Makes sure there was at
- least one flag (or this
- embedding wasn't compiled)
- */
+ least one flag (or else
+ this embedding wasn't
+ compiled) */
|| RExC_parse >= RExC_end - 4
|| UCHARAT(RExC_parse) != ':'
|| UCHARAT(++RExC_parse) != '('
}
vFAIL("Expecting '(?flags:(?[...'");
}
+
+ /* Recurse, with the meat of the embedded expression */
RExC_parse++;
(void) handle_regex_sets(pRExC_state, ¤t, flagp,
depth+1, oregcomp_parse);
/* Here, 'current' contains the embedded expression's
* inversion list, and RExC_parse points to the trailing
- * ']'; the next character should be the ')' which will be
- * paired with the '(' that has been put on the stack, so
- * the whole embedded expression reduces to '(operand)' */
+ * ']'; the next character should be the ')' */
+ RExC_parse++;
+ assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+
+ /* Then the ')' matching the original '(' handled by this
+ * case: statement */
RExC_parse++;
+ assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+ RExC_parse++;
RExC_flags = save_flags;
goto handle_operand;
}
- /* FALLTHROUGH */
- default:
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
- vFAIL("Unexpected character");
+ /* A regular '('. Look behind for illegal syntax */
+ if (top_index - fence >= 0) {
+ /* If the top entry on the stack is an operator, it had
+ * better be a '!', otherwise the entry below the top
+ * operand should be an operator */
+ if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
+ || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
+ || top_index - fence < 1
+ || ! (stacked_ptr = av_fetch(stack,
+ top_index - 1,
+ FALSE))
+ || IS_OPERAND(*stacked_ptr))
+ {
+ RExC_parse++;
+ vFAIL("Unexpected '(' with no preceding operator");
+ }
+ }
+
+ /* Stack the position of this undealt-with left paren */
+ fence = top_index + 1;
+ av_push(fence_stack, newSViv(fence));
+ break;
case '\\':
/* regclass() can only return RESTART_UTF8 if multi-char
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
TRUE, /* strict */
- ¤t
- ))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ ¤t))
+ {
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
+ }
+
/* regclass() will return with parsing just the \ sequence,
* leaving the parse pointer at the next thing to parse */
RExC_parse--;
TRUE, /* strict */
¤t
))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ {
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
+ }
+
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (is_posix_class) {
goto handle_operand;
}
+ case ']':
+ if (top_index >= 1) {
+ goto join_operators;
+ }
+
+ /* Only a single operand on the stack: are done */
+ goto done;
+
+ case ')':
+ if (av_tindex(fence_stack) < 0) {
+ RExC_parse++;
+ vFAIL("Unexpected ')'");
+ }
+
+ /* If at least two thing on the stack, treat this as an
+ * operator */
+ if (top_index - fence >= 1) {
+ goto join_operators;
+ }
+
+ /* Here only a single thing on the fenced stack, and there is a
+ * fence. Get rid of it */
+ fence_ptr = av_pop(fence_stack);
+ assert(fence_ptr);
+ fence = SvIV(fence_ptr) - 1;
+ SvREFCNT_dec_NN(fence_ptr);
+ fence_ptr = NULL;
+
+ if (fence < 0) {
+ fence = 0;
+ }
+
+ /* Having gotten rid of the fence, we pop the operand at the
+ * stack top and process it as a newly encountered operand */
+ current = av_pop(stack);
+ assert(IS_OPERAND(current));
+ goto handle_operand;
+
case '&':
case '|':
case '+':
case '-':
case '^':
- if (top_index < 0
+
+ /* These binary operators should have a left operand already
+ * parsed */
+ if ( top_index - fence < 0
+ || top_index - fence == 1
|| ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
|| ! IS_OPERAND(*top_ptr))
{
- RExC_parse++;
- vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
+ goto unexpected_binary;
}
- av_push(stack, newSVuv(curchar));
- break;
- case '!':
- av_push(stack, newSVuv(curchar));
- break;
+ /* If only the one operand is on the part of the stack visible
+ * to us, we just place this operator in the proper position */
+ if (top_index - fence < 2) {
- case '(':
- if (top_index >= 0) {
- top_ptr = av_fetch(stack, top_index, FALSE);
- assert(top_ptr);
- if (IS_OPERAND(*top_ptr)) {
- RExC_parse++;
- vFAIL("Unexpected '(' with no preceding operator");
- }
+ /* Place the operator before the operand */
+
+ SV* lhs = av_pop(stack);
+ av_push(stack, newSVuv(curchar));
+ av_push(stack, lhs);
+ break;
}
- av_push(stack, newSVuv(curchar));
- break;
- case ')':
- {
- SV* lparen;
- if (top_index < 1
- || ! (current = av_pop(stack))
- || ! IS_OPERAND(current)
- || ! (lparen = av_pop(stack))
- || IS_OPERAND(lparen)
- || SvUV(lparen) != '(')
+ /* But if there is something else on the stack, we need to
+ * process it before this new operator if and only if the
+ * stacked operation has equal or higher precedence than the
+ * new one */
+
+ join_operators:
+
+ /* The operator on the stack is supposed to be below both its
+ * operands */
+ if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
+ || IS_OPERAND(*stacked_ptr))
{
- SvREFCNT_dec(current);
+ /* But if not, it's legal and indicates we are completely
+ * done if and only if we're currently processing a ']',
+ * which should be the final thing in the expression */
+ if (curchar == ']') {
+ goto done;
+ }
+
+ unexpected_binary:
RExC_parse++;
- vFAIL("Unexpected ')'");
+ vFAIL2("Unexpected binary operator '%c' with no "
+ "preceding operand", curchar);
}
- top_index -= 2;
- SvREFCNT_dec_NN(lparen);
+ stacked_operator = (char) SvUV(*stacked_ptr);
- /* FALLTHROUGH */
- }
+ if (regex_set_precedence(curchar)
+ > regex_set_precedence(stacked_operator))
+ {
+ /* Here, the new operator has higher precedence than the
+ * stacked one. This means we need to add the new one to
+ * the stack to await its rhs operand (and maybe more
+ * stuff). We put it before the lhs operand, leaving
+ * untouched the stacked operator and everything below it
+ * */
+ lhs = av_pop(stack);
+ assert(IS_OPERAND(lhs));
+
+ av_push(stack, newSVuv(curchar));
+ av_push(stack, lhs);
+ break;
+ }
- handle_operand:
+ /* Here, the new operator has equal or lower precedence than
+ * what's already there. This means the operation already
+ * there should be performed now, before the new one. */
+ rhs = av_pop(stack);
+ lhs = av_pop(stack);
- /* Here, we have an operand to process, in 'current' */
+ assert(IS_OPERAND(rhs));
+ assert(IS_OPERAND(lhs));
- if (top_index < 0) { /* Just push if stack is empty */
- av_push(stack, current);
- }
- else {
- SV* top = av_pop(stack);
- SV *prev = NULL;
- char current_operator;
-
- if (IS_OPERAND(top)) {
- SvREFCNT_dec_NN(top);
- SvREFCNT_dec_NN(current);
- vFAIL("Operand with no preceding operator");
+ switch (stacked_operator) {
+ case '&':
+ _invlist_intersection(lhs, rhs, &rhs);
+ break;
+
+ case '|':
+ case '+':
+ _invlist_union(lhs, rhs, &rhs);
+ break;
+
+ case '-':
+ _invlist_subtract(lhs, rhs, &rhs);
+ break;
+
+ case '^': /* The union minus the intersection */
+ {
+ SV* i = NULL;
+ SV* u = NULL;
+ SV* element;
+
+ _invlist_union(lhs, rhs, &u);
+ _invlist_intersection(lhs, rhs, &i);
+ /* _invlist_subtract will overwrite rhs
+ without freeing what it already contains */
+ element = rhs;
+ _invlist_subtract(u, i, &rhs);
+ SvREFCNT_dec_NN(i);
+ SvREFCNT_dec_NN(u);
+ SvREFCNT_dec_NN(element);
+ break;
}
- current_operator = (char) SvUV(top);
- switch (current_operator) {
- case '(': /* Push the '(' back on followed by the new
- operand */
- av_push(stack, top);
- av_push(stack, current);
- SvREFCNT_inc(top); /* Counters the '_dec' done
- just after the 'break', so
- it doesn't get wrongly freed
- */
- break;
+ }
+ SvREFCNT_dec(lhs);
+
+ /* Here, the higher precedence operation has been done, and the
+ * result is in 'rhs'. We overwrite the stacked operator with
+ * the result. Then we redo this code to either push the new
+ * operator onto the stack or perform any higher precedence
+ * stacked operation */
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ av_push(stack, rhs);
+ goto redo_curchar;
+
+ case '!': /* Highest priority, right associative, so just push
+ onto stack */
+ av_push(stack, newSVuv(curchar));
+ break;
- case '!':
- _invlist_invert(current);
-
- /* Unlike binary operators, the top of the stack,
- * now that this unary one has been popped off, may
- * legally be an operator, and we now have operand
- * for it. */
- top_index--;
- SvREFCNT_dec_NN(top);
- goto handle_operand;
-
- case '&':
- prev = av_pop(stack);
- _invlist_intersection(prev,
- current,
- ¤t);
- av_push(stack, current);
- break;
+ default:
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unexpected character");
- case '|':
- case '+':
- prev = av_pop(stack);
- _invlist_union(prev, current, ¤t);
- av_push(stack, current);
- break;
+ handle_operand:
+
+ /* Here 'current' is the operand. If something is already on the
+ * stack, we have to check if it is a !. */
+ top_index = av_tindex(stack); /* Code above may have altered the
+ * stack in the time since we
+ * earlier set 'top_index'. */
+ if (top_index - fence >= 0) {
+ /* If the top entry on the stack is an operator, it had better
+ * be a '!', otherwise the entry below the top operand should
+ * be an operator */
+ top_ptr = av_fetch(stack, top_index, FALSE);
+ assert(top_ptr);
+ if (! IS_OPERAND(*top_ptr)) {
+
+ /* The only permissible operator at the top of the stack is
+ * '!', which is applied immediately to this operand. */
+ curchar = (char) SvUV(*top_ptr);
+ if (curchar != '!') {
+ SvREFCNT_dec(current);
+ vFAIL2("Unexpected binary operator '%c' with no "
+ "preceding operand", curchar);
+ }
- case '-':
- prev = av_pop(stack);;
- _invlist_subtract(prev, current, ¤t);
- av_push(stack, current);
- break;
+ _invlist_invert(current);
- case '^': /* The union minus the intersection */
- {
- SV* i = NULL;
- SV* u = NULL;
- SV* element;
-
- prev = av_pop(stack);
- _invlist_union(prev, current, &u);
- _invlist_intersection(prev, current, &i);
- /* _invlist_subtract will overwrite current
- without freeing what it already contains */
- element = current;
- _invlist_subtract(u, i, ¤t);
- av_push(stack, current);
- SvREFCNT_dec_NN(i);
- SvREFCNT_dec_NN(u);
- SvREFCNT_dec_NN(element);
- break;
- }
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ top_index = av_tindex(stack);
- default:
- Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
+ /* And we redo with the inverted operand. This allows
+ * handling multiple ! in a row */
+ goto handle_operand;
+ }
+ /* Single operand is ok only for the non-binary ')'
+ * operator */
+ else if ((top_index - fence == 0 && curchar != ')')
+ || (top_index - fence > 0
+ && (! (stacked_ptr = av_fetch(stack,
+ top_index - 1,
+ FALSE))
+ || IS_OPERAND(*stacked_ptr))))
+ {
+ SvREFCNT_dec(current);
+ vFAIL("Operand with no preceding operator");
}
- SvREFCNT_dec_NN(top);
- SvREFCNT_dec(prev);
}
- }
+
+ /* Here there was nothing on the stack or the top element was
+ * another operand. Just add this new one */
+ av_push(stack, current);
+
+ } /* End of switch on next parse token */
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ } /* End of loop parsing through the construct */
+
+ done:
+ if (av_tindex(fence_stack) >= 0) {
+ vFAIL("Unmatched (");
}
if (av_tindex(stack) < 0 /* Was empty */
|| ! IS_OPERAND(final)
|| av_tindex(stack) >= 0) /* More left on stack */
{
+ SvREFCNT_dec(final);
vFAIL("Incomplete expression within '(?[ ])'");
}
}
}
+ /* About to generate an ANYOF (or similar) node from the inversion list we
+ * have calculated */
save_parse = RExC_parse;
RExC_parse = SvPV(result_string, len);
save_end = RExC_end;
const char *s = RExC_parse;
const char c = *s++;
+ if (*s == '^') {
+ s++;
+ }
while (isWORDCHAR(*s))
s++;
if (*s && c == *s && s[1] == ']') {
case 'H': namedclass = ANYOF_NHORIZWS; break;
case 'N': /* Handle \N{NAME} in class */
{
- SV *as_text;
- STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
- flagp, depth, &as_text);
- if (*flagp & RESTART_UTF8)
- FAIL("panic: grok_bslash_N set RESTART_UTF8");
- if (cp_count != 1) { /* The typical case drops through */
- assert(cp_count != (STRLEN) -1);
- if (cp_count == 0) {
+ const char * const backslash_N_beg = RExC_parse - 2;
+ int cp_count;
+
+ if (! grok_bslash_N(pRExC_state,
+ NULL, /* No regnode */
+ &value, /* Yes single value */
+ &cp_count, /* Multiple code pt count */
+ flagp,
+ depth)
+ ) {
+
+ if (*flagp & RESTART_UTF8)
+ FAIL("panic: grok_bslash_N set RESTART_UTF8");
+
+ if (cp_count < 0) {
+ vFAIL("\\N in a character class must be a named character: \\N{...}");
+ }
+ else if (cp_count == 0) {
if (strict) {
RExC_parse++; /* Position after the "}" */
vFAIL("Zero length \\N{}");
else if (PASS2) {
ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
}
+ break; /* <value> contains the first code
+ point. Drop out of the switch to
+ process it */
}
else {
+ SV * multi_char_N = newSVpvn(backslash_N_beg,
+ RExC_parse - backslash_N_beg);
multi_char_matches
= add_multi_match(multi_char_matches,
- as_text,
+ multi_char_N,
cp_count);
}
- break; /* <value> contains the first code
- point. Drop out of the switch to
- process it */
}
} /* End of cp_count != 1 */
vFAIL2utf8f(
"Invalid [] range \"%"UTF8f"\"",
UTF8fARG(UTF, w, rangebegin));
- NOT_REACHED; /* NOT REACHED */
+ NOT_REACHED; /* NOTREACHED */
}
}
else {
* same element, neither should be a digit. */
if (index_start == index_final) {
assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
- || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
- - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
- == 10);
+ || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+ - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+ == 10)
+ /* But actually Unicode did have one group of 11
+ * 'digits' in 5.2, so in case we are operating
+ * on that version, let that pass */
+ || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+ - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+ == 11
+ && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+ == 0x19D0)
+ );
}
else if ((index_start >= 0
&& ELEMENT_RANGE_MATCHES_INVLIST(index_start))
}
if (ret_invlist) {
+ assert(cp_list);
+
*ret_invlist = cp_list;
SvREFCNT_dec(swash);
|| _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
|| _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
|| _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
- || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
- || _CC_VERTSPACE != 16
+ || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
#error Need to adjust order of anyofs[]
#endif
"\\w",
"[:^blank:]",
"[:xdigit:]",
"[:^xdigit:]",
- "[:space:]",
- "[:^space:]",
"[:cntrl:]",
"[:^cntrl:]",
"[:ascii:]",
Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
}
}
+ else if (k == BOUND || k == NBOUND) {
+ /* Must be synced with order of 'bound_type' in regcomp.h */
+ const char * const bounds[] = {
+ "", /* Traditional */
+ "{gcb}",
+ "{sb}",
+ "{wb}"
+ };
+ sv_catpv(sv, bounds[FLAGS(o)]);
+ }
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
else if (OP(o) == SBOL)
#endif /* DEBUGGING */
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/