} STMT_END
/* A specialized version of vFAIL2 that works with UTF8f */
-#define vFAIL2utf8f(m, a1) STMT_START { \
+#define vFAIL2utf8f(m, a1) STMT_START { \
const IV offset = RExC_parse - RExC_precomp; \
if (!SIZE_ONLY) \
SAVEFREESV(RExC_rx_sv); \
REPORT_LOCATION_ARGS(offset)); \
} STMT_END
+#define vFAIL3utf8f(m, a1, a2) STMT_START { \
+ const IV offset = RExC_parse - RExC_precomp; \
+ if (!SIZE_ONLY) \
+ SAVEFREESV(RExC_rx_sv); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
+ REPORT_LOCATION_ARGS(offset)); \
+} STMT_END
+
/* These have asserts in them because of [perl #122671] Many warnings in
* regcomp.c can occur twice. If they get output in pass1 and later in that
* pass, the pattern has to be converted to UTF-8 and the pass restarted, they
RExC_pm_flags = pm_flags;
if (runtime_code) {
- if (TAINTING_get && TAINT_get)
+ assert(TAINTING_get || !TAINT_get);
+ if (TAINT_get)
Perl_croak(aTHX_ "Eval-group in insecure regular expression");
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
++RExC_parse;
}
- if (PASS2) {
- STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
- }
+ vFAIL("Sequence (?... not terminated");
}
/*
* enough space for all the things we are about to throw
* away, but we can shrink it by the ammount we are about
* to re-use here */
- RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
+ RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
}
else {
ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
"Useless use of greediness modifier '%c'",
*RExC_parse);
}
- /* Absorb the modifier, so later code doesn't see nor use it */
- nextchar(pRExC_state);
}
do_curly:
char * endbrace; /* points to '}' following the name */
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
- char* p; /* Temporary */
+ char* p = RExC_parse; /* Temporary */
GET_RE_DEBUG_FLAGS_DECL;
/* The [^\n] meaning of \N ignores spaces and comments under the /x
* modifier. The other meanings do not, so use a temporary until we find
* out which we are being called with */
- p = (RExC_flags & RXf_PMf_EXTENDED)
- ? regpatws(pRExC_state, RExC_parse,
- TRUE) /* means recognize comments */
- : RExC_parse;
+ skip_to_be_ignored_text(pRExC_state, &p,
+ FALSE /* Don't force to /x */ );
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The latter is assumed when the {...} following the \N is a legal
if (! node_p) {
return FALSE;
}
- skip_to_be_ignored_text(pRExC_state, &RExC_parse,
- FALSE /* Don't force to /x */ );
+
*node_p = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
MARK_NAUGHTY(1);
* it returns U+FFFD (Replacement character) and sets *encp to NULL.
*/
STATIC UV
-S_reg_recode(pTHX_ const char value, SV **encp)
+S_reg_recode(pTHX_ const U8 value, SV **encp)
{
STRLEN numlen = 1;
- SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
+ SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
const STRLEN newlen = SvCUR(sv);
UV uv = UNICODE_REPLACEMENT;
recode_encoding:
if (! RExC_override_recoding) {
SV* enc = _get_encoding();
- ender = reg_recode((const char)(U8)ender, &enc);
+ ender = reg_recode((U8)ender, &enc);
if (!enc && PASS2)
ckWARNreg(p, "Invalid escape in the specified encoding");
REQUIRE_UTF8(flagp);
* quantifier. Move <p> to after anything that should be
* ignored, which, as a side effect, positions <p> for the next
* loop iteration */
- if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regpatws(pRExC_state, p,
- TRUE); /* means recognize comments */
+ skip_to_be_ignored_text(pRExC_state, &p,
+ FALSE /* Don't force to /x */ );
/* If the next thing is a quantifier, it applies to this
* character only, which means that this character has to be in
return(ret);
}
-STATIC char *
-S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
-{
- /* Returns the next non-pattern-white space, non-comment character (the
- * latter only if 'recognize_comment is true) in the string p, which is
- * ended by RExC_end. See also reg_skipcomment */
- const char *e = RExC_end;
-
- PERL_ARGS_ASSERT_REGPATWS;
-
- while (p < e) {
- STRLEN len;
- if ((len = is_PATWS_safe(p, e, UTF))) {
- p += len;
- }
- else if (recognize_comment && *p == '#') {
- p = reg_skipcomment(pRExC_state, p);
- }
- else
- break;
- }
- return p;
-}
STATIC void
S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
while (RExC_parse < RExC_end) {
SV* current = NULL;
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ TRUE /* Force /x */ );
+
switch (*RExC_parse) {
case '?':
if (RExC_parse[1] == '[') depth++, RExC_parse++;
* default: case next time and keep on incrementing until
* we find one of the invariants we do handle. */
RExC_parse++;
+ if (*RExC_parse == 'c') {
+ /* Skip the \cX notation for control characters */
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ }
break;
case '[':
{
operand */
SV* only_to_avoid_leaks;
- /* Skip white space */
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE /* means recognize comments */ );
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ TRUE /* Force /x */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
/* Having gotten rid of the fence, we pop the operand at the
* stack top and process it as a newly encountered operand */
current = av_pop(stack);
- assert(IS_OPERAND(current));
- goto handle_operand;
+ if (IS_OPERAND(current)) {
+ goto handle_operand;
+ }
+
+ RExC_parse++;
+ goto bad_syntax;
case '&':
case '|':
/* Here, the new operator has equal or lower precedence than
* what's already there. This means the operation already
* there should be performed now, before the new one. */
+
rhs = av_pop(stack);
+ if (! IS_OPERAND(rhs)) {
+
+ /* This can happen when a ! is not followed by an operand,
+ * like in /(?[\t &!])/ */
+ goto bad_syntax;
+ }
+
lhs = av_pop(stack);
- assert(IS_OPERAND(rhs));
- assert(IS_OPERAND(lhs));
+ if (! IS_OPERAND(lhs)) {
+
+ /* This can happen when there is an empty (), like in
+ * /(?[[0]+()+])/ */
+ goto bad_syntax;
+ }
switch (stacked_operator) {
case '&':
av_push(stack, rhs);
goto redo_curchar;
- case '!': /* Highest priority, right associative, so just push
- onto stack */
- av_push(stack, newSVuv(curchar));
+ case '!': /* Highest priority, right associative */
+
+ /* If what's already at the top of the stack is another '!",
+ * they just cancel each other out */
+ if ( (top_ptr = av_fetch(stack, top_index, FALSE))
+ && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
+ {
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ }
+ else { /* Otherwise, since it's right associative, just push
+ onto the stack */
+ av_push(stack, newSVuv(curchar));
+ }
break;
default:
|| SvTYPE(final) != SVt_INVLIST
|| av_tindex(stack) >= 0) /* More left on stack */
{
+ bad_syntax:
SvREFCNT_dec(final);
vFAIL("Incomplete expression within '(?[ ])'");
}
#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
(SvCUR(listsv) != initial_listsv_len)
+/* There is a restricted set of white space characters that are legal when
+ * ignoring white space in a bracketed character class. This generates the
+ * code to skip them.
+ *
+ * There is a line below that uses the same white space criteria but is outside
+ * this macro. Both here and there must use the same definition */
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
+ STMT_START { \
+ if (do_skip) { \
+ while ( p < RExC_end \
+ && isBLANK_A(UCHARAT(p))) \
+ { \
+ p++; \
+ } \
+ } \
+ } STMT_END
+
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
const bool stop_at_1, /* Just parse the next thing, don't
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
}
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_parse++;
invert = TRUE;
allow_multi_folds = FALSE;
MARK_NAUGHTY(1);
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
}
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
break;
}
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
if (UCHARAT(RExC_parse) == ']') {
break;
* skipped, it means that that white space is wanted literally, and
* is already in 'value'. Otherwise, need to translate the escape
* into what it signifies. */
- if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
+ if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
case 'w': namedclass = ANYOF_WORDCHAR; break;
case 'W': namedclass = ANYOF_NWORDCHAR; break;
vFAIL2("Empty \\%c{}", (U8)value);
if (*RExC_parse == '{') {
const U8 c = (U8)value;
- e = strchr(RExC_parse++, '}');
- if (!e)
+ e = strchr(RExC_parse, '}');
+ if (!e) {
+ RExC_parse++;
vFAIL2("Missing right brace on \\%c{}", c);
- while (isSPACE(*RExC_parse))
- RExC_parse++;
+ }
+
+ RExC_parse++;
+ while (isSPACE(*RExC_parse)) {
+ RExC_parse++;
+ }
+
+ if (UCHARAT(RExC_parse) == '^') {
+
+ /* toggle. (The rhs xor gets the single bit that
+ * differs between P and p; the other xor inverts just
+ * that bit) */
+ value ^= 'P' ^ 'p';
+
+ RExC_parse++;
+ while (isSPACE(*RExC_parse)) {
+ RExC_parse++;
+ }
+ }
+
if (e == RExC_parse)
vFAIL2("Empty \\%c{}", c);
+
n = e - RExC_parse;
while (isSPACE(*(RExC_parse + n - 1)))
n--;
- }
- else {
+ } /* The \p isn't immediately followed by a '{' */
+ else if (! isALPHA(*RExC_parse)) {
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL2("Character following \\%c must be '{' or a "
+ "single-character Unicode property name",
+ (U8) value);
+ }
+ else {
e = RExC_parse;
n = 1;
}
if (!SIZE_ONLY) {
SV* invlist;
char* name;
+ char* base_name; /* name after any packages are stripped */
+ const char * const colon_colon = "::";
- if (UCHARAT(RExC_parse) == '^') {
- RExC_parse++;
- n--;
- /* toggle. (The rhs xor gets the single bit that
- * differs between P and p; the other xor inverts just
- * that bit) */
- value ^= 'P' ^ 'p';
-
- while (isSPACE(*RExC_parse)) {
- RExC_parse++;
- n--;
- }
- }
/* Try to get the definition of the property into
* <invlist>. If /i is in effect, the effective property
* will have its name be <__NAME_i>. The design is
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
- if (swash) {
+ if (swash) { /* Return any left-overs */
SvREFCNT_dec_NN(swash);
}
swash = _core_swash_init("utf8", name, &PL_sv_undef,
HV* curpkg = (IN_PERL_COMPILETIME)
? PL_curstash
: CopSTASH(PL_curcop);
- if (swash) {
+ UV final_n = n;
+ bool has_pkg;
+
+ if (swash) { /* Got a swash but no inversion list.
+ Something is likely wrong that will
+ be sorted-out later */
SvREFCNT_dec_NN(swash);
swash = NULL;
}
- /* Here didn't find it. It could be a user-defined
- * property that will be available at run-time. If we
- * accept only compile-time properties, is an error;
- * otherwise add it to the list for run-time look up */
- if (ret_invlist) {
+ /* Here didn't find it. It could be a an error (like a
+ * typo) in specifying a Unicode property, or it could
+ * be a user-defined property that will be available at
+ * run-time. The names of these must begin with 'In'
+ * or 'Is' (after any packages are stripped off). So
+ * if not one of those, or if we accept only
+ * compile-time properties, is an error; otherwise add
+ * it to the list for run-time look up. */
+ if ((base_name = rninstr(name, name + n,
+ colon_colon, colon_colon + 2)))
+ { /* Has ::. We know this must be a user-defined
+ property */
+ base_name += 2;
+ final_n -= base_name - name;
+ has_pkg = TRUE;
+ }
+ else {
+ base_name = name;
+ has_pkg = FALSE;
+ }
+
+ if ( final_n < 3
+ || base_name[0] != 'I'
+ || (base_name[1] != 's' && base_name[1] != 'n')
+ || ret_invlist)
+ {
+ const char * const msg
+ = (has_pkg)
+ ? "Illegal user-defined property name"
+ : "Can't find Unicode property definition";
RExC_parse = e + 1;
- vFAIL2utf8f(
- "Property '%"UTF8f"' is unknown",
- UTF8fARG(UTF, n, name));
+
+ /* diag_listed_as: Can't find Unicode property definition "%s" */
+ vFAIL3utf8f("%s \"%"UTF8f"\"",
+ msg, UTF8fARG(UTF, n, name));
}
/* If the property name doesn't already have a package
* name, add the current one to it so that it can be
* referred to outside it. [perl #121777] */
- if (curpkg && ! instr(name, "::")) {
+ if (! has_pkg && curpkg) {
char* pkgname = HvNAME(curpkg);
if (strNE(pkgname, "main")) {
char* full_name = Perl_form(aTHX_
ANYOF node */
/* We don't know yet, so have to assume that the
- * property could match something in the Latin1 range,
- * hence something that isn't utf8. Note that this
- * would cause things in <depends_list> to match
+ * property could match something in the upper Latin1
+ * range, hence something that isn't utf8. Note that
+ * this would cause things in <depends_list> to match
* inappropriately, except that any \p{}, including
* this one forces Unicode semantics, which means there
* is no <depends_list> */
recode_encoding:
if (! RExC_override_recoding) {
SV* enc = _get_encoding();
- value = reg_recode((const char)(U8)value, &enc);
+ value = reg_recode((U8)value, &enc);
if (!enc) {
if (strict) {
vFAIL("Invalid escape in the specified encoding");
}
} /* end of namedclass \blah */
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
/* If 'range' is set, 'value' is the ending of a range--check its
* validity. (If value isn't a single code point in the case of a
&& *RExC_parse == '-')
{
char* next_char_ptr = RExC_parse + 1;
- if (skip_white) { /* Get the next real char after the '-' */
- next_char_ptr = regpatws(pRExC_state,
- RExC_parse + 1,
- FALSE); /* means don't recognize
- comments */
- }
+
+ /* Get the next real char after the '-' */
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
/* If the '-' is at the end of the class (just before the ']',
* it is a literal minus; otherwise it is a range */
}
if (use_xmod) {
- char * new_p = regpatws(pRExC_state, *p,
- TRUE); /* means recognize comments */
- if (new_p != *p) {
- *p = new_p;
+ const char * save_p = *p;
+ while ((*p) < RExC_end) {
+ STRLEN len;
+ if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
+ (*p) += len;
+ }
+ else if (*(*p) == '#') {
+ (*p) = reg_skipcomment(pRExC_state, (*p));
+ }
+ else {
+ break;
+ }
+ }
+ if (*p != save_p) {
continue;
}
}