scan_frame *frame_head;
scan_frame *frame_last;
U32 frame_count;
- U32 strict;
#ifdef ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#endif
bool seen_unfolded_sharp_s;
+ bool strict;
};
#define RExC_flags (pRExC_state->flags)
}
/* If this can match all upper Latin1 code points, have to add them
- * as well */
- if (OP(node) == ANYOFD
+ * as well. But don't add them if inverting, as when that gets done below,
+ * it would exclude all these characters, including the ones it shouldn't
+ * that were added just above */
+ if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
&& (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
{
_invlist_union(invlist, PL_UpperLatin1, &invlist);
* (unassigned, private use, surrogates, controls and formats). This
* is a much large number. */
- const U32 max_match = (LOC)
- ? 127
- : (! UNI_SEMANTICS)
- ? 63
- : (invlist_highest(ssc->invlist) < 256)
- ? 127
- : ((NON_OTHER_COUNT + 1) / 2) - 1;
U32 count = 0; /* Running total of number of code points matched by
'ssc' */
UV start, end; /* Start and end points of current range in inversion
list */
+ const U32 max_code_points = (LOC)
+ ? 256
+ : (( ! UNI_SEMANTICS
+ || invlist_highest(ssc->invlist) < 256)
+ ? 128
+ : NON_OTHER_COUNT);
+ const U32 max_match = max_code_points / 2;
PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
invlist_iterinit(ssc->invlist);
while (invlist_iternext(ssc->invlist, &start, &end)) {
-
- /* /u is the only thing that we expect to match above 255; so if not /u
- * and even if there are matches above 255, ignore them. This catches
- * things like \d under /d which does match the digits above 255, but
- * since the pattern is /d, it is not likely to be expecting them */
- if (! UNI_SEMANTICS) {
- if (start > 255) {
- break;
- }
- end = MIN(end, 255);
+ if (start >= max_code_points) {
+ break;
}
+ end = MIN(end, max_code_points - 1);
count += end - start + 1;
- if (count > max_match) {
+ if (count >= max_match) {
invlist_iterfinish(ssc->invlist);
return FALSE;
}
/* Initialize these here instead of as-needed, as is quick and avoids
* having to test them each time otherwise */
if (! PL_AboveLatin1) {
+#ifdef DEBUGGING
+ char * dump_len_string;
+#endif
+
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
PL_InBitmap = _new_invlist(2);
PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
NUM_ANYOF_CODE_POINTS - 1);
+#ifdef DEBUGGING
+ dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+ if ( ! dump_len_string
+ || ! grok_atoUV(dump_len_string, &PL_dump_re_max_len, NULL))
+ {
+ PL_dump_re_max_len = 0;
+ }
+#endif
}
pRExC_state->code_blocks = NULL;
SvPV_shrink_to_cur((SV *) invlist);
}
+#endif /* ifndef PERL_IN_XSUB_RE */
+
PERL_STATIC_INLINE bool
S_invlist_is_iterating(SV* const invlist)
{
return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
}
-#endif /* ifndef PERL_IN_XSUB_RE */
-
PERL_STATIC_INLINE UV
S_invlist_max(SV* const invlist)
{
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
* intersection must be empty */
if (*i == a) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
+ if (a != NULL) {
+ if (! (make_temp = cBOOL(SvTEMP(a)))) {
+ SvREFCNT_dec_NN(a);
+ }
}
}
else if (*i == b) {
: array[len - 1] - 1;
}
-#ifndef PERL_IN_XSUB_RE
SV *
-Perl__invlist_contents(pTHX_ SV* const invlist)
+S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
{
/* Get the contents of an inversion list into a string SV so that they can
- * be printed out. It uses the format traditionally done for debug tracing
- */
+ * be printed out. If 'traditional_style' is TRUE, it uses the format
+ * traditionally done for debug tracing; otherwise it uses a format
+ * suitable for just copying to the output, with blanks between ranges and
+ * a dash between range components */
UV start, end;
- SV* output = newSVpvs("\n");
+ SV* output;
+ const char intra_range_delimiter = (traditional_style ? '\t' : '-');
+ const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
- PERL_ARGS_ASSERT__INVLIST_CONTENTS;
+ if (traditional_style) {
+ output = newSVpvs("\n");
+ }
+ else {
+ output = newSVpvs("");
+ }
+
+ PERL_ARGS_ASSERT_INVLIST_CONTENTS;
assert(! invlist_is_iterating(invlist));
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c",
+ start, intra_range_delimiter,
+ inter_range_delimiter);
}
else if (end != start) {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
- start, end);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c",
+ start,
+ intra_range_delimiter,
+ end, inter_range_delimiter);
}
else {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c",
+ start, inter_range_delimiter);
}
}
+ if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
+ SvCUR_set(output, SvCUR(output) - 1);
+ }
+
return output;
}
-#endif
#ifndef PERL_IN_XSUB_RE
void
UV * code_point_p,
int * cp_count,
I32 * flagp,
+ const bool strict,
const U32 depth
)
{
semantics */
if (endbrace == RExC_parse) { /* empty: \N{} */
+ if (strict) {
+ RExC_parse++; /* Position after the "}" */
+ vFAIL("Zero length \\N{}");
+ }
if (cp_count) {
*cp_count = 0;
}
NULL, /* Don't need a count of how many code
points */
flagp,
+ RExC_strict,
depth)
) {
break;
NULL, /* Don't need a count of
how many code points */
flagp,
+ RExC_strict,
depth)
) {
if (*flagp & NEED_UTF8)
&value, /* Yes single value */
&cp_count, /* Multiple code pt count */
flagp,
+ strict,
depth)
) {
vFAIL("\\N in a character class must be a named character: \\N{...}");
}
else if (cp_count == 0) {
- if (strict) {
- RExC_parse++; /* Position after the "}" */
- vFAIL("Zero length \\N{}");
- }
- else if (PASS2) {
+ if (PASS2) {
ckWARNreg(RExC_parse,
"Ignoring zero length \\N{} in character class");
}
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
- if (swash) { /* Return any left-overs */
- SvREFCNT_dec_NN(swash);
- }
+ SvREFCNT_dec(swash); /* Free any left-overs */
swash = _core_swash_init("utf8", name, &PL_sv_undef,
1, /* binary */
0, /* not tr/// */
"\"%.*s\" is more clearly written simply as \"%s\"",
(int) (RExC_parse - rangebegin),
rangebegin,
- cntrl_to_mnemonic((char) value)
+ cntrl_to_mnemonic((U8) value)
);
}
}
* fetching). We know to set the flag if we have a non-NULL list for UTF-8
* locales, or the class matches at least one 0-255 range code point */
if (LOC && FOLD) {
+
+ /* Some things on the list might be unconditionally included because of
+ * other components. Remove them, and clean up the list if it goes to
+ * 0 elements */
+ if (only_utf8_locale_list && cp_list) {
+ _invlist_subtract(only_utf8_locale_list, cp_list,
+ &only_utf8_locale_list);
+
+ if (_invlist_len(only_utf8_locale_list) == 0) {
+ SvREFCNT_dec_NN(only_utf8_locale_list);
+ only_utf8_locale_list = NULL;
+ }
+ }
if (only_utf8_locale_list) {
ANYOF_FLAGS(ret)
|= ANYOFL_FOLD
* swash exists, by calling this function with 'doinit' set to false, in
* which case the components that will be used to eventually create the
* swash are returned (in a printable form).
+ * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
+ * store an inversion list of code points that should match only if the
+ * execution-time locale is a UTF-8 one.
* If <exclude_list> is not NULL, it is an inversion list of things to
* exclude from what's returned in <listsvp>.
+ *
* Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
* that, in spite of this function's name, the swash it returns may include
* the bitmap data as well */
if (exclude_list) {
SV* clone = invlist_clone(invlist);
_invlist_subtract(clone, exclude_list, &clone);
- sv_catsv(matches_string, _invlist_contents(clone));
+ sv_catsv(matches_string, invlist_contents(clone, TRUE));
SvREFCNT_dec_NN(clone);
}
else {
- sv_catsv(matches_string, _invlist_contents(invlist));
+ sv_catsv(matches_string, invlist_contents(invlist, TRUE));
}
}
*listsvp = matches_string;
#endif /* DEBUGGING */
}
+/* Should be synchronized with ANYOF_ #defines in regcomp.h */
+#ifdef DEBUGGING
+
+# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
+ || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
+ || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
+ || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
+ || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
+ || _CC_VERTSPACE != 15
+# error Need to adjust order of anyofs[]
+# endif
+static const char * const anyofs[] = {
+ "\\w",
+ "\\W",
+ "\\d",
+ "\\D",
+ "[:alpha:]",
+ "[:^alpha:]",
+ "[:lower:]",
+ "[:^lower:]",
+ "[:upper:]",
+ "[:^upper:]",
+ "[:punct:]",
+ "[:^punct:]",
+ "[:print:]",
+ "[:^print:]",
+ "[:alnum:]",
+ "[:^alnum:]",
+ "[:graph:]",
+ "[:^graph:]",
+ "[:cased:]",
+ "[:^cased:]",
+ "\\s",
+ "\\S",
+ "[:blank:]",
+ "[:^blank:]",
+ "[:xdigit:]",
+ "[:^xdigit:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
+ "[:ascii:]",
+ "[:^ascii:]",
+ "\\v",
+ "\\V"
+};
+#endif
+
/*
- regprop - printable representation of opcode, with run time support
*/
{
#ifdef DEBUGGING
int k;
-
- /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
- static const char * const anyofs[] = {
-#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
- || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
- || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
- || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
- || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
- #error Need to adjust order of anyofs[]
-#endif
- "\\w",
- "\\W",
- "\\d",
- "\\D",
- "[:alpha:]",
- "[:^alpha:]",
- "[:lower:]",
- "[:^lower:]",
- "[:upper:]",
- "[:^upper:]",
- "[:punct:]",
- "[:^punct:]",
- "[:print:]",
- "[:^print:]",
- "[:alnum:]",
- "[:^alnum:]",
- "[:graph:]",
- "[:^graph:]",
- "[:cased:]",
- "[:^cased:]",
- "\\s",
- "\\S",
- "[:blank:]",
- "[:^blank:]",
- "[:xdigit:]",
- "[:^xdigit:]",
- "[:cntrl:]",
- "[:^cntrl:]",
- "[:ascii:]",
- "[:^ascii:]",
- "\\v",
- "\\V"
- };
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
sv_catpvs(sv, "[");
(void) put_charclass_bitmap_innards(sv,
- (IS_ANYOF_TRIE(op))
+ ((IS_ANYOF_TRIE(op))
? ANYOF_BITMAP(o)
- : TRIE_BITMAP(trie),
+ : TRIE_BITMAP(trie)),
NULL);
sv_catpvs(sv, "]");
}
int do_sep = 0;
SV* bitmap_invlist = NULL; /* Will hold what the bit map contains */
-
if (OP(o) == ANYOFL) {
if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
- sv_catpvs(sv, "{utf8-loc}");
+ sv_catpvs(sv, "{utf8-locale-reqd}");
}
- else {
- sv_catpvs(sv, "{loc}");
+ if (flags & ANYOFL_FOLD) {
+ sv_catpvs(sv, "{i}");
}
}
- if (flags & ANYOFL_FOLD)
- sv_catpvs(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
- /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
- * */
+ /* Output what the bitmap matches, and get what that is into
+ * 'bitmap_invlist' */
do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
&bitmap_invlist);
- /* output any special charclass tests (used entirely under use
- * locale) * */
+ /* Output any special charclass tests (used entirely under 'use
+ * locale'). */
if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
int i;
for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
{
if (do_sep) {
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
- if (flags & ANYOF_INVERT)
- /*make sure the invert info is in each */
+ if (flags & ANYOF_INVERT) /*make sure the invert info is in each */
sv_catpvs(sv, "^");
}
bitmap_invlist);
if (lv && lv != &PL_sv_undef) {
char *s = savesvpv(lv);
- char * const origs = s;
+ const char * const orig_s = s; /* Save the beginning of
+ 's', so can be freed */
+ const STRLEN dump_len = (PL_dump_re_max_len)
+ ? PL_dump_re_max_len
+ : 256;
+ /* Ignore anything before the first \n */
while (*s && *s != '\n')
s++;
+ /* The data are one range per line. A range is a single
+ * entity; or two, separated by \t. So can just convert \n
+ * to space and \t to '-' */
if (*s == '\n') {
const char * const t = ++s;
if (*s == '\n') {
/* Truncate very long output */
- if (s - origs > 256) {
+ if ((UV) (s - t) > dump_len) {
Perl_sv_catpvf(aTHX_ sv,
"%.*s...",
- (int) (s - origs - 1),
+ (int) (s - t),
t);
goto out_dump;
}
}
s++;
}
+
+ /* Here, it fits in the allocated space. Replace a
+ * final blank with a NUL */
if (s[-1] == ' ')
- s[-1] = 0;
+ s[-1] = '\0';
sv_catpv(sv, t);
}
out_dump:
- Safefree(origs);
+ Safefree(orig_s);
SvREFCNT_dec_NN(lv);
}
sv_catpvs(sv, "\\");
sv_catpvn(sv, &string, 1);
}
+ else if (isMNEMONIC_CNTRL(c)) {
+ Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
+ }
else {
- const char * const mnemonic = cntrl_to_mnemonic((char) c);
- if (mnemonic) {
- Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
- }
- else {
- Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
- }
+ Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
}
}
S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
{
/* Appends to 'sv' a displayable version of the range of code points from
- * 'start' to 'end'. It assumes that only ASCII printables are displayable
- * as-is (though some of these will be escaped by put_code_point()). */
+ * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
+ * that have them, when they occur at the beginning or end of the range.
+ * It uses hex to output the remaining code points, unless 'allow_literals'
+ * is true, in which case the printable ASCII ones are output as-is (though
+ * some of these will be escaped by put_code_point()).
+ *
+ * NOTE: This is designed only for printing ranges of code points that fit
+ * inside an ANYOF bitmap. Higher code points are simply suppressed
+ */
const unsigned int min_range_count = 3;
if (end - start < min_range_count) {
- /* Individual chars in short ranges */
+ /* Output chars individually when they occur in short ranges */
for (; start <= end; start++) {
put_code_point(sv, start);
}
/* If permitted by the input options, and there is a possibility that
* this range contains a printable literal, look to see if there is
- * one. */
+ * one. */
if (allow_literals && start <= MAX_PRINT_A) {
- /* If the range begin isn't an ASCII printable, effectively split
- * the range into two parts:
+ /* If the character at the beginning of the range isn't an ASCII
+ * printable, effectively split the range into two parts:
* 1) the portion before the first such printable,
* 2) the rest
* and output them separately. */
temp_end = end + 1;
}
- /* Output the first part of the split range, the part that
- * doesn't have printables, with no looking for literals
- * (otherwise we would infinitely recurse) */
+ /* Output the first part of the split range: the part that
+ * doesn't have printables, with the parameter set to not look
+ * for literals (otherwise we would infinitely recurse) */
put_range(sv, start, temp_end - 1, FALSE);
/* The 2nd part of the range (if any) starts here. */
start = temp_end;
- /* We continue instead of dropping down because even if the 2nd
- * part is non-empty, it could be so short that we want to
- * output it specially, as tested for at the top of this loop.
- * */
+ /* We do a continue, instead of dropping down, because even if
+ * the 2nd part is non-empty, it could be so short that we want
+ * to output it as individual characters, as tested for at the
+ * top of this loop. */
continue;
}
temp_end--;
}
- /* And separately output the range that doesn't have mnemonics */
+ /* And separately output the interior range that doesn't start or
+ * end with mnemonics */
put_range(sv, start, temp_end, FALSE);
/* Then output the mnemonic trailing controls */
: NUM_ANYOF_CODE_POINTS - 1;
#if NUM_ANYOF_CODE_POINTS > 256
format = (this_end < 256)
- ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
+ ? "\\x%02"UVXf"-\\x%02"UVXf""
: "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
#else
- format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
+ format = "\\x%02"UVXf"-\\x%02"UVXf"";
#endif
GCC_DIAG_IGNORE(-Wformat-nonliteral);
Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
* output anything, and bitmap_invlist, if not NULL, will point to an
- * inversion list of what is in the bit map */
+ * inversion list of what is in the bit map. It must be freed by the
+ * caller. */
int i;
UV start, end;
/* Generally, it is more readable if printable characters are output as
* literals, but if a range (nearly) spans all of them, it's best to output
* it as a single range. This code will use a single range if all but 2
- * printables are in it */
+ * ASCII printables are in it */
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
- /* If range starts beyond final printable, it doesn't have any in it */
+ /* If the range starts beyond the final printable, it doesn't have any
+ * in it */
if (start > MAX_PRINT_A) {
break;
}