#define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
+#ifndef MAX
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#endif
+
/* this is a chain of data about sub patterns we are processing that
need to be handled separately/specially in study_chunk. Its so
we can simulate recursion without losing state. */
scan_frame *frame_head;
scan_frame *frame_last;
U32 frame_count;
- U32 strict;
#ifdef ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#endif
bool seen_unfolded_sharp_s;
+ bool strict;
};
#define RExC_flags (pRExC_state->flags)
* returned list must, and will, contain every code point that is a
* possibility. */
- SV* invlist = sv_2mortal(_new_invlist(0));
+ SV* invlist = NULL;
SV* only_utf8_locale_invlist = NULL;
unsigned int i;
const U32 n = ARG(node);
/* Here, no compile-time swash, and there are things that won't be
* known until runtime -- we have to assume it could be anything */
+ invlist = sv_2mortal(_new_invlist(1));
return _add_range_to_invlist(invlist, 0, UV_MAX);
}
else if (ary[3] && ary[3] != &PL_sv_undef) {
}
}
+ if (! invlist) {
+ invlist = sv_2mortal(_new_invlist(0));
+ }
+
/* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
* code points, and an inversion list for the others, but if there are code
* points that should match only conditionally on the target string being
}
/* If this can match all upper Latin1 code points, have to add them
- * as well */
- if (OP(node) == ANYOFD
+ * as well. But don't add them if inverting, as when that gets done below,
+ * it would exclude all these characters, including the ones it shouldn't
+ * that were added just above */
+ if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
&& (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
{
_invlist_union(invlist, PL_UpperLatin1, &invlist);
/* Initialize these here instead of as-needed, as is quick and avoids
* having to test them each time otherwise */
if (! PL_AboveLatin1) {
+#ifdef DEBUGGING
+ char * dump_len_string;
+#endif
+
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
PL_InBitmap = _new_invlist(2);
PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
NUM_ANYOF_CODE_POINTS - 1);
+#ifdef DEBUGGING
+ dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
+ if ( ! dump_len_string
+ || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
+ {
+ PL_dump_re_max_len = 0;
+ }
+#endif
}
pRExC_state->code_blocks = NULL;
#ifndef PERL_IN_XSUB_RE
+STATIC void
+S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
+{
+ /* Replaces the inversion list in 'src' with the one in 'dest'. It steals
+ * the list from 'src', so 'src' is made to have a NULL list. This is
+ * similar to what SvSetMagicSV() would do, if it were implemented on
+ * inversion lists, though this routine avoids a copy */
+
+ const UV src_len = _invlist_len(src);
+ const bool src_offset = *get_invlist_offset_addr(src);
+ const STRLEN src_byte_len = SvLEN(src);
+ char * array = SvPVX(src);
+
+ const int oldtainted = TAINT_get;
+
+ PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
+
+ assert(SvTYPE(src) == SVt_INVLIST);
+ assert(SvTYPE(dest) == SVt_INVLIST);
+ assert(! invlist_is_iterating(src));
+ assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
+
+ /* Make sure it ends in the right place with a NUL, as our inversion list
+ * manipulations aren't careful to keep this true, but sv_usepvn_flags()
+ * asserts it */
+ array[src_byte_len - 1] = '\0';
+
+ TAINT_NOT; /* Otherwise it breaks */
+ sv_usepvn_flags(dest,
+ (char *) array,
+ src_byte_len - 1,
+
+ /* This flag is documented to cause a copy to be avoided */
+ SV_HAS_TRAILING_NUL);
+ TAINT_set(oldtainted);
+ SvPV_set(src, 0);
+ SvLEN_set(src, 0);
+ SvCUR_set(src, 0);
+
+ /* Finish up copying over the other fields in an inversion list */
+ *get_invlist_offset_addr(dest) = src_offset;
+ invlist_set_len(dest, src_len, src_offset);
+ *get_invlist_previous_index_addr(dest) = 0;
+ invlist_iterfinish(dest);
+}
+
PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV* invlist)
{
}
PERL_STATIC_INLINE void
-S_invlist_trim(SV* const invlist)
+S_invlist_trim(SV* invlist)
{
+ /* Free the not currently-being-used space in an inversion list */
+
+ /* But don't free up the space needed for 0 UV that is always at the
+ * beginning of the list, nor the trailing NUL */
+ const UV min_size = TO_INTERNAL_SIZE(1) + 1;
+
PERL_ARGS_ASSERT_INVLIST_TRIM;
assert(SvTYPE(invlist) == SVt_INVLIST);
- /* Change the length of the inversion list to how many entries it currently
- * has */
- SvPV_shrink_to_cur((SV *) invlist);
+ SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
+
+}
+
+PERL_STATIC_INLINE void
+S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
+{
+ PERL_ARGS_ASSERT_INVLIST_CLEAR;
+
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ invlist_set_len(invlist, 0, 0);
+ invlist_trim(invlist);
}
+#endif /* ifndef PERL_IN_XSUB_RE */
+
PERL_STATIC_INLINE bool
S_invlist_is_iterating(SV* const invlist)
{
return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
}
-#endif /* ifndef PERL_IN_XSUB_RE */
-
PERL_STATIC_INLINE UV
S_invlist_max(SV* const invlist)
{
/* Take the union of two inversion lists and point <output> to it. *output
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented if not already a
- * temporary (mortal); otherwise *output will be made correspondingly
- * mortal. The first list, <a>, may be NULL, in which case a copy of the
- * second list is returned. If <complement_b> is TRUE, the union is taken
- * of the complement (inversion) of <b> instead of b itself.
+ * temporary (mortal); otherwise just its contents will be modified to be
+ * the union. The first list, <a>, may be NULL, in which case a copy of
+ * the second list is returned. If <complement_b> is TRUE, the union is
+ * taken of the complement (inversion) of <b> instead of b itself.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
}
}
- /* We may be removing a reference to one of the inputs. If so, the output
- * is made mortal if the input was. (Mortal SVs shouldn't have their ref
- * count decremented) */
- if (a == *output || b == *output) {
+ if (a != *output && b != *output) {
+ *output = u;
+ }
+ else {
+ /* Here, the output is to be the same as one of the input scalars,
+ * hence replacing it. The simple thing to do is to free the input
+ * scalar, making it instead be the output one. But experience has
+ * shown [perl #127392] that if the input is a mortal, we can get a
+ * huge build-up of these during regex compilation before they get
+ * freed. So for that case, replace just the input's interior with
+ * the output's, and then free the output */
+
assert(! invlist_is_iterating(*output));
- if ((SvTEMP(*output))) {
- sv_2mortal(u);
+
+ if (! SvTEMP(*output)) {
+ SvREFCNT_dec_NN(*output);
+ *output = u;
}
else {
- SvREFCNT_dec_NN(*output);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
}
}
- *output = u;
-
return;
}
/* Take the intersection of two inversion lists and point <i> to it. *i
* SHOULD BE DEFINED upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented if not already a
- * temporary (mortal); otherwise *i will be made correspondingly mortal.
- * The first list, <a>, may be NULL, in which case an empty list is
- * returned. If <complement_b> is TRUE, the result will be the
- * intersection of <a> and the complement (or inversion) of <b> instead of
- * <b> directly.
+ * temporary (mortal); otherwise just its contents will be modified to be
+ * the intersection. The first list, <a>, may be NULL, in which case an
+ * empty list is returned. If <complement_b> is TRUE, the result will be
+ * the intersection of <a> and the complement (or inversion) of <b> instead
+ * of <b> directly.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
* intersection must be empty */
if (*i == a) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
+ if (a != NULL) {
+ if (! (make_temp = cBOOL(SvTEMP(a)))) {
+ SvREFCNT_dec_NN(a);
+ }
}
}
else if (*i == b) {
}
}
- /* We may be removing a reference to one of the inputs. If so, the output
- * is made mortal if the input was. (Mortal SVs shouldn't have their ref
- * count decremented) */
- if (a == *i || b == *i) {
+ if (a != *i && b != *i) {
+ *i = r;
+ }
+ else {
+ /* Here, the output is to be the same as one of the input scalars,
+ * hence replacing it. The simple thing to do is to free the input
+ * scalar, making it instead be the output one. But experience has
+ * shown [perl #127392] that if the input is a mortal, we can get a
+ * huge build-up of these during regex compilation before they get
+ * freed. So for that case, replace just the input's interior with
+ * the output's, and then free the output. A short-cut in this case
+ * is if the output is empty, we can just set the input to be empty */
+
assert(! invlist_is_iterating(*i));
- if (SvTEMP(*i)) {
- sv_2mortal(r);
+
+ if (! SvTEMP(*i)) {
+ SvREFCNT_dec_NN(*i);
+ *i = r;
}
else {
- SvREFCNT_dec_NN(*i);
+ if (len_r) {
+ invlist_replace_list_destroys_src(*i, r);
+ }
+ else {
+ invlist_clear(*i);
+ }
+ SvREFCNT_dec_NN(r);
}
}
- *i = r;
-
return;
}
: array[len - 1] - 1;
}
-#ifndef PERL_IN_XSUB_RE
-SV *
-Perl__invlist_contents(pTHX_ SV* const invlist)
+STATIC SV *
+S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
{
/* Get the contents of an inversion list into a string SV so that they can
- * be printed out. It uses the format traditionally done for debug tracing
- */
+ * be printed out. If 'traditional_style' is TRUE, it uses the format
+ * traditionally done for debug tracing; otherwise it uses a format
+ * suitable for just copying to the output, with blanks between ranges and
+ * a dash between range components */
UV start, end;
- SV* output = newSVpvs("\n");
+ SV* output;
+ const char intra_range_delimiter = (traditional_style ? '\t' : '-');
+ const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
+
+ if (traditional_style) {
+ output = newSVpvs("\n");
+ }
+ else {
+ output = newSVpvs("");
+ }
- PERL_ARGS_ASSERT__INVLIST_CONTENTS;
+ PERL_ARGS_ASSERT_INVLIST_CONTENTS;
assert(! invlist_is_iterating(invlist));
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c",
+ start, intra_range_delimiter,
+ inter_range_delimiter);
}
else if (end != start) {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
- start, end);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c",
+ start,
+ intra_range_delimiter,
+ end, inter_range_delimiter);
}
else {
- Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
+ Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c",
+ start, inter_range_delimiter);
}
}
+ if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
+ SvCUR_set(output, SvCUR(output) - 1);
+ }
+
return output;
}
-#endif
#ifndef PERL_IN_XSUB_RE
void
{
/* See if this is a [:posix:] class. */
bool is_posix_class = (OOB_NAMEDCLASS
- < handle_possible_posix(pRExC_state,
- RExC_parse + 1,
- NULL,
- NULL));
+ < handle_possible_posix(pRExC_state,
+ RExC_parse + 1,
+ NULL,
+ NULL));
/* If it is a posix class, leave the parse pointer at the
* '[' to fool regclass() into thinking it is part of a
* '[[:posix:]]'. */
{
/* See if this is a [:posix:] class. */
bool is_posix_class = (OOB_NAMEDCLASS
- < handle_possible_posix(pRExC_state,
- RExC_parse + 1,
- NULL,
- NULL));
+ < handle_possible_posix(pRExC_state,
+ RExC_parse + 1,
+ NULL,
+ NULL));
/* If it is a posix class, leave the parse pointer at the '['
* to fool regclass() into thinking it is part of a
* '[[:posix:]]'. */
bool optimizable, /* ? Allow a non-ANYOF return
node */
SV** ret_invlist, /* Return an inversion list, not a node */
- AV** posix_warnings
+ AV** return_posix_warnings
)
{
/* parse a bracketed class specification. Most of these will produce an
* like it could be such a construct is encountered, it is checked for
* being one, but not if we've already checked this area of the input.
* Only after this position is reached do we check again */
- char *dont_check_for_posix_end = RExC_parse - 1;
+ char *not_posix_region_end = RExC_parse - 1;
GET_RE_DEBUG_FLAGS_DECL;
allow_multi_folds = FALSE;
#endif
- if (posix_warnings == NULL) {
- posix_warnings = (AV **) -1;
+ if (return_posix_warnings == NULL) {
+ return_posix_warnings = (AV **) -1;
}
/* Assume we are going to generate an ANYOF node. */
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
char *class_end;
- int maybe_class = handle_possible_posix(pRExC_state, RExC_parse,
- &class_end, NULL);
+ int maybe_class = handle_possible_posix(pRExC_state,
+ RExC_parse,
+ &class_end,
+ NULL);
if (maybe_class >= OOB_NAMEDCLASS) {
- dont_check_for_posix_end = class_end;
- if (PASS2 && posix_warnings == (AV **) -1) {
+ not_posix_region_end = class_end;
+ if (PASS2 && return_posix_warnings == (AV **) -1) {
SAVEFREESV(RExC_rx_sv);
ckWARN4reg(class_end,
"POSIX syntax [%c %c] belongs inside character classes%s",
value = UCHARAT(RExC_parse++);
if (value == '[') {
- namedclass = handle_possible_posix(pRExC_state, RExC_parse, &dont_check_for_posix_end, posix_warnings);
+ namedclass = handle_possible_posix(pRExC_state,
+ RExC_parse,
+ ¬_posix_region_end,
+ return_posix_warnings);
if (namedclass > OOB_NAMEDCLASS) {
- RExC_parse = dont_check_for_posix_end;
+ RExC_parse = not_posix_region_end;
}
else {
namedclass = OOB_NAMEDCLASS;
}
}
- else if ( RExC_parse - 1 > dont_check_for_posix_end
+ else if ( RExC_parse - 1 > not_posix_region_end
&& MAYBE_POSIXCC(value))
{
- (void) handle_possible_posix(pRExC_state, RExC_parse - 1, /* -1 because parse has already been advanced */
- &dont_check_for_posix_end, posix_warnings);
+ (void) handle_possible_posix(
+ pRExC_state,
+ RExC_parse - 1, /* -1 because parse has already been
+ advanced */
+ ¬_posix_region_end,
+ return_posix_warnings);
}
else if (value == '\\') {
/* Is a backslash; get the code point of the char after it */
"\"%.*s\" is more clearly written simply as \"%s\"",
(int) (RExC_parse - rangebegin),
rangebegin,
- cntrl_to_mnemonic((char) value)
+ cntrl_to_mnemonic((U8) value)
);
}
}
* fetching). We know to set the flag if we have a non-NULL list for UTF-8
* locales, or the class matches at least one 0-255 range code point */
if (LOC && FOLD) {
+
+ /* Some things on the list might be unconditionally included because of
+ * other components. Remove them, and clean up the list if it goes to
+ * 0 elements */
+ if (only_utf8_locale_list && cp_list) {
+ _invlist_subtract(only_utf8_locale_list, cp_list,
+ &only_utf8_locale_list);
+
+ if (_invlist_len(only_utf8_locale_list) == 0) {
+ SvREFCNT_dec_NN(only_utf8_locale_list);
+ only_utf8_locale_list = NULL;
+ }
+ }
if (only_utf8_locale_list) {
ANYOF_FLAGS(ret)
|= ANYOFL_FOLD
if ( has_upper_latin1_only_utf8_matches
|| MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
{
- if (has_upper_latin1_only_utf8_matches) {
- if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
-
- /* Here, we have both the flag and inversion list. Any character in
- * 'has_upper_latin1_only_utf8_matches' matches when UTF-8 is
- * in effect, but it also matches when UTF-8 is not in effect
- * because of MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it
- * matches unconditionally, so can be added to the regular
- * list, and 'has_upper_latin1_only_utf8_matches' cleared */
- _invlist_union(cp_list,
- has_upper_latin1_only_utf8_matches,
- &cp_list);
- SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
- has_upper_latin1_only_utf8_matches = NULL;
- }
- else if (cp_list) {
-
- /* Here, 'cp_list' gives chars that always match, and
- * 'has_upper_latin1_only_utf8_matches' gives chars that were
- * specified to match only if the target string is in UTF-8.
- * It may be that these overlap, so we can subtract the
- * unconditionally matching from the conditional ones, to make
- * the conditional list as small as possible, perhaps even
- * clearing it, in which case more optimizations are possible
- * later */
- _invlist_subtract(has_upper_latin1_only_utf8_matches,
- cp_list,
- &has_upper_latin1_only_utf8_matches);
- if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+ /* But not if we are inverting, as that screws it up */
+ if (! invert) {
+ if (has_upper_latin1_only_utf8_matches) {
+ if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+
+ /* Here, we have both the flag and inversion list. Any
+ * character in 'has_upper_latin1_only_utf8_matches'
+ * matches when UTF-8 is in effect, but it also matches
+ * when UTF-8 is not in effect because of
+ * MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it matches
+ * unconditionally, so can be added to the regular list,
+ * and 'has_upper_latin1_only_utf8_matches' cleared */
+ _invlist_union(cp_list,
+ has_upper_latin1_only_utf8_matches,
+ &cp_list);
SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
has_upper_latin1_only_utf8_matches = NULL;
}
+ else if (cp_list) {
+
+ /* Here, 'cp_list' gives chars that always match, and
+ * 'has_upper_latin1_only_utf8_matches' gives chars that
+ * were specified to match only if the target string is in
+ * UTF-8. It may be that these overlap, so we can subtract
+ * the unconditionally matching from the conditional ones,
+ * to make the conditional list as small as possible,
+ * perhaps even clearing it, in which case more
+ * optimizations are possible later */
+ _invlist_subtract(has_upper_latin1_only_utf8_matches,
+ cp_list,
+ &has_upper_latin1_only_utf8_matches);
+ if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
+ }
}
- }
- /* Similarly, if the unconditional matches include every upper latin1
- * character, we can clear that flag to permit later optimizations */
- if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
- SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
- _invlist_subtract(only_non_utf8_list, cp_list, &only_non_utf8_list);
- if (_invlist_len(only_non_utf8_list) == 0) {
- ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ /* Similarly, if the unconditional matches include every upper
+ * latin1 character, we can clear that flag to permit later
+ * optimizations */
+ if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
+ SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+ _invlist_subtract(only_non_utf8_list, cp_list,
+ &only_non_utf8_list);
+ if (_invlist_len(only_non_utf8_list) == 0) {
+ ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ }
+ SvREFCNT_dec_NN(only_non_utf8_list);
+ only_non_utf8_list = NULL;;
}
- SvREFCNT_dec_NN(only_non_utf8_list);
- only_non_utf8_list = NULL;;
}
/* If we haven't gotten rid of all conditional matching, we change the
bool doinit,
SV** listsvp,
SV** only_utf8_locale_ptr,
- SV* exclude_list)
+ SV** output_invlist)
{
/* For internal core use only.
* swash exists, by calling this function with 'doinit' set to false, in
* which case the components that will be used to eventually create the
* swash are returned (in a printable form).
- * If <exclude_list> is not NULL, it is an inversion list of things to
- * exclude from what's returned in <listsvp>.
+ * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
+ * store an inversion list of code points that should match only if the
+ * execution-time locale is a UTF-8 one.
+ * If <output_invlist> is not NULL, it is where this routine is to store an
+ * inversion list of the code points that would be instead returned in
+ * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
+ * when this parameter is used, is just the non-code point data that
+ * will go into creating the swash. This currently should be just
+ * user-defined properties whose definitions were not known at compile
+ * time. Using this parameter allows for easier manipulation of the
+ * swash's data by the caller. It is illegal to call this function with
+ * this parameter set, but not <listsvp>
+ *
* Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
* that, in spite of this function's name, the swash it returns may include
* the bitmap data as well */
SV *sw = NULL;
SV *si = NULL; /* Input swash initialization string */
- SV* invlist = NULL;
+ SV* invlist = NULL;
RXi_GET_DECL(prog,progi);
const struct reg_data * const data = prog ? progi->data : NULL;
PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
+ assert(! output_invlist || listsvp);
if (data && data->count) {
const U32 n = ARG(node);
/* If requested, return a printable version of what this swash matches */
if (listsvp) {
- SV* matches_string = newSVpvs("");
+ SV* matches_string = NULL;
/* The swash should be used, if possible, to get the data, as it
* contains the resolved data. But this function can be called at
if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
&& (si && si != &PL_sv_undef))
{
- sv_catsv(matches_string, si);
+ /* Here, we only have 'si' (and possibly some passed-in data in
+ * 'invlist', which is handled below) If the caller only wants
+ * 'si', use that. */
+ if (! output_invlist) {
+ matches_string = newSVsv(si);
+ }
+ else {
+ /* But if the caller wants an inversion list of the node, we
+ * need to parse 'si' and place as much as possible in the
+ * desired output inversion list, making 'matches_string' only
+ * contain the currently unresolvable things */
+ const char *si_string = SvPVX(si);
+ STRLEN remaining = SvCUR(si);
+ UV prev_cp = 0;
+ U8 count = 0;
+
+ /* Ignore everything before the first new-line */
+ while (*si_string != '\n' && remaining > 0) {
+ si_string++;
+ remaining--;
+ }
+ assert(remaining > 0);
+
+ si_string++;
+ remaining--;
+
+ while (remaining > 0) {
+
+ /* The data consists of just strings defining user-defined
+ * property names, but in prior incarnations, and perhaps
+ * somehow from pluggable regex engines, it could still
+ * hold hex code point definitions. Each component of a
+ * range would be separated by a tab, and each range by a
+ * new-line. If these are found, instead add them to the
+ * inversion list */
+ I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
+ |PERL_SCAN_SILENT_NON_PORTABLE;
+ STRLEN len = remaining;
+ UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
+
+ /* If the hex decode routine found something, it should go
+ * up to the next \n */
+ if ( *(si_string + len) == '\n') {
+ if (count) { /* 2nd code point on line */
+ *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
+ }
+ else {
+ *output_invlist = add_cp_to_invlist(*output_invlist, cp);
+ }
+ count = 0;
+ goto prepare_for_next_iteration;
+ }
+
+ /* If the hex decode was instead for the lower range limit,
+ * save it, and go parse the upper range limit */
+ if (*(si_string + len) == '\t') {
+ assert(count == 0);
+
+ prev_cp = cp;
+ count = 1;
+ prepare_for_next_iteration:
+ si_string += len + 1;
+ remaining -= len + 1;
+ continue;
+ }
+
+ /* Here, didn't find a legal hex number. Just add it from
+ * here to the next \n */
+
+ remaining -= len;
+ while (*(si_string + len) != '\n' && remaining > 0) {
+ remaining--;
+ len++;
+ }
+ if (*(si_string + len) == '\n') {
+ len++;
+ remaining--;
+ }
+ if (matches_string) {
+ sv_catpvn(matches_string, si_string, len - 1);
+ }
+ else {
+ matches_string = newSVpvn(si_string, len - 1);
+ }
+ si_string += len;
+ sv_catpvs(matches_string, " ");
+ } /* end of loop through the text */
+
+ assert(matches_string);
+ if (SvCUR(matches_string)) { /* Get rid of trailing blank */
+ SvCUR_set(matches_string, SvCUR(matches_string) - 1);
+ }
+ } /* end of has an 'si' but no swash */
}
- /* Add the inversion list to whatever we have. This may have come from
- * the swash, or from an input parameter */
- if (invlist) {
- if (exclude_list) {
- SV* clone = invlist_clone(invlist);
- _invlist_subtract(clone, exclude_list, &clone);
- sv_catsv(matches_string, _invlist_contents(clone));
- SvREFCNT_dec_NN(clone);
+ /* If we have a swash in place, its equivalent inversion list was above
+ * placed into 'invlist'. If not, this variable may contain a stored
+ * inversion list which is information beyond what is in 'si' */
+ if (invlist) {
+
+ /* Again, if the caller doesn't want the output inversion list, put
+ * everything in 'matches-string' */
+ if (! output_invlist) {
+ if ( ! matches_string) {
+ matches_string = newSVpvs("\n");
+ }
+ sv_catsv(matches_string, invlist_contents(invlist,
+ TRUE /* traditional style */
+ ));
+ }
+ else if (! *output_invlist) {
+ *output_invlist = invlist_clone(invlist);
}
else {
- sv_catsv(matches_string, _invlist_contents(invlist));
+ _invlist_union(*output_invlist, invlist, output_invlist);
}
- }
+ }
+
*listsvp = matches_string;
}
#endif /* DEBUGGING */
}
+/* Should be synchronized with ANYOF_ #defines in regcomp.h */
+#ifdef DEBUGGING
+
+# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
+ || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
+ || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
+ || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
+ || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
+ || _CC_VERTSPACE != 15
+# error Need to adjust order of anyofs[]
+# endif
+static const char * const anyofs[] = {
+ "\\w",
+ "\\W",
+ "\\d",
+ "\\D",
+ "[:alpha:]",
+ "[:^alpha:]",
+ "[:lower:]",
+ "[:^lower:]",
+ "[:upper:]",
+ "[:^upper:]",
+ "[:punct:]",
+ "[:^punct:]",
+ "[:print:]",
+ "[:^print:]",
+ "[:alnum:]",
+ "[:^alnum:]",
+ "[:graph:]",
+ "[:^graph:]",
+ "[:cased:]",
+ "[:^cased:]",
+ "\\s",
+ "\\S",
+ "[:blank:]",
+ "[:^blank:]",
+ "[:xdigit:]",
+ "[:^xdigit:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
+ "[:ascii:]",
+ "[:^ascii:]",
+ "\\v",
+ "\\V"
+};
+#endif
+
/*
- regprop - printable representation of opcode, with run time support
*/
{
#ifdef DEBUGGING
int k;
-
- /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
- static const char * const anyofs[] = {
-#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
- || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
- || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
- || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
- || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
- #error Need to adjust order of anyofs[]
-#endif
- "\\w",
- "\\W",
- "\\d",
- "\\D",
- "[:alpha:]",
- "[:^alpha:]",
- "[:lower:]",
- "[:^lower:]",
- "[:upper:]",
- "[:^upper:]",
- "[:punct:]",
- "[:^punct:]",
- "[:print:]",
- "[:^print:]",
- "[:alnum:]",
- "[:^alnum:]",
- "[:graph:]",
- "[:^graph:]",
- "[:cased:]",
- "[:^cased:]",
- "\\s",
- "\\S",
- "[:blank:]",
- "[:^blank:]",
- "[:xdigit:]",
- "[:^xdigit:]",
- "[:cntrl:]",
- "[:^cntrl:]",
- "[:ascii:]",
- "[:^ascii:]",
- "\\v",
- "\\V"
- };
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
sv_catpvs(sv, "[");
(void) put_charclass_bitmap_innards(sv,
- (IS_ANYOF_TRIE(op))
+ ((IS_ANYOF_TRIE(op))
? ANYOF_BITMAP(o)
- : TRIE_BITMAP(trie),
- NULL);
+ : TRIE_BITMAP(trie)),
+ NULL,
+ NULL,
+ NULL
+ );
sv_catpvs(sv, "]");
}
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
const U8 flags = ANYOF_FLAGS(o);
- int do_sep = 0;
- SV* bitmap_invlist = NULL; /* Will hold what the bit map contains */
+ bool do_sep = FALSE; /* Do we need to separate various components of
+ the output? */
+ /* Set if there is still an unresolved user-defined property */
+ SV *unresolved = NULL;
+
+ /* Things that are ignored except when the runtime locale is UTF-8 */
+ SV *only_utf8_locale_invlist = NULL;
+ /* Code points that don't fit in the bitmap */
+ SV *nonbitmap_invlist = NULL;
+
+ /* And things that aren't in the bitmap, but are small enough to be */
+ SV* bitmap_range_not_in_bitmap = NULL;
if (OP(o) == ANYOFL) {
if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
- sv_catpvs(sv, "{utf8-loc}");
+ sv_catpvs(sv, "{utf8-locale-reqd}");
}
- else {
- sv_catpvs(sv, "{loc}");
+ if (flags & ANYOFL_FOLD) {
+ sv_catpvs(sv, "{i}");
}
}
- if (flags & ANYOFL_FOLD)
- sv_catpvs(sv, "{i}");
- Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (flags & ANYOF_INVERT)
- sv_catpvs(sv, "^");
- /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
- * */
- do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
- &bitmap_invlist);
+ /* If there is stuff outside the bitmap, get it */
+ if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
+ (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+ &unresolved,
+ &only_utf8_locale_invlist,
+ &nonbitmap_invlist);
+ /* The non-bitmap data may contain stuff that could fit in the
+ * bitmap. This could come from a user-defined property being
+ * finally resolved when this call was done; or much more likely
+ * because there are matches that require UTF-8 to be valid, and so
+ * aren't in the bitmap. This is teased apart later */
+ _invlist_intersection(nonbitmap_invlist,
+ PL_InBitmap,
+ &bitmap_range_not_in_bitmap);
+ /* Leave just the things that don't fit into the bitmap */
+ _invlist_subtract(nonbitmap_invlist,
+ PL_InBitmap,
+ &nonbitmap_invlist);
+ }
- /* output any special charclass tests (used entirely under use
- * locale) * */
- if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
- int i;
- for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
- if (ANYOF_POSIXL_TEST(o,i)) {
- sv_catpv(sv, anyofs[i]);
- do_sep = 1;
- }
- }
+ /* Obey this flag to add all above-the-bitmap code points */
+ if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ NUM_ANYOF_CODE_POINTS,
+ UV_MAX);
}
- if ( ARG(o) != ANYOF_ONLY_HAS_BITMAP
- || (flags
- & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
- |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
- |ANYOFL_FOLD)))
- {
+ /* Ready to start outputting. First, the initial left bracket */
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+
+ /* Then all the things that could fit in the bitmap */
+ do_sep = put_charclass_bitmap_innards(sv,
+ ANYOF_BITMAP(o),
+ bitmap_range_not_in_bitmap,
+ only_utf8_locale_invlist,
+ o);
+ SvREFCNT_dec(bitmap_range_not_in_bitmap);
+
+ /* If there are user-defined properties which haven't been defined yet,
+ * output them, in a separate [] from the bitmap range stuff */
+ if (unresolved) {
if (do_sep) {
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
- if (flags & ANYOF_INVERT)
- /*make sure the invert info is in each */
- sv_catpvs(sv, "^");
}
-
- if (OP(o) == ANYOFD
- && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
- {
- sv_catpvs(sv, "{non-utf8-latin1-all}");
+ if (flags & ANYOF_INVERT) {
+ sv_catpvs(sv, "^");
}
+ sv_catsv(sv, unresolved);
+ do_sep = TRUE;
+ SvREFCNT_dec_NN(unresolved);
+ }
- if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
- sv_catpvs(sv, "{above_bitmap_all}");
-
- if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
- SV *lv; /* Set if there is something outside the bit map. */
- bool byte_output = FALSE; /* If something has been output */
- SV *only_utf8_locale;
-
- /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
- * is used to guarantee that nothing in the bitmap gets
- * returned */
- (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
- &lv, &only_utf8_locale,
- bitmap_invlist);
- if (lv && lv != &PL_sv_undef) {
- char *s = savesvpv(lv);
- const char * const orig_s = s; /* Save the beginning of
- 's', so can be freed */
-
- /* Ignore anything before the first \n */
- while (*s && *s != '\n')
- s++;
-
- /* The data are one range per line. A range is a single
- * entity; or two, separated by \t. So can just convert \n
- * to space and \t to '-' */
- if (*s == '\n') {
- const char * const t = ++s;
-
- if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) {
- if (OP(o) == ANYOFD) {
- sv_catpvs(sv, "{utf8}");
- }
- else {
- sv_catpvs(sv, "{outside bitmap}");
- }
- }
-
- if (byte_output) {
- sv_catpvs(sv, " ");
- }
+ /* And, finally, add the above-the-bitmap stuff */
+ if (nonbitmap_invlist) {
+ SV* contents;
- while (*s) {
- if (*s == '\n') {
+ /* See if truncation size is overridden */
+ const STRLEN dump_len = (PL_dump_re_max_len)
+ ? PL_dump_re_max_len
+ : 256;
- /* Truncate very long output */
- if ((UV) (s - t) > 256) {
- Perl_sv_catpvf(aTHX_ sv,
- "%.*s...",
- (int) (s - t),
- t);
- goto out_dump;
- }
- *s = ' ';
- }
- else if (*s == '\t') {
- *s = '-';
- }
- s++;
- }
+ /* This is output in a separate [] */
+ if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+ }
- /* Here, it fits in the allocated space. Replace a
- * final blank with a NUL */
- if (s[-1] == ' ')
- s[-1] = '\0';
+ /* And, for easy of understanding, it is always output not-shown as
+ * complemented */
+ if (flags & ANYOF_INVERT) {
+ _invlist_invert(nonbitmap_invlist);
+ _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
+ }
- sv_catpv(sv, t);
- }
+ contents = invlist_contents(nonbitmap_invlist,
+ FALSE /* output suitable for catsv */
+ );
- out_dump:
+ /* If the output is shorter than the permissible maximum, just do it. */
+ if (SvCUR(contents) <= dump_len) {
+ sv_catsv(sv, contents);
+ }
+ else {
+ const char * contents_string = SvPVX(contents);
+ STRLEN i = dump_len;
- Safefree(orig_s);
- SvREFCNT_dec_NN(lv);
+ /* Otherwise, start at the permissible max and work back to the
+ * first break possibility */
+ while (i > 0 && contents_string[i] != ' ') {
+ i--;
}
-
- if ((flags & ANYOFL_FOLD)
- && only_utf8_locale
- && only_utf8_locale != &PL_sv_undef)
- {
- UV start, end;
- int max_entries = 256;
-
- sv_catpvs(sv, "{utf8 locale}");
- invlist_iterinit(only_utf8_locale);
- while (invlist_iternext(only_utf8_locale,
- &start, &end)) {
- put_range(sv, start, end, FALSE);
- max_entries --;
- if (max_entries < 0) {
- sv_catpvs(sv, "...");
- break;
- }
- }
- invlist_iterfinish(only_utf8_locale);
+ if (i == 0) { /* Fail-safe. Use the max if we couldn't
+ find a legal break */
+ i = dump_len;
}
+
+ sv_catpvn(sv, contents_string, i);
+ sv_catpvs(sv, "...");
}
- }
- SvREFCNT_dec(bitmap_invlist);
+ SvREFCNT_dec_NN(contents);
+ SvREFCNT_dec_NN(nonbitmap_invlist);
+ }
+ /* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
else if (k == POSIXD || k == NPOSIXD) {
}
else if (isPRINT(c)) {
const char string = (char) c;
- if (isBACKSLASHED_PUNCT(c))
+
+ /* We use {phrase} as metanotation in the class, so also escape literal
+ * braces */
+ if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
sv_catpvs(sv, "\\");
sv_catpvn(sv, &string, 1);
}
+ else if (isMNEMONIC_CNTRL(c)) {
+ Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
+ }
else {
- const char * const mnemonic = cntrl_to_mnemonic((char) c);
- if (mnemonic) {
- Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
- }
- else {
- Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
- }
+ Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
}
}
S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
{
/* Appends to 'sv' a displayable version of the range of code points from
- * 'start' to 'end'. It assumes that only ASCII printables are displayable
- * as-is (though some of these will be escaped by put_code_point()). */
+ * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
+ * that have them, when they occur at the beginning or end of the range.
+ * It uses hex to output the remaining code points, unless 'allow_literals'
+ * is true, in which case the printable ASCII ones are output as-is (though
+ * some of these will be escaped by put_code_point()).
+ *
+ * NOTE: This is designed only for printing ranges of code points that fit
+ * inside an ANYOF bitmap. Higher code points are simply suppressed
+ */
const unsigned int min_range_count = 3;
if (end - start < min_range_count) {
- /* Individual chars in short ranges */
+ /* Output chars individually when they occur in short ranges */
for (; start <= end; start++) {
put_code_point(sv, start);
}
/* If permitted by the input options, and there is a possibility that
* this range contains a printable literal, look to see if there is
- * one. */
+ * one. */
if (allow_literals && start <= MAX_PRINT_A) {
- /* If the range begin isn't an ASCII printable, effectively split
- * the range into two parts:
+ /* If the character at the beginning of the range isn't an ASCII
+ * printable, effectively split the range into two parts:
* 1) the portion before the first such printable,
* 2) the rest
* and output them separately. */
temp_end = end + 1;
}
- /* Output the first part of the split range, the part that
- * doesn't have printables, with no looking for literals
- * (otherwise we would infinitely recurse) */
+ /* Output the first part of the split range: the part that
+ * doesn't have printables, with the parameter set to not look
+ * for literals (otherwise we would infinitely recurse) */
put_range(sv, start, temp_end - 1, FALSE);
/* The 2nd part of the range (if any) starts here. */
start = temp_end;
- /* We continue instead of dropping down because even if the 2nd
- * part is non-empty, it could be so short that we want to
- * output it specially, as tested for at the top of this loop.
- * */
+ /* We do a continue, instead of dropping down, because even if
+ * the 2nd part is non-empty, it could be so short that we want
+ * to output it as individual characters, as tested for at the
+ * top of this loop. */
continue;
}
temp_end--;
}
- /* And separately output the range that doesn't have mnemonics */
+ /* And separately output the interior range that doesn't start or
+ * end with mnemonics */
put_range(sv, start, temp_end, FALSE);
/* Then output the mnemonic trailing controls */
}
}
-STATIC bool
-S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
+STATIC void
+S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
{
- /* Appends to 'sv' a displayable version of the innards of the bracketed
- * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
- * output anything, and bitmap_invlist, if not NULL, will point to an
- * inversion list of what is in the bit map */
+ /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
+ * 'invlist' */
- int i;
UV start, end;
- unsigned int punct_count = 0;
- SV* invlist;
bool allow_literals = TRUE;
- bool inverted_for_output = FALSE;
-
- PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
-
- /* Worst case is exactly every-other code point is in the list */
- invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
- /* Convert the bit map to an inversion list, keeping track of how many
- * ASCII puncts are set, including an extra amount for the backslashed
- * ones. */
- for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
- if (BITMAP_TEST(bitmap, i)) {
- invlist = add_cp_to_invlist(invlist, i);
- if (isPUNCT_A(i)) {
- punct_count++;
- if isBACKSLASHED_PUNCT(i) {
- punct_count++;
- }
- }
- }
- }
-
- /* Nothing to output */
- if (_invlist_len(invlist) == 0) {
- SvREFCNT_dec_NN(invlist);
- return FALSE;
- }
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
/* Generally, it is more readable if printable characters are output as
* literals, but if a range (nearly) spans all of them, it's best to output
* it as a single range. This code will use a single range if all but 2
- * printables are in it */
+ * ASCII printables are in it */
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
- /* If range starts beyond final printable, it doesn't have any in it */
+ /* If the range starts beyond the final printable, it doesn't have any
+ * in it */
if (start > MAX_PRINT_A) {
break;
}
}
invlist_iterfinish(invlist);
- /* The legibility of the output depends mostly on how many punctuation
- * characters are output. There are 32 possible ASCII ones, and some have
- * an additional backslash, bringing it to currently 36, so if any more
- * than 18 are to be output, we can instead output it as its complement,
- * yielding fewer puncts, and making it more legible. But give some weight
- * to the fact that outputting it as a complement is less legible than a
- * straight output, so don't complement unless we are somewhat over the 18
- * mark */
- if (allow_literals && punct_count > 22) {
- sv_catpvs(sv, "^");
-
- /* Add everything remaining to the list, so when we invert it just
- * below, it will be excluded */
- _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
- _invlist_invert(invlist);
- inverted_for_output = TRUE;
- }
-
/* Here we have figured things out. Output each range */
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
}
invlist_iterfinish(invlist);
- if (bitmap_invlist) {
+ return;
+}
- /* Here, wants the inversion list returned. If we inverted it, we have
- * to restore it to the original */
- if (inverted_for_output) {
- _invlist_invert(invlist);
- _invlist_intersection(invlist, PL_InBitmap, &invlist);
- }
+STATIC SV*
+S_put_charclass_bitmap_innards_common(pTHX_
+ SV* invlist, /* The bitmap */
+ SV* posixes, /* Under /l, things like [:word:], \S */
+ SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
+ SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
+ SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
+ const bool invert /* Is the result to be inverted? */
+)
+{
+ /* Create and return an SV containing a displayable version of the bitmap
+ * and associated information determined by the input parameters. */
+
+ SV * output;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
- *bitmap_invlist = invlist;
+ if (invert) {
+ output = newSVpvs("^");
}
else {
- SvREFCNT_dec_NN(invlist);
+ output = newSVpvs("");
}
- return TRUE;
+ /* First, the code points in the bitmap that are unconditionally there */
+ put_charclass_bitmap_innards_invlist(output, invlist);
+
+ /* Traditionally, these have been placed after the main code points */
+ if (posixes) {
+ sv_catsv(output, posixes);
+ }
+
+ if (only_utf8 && _invlist_len(only_utf8)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, only_utf8);
+ }
+
+ if (not_utf8 && _invlist_len(not_utf8)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, not_utf8);
+ }
+
+ if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
+ Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
+ put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
+
+ /* This is the only list in this routine that can legally contain code
+ * points outside the bitmap range. The call just above to
+ * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
+ * output them here. There's about a half-dozen possible, and none in
+ * contiguous ranges longer than 2 */
+ if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+ UV start, end;
+ SV* above_bitmap = NULL;
+
+ _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
+
+ invlist_iterinit(above_bitmap);
+ while (invlist_iternext(above_bitmap, &start, &end)) {
+ UV i;
+
+ for (i = start; i <= end; i++) {
+ put_code_point(output, i);
+ }
+ }
+ invlist_iterfinish(above_bitmap);
+ SvREFCNT_dec_NN(above_bitmap);
+ }
+ }
+
+ /* If the only thing we output is the '^', clear it */
+ if (invert && SvCUR(output) == 1) {
+ SvCUR_set(output, 0);
+ }
+
+ return output;
+}
+
+STATIC bool
+S_put_charclass_bitmap_innards(pTHX_ SV *sv,
+ char *bitmap,
+ SV *nonbitmap_invlist,
+ SV *only_utf8_locale_invlist,
+ const regnode * const node)
+{
+ /* Appends to 'sv' a displayable version of the innards of the bracketed
+ * character class defined by the other arguments:
+ * 'bitmap' points to the bitmap.
+ * 'nonbitmap_invlist' is an inversion list of the code points that are in
+ * the bitmap range, but for some reason aren't in the bitmap; NULL if
+ * none. The reasons for this could be that they require some
+ * condition such as the target string being or not being in UTF-8
+ * (under /d), or because they came from a user-defined property that
+ * was not resolved at the time of the regex compilation (under /u)
+ * 'only_utf8_locale_invlist' is an inversion list of the code points that
+ * are valid only if the runtime locale is a UTF-8 one; NULL if none
+ * 'node' is the regex pattern node. It is needed only when the above two
+ * parameters are not null, and is passed so that this routine can
+ * tease apart the various reasons for them.
+ *
+ * It returns TRUE if there was actually something output. (It may be that
+ * the bitmap, etc is empty.)
+ *
+ * When called for outputting the bitmap of a non-ANYOF node, just pass the
+ * bitmap, with the succeeding parameters set to NULL.
+ *
+ */
+
+ /* In general, it tries to display the 'cleanest' representation of the
+ * innards, choosing whether to display them inverted or not, regardless of
+ * whether the class itself is to be inverted. However, there are some
+ * cases where it can't try inverting, as what actually matches isn't known
+ * until runtime, and hence the inversion isn't either. */
+ bool inverting_allowed = TRUE;
+
+ int i;
+ STRLEN orig_sv_cur = SvCUR(sv);
+
+ SV* invlist; /* Inversion list we accumulate of code points that
+ are unconditionally matched */
+ SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
+ UTF-8 */
+ SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
+ */
+ SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
+ SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
+ is UTF-8 */
+
+ SV* as_is_display; /* The output string when we take the inputs
+ literally */
+ SV* inverted_display; /* The output string when we invert the inputs */
+
+ U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
+
+ bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
+ to match? */
+ /* We are biased in favor of displaying things without them being inverted,
+ * as that is generally easier to understand */
+ const int bias = 5;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
+
+ /* Start off with whatever code points are passed in. (We clone, so we
+ * don't change the caller's list) */
+ if (nonbitmap_invlist) {
+ assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
+ invlist = invlist_clone(nonbitmap_invlist);
+ }
+ else { /* Worst case size is every other code point is matched */
+ invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+ }
+
+ if (flags) {
+ if (OP(node) == ANYOFD) {
+
+ /* This flag indicates that the code points below 0x100 in the
+ * nonbitmap list are precisely the ones that match only when the
+ * target is UTF-8 (they should all be non-ASCII). */
+ if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
+ {
+ _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
+ _invlist_subtract(invlist, only_utf8, &invlist);
+ }
+
+ /* And this flag for matching all non-ASCII 0xFF and below */
+ if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ {
+ if (invert) {
+ not_utf8 = _new_invlist(0);
+ }
+ else {
+ not_utf8 = invlist_clone(PL_UpperLatin1);
+ }
+ inverting_allowed = FALSE; /* XXX needs more work to be able
+ to allow this */
+ }
+ }
+ else if (OP(node) == ANYOFL) {
+
+ /* If either of these flags are set, what matches isn't
+ * determinable except during execution, so don't know enough here
+ * to invert */
+ if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
+ inverting_allowed = FALSE;
+ }
+
+ /* What the posix classes match also varies at runtime, so these
+ * will be output symbolically. */
+ if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
+ int i;
+
+ posixes = newSVpvs("");
+ for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
+ if (ANYOF_POSIXL_TEST(node,i)) {
+ sv_catpv(posixes, anyofs[i]);
+ }
+ }
+ }
+ }
+ }
+
+ /* Accumulate the bit map into the unconditional match list */
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST(bitmap, i)) {
+ invlist = add_cp_to_invlist(invlist, i);
+ }
+ }
+
+ /* Make sure that the conditional match lists don't have anything in them
+ * that match unconditionally; otherwise the output is quite confusing.
+ * This could happen if the code that populates these misses some
+ * duplication. */
+ if (only_utf8) {
+ _invlist_subtract(only_utf8, invlist, &only_utf8);
+ }
+ if (not_utf8) {
+ _invlist_subtract(not_utf8, invlist, ¬_utf8);
+ }
+
+ if (only_utf8_locale_invlist) {
+
+ /* Since this list is passed in, we have to make a copy before
+ * modifying it */
+ only_utf8_locale = invlist_clone(only_utf8_locale_invlist);
+
+ _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
+
+ /* And, it can get really weird for us to try outputting an inverted
+ * form of this list when it has things above the bitmap, so don't even
+ * try */
+ if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
+ inverting_allowed = FALSE;
+ }
+ }
+
+ /* Calculate what the output would be if we take the input as-is */
+ as_is_display = put_charclass_bitmap_innards_common(invlist,
+ posixes,
+ only_utf8,
+ not_utf8,
+ only_utf8_locale,
+ invert);
+
+ /* If have to take the output as-is, just do that */
+ if (! inverting_allowed) {
+ sv_catsv(sv, as_is_display);
+ }
+ else { /* But otherwise, create the output again on the inverted input, and
+ use whichever version is shorter */
+
+ int inverted_bias, as_is_bias;
+
+ /* We will apply our bias to whichever of the the results doesn't have
+ * the '^' */
+ if (invert) {
+ invert = FALSE;
+ as_is_bias = bias;
+ inverted_bias = 0;
+ }
+ else {
+ invert = TRUE;
+ as_is_bias = 0;
+ inverted_bias = bias;
+ }
+
+ /* Now invert each of the lists that contribute to the output,
+ * excluding from the result things outside the possible range */
+
+ /* For the unconditional inversion list, we have to add in all the
+ * conditional code points, so that when inverted, they will be gone
+ * from it */
+ _invlist_union(only_utf8, invlist, &invlist);
+ _invlist_union(only_utf8_locale, invlist, &invlist);
+ _invlist_invert(invlist);
+ _invlist_intersection(invlist, PL_InBitmap, &invlist);
+
+ if (only_utf8) {
+ _invlist_invert(only_utf8);
+ _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
+ }
+
+ if (not_utf8) {
+ _invlist_invert(not_utf8);
+ _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8);
+ }
+
+ if (only_utf8_locale) {
+ _invlist_invert(only_utf8_locale);
+ _invlist_intersection(only_utf8_locale,
+ PL_InBitmap,
+ &only_utf8_locale);
+ }
+
+ inverted_display = put_charclass_bitmap_innards_common(
+ invlist,
+ posixes,
+ only_utf8,
+ not_utf8,
+ only_utf8_locale, invert);
+
+ /* Use the shortest representation, taking into account our bias
+ * against showing it inverted */
+ if (SvCUR(inverted_display) + inverted_bias
+ < SvCUR(as_is_display) + as_is_bias)
+ {
+ sv_catsv(sv, inverted_display);
+ }
+ else {
+ sv_catsv(sv, as_is_display);
+ }
+
+ SvREFCNT_dec_NN(as_is_display);
+ SvREFCNT_dec_NN(inverted_display);
+ }
+
+ SvREFCNT_dec_NN(invlist);
+ SvREFCNT_dec(only_utf8);
+ SvREFCNT_dec(not_utf8);
+ SvREFCNT_dec(posixes);
+ SvREFCNT_dec(only_utf8_locale);
+
+ return SvCUR(sv) > orig_sv_cur;
}
#define CLEAR_OPTSTART \