#define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
+#ifndef MAX
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#endif
+
/* this is a chain of data about sub patterns we are processing that
need to be handled separately/specially in study_chunk. Its so
we can simulate recursion without losing state. */
* returned list must, and will, contain every code point that is a
* possibility. */
- SV* invlist = sv_2mortal(_new_invlist(0));
+ SV* invlist = NULL;
SV* only_utf8_locale_invlist = NULL;
unsigned int i;
const U32 n = ARG(node);
/* Here, no compile-time swash, and there are things that won't be
* known until runtime -- we have to assume it could be anything */
+ invlist = sv_2mortal(_new_invlist(1));
return _add_range_to_invlist(invlist, 0, UV_MAX);
}
else if (ary[3] && ary[3] != &PL_sv_undef) {
}
}
+ if (! invlist) {
+ invlist = sv_2mortal(_new_invlist(0));
+ }
+
/* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
* code points, and an inversion list for the others, but if there are code
* points that should match only conditionally on the target string being
/* Add in the points from the bit map */
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
- invlist = add_cp_to_invlist(invlist, i);
+ unsigned int start = i++;
+
+ for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
+ /* empty */
+ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
new_node_has_latin1 = TRUE;
}
}
#ifndef PERL_IN_XSUB_RE
STATIC void
-S_invlist_replace_list(pTHX_ SV * dest, SV * src)
+S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
{
/* Replaces the inversion list in 'src' with the one in 'dest'. It steals
* the list from 'src', so 'src' is made to have a NULL list. This is
const UV src_len = _invlist_len(src);
const bool src_offset = *get_invlist_offset_addr(src);
- const STRLEN src_byte_len = SvCUR(src);
+ const STRLEN src_byte_len = SvLEN(src);
char * array = SvPVX(src);
const int oldtainted = TAINT_get;
- PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST;
+ PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
assert(SvTYPE(src) == SVt_INVLIST);
assert(SvTYPE(dest) == SVt_INVLIST);
assert(! invlist_is_iterating(src));
+ assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
/* Make sure it ends in the right place with a NUL, as our inversion list
* manipulations aren't careful to keep this true, but sv_usepvn_flags()
PERL_STATIC_INLINE void
S_invlist_trim(SV* invlist)
{
+ /* Free the not currently-being-used space in an inversion list */
+
+ /* But don't free up the space needed for the 0 UV that is always at the
+ * beginning of the list, nor the trailing NUL */
+ const UV min_size = TO_INTERNAL_SIZE(1) + 1;
+
PERL_ARGS_ASSERT_INVLIST_TRIM;
assert(SvTYPE(invlist) == SVt_INVLIST);
- /* Change the length of the inversion list to how many entries it currently
- * has. But don't shorten it so that it would free up the required
- * initial 0 UV (and a trailing NUL byte) */
- if (SvCUR(invlist) > TO_INTERNAL_SIZE(1) + 1) {
- SvPV_shrink_to_cur(invlist);
- }
+ SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
+}
+
+PERL_STATIC_INLINE void
+S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
+{
+ PERL_ARGS_ASSERT_INVLIST_CLEAR;
+
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ invlist_set_len(invlist, 0, 0);
+ invlist_trim(invlist);
}
#endif /* ifndef PERL_IN_XSUB_RE */
PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
- /* If either one is empty, the union is the other one */
- if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
- bool make_temp = FALSE; /* Should we mortalize the result? */
+ len_b = _invlist_len(b);
+ if (len_b == 0) {
- if (*output == a) {
- if (a != NULL) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
+ /* Here, 'b' is empty. If the output is the complement of 'b', the
+ * union is all possible code points, and we need not even look at 'a'.
+ * It's easiest to create a new inversion list that matches everything.
+ * */
+ if (complement_b) {
+ SV* everything = _new_invlist(1);
+ _append_range_to_invlist(everything, 0, UV_MAX);
+
+ /* If the output didn't exist, just point it at the new list */
+ if (*output == NULL) {
+ *output = everything;
+ return;
}
- }
- if (*output != b) {
- *output = invlist_clone(b);
- if (complement_b) {
- _invlist_invert(*output);
+
+ /* Otherwise, replace its contents with the new list */
+ invlist_replace_list_destroys_src(*output, everything);
+ SvREFCNT_dec_NN(everything);
+ return;
+ }
+
+ /* Here, we don't want the complement of 'b', and since it is empty,
+ * the union will come entirely from 'a'. If 'a' is NULL or empty, the
+ * output will be empty */
+
+ if (a == NULL) {
+ *output = _new_invlist(0);
+ return;
+ }
+
+ if (_invlist_len(a) == 0) {
+ invlist_clear(*output);
+ return;
+ }
+
+ /* Here, 'a' is not empty, and entirely determines the union. If the
+ * output is not to overwrite 'b', we can just return 'a'. */
+ if (*output != b) {
+
+ /* If the output is to overwrite 'a', we have a no-op, as it's
+ * already in 'a' */
+ if (*output == a) {
+ return;
}
- } /* else *output already = b; */
- if (make_temp) {
- sv_2mortal(*output);
+ /* But otherwise we have to copy 'a' to the output */
+ *output = invlist_clone(a);
+ return;
}
+
+ /* Here, 'b' is to be overwritten by the output, which will be 'a' */
+ u = invlist_clone(a);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
+
return;
}
- else if ((len_b = _invlist_len(b)) == 0) {
- bool make_temp = FALSE;
- if (*output == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
+
+ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
+
+ /* Here, 'a' is empty. That means the union will come entirely from
+ * 'b'. If the output is not to overwrite 'a', we can just return
+ * what's in 'b'. */
+ if (*output != a) {
+
+ /* If the output is to overwrite 'b', it's already in 'b', but
+ * otherwise we have to copy 'b' to the output */
+ if (*output != b) {
+ *output = invlist_clone(b);
}
- }
- /* The complement of an empty list is a list that has everything in it,
- * so the union with <a> includes everything too */
- if (complement_b) {
- if (a == *output) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
+ /* And if the output is to be the inversion of 'b', do that */
+ if (complement_b) {
+ _invlist_invert(*output);
}
- *output = _new_invlist(1);
- _append_range_to_invlist(*output, 0, UV_MAX);
+
+ return;
}
- else if (*output != a) {
- *output = invlist_clone(a);
+
+ /* Here, 'a', which is empty or even NULL, is to be overwritten by the
+ * output, which will either be 'b' or the complement of 'b' */
+
+ if (a == NULL) {
+ *output = invlist_clone(b);
}
- /* else *output already = a; */
+ else {
+ u = invlist_clone(b);
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
+ }
- if (make_temp) {
- sv_2mortal(*output);
+ if (complement_b) {
+ _invlist_invert(*output);
}
+
return;
}
/* Here, have chosen which of the two inputs to look at. Only output
* if the running count changes to/from 0, which marks the
- * beginning/end of a range in that's in the set */
+ * beginning/end of a range that's in the set */
if (cp_in_set) {
if (count == 0) {
array_u[i_u++] = cp;
/* When 'count' is 0, the list that was exhausted (if one was shorter than
* the other) ended with everything above it not in its set. That means
* that the remaining part of the union is precisely the same as the
- * non-exhausted list, so can just copy it unchanged. (If both list were
+ * non-exhausted list, so can just copy it unchanged. (If both lists were
* exhausted at the same time, then the operations below will be both 0.)
*/
if (count == 0) {
*output = u;
}
else {
- invlist_replace_list(*output, u);
+ invlist_replace_list_destroys_src(*output, u);
SvREFCNT_dec_NN(u);
}
}
/* Special case if either one is empty */
len_a = (a == NULL) ? 0 : _invlist_len(a);
if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
- bool make_temp = FALSE;
-
if (len_a != 0 && complement_b) {
- /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
- * be empty. Here, also we are using 'b's complement, which hence
- * must be every possible code point. Thus the intersection is
- * simply 'a'. */
- if (*i != a) {
- if (*i == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
- }
- }
+ /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
+ * must be empty. Here, also we are using 'b's complement, which
+ * hence must be every possible code point. Thus the intersection
+ * is simply 'a'. */
- *i = invlist_clone(a);
+ if (*i == a) { /* No-op */
+ return;
}
- /* else *i is already 'a' */
- if (make_temp) {
- sv_2mortal(*i);
+ /* If not overwriting either input, just make a copy of 'a' */
+ if (*i != b) {
+ *i = invlist_clone(a);
+ return;
}
+
+ /* Here we are overwriting 'b' with 'a's contents */
+ r = invlist_clone(a);
+ invlist_replace_list_destroys_src(*i, r);
+ SvREFCNT_dec_NN(r);
return;
}
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
* intersection must be empty */
- if (*i == a) {
- if (a != NULL) {
- if (! (make_temp = cBOOL(SvTEMP(a)))) {
- SvREFCNT_dec_NN(a);
- }
- }
- }
- else if (*i == b) {
- if (! (make_temp = cBOOL(SvTEMP(b)))) {
- SvREFCNT_dec_NN(b);
- }
- }
- *i = _new_invlist(0);
- if (make_temp) {
- sv_2mortal(*i);
+ if (*i == NULL) {
+ *i = _new_invlist(0);
+ return;
}
+ invlist_clear(*i);
return;
}
}
else {
if (len_r) {
- invlist_replace_list(*i, r);
+ invlist_replace_list_destroys_src(*i, r);
}
else {
- invlist_set_len(*i, 0, 0);
- invlist_trim(*i);
+ invlist_clear(*i);
}
SvREFCNT_dec_NN(r);
}
: array[len - 1] - 1;
}
-SV *
+STATIC SV *
S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
{
/* Get the contents of an inversion list into a string SV so that they can
* indivisible */
bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
- assert(RExC_parse < RExC_end);
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Unmatched (");
+ }
if ( *RExC_parse == '*') { /* (*VERB:ARG) */
char *start_verb = RExC_parse + 1;
/* 'posix_warnings' and 'warn_text' are names of variables in the following
* routine. q.v. */
#define ADD_POSIX_WARNING(p, text) STMT_START { \
- if (posix_warnings && ( posix_warnings != (AV **) -1 \
- || (PASS2 && ckWARN(WARN_REGEXP)))) \
- { \
+ if (posix_warnings) { \
if (! warn_text) warn_text = newAV(); \
av_push(warn_text, Perl_newSVpvf(aTHX_ \
WARNING_PREFIX \
besides RExC_parse. */
char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
NULL */
- AV ** posix_warnings /* Where to place any generated warnings, or -1
- if to output them, or NULL */
+ AV ** posix_warnings, /* Where to place any generated warnings, or
+ NULL */
+ const bool check_only /* Don't die if error */
)
{
/* This parses what the caller thinks may be one of the three POSIX
* 'updated_parse_ptr' is not changed. No warnings nor errors are
* raised.
*
- * In b) there may be warnings and even errors generated. What to do about
- * these is determined by the 'posix_warnings' parameter. If it is NULL,
- * this call is treated as a check-only, scouting-out-the-territory call,
- * and no warnings nor errors are generated at all. Otherwise, any errors
- * are raised if found. If 'posix_warnings' is -1 (appropriately cast),
- * warnings are generated and displayed (in pass 2), just as they would be
- * for any other message of the same type from this file. If it isn't NULL
- * and not -1, warnings aren't displayed, but instead an AV is generated
- * with all the warning messages (that aren't to be ignored) stored into
- * it, so that the caller can output them if it wants. This is done in all
+ * In b) there may be errors or warnings generated. If 'check_only' is
+ * TRUE, then any errors are discarded. Warnings are returned to the
+ * caller via an AV* created into '*posix_warnings' if it is not NULL. If
+ * instead it is NULL, warnings are suppressed. This is done in all
* passes. The reason for this is that the rest of the parsing is heavily
* dependent on whether this routine found a valid posix class or not. If
- * it did, the closing ']' is absorbed as part of the class. If no class
+ * it did, the closing ']' is absorbed as part of the class. If no class,
* or an invalid one is found, any ']' will be considered the terminator of
* the outer bracketed character class, leading to very different results.
* In particular, a '(?[ ])' construct will likely have a syntax error if
/* For [. .] and [= =]. These are quite different internally from [: :],
* so they are handled separately. */
- if (POSIXCC_NOTYET(*p)) {
+ if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
+ and 1 for at least one char in it
+ */
+ {
const char open_char = *p;
const char * temp_ptr = p + 1;
- unsigned int len = 0;
/* These two constructs are not handled by perl, and if we find a
- * syntactically valid one, we croak. It looks like just about any
- * byte can be in them, but they are likely very short, like [.ch.] to
- * denote a ligature 'ch' single character. If we find something that
- * started out to look like one of these constructs, but isn't, we
- * break so that it can be checked for being a class name with a typo
- * of '.' or '=' instead of a colon */
- while (temp_ptr < e) {
- len++;
-
- /* qr/[[.].]]/, for example, is valid. But otherwise we quit on an
- * unexpected ']'. It is possible, it appears, for such a ']' to
- * be not in the final position, but that's so unlikely that that
- * case is not handled. */
- if (*temp_ptr == ']' && temp_ptr[1] != open_char) {
- break;
- }
-
- /* XXX this could be cut down, but this value is certainly large
- * enough */
- if (len > 10) {
- break;
- }
+ * syntactically valid one, we croak. khw, who wrote this code, finds
+ * this explanation of them very unclear:
+ * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
+ * And searching the rest of the internet wasn't very helpful either.
+ * It looks like just about any byte can be in these constructs,
+ * depending on the locale. But unless the pattern is being compiled
+ * under /l, which is very rare, Perl runs under the C or POSIX locale.
+ * In that case, it looks like [= =] isn't allowed at all, and that
+ * [. .] could be any single code point, but for longer strings the
+ * constituent characters would have to be the ASCII alphabetics plus
+ * the minus-hyphen. Any sensible locale definition would limit itself
+ * to these. And any portable one definitely should. Trying to parse
+ * the general case is a nightmare (see [perl #127604]). So, this code
+ * looks only for interiors of these constructs that match:
+ * qr/.|[-\w]{2,}/
+ * Using \w relaxes the apparent rules a little, without adding much
+ * danger of mistaking something else for one of these constructs.
+ *
+ * [. .] in some implementations described on the internet is usable to
+ * escape a character that otherwise is special in bracketed character
+ * classes. For example [.].] means a literal right bracket instead of
+ * the ending of the class
+ *
+ * [= =] can legitimately contain a [. .] construct, but we don't
+ * handle this case, as that [. .] construct will later get parsed
+ * itself and croak then. And [= =] is checked for even when not under
+ * /l, as Perl has long done so.
+ *
+ * The code below relies on there being a trailing NUL, so it doesn't
+ * have to keep checking if the parse ptr < e.
+ */
+ if (temp_ptr[1] == open_char) {
+ temp_ptr++;
+ }
+ else while ( temp_ptr < e
+ && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
+ {
+ temp_ptr++;
+ }
- if (*temp_ptr == open_char) {
+ if (*temp_ptr == open_char) {
temp_ptr++;
if (*temp_ptr == ']') {
temp_ptr++;
- if (! found_problem && posix_warnings) {
+ if (! found_problem && ! check_only) {
RExC_parse = (char *) temp_ptr;
vFAIL3("POSIX syntax [%c %c] is reserved for future "
"extensions", open_char, open_char);
return OOB_NAMEDCLASS;
}
- }
- else if (*temp_ptr == '\\') {
-
- /* A backslash is treate as like any other character, unless it
- * precedes a comment starter. XXX multiple backslashes in a
- * row are not handled specially here, nor would they ever
- * likely to be handled specially in one of these constructs */
- if (temp_ptr[1] == '#' && (RExC_flags & RXf_PMf_EXTENDED)) {
- temp_ptr++;
- }
- temp_ptr++;
- }
- else if (*temp_ptr == '#' && (RExC_flags & RXf_PMf_EXTENDED)) {
- break; /* Under no circumstances can we look at the interior
- of a comment */
- }
- else if (*temp_ptr == '\n') { /* And we don't allow newlines
- either as it's extremely
- unlikely that one could be in an
- intended class */
- break;
- }
- else if (UTF && ! UTF8_IS_INVARIANT(*temp_ptr)) {
- /* XXX Since perl will never handle multi-byte locales, except
- * for UTF-8, we could break if we found a byte above latin1,
- * but perhaps the person intended to use one. */
- temp_ptr += UTF8SKIP(temp_ptr);
- }
- else {
- temp_ptr++;
- }
}
+ /* If we find something that started out to look like one of these
+ * constructs, but isn't, we continue below so that it can be checked
+ * for being a class name with a typo of '.' or '=' instead of a colon.
+ * */
}
/* Here, we think there is a possibility that a [: :] class was meant, and
}
if (warn_text) {
- if (posix_warnings != (AV **) -1) {
- *posix_warnings = warn_text;
+ if (posix_warnings) {
+ /* mortalize to avoid a leak with FATAL warnings */
+ *posix_warnings = (AV *) sv_2mortal((SV *) warn_text);
}
else {
- SV * msg;
- while ((msg = av_shift(warn_text)) != &PL_sv_undef) {
- Perl_warner(aTHX_ packWARN(WARN_REGEXP),
- "%s", SvPVX(msg));
- SvREFCNT_dec_NN(msg);
- }
SvREFCNT_dec_NN(warn_text);
}
}
* one */
return class_number + complement;
}
- else if (posix_warnings) {
+ else if (! check_only) {
/* Here, it is an unrecognized class. This is an error (unless the
* call is to check only, which we've already handled above) */
{
/* See if this is a [:posix:] class. */
bool is_posix_class = (OOB_NAMEDCLASS
- < handle_possible_posix(pRExC_state,
- RExC_parse + 1,
- NULL,
- NULL));
+ < handle_possible_posix(pRExC_state,
+ RExC_parse + 1,
+ NULL,
+ NULL,
+ TRUE /* checking only */));
/* If it is a posix class, leave the parse pointer at the
* '[' to fool regclass() into thinking it is part of a
* '[[:posix:]]'. */
no_close:
/* We output the messages even if warnings are off, because we'll fail
* the very next thing, and these give a likely diagnosis for that */
- if (posix_warnings) {
- SV * msg;
- while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
- SvREFCNT_dec_NN(msg);
- }
- SvREFCNT_dec_NN(posix_warnings);
+ if (posix_warnings && av_tindex(posix_warnings) >= 0) {
+ output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
}
FAIL("Syntax error in (?[...])");
{
/* See if this is a [:posix:] class. */
bool is_posix_class = (OOB_NAMEDCLASS
- < handle_possible_posix(pRExC_state,
- RExC_parse + 1,
- NULL,
- NULL));
+ < handle_possible_posix(pRExC_state,
+ RExC_parse + 1,
+ NULL,
+ NULL,
+ TRUE /* checking only */));
/* If it is a posix class, leave the parse pointer at the '['
* to fool regclass() into thinking it is part of a
* '[[:posix:]]'. */
}
}
+STATIC void
+S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
+{
+ /* If the final parameter is NULL, output the elements of the array given
+ * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
+ * pushed onto it, (creating if necessary) */
+
+ SV * msg;
+ const bool first_is_fatal = ! return_posix_warnings
+ && ckDEAD(packWARN(WARN_REGEXP));
+
+ PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
+
+ while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
+ if (return_posix_warnings) {
+ if (! *return_posix_warnings) { /* mortalize to not leak if
+ warnings are fatal */
+ *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
+ }
+ av_push(*return_posix_warnings, msg);
+ }
+ else {
+ if (first_is_fatal) { /* Avoid leaking this */
+ av_undef(posix_warnings); /* This isn't necessary if the
+ array is mortal, but is a
+ fail-safe */
+ (void) sv_2mortal(msg);
+ if (PASS2) {
+ SAVEFREESV(RExC_rx_sv);
+ }
+ }
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
+ SvREFCNT_dec_NN(msg);
+ }
+ }
+}
+
STATIC AV *
S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
{
bool optimizable, /* ? Allow a non-ANYOF return
node */
SV** ret_invlist, /* Return an inversion list, not a node */
- AV** posix_warnings
+ AV** return_posix_warnings
)
{
/* parse a bracketed class specification. Most of these will produce an
const SSize_t orig_size = RExC_size;
bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
- /* This variable is used to mark where in the input something that looks
- * like a POSIX construct ends. During the parse, when something looks
- * like it could be such a construct is encountered, it is checked for
- * being one, but not if we've already checked this area of the input.
- * Only after this position is reached do we check again */
- char *dont_check_for_posix_end = RExC_parse - 1;
+ /* This variable is used to mark where the end in the input is of something
+ * that looks like a POSIX construct but isn't. During the parse, when
+ * something looks like it could be such a construct is encountered, it is
+ * checked for being one, but not if we've already checked this area of the
+ * input. Only after this position is reached do we check again */
+ char *not_posix_region_end = RExC_parse - 1;
+
+ AV* posix_warnings = NULL;
+ const bool do_posix_warnings = return_posix_warnings
+ || (PASS2 && ckWARN(WARN_REGEXP));
GET_RE_DEBUG_FLAGS_DECL;
allow_multi_folds = FALSE;
#endif
- if (posix_warnings == NULL) {
- posix_warnings = (AV **) -1;
- }
-
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state,
(LOC)
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
- char *class_end;
- int maybe_class = handle_possible_posix(pRExC_state, RExC_parse,
- &class_end, NULL);
- if (maybe_class >= OOB_NAMEDCLASS) {
- dont_check_for_posix_end = class_end;
- if (PASS2 && posix_warnings == (AV **) -1) {
- SAVEFREESV(RExC_rx_sv);
- ckWARN4reg(class_end,
- "POSIX syntax [%c %c] belongs inside character classes%s",
- *RExC_parse, *RExC_parse,
- (maybe_class == OOB_NAMEDCLASS)
- ? ((POSIXCC_NOTYET(*RExC_parse))
- ? " (but this one isn't implemented)"
- : " (but this one isn't fully valid)")
- : ""
- );
- (void)ReREFCNT_inc(RExC_rx_sv);
- }
- }
+ int maybe_class = handle_possible_posix(pRExC_state,
+ RExC_parse,
+ ¬_posix_region_end,
+ NULL,
+ TRUE /* checking only */);
+ if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
+ SAVEFREESV(RExC_rx_sv);
+ ckWARN4reg(not_posix_region_end,
+ "POSIX syntax [%c %c] belongs inside character classes%s",
+ *RExC_parse, *RExC_parse,
+ (maybe_class == OOB_NAMEDCLASS)
+ ? ((POSIXCC_NOTYET(*RExC_parse))
+ ? " (but this one isn't implemented)"
+ : " (but this one isn't fully valid)")
+ : ""
+ );
+ (void)ReREFCNT_inc(RExC_rx_sv);
+ }
}
/* If the caller wants us to just parse a single element, accomplish this
goto charclassloop;
while (1) {
+
+ if ( posix_warnings
+ && av_tindex(posix_warnings) >= 0
+ && RExC_parse > not_posix_region_end)
+ {
+ /* Warnings about posix class issues are considered tentative until
+ * we are far enough along in the parse that we can no longer
+ * change our mind, at which point we either output them or add
+ * them, if it has so specified, to what gets returned to the
+ * caller. This is done each time through the loop so that a later
+ * class won't zap them before they have been dealt with. */
+ output_or_return_posix_warnings(pRExC_state, posix_warnings,
+ return_posix_warnings);
+ }
+
if (RExC_parse >= stop_ptr) {
break;
}
value = UCHARAT(RExC_parse++);
if (value == '[') {
- namedclass = handle_possible_posix(pRExC_state, RExC_parse, &dont_check_for_posix_end, posix_warnings);
+ char * posix_class_end;
+ namedclass = handle_possible_posix(pRExC_state,
+ RExC_parse,
+ &posix_class_end,
+ do_posix_warnings ? &posix_warnings : NULL,
+ FALSE /* die if error */);
if (namedclass > OOB_NAMEDCLASS) {
- RExC_parse = dont_check_for_posix_end;
+
+ /* If there was an earlier attempt to parse this particular
+ * posix class, and it failed, it was a false alarm, as this
+ * successful one proves */
+ if ( posix_warnings
+ && av_tindex(posix_warnings) >= 0
+ && not_posix_region_end >= RExC_parse
+ && not_posix_region_end <= posix_class_end)
+ {
+ av_undef(posix_warnings);
+ }
+
+ RExC_parse = posix_class_end;
+ }
+ else if (namedclass == OOB_NAMEDCLASS) {
+ not_posix_region_end = posix_class_end;
}
else {
namedclass = OOB_NAMEDCLASS;
}
}
- else if ( RExC_parse - 1 > dont_check_for_posix_end
+ else if ( RExC_parse - 1 > not_posix_region_end
&& MAYBE_POSIXCC(value))
{
- (void) handle_possible_posix(pRExC_state, RExC_parse - 1, /* -1 because parse has already been advanced */
- &dont_check_for_posix_end, posix_warnings);
+ (void) handle_possible_posix(
+ pRExC_state,
+ RExC_parse - 1, /* -1 because parse has already been
+ advanced */
+ ¬_posix_region_end,
+ do_posix_warnings ? &posix_warnings : NULL,
+ TRUE /* checking only */);
}
else if (value == '\\') {
/* Is a backslash; get the code point of the char after it */
SV* invlist;
char* name;
char* base_name; /* name after any packages are stripped */
+ char* lookup_name = NULL;
const char * const colon_colon = "::";
/* Try to get the definition of the property into
* will have its name be <__NAME_i>. The design is
* discussed in commit
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- name = savepv(Perl_form(aTHX_
- "%s%.*s%s\n",
- (FOLD) ? "__" : "",
- (int)n,
- RExC_parse,
- (FOLD) ? "_i" : ""
- ));
+ name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
+ if (FOLD) {
+ lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
+ }
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
SvREFCNT_dec(swash); /* Free any left-overs */
- swash = _core_swash_init("utf8", name, &PL_sv_undef,
+ swash = _core_swash_init("utf8",
+ (lookup_name)
+ ? lookup_name
+ : name,
+ &PL_sv_undef,
1, /* binary */
0, /* not tr/// */
NULL, /* No inversion list */
&swash_init_flags
);
+ if (lookup_name) {
+ Safefree(lookup_name);
+ }
if (! swash || ! (invlist = _get_swash_invlist(swash))) {
HV* curpkg = (IN_PERL_COMPILETIME)
? PL_curstash
name = savepvn(full_name, n);
}
}
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
+ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n",
(value == 'p' ? '+' : '!'),
- UTF8fARG(UTF, n, name));
+ (FOLD) ? "__" : "",
+ UTF8fARG(UTF, n, name),
+ (FOLD) ? "_i" : "");
has_user_defined_property = TRUE;
optimizable = FALSE; /* Will have to leave this an
ANYOF node */
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
+
+ if ( posix_warnings && av_tindex(posix_warnings) >= 0) {
+ output_or_return_posix_warnings(pRExC_state, posix_warnings,
+ return_posix_warnings);
+ }
+
/* If anything in the class expands to more than one character, we have to
* deal with them by building up a substitute parse string, and recursively
* calling reg() on it, instead of proceeding */
}
/* And, finally, add the above-the-bitmap stuff */
- if (nonbitmap_invlist) {
+ if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
SV* contents;
/* See if truncation size is overridden */
/* Accumulate the bit map into the unconditional match list */
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (BITMAP_TEST(bitmap, i)) {
- invlist = add_cp_to_invlist(invlist, i);
+ int start = i++;
+ for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
+ /* empty */
+ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
}
}