#endif
-static int
-S_deprecate_commaless_var_list(pTHX) {
- PL_expect = XTERM;
- deprecate("comma-less variable list");
- return REPORT(','); /* grandfather non-comma-format format */
-}
-
/*
* S_ao
*
*/
STATIC void
-S_missingterm(pTHX_ char *s)
+S_missingterm(pTHX_ char *s, STRLEN len)
{
char tmpbuf[UTF8_MAXBYTES + 1];
char q;
bool uni = FALSE;
SV *sv;
if (s) {
- char * const nl = strrchr(s,'\n');
- if (nl)
- *nl = '\0';
+ char * const nl = (char *) my_memrchr(s, '\n', len);
+ if (nl) {
+ *nl = '\0';
+ len = nl - s;
+ }
uni = UTF;
}
else if (PL_multi_close < 32) {
tmpbuf[1] = (char)toCTRL(PL_multi_close);
tmpbuf[2] = '\0';
s = tmpbuf;
+ len = 2;
}
else {
if (LIKELY(PL_multi_close < 256)) {
*tmpbuf = (char)PL_multi_close;
tmpbuf[1] = '\0';
+ len = 1;
}
else {
+ char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
+ *end = '\0';
+ len = end - tmpbuf;
uni = TRUE;
- *uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close) = 0;
}
s = tmpbuf;
}
- q = strchr(s,'"') ? '\'' : '"';
- sv = sv_2mortal(newSVpv(s,0));
+ q = memchr(s, '"', len) ? '\'' : '"';
+ sv = sv_2mortal(newSVpvn(s, len));
if (uni)
SvUTF8_on(sv);
- Perl_croak(aTHX_ "Can't find string terminator %c%" SVf
- "%c anywhere before EOF",q,SVfARG(sv),q);
+ Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c"
+ " anywhere before EOF", q, SVfARG(sv), q);
}
#include "feature.h"
Creates and initialises a new lexer/parser state object, supplying
a context in which to lex and parse from a new source of Perl code.
A pointer to the new state object is placed in L</PL_parser>. An entry
-is made on the save stack so that upon unwinding the new state object
+is made on the save stack so that upon unwinding, the new state object
will be destroyed and the former value of L</PL_parser> will be restored.
Nothing else need be done to clean up the parsing context.
{
const char *s = NULL;
yy_parser *parser, *oparser;
+
if (flags && flags & ~LEX_START_FLAGS)
Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
parser->lex_state = LEX_NORMAL;
parser->expect = XSTATE;
parser->rsfp = rsfp;
+ parser->recheck_utf8_validity = FALSE;
parser->rsfp_filters =
!(flags & LEX_START_SAME_FILTER) || !oparser
? NULL
if (line) {
STRLEN len;
+ const U8* first_bad_char_loc;
+
s = SvPV_const(line, len);
+
+ if ( SvUTF8(line)
+ && UNLIKELY(! is_utf8_string_loc((U8 *) s,
+ SvCUR(line),
+ &first_bad_char_loc)))
+ {
+ _force_out_malformed_utf8_message(first_bad_char_loc,
+ (U8 *) s + SvCUR(line),
+ 0,
+ 1 /* 1 means die */ );
+ NOT_REACHED; /* NOTREACHED */
+ }
+
parser->linestr = flags & LEX_START_COPIED
? SvREFCNT_inc_simple_NN(line)
: newSVpvn_flags(s, len, SvUTF8(line));
} else {
parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
}
+
parser->oldoldbufptr =
parser->oldbufptr =
parser->bufptr =
} else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
p++;
highhalf++;
- } else if (! UTF8_IS_INVARIANT(c)) {
- _force_out_malformed_utf8_message((U8 *) p, (U8 *) e,
- 0,
- 1 /* 1 means die */ );
- NOT_REACHED; /* NOTREACHED */
- }
+ } else assert(UTF8_IS_INVARIANT(c));
}
if (!highhalf)
goto plain_copy;
PL_parser->last_lop -= discard_len;
}
+void
+Perl_notify_parser_that_changed_to_utf8(pTHX)
+{
+ /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
+ * off to on. At compile time, this has the effect of entering a 'use
+ * utf8' section. This means that any input was not previously checked for
+ * UTF-8 (because it was off), but now we do need to check it, or our
+ * assumptions about the input being sane could be wrong, and we could
+ * segfault. This routine just sets a flag so that the next time we look
+ * at the input we do the well-formed UTF-8 check. If we aren't in the
+ * proper phase, there may not be a parser object, but if there is, setting
+ * the flag is harmless */
+
+ if (PL_parser) {
+ PL_parser->recheck_utf8_validity = TRUE;
+ }
+}
+
/*
=for apidoc Amx|bool|lex_next_chunk|U32 flags
STRLEN linestart_pos, last_uni_pos, last_lop_pos;
bool got_some_for_debugger = 0;
bool got_some;
- const U8* first_bad_char_loc;
if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
PL_parser->bufend = buf + new_bufend_pos;
PL_parser->bufptr = buf + bufptr_pos;
- if (UTF && ! is_utf8_string_loc((U8 *) PL_parser->bufptr,
- PL_parser->bufend - PL_parser->bufptr,
- &first_bad_char_loc))
- {
- _force_out_malformed_utf8_message(first_bad_char_loc,
- (U8 *) PL_parser->bufend,
- 0,
- 1 /* 1 means die */ );
- NOT_REACHED; /* NOTREACHED */
+ if (UTF) {
+ const U8* first_bad_char_loc;
+ if (UNLIKELY(! is_utf8_string_loc(
+ (U8 *) PL_parser->bufptr,
+ PL_parser->bufend - PL_parser->bufptr,
+ &first_bad_char_loc)))
+ {
+ _force_out_malformed_utf8_message(first_bad_char_loc,
+ (U8 *) PL_parser->bufend,
+ 0,
+ 1 /* 1 means die */ );
+ NOT_REACHED; /* NOTREACHED */
+ }
}
PL_parser->oldbufptr = buf + oldbufptr_pos;
if (s == bufend)
need_incline = 1;
else
- incline(s);
+ incline(s, bufend);
}
} else if (isSPACE(c)) {
s++;
if (!got_more)
break;
if (can_incline && need_incline && PL_parser->rsfp) {
- incline(s);
+ incline(s, bufend);
need_incline = 0;
}
} else if (!c) {
*/
bool
-Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn)
+Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
{
STRLEN len, origlen;
char *p;
origlen, UNI_DISPLAY_ISPRINT)
: pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
+ if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
+ SV *name2 = sv_2mortal(newSVsv(PL_curstname));
+ sv_catpvs(name2, "::");
+ sv_catsv(name2, (SV *)name);
+ name = name2;
+ }
+
if (proto_after_greedy_proto)
Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
"Prototype after '%c' for %" SVf " : %s",
*/
STATIC void
-S_incline(pTHX_ const char *s)
+S_incline(pTHX_ const char *s, const char *end)
{
const char *t;
const char *n;
PERL_ARGS_ASSERT_INCLINE;
+ assert(end >= s);
+
COPLINE_INC_WITH_HERELINES;
if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
&& s+1 == PL_bufend && *s == ';') {
return;
while (SPACE_OR_TAB(*s))
s++;
- if (strEQs(s, "line"))
- s += 4;
+ if (memBEGINs(s, (STRLEN) (end - s), "line"))
+ s += sizeof("line") - 1;
else
return;
if (SPACE_OR_TAB(*s))
return;
while (SPACE_OR_TAB(*s))
s++;
- if (*s == '"' && (t = strchr(s+1, '"'))) {
+ if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
s++;
e = t + 1;
}
S_check_uni(pTHX)
{
const char *s;
- const char *t;
if (PL_oldoldbufptr != PL_last_uni)
return;
s = PL_last_uni;
while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
s += UTF ? UTF8SKIP(s) : 1;
- if ((t = strchr(s, '(')) && t < PL_bufptr)
+ if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
return;
Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
if (check_keyword) {
char *s2 = PL_tokenbuf;
STRLEN len2 = len;
- if (allow_pack && len > 6 && strEQs(s2, "CORE::"))
- s2 += 6, len2 -= 6;
+ if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
+ s2 += sizeof("CORE::") - 1;
+ len2 -= sizeof("CORE::") - 1;
+ }
if (keyword(s2, len2, 0))
return start;
}
STRLEN len;
const char *start = SvPV_const(sv,len);
const char * const end = start + len;
- const bool utf = SvUTF8(sv) ? TRUE : FALSE;
+ const bool utf = cBOOL(SvUTF8(sv));
PERL_ARGS_ASSERT_STR_TO_VERSION;
/*
* S_tokeq
- * Tokenize a quoted string passed in as an SV. It finds the next
- * chunk, up to end of string or a backslash. It may make a new
- * SV containing that chunk (if HINT_NEW_STRING is on). It also
- * turns \\ into \.
+ * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
+ * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
+ * unchanged, and a new SV containing the modified input is returned.
*/
STATIC SV *
SV *cv;
SV *rv;
HV *stash;
- const U8* first_bad_char_loc;
const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */
PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
if (!SvCUR(res)) {
- Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
- "Unknown charname '' is deprecated");
- return res;
- }
-
- if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
- e - backslash_ptr,
- &first_bad_char_loc))
- {
- _force_out_malformed_utf8_message(first_bad_char_loc,
- (U8 *) PL_parser->bufend,
- 0,
- 0 /* 0 means don't die */ );
- yyerror_pv(Perl_form(aTHX_
- "Malformed UTF-8 character immediately after '%.*s'",
- (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
- SVf_UTF8);
- return NULL;
+ SvREFCNT_dec_NN(res);
+ /* diag_listed_as: Unknown charname '%s' */
+ yyerror("Unknown charname ''");
+ return NULL;
}
res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr,
SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
{
const char * const name = HvNAME(stash);
- if (HvNAMELEN(stash) == sizeof("_charnames")-1
- && strEQ(name, "_charnames")) {
+ if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
return res;
}
}
}
}
if (*(s-1) == ' ') {
+ /* diag_listed_as: charnames alias definitions may not contain
+ trailing white-space; marked by <-- HERE in %s
+ */
yyerror_pv(
Perl_form(aTHX_
"charnames alias definitions may not contain trailing "
const U8* first_bad_char_loc;
STRLEN len;
const char* const str = SvPV_const(res, len);
- if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) {
+ if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
+ &first_bad_char_loc)))
+ {
_force_out_malformed_utf8_message(first_bad_char_loc,
(U8 *) PL_parser->bufend,
0,
0 /* 0 means don't die */ );
+ /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
+ immediately after '%s' */
yyerror_pv(
Perl_form(aTHX_
"Malformed UTF-8 returned by %.*s immediately after '%.*s'",
/* The final %.*s makes sure that should the trailing NUL be missing
* that this print won't run off the end of the string */
+ /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
+ in \N{%s} */
yyerror_pv(
Perl_form(aTHX_
"Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
}
multi_spaces:
+ /* diag_listed_as: charnames alias definitions may not contain a
+ sequence of multiple spaces; marked by <-- HERE
+ in %s */
yyerror_pv(
Perl_form(aTHX_
"charnames alias definitions may not contain a sequence of "
} (end if backslash)
handle regular character
} (end while character to read)
-
+
*/
STATIC char *
bool didrange = FALSE; /* did we just finish a range? */
bool in_charclass = FALSE; /* within /[...]/ */
bool has_utf8 = FALSE; /* Output constant is UTF8 */
- bool has_above_latin1 = FALSE; /* does something require special
- handling in tr/// ? */
bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
UTF8? But, this can show as true
when the source isn't utf8, as for
STRLEN offset_to_max; /* The offset in the output to where the range
high-end character is temporarily placed */
+ /* Does something require special handling in tr/// ? This avoids extra
+ * work in a less likely case. As such, khw didn't feel it was worth
+ * adding any branches to the more mainline code to handle this, which
+ * means that this doesn't get set in some circumstances when things like
+ * \x{100} get expanded out. As a result there needs to be extra testing
+ * done in the tr code */
+ bool has_above_latin1 = FALSE;
+
/* Note on sizing: The scanned constant is placed into sv, which is
* initialized by newSV() assuming one byte of output for every byte of
* input. This routine expects newSV() to allocate an extra byte for a
* range, so for most cases we just drop down and handle the value
* as any other. There are two exceptions.
*
- * 1. A minus sign indicates that we are actually going to have
- * a range. In this case, skip the '-', set a flag, then drop
+ * 1. A hyphen indicates that we are actually going to have a
+ * range. In this case, skip the '-', set a flag, then drop
* down to handle what should be the end range value.
* 2. After we've handled that value, the next time through, that
* flag is set and we fix up the range.
if (! dorange) {
- /* Here, we don't think we're in a range. If we've processed
- * at least one character, then see if this next one is a '-',
- * indicating the previous one was the start of a range. But
- * don't bother if we're too close to the end for the minus to
- * mean that. */
- if (*s != '-' || s >= send - 1 || s == start) {
+ /* Here, we don't think we're in a range. If the new character
+ * is not a hyphen; or if it is a hyphen, but it's too close to
+ * either edge to indicate a range, or if we haven't output any
+ * characters yet then it's a regular character. */
+ if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv)) {
/* A regular character. Process like any other, but first
* clear any flags */
non_portable_endpoint = 0;
backslash_N = 0;
#endif
- /* The tests here and the following 'else' for being above
- * Latin1 suffice to find all such occurences in the
- * constant, except those added by a backslash escape
- * sequence, like \x{100}. And all those set
- * 'has_above_latin1' as appropriate */
+ /* The tests here for being above Latin1 and similar ones
+ * in the following 'else' suffice to find all such
+ * occurences in the constant, except those added by a
+ * backslash escape sequence, like \x{100}. Mostly, those
+ * set 'has_above_latin1' as appropriate */
if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
has_above_latin1 = TRUE;
}
/* Drops down to generic code to process current byte */
}
- else {
+ else { /* Is a '-' in the context where it means a range */
if (didrange) { /* Something like y/A-C-Z// */
- Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
+ Perl_croak(aTHX_ "Ambiguous range in transliteration"
+ " operator");
}
dorange = TRUE;
- s++; /* Skip past the minus */
+ s++; /* Skip past the hyphen */
/* d now points to where the end-range character will be
* placed. Save it so won't have to go finding it later,
if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
has_above_latin1 = TRUE;
}
+
+ /* Drops down to generic code to process current byte */
}
} /* End of not a range */
else {
* 'd' points to just beyond the range end in the 'sv' string,
* where we would next place something
* 'offset_to_max' is the offset in 'sv' at which the character
- * before 'd' begins.
+ * (the range's maximum end point) before 'd' begins.
*/
- const char * max_ptr = SvPVX_const(sv) + offset_to_max;
- const char * min_ptr;
+ char * max_ptr = SvPVX(sv) + offset_to_max;
+ char * min_ptr;
IV range_min;
IV range_max; /* last character in range */
- STRLEN save_offset;
STRLEN grow;
+ Size_t offset_to_min = 0;
+ Size_t extras = 0;
#ifdef EBCDIC
bool convert_unicode;
IV real_range_max = 0;
#endif
-
- /* Get the range-ends code point values. */
+ /* Get the code point values of the range ends. */
if (has_utf8) {
/* We know the utf8 is valid, because we just constructed
* it ourselves in previous loop iterations */
min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
+
+ /* This compensates for not all code setting
+ * 'has_above_latin1', so that we don't skip stuff that
+ * should be executed */
+ if (range_max > 255) {
+ has_above_latin1 = TRUE;
+ }
}
else {
min_ptr = max_ptr - 1;
range_max = * (U8*) max_ptr;
}
+ /* If the range is just a single code point, like tr/a-a/.../,
+ * that code point is already in the output, twice. We can
+ * just back up over the second instance and avoid all the rest
+ * of the work. But if it is a variant character, it's been
+ * counted twice, so decrement. (This unlikely scenario is
+ * special cased, like the one for a range of 2 code points
+ * below, only because the main-line code below needs a range
+ * of 3 or more to work without special casing. Might as well
+ * get it out of the way now.) */
+ if (UNLIKELY(range_max == range_min)) {
+ d = max_ptr;
+ if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
+ utf8_variant_count--;
+ }
+ goto range_done;
+ }
+
#ifdef EBCDIC
/* On EBCDIC platforms, we may have to deal with portable
* ranges. These happen if at least one range endpoint is a
* [A-Z] or [a-z], and both ends are literal characters,
* like 'A', and not like \x{C1} */
convert_unicode =
- cBOOL(backslash_N) /* \N{} forces Unicode, hence
- portable range */
- || ( ! non_portable_endpoint
- && (( isLOWER_A(range_min) && isLOWER_A(range_max))
- || (isUPPER_A(range_min) && isUPPER_A(range_max))));
+ cBOOL(backslash_N) /* \N{} forces Unicode,
+ hence portable range */
+ || ( ! non_portable_endpoint
+ && (( isLOWER_A(range_min) && isLOWER_A(range_max))
+ || (isUPPER_A(range_min) && isUPPER_A(range_max))));
if (convert_unicode) {
/* Special handling is needed for these portable ranges.
- * They are defined to all be in Unicode terms, which
- * include all Unicode code points between the end points.
+ * They are defined to be in Unicode terms, which includes
+ * all the Unicode code points between the end points.
* Convert to Unicode to get the Unicode range. Later we
* will convert each code point in the range back to
* native. */
range_max = UNI_TO_NATIVE(range_max);
}
#endif
-
/* Use the characters themselves for the error message if
* ASCII printables; otherwise some visible representation
* of them */
}
#ifdef EBCDIC
else if (convert_unicode) {
- /* diag_listed_as: Invalid range "%s" in transliteration operator */
+ /* diag_listed_as: Invalid range "%s" in transliteration operator */
Perl_croak(aTHX_
- "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04" UVXf "}\""
- " in transliteration operator",
- range_min, range_max);
+ "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
+ UVXf "}\" in transliteration operator",
+ range_min, range_max);
}
#endif
else {
- /* diag_listed_as: Invalid range "%s" in transliteration operator */
+ /* diag_listed_as: Invalid range "%s" in transliteration operator */
Perl_croak(aTHX_
- "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
- " in transliteration operator",
- range_min, range_max);
+ "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
+ " in transliteration operator",
+ range_min, range_max);
}
}
+ /* If the range is exactly two code points long, they are
+ * already both in the output */
+ if (UNLIKELY(range_min + 1 == range_max)) {
+ goto range_done;
+ }
+
+ /* Here the range contains at least 3 code points */
+
if (has_utf8) {
/* If everything in the transliteration is below 256, we
* can avoid special handling later. A translation table
- * of each of those bytes is created. And so we expand out
- * all ranges to their constituent code points. But if
- * we've encountered something above 255, the expanding
- * won't help, so skip doing that. But if it's EBCDIC, we
- * may have to look at each character below 256 if we have
- * to convert to/from Unicode values */
+ * for each of those bytes is created by op.c. So we
+ * expand out all ranges to their constituent code points.
+ * But if we've encountered something above 255, the
+ * expanding won't help, so skip doing that. But if it's
+ * EBCDIC, we may have to look at each character below 256
+ * if we have to convert to/from Unicode values */
if ( has_above_latin1
#ifdef EBCDIC
&& (range_min > 255 || ! convert_unicode)
/* Move the high character one byte to the right; then
* insert between it and the range begin, an illegal
* byte which serves to indicate this is a range (using
- * a '-' could be ambiguous). */
+ * a '-' would be ambiguous). */
char *e = d++;
while (e-- > max_ptr) {
*(e + 1) = *e;
}
/* Here we need to expand out the string to contain each
- * character in the range. Grow the output to handle this */
+ * character in the range. Grow the output to handle this.
+ * For non-UTF8, we need a byte for each code point in the
+ * range, minus the three that we've already allocated for: the
+ * hyphen, the min, and the max. For UTF-8, we need this
+ * plus an extra byte for each code point that occupies two
+ * bytes (is variant) when in UTF-8 (except we've already
+ * allocated for the end points, including if they are
+ * variants). For ASCII platforms and Unicode ranges on EBCDIC
+ * platforms, it's easy to calculate a precise number. To
+ * start, we count the variants in the range, which we need
+ * elsewhere in this function anyway. (For the case where it
+ * isn't easy to calculate, 'extras' has been initialized to 0,
+ * and the calculation is done in a loop further down.) */
+#ifdef EBCDIC
+ if (convert_unicode)
+#endif
+ {
+ /* This is executed unconditionally on ASCII, and for
+ * Unicode ranges on EBCDIC. Under these conditions, all
+ * code points above a certain value are variant; and none
+ * under that value are. We just need to find out how much
+ * of the range is above that value. We don't count the
+ * end points here, as they will already have been counted
+ * as they were parsed. */
+ if (range_min >= UTF_CONTINUATION_MARK) {
+
+ /* The whole range is made up of variants */
+ extras = (range_max - 1) - (range_min + 1) + 1;
+ }
+ else if (range_max >= UTF_CONTINUATION_MARK) {
- save_offset = min_ptr - SvPVX_const(sv);
+ /* Only the higher portion of the range is variants */
+ extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
+ }
- /* The base growth is the number of code points in the range */
- grow = range_max - range_min + 1;
- if (has_utf8) {
+ utf8_variant_count += extras;
+ }
+
+ /* The base growth is the number of code points in the range,
+ * not including the endpoints, which have already been sized
+ * for (and output). We don't subtract for the hyphen, as it
+ * has been parsed but not output, and the SvGROW below is
+ * based only on what's been output plus what's left to parse.
+ * */
+ grow = (range_max - 1) - (range_min + 1) + 1;
- /* But if the output is UTF-8, some of those characters may
- * need two bytes (since the maximum range value here is
- * 255, the max bytes per character is two). On ASCII
- * platforms, it's not much trouble to get an accurate
- * count of what's needed. But on EBCDIC, the ones that
- * need 2 bytes are scattered around, so just use a worst
- * case value instead of calculating for that platform. */
+ if (has_utf8) {
#ifdef EBCDIC
- grow *= 2;
-#else
- /* Only those above 127 require 2 bytes. This may be
- * everything in the range, or not */
- if (range_min > 127) {
+ /* In some cases in EBCDIC, we haven't yet calculated a
+ * precise amount needed for the UTF-8 variants. Just
+ * assume the worst case, that everything will expand by a
+ * byte */
+ if (! convert_unicode) {
grow *= 2;
}
- else if (range_max > 127) {
- grow += range_max - 127;
- }
+ else
#endif
+ {
+ /* Otherwise we know exactly how many variants there
+ * are in the range. */
+ grow += extras;
+ }
}
- /* Subtract 3 for the bytes that were already accounted for
- * (min, max, and the hyphen) */
- d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3);
+ /* Grow, but position the output to overwrite the range min end
+ * point, because in some cases we overwrite that */
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ offset_to_min = min_ptr - SvPVX_const(sv);
+
+ /* See Note on sizing above. */
+ d = offset_to_min + SvGROW(sv, SvCUR(sv)
+ + (send - s)
+ + grow
+ + 1 /* Trailing NUL */ );
+ /* Now, we can expand out the range. */
#ifdef EBCDIC
- /* Here, we expand out the range. */
if (convert_unicode) {
- IV i;
+ SSize_t i;
/* Recall that the min and max are now in Unicode terms, so
* we have to convert each character to its native
* equivalent */
if (has_utf8) {
for (i = range_min; i <= range_max; i++) {
- append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
- (U8 **) &d);
+ append_utf8_from_native_byte(
+ LATIN1_TO_NATIVE((U8) i),
+ (U8 **) &d);
}
}
else {
#endif
/* Always gets run for ASCII, and sometimes for EBCDIC. */
{
- IV i;
-
/* Here, no conversions are necessary, which means that the
* first character in the range is already in 'd' and
* valid, so we can skip overwriting it */
if (has_utf8) {
+ SSize_t i;
d += UTF8SKIP(d);
for (i = range_min + 1; i <= range_max; i++) {
append_utf8_from_native_byte((U8) i, (U8 **) &d);
}
}
else {
+ SSize_t i;
d++;
- for (i = range_min + 1; i <= range_max; i++) {
+ assert(range_min + 1 <= range_max);
+ for (i = range_min + 1; i < range_max; i++) {
+#ifdef EBCDIC
+ /* In this case on EBCDIC, we haven't calculated
+ * the variants. Do it here, as we go along */
+ if (! UVCHR_IS_INVARIANT(i)) {
+ utf8_variant_count++;
+ }
+#endif
*d++ = (char)i;
}
+
+ /* The range_max is done outside the loop so as to
+ * avoid having to special case not incrementing
+ * 'utf8_variant_count' on EBCDIC (it's already been
+ * counted when originally parsed) */
+ *d++ = (char) range_max;
}
}
#ifdef EBCDIC
- /* If the original range extended above 255, add in that portion. */
+ /* If the original range extended above 255, add in that
+ * portion. */
if (real_range_max) {
*d++ = (char) UTF8_TWO_BYTE_HI(0x100);
*d++ = (char) UTF8_TWO_BYTE_LO(0x100);
- if (real_range_max > 0x101)
- *d++ = (char) ILLEGAL_UTF8_BYTE;
- if (real_range_max > 0x100)
+ if (real_range_max > 0x100) {
+ if (real_range_max > 0x101) {
+ *d++ = (char) ILLEGAL_UTF8_BYTE;
+ }
d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
+ }
}
#endif
if (!esc)
in_charclass = FALSE;
}
-
- /* skip for regexp comments /(?#comment)/, except for the last
- * char, which will be done separately.
- * Stop on (?{..}) and friends */
-
+ /* skip for regexp comments /(?#comment)/, except for the last
+ * char, which will be done separately. Stop on (?{..}) and
+ * friends */
else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
if (s[2] == '#') {
while (s+1 < send && *s != ')')
break;
}
}
-
- /* likewise skip #-initiated comments in //x patterns */
+ /* likewise skip #-initiated comments in //x patterns */
else if (*s == '#'
&& PL_lex_inpat
&& !in_charclass
while (s < send && *s != '\n')
*d++ = *s++;
}
-
- /* no further processing of single-quoted regex */
+ /* no further processing of single-quoted regex */
else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
goto default_action;
- /* check for embedded arrays
- (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
- */
+ /* check for embedded arrays
+ * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
+ */
else if (*s == '@' && s[1]) {
if (UTF
? isIDFIRST_utf8_safe(s+1, send)
if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
break; /* in regexp, neither @+ nor @- are interpolated */
}
-
- /* check for embedded scalars. only stop if we're sure it's a
- variable.
- */
+ /* check for embedded scalars. only stop if we're sure it's a
+ * variable. */
else if (*s == '$') {
if (!PL_lex_inpat) /* not a regexp, so $ must be var */
break;
{
const char* error;
- bool valid = grok_bslash_o(&s, &uv, &error,
+ bool valid = grok_bslash_o(&s, PL_bufend,
+ &uv, &error,
TRUE, /* Output warning */
FALSE, /* Not strict */
TRUE, /* Output warnings for
UTF);
if (! valid) {
yyerror(error);
- continue;
+ uv = 0; /* drop through to ensure range ends are set */
}
goto NUM_ESCAPE_INSERT;
}
{
const char* error;
- bool valid = grok_bslash_x(&s, &uv, &error,
+ bool valid = grok_bslash_x(&s, PL_bufend,
+ &uv, &error,
TRUE, /* Output warning */
FALSE, /* Not strict */
TRUE, /* Output warnings for
UTF);
if (! valid) {
yyerror(error);
- continue;
+ uv = 0; /* drop through to ensure range ends are set */
}
}
NUM_ESCAPE_INSERT:
/* Insert oct or hex escaped character. */
-
+
/* Here uv is the ordinal of the next character being added */
if (UVCHR_IS_INVARIANT(uv)) {
*d++ = (char) uv;
* For non-patterns, the named characters are converted to
* their string equivalents. In patterns, named characters are
* not converted to their ultimate forms for the same reasons
- * that other escapes aren't. Instead, they are converted to
- * the \N{U+...} form to get the value from the charnames that
- * is in effect right now, while preserving the fact that it
- * was a named character, so that the regex compiler knows
- * this.
+ * that other escapes aren't (mainly that the ultimate
+ * character could be considered a meta-symbol by the regex
+ * compiler). Instead, they are converted to the \N{U+...}
+ * form to get the value from the charnames that is in effect
+ * right now, while preserving the fact that it was a named
+ * character, so that the regex compiler knows this.
*
* The structure of this section of code (besides checking for
* errors and upgrading to utf8) is:
s++;
if (*s != '{') {
yyerror("Missing braces on \\N{}");
+ *d++ = '\0';
continue;
}
s++;
/* If there is no matching '}', it is an error. */
- if (! (e = strchr(s, '}'))) {
+ if (! (e = (char *) memchr(s, '}', send - s))) {
if (! PL_lex_inpat) {
yyerror("Missing right brace on \\N{}");
} else {
yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
}
- continue;
+ yyquit(); /* Have exhausted the input. */
}
/* Here it looks like a named character */
"Invalid hexadecimal number in \\N{U+...}"
);
s = e + 1;
+ *d++ = '\0';
continue;
}
while (++s < e) {
" in transliteration operator",
/* +1 to include the "}" */
(int) (e + 1 - start), start));
+ *d++ = '\0';
goto end_backslash_N;
}
case 'c':
s++;
if (s < send) {
- *d++ = grok_bslash_c(*s++, 1);
+ *d++ = grok_bslash_c(*s, 1);
}
else {
yyerror("Missing control char name in \\c");
+ yyquit(); /* Are at end of input, no sense continuing */
}
#ifdef EBCDIC
non_portable_endpoint++;
#endif
- continue;
+ break;
/* printf-style backslashes, formfeeds, newlines, etc */
case 'b':
/* This is the one truly awful dwimmer necessary to conflate C and sed. */
STATIC int
-S_intuit_more(pTHX_ char *s)
+S_intuit_more(pTHX_ char *s, char *e)
{
PERL_ARGS_ASSERT_INTUIT_MORE;
/* this is terrifying, and it works */
int weight;
char seen[256];
- const char * const send = strchr(s,']');
+ const char * const send = (char *) memchr(s, ']', e - s);
unsigned char un_char, last_un_char;
char tmpbuf[sizeof PL_tokenbuf * 4];
weight -= seen[un_char] * 10;
if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
int len;
- char *tmp = PL_bufend;
- PL_bufend = (char*)send;
- scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
- PL_bufend = tmp;
+ scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
len = (int)strlen(tmpbuf);
if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
UTF ? SVf_UTF8 : 0, SVt_PV))
}
if (*start == '$') {
+ SSize_t start_off = start - SvPVX(PL_linestr);
if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
|| isUPPER(*PL_tokenbuf))
return 0;
- s = skipspace(s);
- PL_bufptr = start;
+ /* this could be $# */
+ if (isSPACE(*s))
+ s = skipspace(s);
+ PL_bufptr = SvPVX(PL_linestr) + start_off;
PL_expect = XREF;
return *s == '(' ? FUNCMETH : METHOD;
}
PL_parser->last_uni = buf + last_uni_pos;
if (PL_parser->last_lop)
PL_parser->last_lop = buf + last_lop_pos;
- SvLEN(linestr) = SvCUR(linestr);
- SvCUR(linestr) = s-SvPVX(linestr);
+ SvLEN_set(linestr, SvCUR(linestr));
+ SvCUR_set(linestr, s - SvPVX(linestr));
PL_parser->filtered = 1;
break;
}
Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
{
filter_t funcp;
+ I32 ret;
SV *datasv = NULL;
/* This API is bad. It should have been using unsigned int for maxlen.
Not sure if we want to change the API, but if not we should sanity
/* Call function. The function is expected to */
/* call "FILTER_READ(idx+1, buf_sv)" first. */
/* Return: <0:error, =0:eof, >0:not eof */
- return (*funcp)(aTHX_ idx, buf_sv, correct_length);
+ ENTER;
+ save_scalar(PL_errgv);
+ ret = (*funcp)(aTHX_ idx, buf_sv, correct_length);
+ LEAVE;
+ return ret;
}
STATIC char *
PERL_ARGS_ASSERT_FIND_IN_MY_STASH;
- if (len == 11 && *pkgname == '_' && strEQ(pkgname, "__PACKAGE__"))
+ if (memEQs(pkgname, len, "__PACKAGE__"))
return PL_curstash;
if (len > 2
PERL_ARGS_ASSERT_TOKENIZE_USE;
if (PL_expect != XSTATE)
+ /* diag_listed_as: "use" not allowed in expression */
yyerror(Perl_form(aTHX_ "\"%s\" not allowed in expression",
is_use ? "use" : "no"));
PL_expect = XTERM;
GV *gv = NULL;
GV **gvp = NULL;
+ if (UNLIKELY(PL_parser->recheck_utf8_validity)) {
+ const U8* first_bad_char_loc;
+ if (UTF && UNLIKELY(! is_utf8_string_loc((U8 *) PL_bufptr,
+ PL_bufend - PL_bufptr,
+ &first_bad_char_loc)))
+ {
+ _force_out_malformed_utf8_message(first_bad_char_loc,
+ (U8 *) PL_bufend,
+ 0,
+ 1 /* 1 means die */ );
+ NOT_REACHED; /* NOTREACHED */
+ }
+ PL_parser->recheck_utf8_validity = FALSE;
+ }
DEBUG_T( {
SV* tmp = newSVpvs("");
PerlIO_printf(Perl_debug_log, "### %" IVdf ":LEX_%s/X%s %s\n",
}
else {
I32 tmp;
- if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3))
+ if ( memBEGINs(s, (STRLEN) (PL_bufend - s), "L\\u")
+ || memBEGINs(s, (STRLEN) (PL_bufend - s), "U\\l"))
+ {
tmp = *s, *s = s[2], s[2] = (char)tmp; /* misordered... */
+ }
if ((*s == 'L' || *s == 'U' || *s == 'F')
&& (strpbrk(PL_lex_casestack, "LUF")))
{
return yylex();
case LEX_INTERPENDMAYBE:
- if (intuit_more(PL_bufptr)) {
+ if (intuit_more(PL_bufptr, PL_bufend)) {
PL_lex_state = LEX_INTERPNORMAL; /* false alarm, more expr */
break;
}
s = PL_bufend;
}
else {
+ int save_error_count = PL_error_count;
+
s = scan_const(PL_bufptr);
+
+ /* Set flag if this was a pattern and there were errors. op.c will
+ * refuse to compile a pattern with this flag set. Otherwise, we
+ * could get segfaults, etc. */
+ if (PL_lex_inpat && PL_error_count > save_error_count) {
+ ((PMOP*)PL_lex_inpat)->op_pmflags |= PMf_HAS_ERROR;
+ }
if (*s == '\\')
PL_lex_state = LEX_INTERPCASEMOD;
else
return yylex();
case LEX_FORMLINE:
+ assert(PL_lex_formbrack);
s = scan_formline(PL_bufptr);
if (!PL_lex_formbrack)
{
* as a var; e.g. ($, ...) would be seen as the var '$,'
*/
- char sigil;
+ U8 sigil;
s = skipspace(s);
sigil = *s++;
0, cBOOL(UTF), FALSE);
*dest = '\0';
assert(PL_tokenbuf[1]); /* we have a variable name */
+ }
+ else {
+ *PL_tokenbuf = 0;
+ PL_in_my = 0;
+ }
+
+ s = skipspace(s);
+ /* parse the = for the default ourselves to avoid '+=' etc being accepted here
+ * as the ASSIGNOP, and exclude other tokens that start with =
+ */
+ if (*s == '=' && (!s[1] || strchr("=~>", s[1]) == 0)) {
+ /* save now to report with the same context as we did when
+ * all ASSIGNOPS were accepted */
+ PL_oldbufptr = s;
+
+ ++s;
+ NEXTVAL_NEXTTOKE.ival = 0;
+ force_next(ASSIGNOP);
+ PL_expect = XTERM;
+ }
+ else if (*s == ',' || *s == ')') {
+ PL_expect = XOPERATOR;
+ }
+ else {
+ /* make sure the context shows the unexpected character and
+ * hopefully a bit more */
+ if (*s) ++s;
+ while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
+ s++;
+ PL_bufptr = s; /* for error reporting */
+ yyerror("Illegal operator following parameter in a subroutine signature");
+ PL_in_my = 0;
+ }
+ if (*PL_tokenbuf) {
NEXTVAL_NEXTTOKE.ival = sigil;
force_next('p'); /* force a signature pending identifier */
}
- else
- PL_in_my = 0;
- PL_expect = XOPERATOR;
break;
case ')':
switch (*s) {
default:
if (UTF) {
- if (! isUTF8_CHAR((U8 *) s, (U8 *) PL_bufend)) {
- _force_out_malformed_utf8_message((U8 *) s, (U8 *) PL_bufend,
- 0,
- 1 /* 1 means die */ );
- NOT_REACHED; /* NOTREACHED */
- }
if (isIDFIRST_utf8_safe(s, PL_bufend)) {
goto keylookup;
}
else {
c = Perl_form(aTHX_ "\\x%02X", (unsigned char)*s);
}
- len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart);
- if (len > UNRECOGNIZED_PRECEDE_COUNT) {
- d = UTF ? (char *) utf8_hop_back((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT, (U8 *)PL_linestart) : s - UNRECOGNIZED_PRECEDE_COUNT;
- } else {
+
+ if (s >= PL_linestart) {
d = PL_linestart;
}
+ else {
+ /* somehow (probably due to a parse failure), PL_linestart has advanced
+ * pass PL_bufptr, get a reasonable beginning of line
+ */
+ d = s;
+ while (d > SvPVX(PL_linestr) && d[-1] && d[-1] != '\n')
+ --d;
+ }
+ len = UTF ? Perl_utf8_length(aTHX_ (U8 *) d, (U8 *) s) : (STRLEN) (s - d);
+ if (len > UNRECOGNIZED_PRECEDE_COUNT) {
+ d = UTF ? (char *) utf8_hop_back((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT, (U8 *)d) : s - UNRECOGNIZED_PRECEDE_COUNT;
+ }
+
Perl_croak(aTHX_ "Unrecognized character %s; marked by <-- HERE after %" UTF8f "<-- HERE near column %d", c,
UTF8fARG(UTF, (s - d), d),
(int) len + 1);
sv_catpvs(PL_linestr,"chomp;");
if (PL_minus_a) {
if (PL_minus_F) {
- if ((*PL_splitstr == '/' || *PL_splitstr == '\''
- || *PL_splitstr == '"')
- && strchr(PL_splitstr + 1, *PL_splitstr))
+ if ( ( *PL_splitstr == '/'
+ || *PL_splitstr == '\''
+ || *PL_splitstr == '"')
+ && strchr(PL_splitstr + 1, *PL_splitstr))
+ {
+ /* strchr is ok, because -F pattern can't contain
+ * embeddded NULs */
Perl_sv_catpvf(aTHX_ PL_linestr, "our @F=split(%s);", PL_splitstr);
+ }
else {
/* "q\0${splitstr}\0" is legal perl. Yes, even NUL
bytes can be used as quoting characters. :-) */
}
do {
fake_eof = 0;
- bof = PL_rsfp ? TRUE : FALSE;
+ bof = cBOOL(PL_rsfp);
if (0) {
fake_eof:
fake_eof = LEX_FAKE_EOF;
/* If it looks like the start of a BOM or raw UTF-16,
* check if it in fact is. */
if (bof && PL_rsfp
- && (*s == 0
+ && ( *s == 0
|| *(U8*)s == BOM_UTF8_FIRST_BYTE
- || *(U8*)s >= 0xFE
- || s[1] == 0))
+ || *(U8*)s >= 0xFE
+ || s[1] == 0))
{
Off_t offset = (IV)PerlIO_tell(PL_rsfp);
bof = (offset == (Off_t)SvCUR(PL_linestr));
}
if (PL_parser->in_pod) {
/* Incest with pod. */
- if (*s == '=' && strEQs(s, "=cut") && !isALPHA(s[4])) {
+ if ( memBEGINPs(s, (STRLEN) (PL_bufend - s), "=cut")
+ && !isALPHA(s[4]))
+ {
SvPVCLEAR(PL_linestr);
PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
}
}
if (PL_rsfp || PL_parser->filtered)
- incline(s);
+ incline(s, PL_bufend);
} while (PL_parser->in_pod);
PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s;
PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
d = instr(s,"perl -");
if (!d) {
d = instr(s,"perl");
- if (d && d[4] == '6')
- d = NULL;
#if defined(DOSISH)
/* avoid getting into infinite loops when shebang
* line contains "Perl" rather than "perl" */
&& !PL_rsfp && !PL_parser->filtered) {
/* handle eval qq[#line 1 "foo"\n ...] */
CopLINE_dec(PL_curcop);
- incline(s);
+ incline(s, PL_bufend);
}
d = s;
while (d < PL_bufend && *d != '\n')
d++;
if (d < PL_bufend)
d++;
- else if (d > PL_bufend)
- /* Found by Ilya: feed random input to Perl. */
- Perl_croak(aTHX_ "panic: input overflow, %p > %p",
- d, PL_bufend);
s = d;
if (in_comment && d == PL_bufend
&& PL_lex_state == LEX_INTERPNORMAL
&& PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
&& SvEVALED(PL_lex_repl) && d[-1] == '}') s--;
else
- incline(s);
+ incline(s, PL_bufend);
if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
PL_lex_state = LEX_FORMLINE;
force_next(FORMRBRACK);
{
s++;
if (s < PL_bufend)
- incline(s);
+ incline(s, PL_bufend);
}
- else if (s > PL_bufend)
- /* Found by Ilya: feed random input to Perl. */
- Perl_croak(aTHX_ "panic: input overflow");
}
goto retry;
case '-':
while (s < PL_bufend && SPACE_OR_TAB(*s))
s++;
- if (strEQs(s,"=>")) {
+ if (memBEGINs(s, (STRLEN) (PL_bufend - s), "=>")) {
s = force_word(PL_bufptr,BAREWORD,FALSE,FALSE);
DEBUG_T( { printbuf("### Saw unary minus before =>, forcing word %s\n", s); } );
OPERATOR('-'); /* unary minus */
}
else if (PL_expect == XPOSTDEREF) POSTDEREF('%');
PL_tokenbuf[0] = '%';
- s = scan_ident(s, PL_tokenbuf + 1,
- sizeof PL_tokenbuf - 1, FALSE);
+ s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
pl_yylval.ival = 0;
if (!PL_tokenbuf[1]) {
PREREF('%');
}
- if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) && intuit_more(s)) {
+ if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
+ && intuit_more(s, PL_bufend)) {
if (*s == '[')
PL_tokenbuf[0] = '@';
}
PL_lex_stuff = NULL;
}
else {
- if (len == 6 && strnEQ(SvPVX(sv), "unique", len)) {
- sv_free(sv);
- if (PL_in_my == KEY_our) {
- deprecate_disappears_in("5.28",
- "Attribute \"unique\" is deprecated");
- }
- else
- Perl_croak(aTHX_ "The 'unique' attribute may only be applied to 'our' variables");
- }
-
/* NOTE: any CV attrs applied here need to be part of
the CVf_BUILTIN_ATTRS define in cv.h! */
- else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "lvalue", len)) {
+ if (!PL_in_my && memEQs(SvPVX(sv), len, "lvalue")) {
sv_free(sv);
CvLVALUE_on(PL_compcv);
}
- else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "locked", len)) {
- sv_free(sv);
- deprecate_disappears_in("5.28",
- "Attribute \"locked\" is deprecated");
- }
- else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "method", len)) {
+ else if (!PL_in_my && memEQs(SvPVX(sv), len, "method")) {
sv_free(sv);
CvMETHOD_on(PL_compcv);
}
- else if (!PL_in_my && len == 5
- && strnEQ(SvPVX(sv), "const", len))
+ else if (!PL_in_my && memEQs(SvPVX(sv), len, "const"))
{
sv_free(sv);
Perl_ck_warner_d(aTHX_
PL_expect = XTERM;
break;
}
- if (strEQs(s, "sub")) {
+ if (memBEGINs(s, (STRLEN) (PL_bufend - s), "sub")) {
+ PL_bufptr = s;
d = s + 3;
d = skipspace(d);
+ s = PL_bufptr;
if (*d == ':') {
PL_expect = XTERM;
break;
if (PL_lex_brackets && PL_lex_brackstack[PL_lex_brackets-1] == XFAKEEOF)
TOKEN(0);
rightbracket:
+ assert(s != PL_bufend);
s++;
if (PL_lex_brackets <= 0)
/* diag_listed_as: Unmatched right %s bracket */
return yylex(); /* ignore fake brackets */
}
force_next(formbrack ? '.' : '}');
- if (formbrack) LEAVE;
+ if (formbrack) LEAVE_with_name("lex_format");
if (formbrack == 2) { /* means . where arguments were expected */
force_next(';');
TOKEN(FORMRBRACK);
}
PL_tokenbuf[0] = '&';
- s = scan_ident(s - 1, PL_tokenbuf + 1,
- sizeof PL_tokenbuf - 1, TRUE);
+ s = scan_ident(s - 1, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, TRUE);
pl_yylval.ival = (OPpENTERSUB_AMPER<<8);
if (PL_tokenbuf[1]) {
force_ident_maybe_lex('&');
{
const char tmp = *s++;
if (tmp == '=') {
- if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, "=====")) {
+ if ( (s == PL_linestart+2 || s[-3] == '\n')
+ && memBEGINs(s, (STRLEN) (PL_bufend - s), "====="))
+ {
s = vcs_conflict_marker(s + 5);
goto retry;
}
&& isALPHA(tmp)
&& (s == PL_linestart+1 || s[-2] == '\n') )
{
- if ((PL_in_eval && !PL_rsfp && !PL_parser->filtered)
- || PL_lex_state != LEX_NORMAL) {
+ if ( (PL_in_eval && !PL_rsfp && !PL_parser->filtered)
+ || PL_lex_state != LEX_NORMAL)
+ {
d = PL_bufend;
while (s < d) {
if (*s++ == '\n') {
- incline(s);
- if (strEQs(s,"=cut")) {
- s = strchr(s,'\n');
+ incline(s, PL_bufend);
+ if (memBEGINs(s, (STRLEN) (PL_bufend - s), "=cut"))
+ {
+ s = (char *) memchr(s,'\n', d - s);
if (s)
s++;
else
s = d;
- incline(s);
+ incline(s, PL_bufend);
goto retry;
}
}
t++;
if (*t == '\n' || *t == '#') {
formbrack = 1;
- ENTER;
+ ENTER_with_name("lex_format");
SAVEI8(PL_parser->form_lex_state);
SAVEI32(PL_lex_formbrack);
PL_parser->form_lex_state = PL_lex_state;
OPERATOR('!');
case '<':
if (PL_expect != XOPERATOR) {
- if (s[1] != '<' && !strchr(s,'>'))
+ if (s[1] != '<' && !memchr(s,'>', PL_bufend - s))
check_uni();
if (s[1] == '<' && s[2] != '>') {
- if ((s == PL_linestart || s[-1] == '\n') && strEQs(s+2, "<<<<<")) {
+ if ( (s == PL_linestart || s[-1] == '\n')
+ && memBEGINs(s+2, (STRLEN) (PL_bufend - (s+2)), "<<<<<"))
+ {
s = vcs_conflict_marker(s + 7);
goto retry;
}
{
char tmp = *s++;
if (tmp == '<') {
- if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, "<<<<<")) {
+ if ( (s == PL_linestart+2 || s[-3] == '\n')
+ && memBEGINs(s, (STRLEN) (PL_bufend - s), "<<<<<"))
+ {
s = vcs_conflict_marker(s + 5);
goto retry;
}
{
const char tmp = *s++;
if (tmp == '>') {
- if ((s == PL_linestart+2 || s[-3] == '\n') && strEQs(s, ">>>>>")) {
+ if ( (s == PL_linestart+2 || s[-3] == '\n')
+ && memBEGINs(s, (STRLEN) (PL_bufend - s), ">>>>>"))
+ {
s = vcs_conflict_marker(s + 5);
goto retry;
}
case '$':
CLINE;
- if (PL_expect == XOPERATOR) {
- if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) {
- return deprecate_commaless_var_list();
- }
- }
- else if (PL_expect == XPOSTDEREF) {
+ if (PL_expect == XPOSTDEREF) {
if (s[1] == '#') {
s++;
POSTDEREF(DOLSHARP);
}
PL_tokenbuf[0] = '$';
- s = scan_ident(s, PL_tokenbuf + 1,
- sizeof PL_tokenbuf - 1, FALSE);
+ s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
if (PL_expect == XOPERATOR) {
d = s;
if (PL_bufptr > s) {
if (PL_lex_state == LEX_NORMAL || PL_lex_brackets)
s = skipspace(s);
- if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
- && intuit_more(s)) {
+ if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
+ && intuit_more(s, PL_bufend)) {
if (*s == '[') {
PL_tokenbuf[0] = '@';
if (ckWARN(WARN_SYNTAX)) {
else if (*s == '{') {
char *t;
PL_tokenbuf[0] = '%';
- if (strEQ(PL_tokenbuf+1, "SIG") && ckWARN(WARN_SYNTAX)
- && (t = strchr(s, '}')) && (t = strchr(t, '=')))
- {
- char tmpbuf[sizeof PL_tokenbuf];
- do {
- t++;
- } while (isSPACE(*t));
- if (isIDFIRST_lazy_if_safe(t, PL_bufend, UTF)) {
- STRLEN len;
- t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE,
- &len);
- while (isSPACE(*t))
- t++;
- if ( *t == ';'
- && get_cvn_flags(tmpbuf, len, UTF
- ? SVf_UTF8
- : 0))
- {
- Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
- "You need to quote \"%" UTF8f "\"",
- UTF8fARG(UTF, len, tmpbuf));
- }
- }
- }
+ if ( strEQ(PL_tokenbuf+1, "SIG")
+ && ckWARN(WARN_SYNTAX)
+ && (t = (char *) memchr(s, '}', PL_bufend - s))
+ && (t = (char *) memchr(t, '=', PL_bufend - t)))
+ {
+ char tmpbuf[sizeof PL_tokenbuf];
+ do {
+ t++;
+ } while (isSPACE(*t));
+ if (isIDFIRST_lazy_if_safe(t, PL_bufend, UTF)) {
+ STRLEN len;
+ t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE,
+ &len);
+ while (isSPACE(*t))
+ t++;
+ if ( *t == ';'
+ && get_cvn_flags(tmpbuf, len, UTF
+ ? SVf_UTF8
+ : 0))
+ {
+ Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
+ "You need to quote \"%" UTF8f "\"",
+ UTF8fARG(UTF, len, tmpbuf));
+ }
+ }
+ }
}
}
}
if (PL_lex_state == LEX_NORMAL)
s = skipspace(s);
- if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) && intuit_more(s)) {
+ if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
+ && intuit_more(s, PL_bufend))
+ {
if (*s == '{')
PL_tokenbuf[0] = '%';
TERM(THING);
case '\'':
- if ( PL_expect == XOPERATOR
- && (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack))
- return deprecate_commaless_var_list();
-
s = scan_str(s,FALSE,FALSE,FALSE,NULL);
if (!s)
- missingterm(NULL);
+ missingterm(NULL, 0);
COPLINE_SET_FROM_MULTI_END;
DEBUG_T( { printbuf("### Saw string before %s\n", s); } );
if (PL_expect == XOPERATOR) {
TERM(sublex_start());
case '"':
- if ( PL_expect == XOPERATOR
- && (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack))
- return deprecate_commaless_var_list();
-
s = scan_str(s,FALSE,FALSE,FALSE,NULL);
DEBUG_T( {
if (s)
no_op("String",s);
}
if (!s)
- missingterm(NULL);
+ missingterm(NULL, 0);
pl_yylval.ival = OP_CONST;
/* FIXME. I think that this can be const if char *d is replaced by
more localised variables. */
if (PL_expect == XOPERATOR)
no_op("Backticks",s);
if (!s)
- missingterm(NULL);
+ missingterm(NULL, 0);
pl_yylval.ival = OP_BACKTICK;
TERM(sublex_start());
/* x::* is just a word, unless x is "CORE" */
if (!anydelim && *s == ':' && s[1] == ':') {
- if (strEQ(PL_tokenbuf, "CORE")) goto case_KEY_CORE;
+ if (memEQs(PL_tokenbuf, len, "CORE")) goto case_KEY_CORE;
goto just_a_word;
}
else { /* no override */
tmp = -tmp;
if (tmp == KEY_dump) {
- Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
- "dump() better written as CORE::dump()");
+ Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED),
+ "dump() better written as CORE::dump(). "
+ "dump() will no longer be available "
+ "in Perl 5.30");
}
gv = NULL;
gvp = 0;
orig_keyword = 0;
lex = 0;
off = 0;
+ /* FALLTHROUGH */
default: /* not a keyword */
just_a_word: {
int pkgname = 0;
== OA_FILEREF))
{
bool immediate_paren = *s == '(';
+ SSize_t s_off;
/* (Now we can afford to cross potential line boundary.) */
s = skipspace(s);
+ /* intuit_method() can indirectly call lex_next_chunk(),
+ * invalidating s
+ */
+ s_off = s - SvPVX(PL_linestr);
/* Two barewords in a row may indicate method call. */
if ( ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
|| *s == '$')
&& (tmp = intuit_method(s, lex ? NULL : sv, cv)))
{
+ /* the code at method: doesn't use s */
goto method;
}
+ s = SvPVX(PL_linestr) + s_off;
/* If not a declared subroutine, it's an indirect object. */
/* (But it's an indir obj regardless for sort.) */
*PL_tokenbuf = '&';
d = scan_word(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
1, &len);
- if (len && (len != 4 || strNE(PL_tokenbuf+1, "CORE"))
+ if (len && memNEs(PL_tokenbuf+1, len, "CORE")
&& !keyword(PL_tokenbuf + 1, len, 0)) {
SSize_t off = s-SvPVX(PL_linestr);
d = skipspace(d);
case KEY_exists:
UNI(OP_EXISTS);
-
+
case KEY_exit:
UNI(OP_EXIT);
&& isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
{
char *p = s;
+ SSize_t s_off = s - SvPVX(PL_linestr);
- if ((PL_bufend - p) >= 3
- && strEQs(p, "my") && isSPACE(*(p + 2)))
+ if ( memBEGINPs(p, (STRLEN) (PL_bufend - p), "my")
+ && isSPACE(*(p + 2)))
{
- p += 2;
+ p += 2;
+ }
+ else if ( memBEGINPs(p, (STRLEN) (PL_bufend - p), "our")
+ && isSPACE(*(p + 3)))
+ {
+ p += 3;
}
- else if ((PL_bufend - p) >= 4
- && strEQs(p, "our") && isSPACE(*(p + 3)))
- p += 3;
+
p = skipspace(p);
/* skip optional package name, as in "for my abc $x (..)" */
if (isIDFIRST_lazy_if_safe(p, PL_bufend, UTF)) {
}
if (*p != '$' && *p != '\\')
Perl_croak(aTHX_ "Missing $ on loop variable");
+
+ /* The buffer may have been reallocated, update s */
+ s = SvPVX(PL_linestr) + s_off;
}
OPERATOR(FOR);
case KEY_last:
LOOPX(OP_LAST);
-
+
case KEY_lc:
UNI(OP_LC);
s = skipspace(s);
if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
- if (len == 3 && strEQs(PL_tokenbuf, "sub"))
+ if (memEQs(PL_tokenbuf, len, "sub"))
goto really_sub;
PL_in_my_stash = find_in_my_stash(PL_tokenbuf, len);
if (!PL_in_my_stash) {
case KEY_pos:
UNIDOR(OP_POS);
-
+
case KEY_pack:
LOP(OP_PACK,XTERM);
case KEY_q:
s = scan_str(s,FALSE,FALSE,FALSE,NULL);
if (!s)
- missingterm(NULL);
+ missingterm(NULL, 0);
COPLINE_SET_FROM_MULTI_END;
pl_yylval.ival = OP_CONST;
TERM(sublex_start());
OP *words = NULL;
s = scan_str(s,FALSE,FALSE,FALSE,NULL);
if (!s)
- missingterm(NULL);
+ missingterm(NULL, 0);
COPLINE_SET_FROM_MULTI_END;
PL_expect = XOPERATOR;
if (SvCUR(PL_lex_stuff)) {
case KEY_qq:
s = scan_str(s,FALSE,FALSE,FALSE,NULL);
if (!s)
- missingterm(NULL);
+ missingterm(NULL, 0);
pl_yylval.ival = OP_STRINGIFY;
if (SvIVX(PL_lex_stuff) == '\'')
SvIV_set(PL_lex_stuff, 0); /* qq'$foo' should interpolate */
case KEY_qx:
s = scan_str(s,FALSE,FALSE,FALSE,NULL);
if (!s)
- missingterm(NULL);
+ missingterm(NULL, 0);
pl_yylval.ival = OP_BACKTICK;
TERM(sublex_start());
case KEY_chomp:
UNI(OP_CHOMP);
-
+
case KEY_scalar:
UNI(OP_SCALAR);
COPLINE_SET_FROM_MULTI_END;
if (!s)
Perl_croak(aTHX_ "Prototype not terminated");
- (void)validate_proto(PL_subname, PL_lex_stuff, ckWARN(WARN_ILLEGALPROTO));
+ (void)validate_proto(PL_subname, PL_lex_stuff,
+ ckWARN(WARN_ILLEGALPROTO), 0);
have_proto = TRUE;
s = skipspace(s);
if (PL_in_my) {
if (PL_in_my == KEY_our) { /* "our" is merely analogous to "my" */
if (has_colon)
+ /* diag_listed_as: No package name allowed for variable %s
+ in "our" */
yyerror_pv(Perl_form(aTHX_ "No package name allowed for "
- "variable %s in \"our\"",
+ "%se %s in \"our\"",
+ *PL_tokenbuf=='&' ?"subroutin":"variabl",
PL_tokenbuf), UTF ? SVf_UTF8 : 0);
tmp = allocmy(PL_tokenbuf, tokenbuf_len, UTF ? SVf_UTF8 : 0);
}
s++;
if (*s == ',') {
GV* gv;
- PADOFFSET off;
if (keyword(w, s - w, 0))
return;
if (gv && GvCVu(gv))
return;
if (s - w <= 254) {
+ PADOFFSET off;
char tmpbuf[256];
Copy(w, tmpbuf+1, s - w, char);
*tmpbuf = '&';
|| ! SvOK(*cvp))
{
char *msg;
-
+
/* Here haven't found what we're looking for. If it is charnames,
* perhaps it needs to be loaded. Try doing that before giving up */
if (*key == 'c') {
|| isDIGIT_A((U8)s[1])
|| s[1] == '$'
|| s[1] == '{'
- || strEQs(s+1,"::")) )
+ || memBEGINs(s+1, (STRLEN) (PL_bufend - (s+1)), "::")) )
{
/* Dereferencing a value in a scalar variable.
The alternatives are different syntaxes for a scalar variable.
bool skip;
char *s2;
/* If we were processing {...} notation then... */
- if (isIDFIRST_lazy_if_safe(d, e, is_utf8)) {
- /* if it starts as a valid identifier, assume that it is one.
- (the later check for } being at the expected point will trap
- cases where this doesn't pan out.) */
- d += is_utf8 ? UTF8SKIP(d) : 1;
- parse_ident(&s, &d, e, 1, is_utf8, TRUE);
- *d = '\0';
+ if (isIDFIRST_lazy_if_safe(d, e, is_utf8)
+ || (!isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */
+ && isWORDCHAR(*s))
+ ) {
+ /* note we have to check for a normal identifier first,
+ * as it handles utf8 symbols, and only after that has
+ * been ruled out can we look at the caret words */
+ if (isIDFIRST_lazy_if_safe(d, e, is_utf8) ) {
+ /* if it starts as a valid identifier, assume that it is one.
+ (the later check for } being at the expected point will trap
+ cases where this doesn't pan out.) */
+ d += is_utf8 ? UTF8SKIP(d) : 1;
+ parse_ident(&s, &d, e, 1, is_utf8, TRUE);
+ *d = '\0';
+ }
+ else { /* caret word: ${^Foo} ${^CAPTURE[0]} */
+ d++;
+ while (isWORDCHAR(*s) && d < e) {
+ *d++ = *s++;
+ }
+ if (d >= e)
+ Perl_croak(aTHX_ "%s", ident_too_long);
+ *d = '\0';
+ }
tmp_copline = CopLINE(PL_curcop);
if (s < PL_bufend && isSPACE(*s)) {
s = skipspace(s);
}
if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) {
- /* ${foo[0]} and ${foo{bar}} notation. */
+ /* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */
if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) {
const char * const brack =
(const char *)
return s;
}
}
- /* Handle extended ${^Foo} variables
- * 1999-02-27 mjd-perl-patch@plover.com */
- else if (! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */
- && isWORDCHAR(*s))
- {
- d++;
- while (isWORDCHAR(*s) && d < e) {
- *d++ = *s++;
- }
- if (d >= e)
- Perl_croak(aTHX_ "%s", ident_too_long);
- *d = '\0';
- }
if ( !tmp_copline )
tmp_copline = CopLINE(PL_curcop);
- if ((skip = s < PL_bufend && isSPACE(*s)))
+ if ((skip = s < PL_bufend && isSPACE(*s))) {
/* Avoid incrementing line numbers or resetting PL_linestart,
in case we have to back up. */
+ STRLEN s_off = s - SvPVX(PL_linestr);
s2 = peekspace(s);
+ s = SvPVX(PL_linestr) + s_off;
+ }
else
s2 = s;
*dest = '\0';
}
}
- else if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets && !intuit_more(s))
+ else if ( PL_lex_state == LEX_INTERPNORMAL
+ && !PL_lex_brackets
+ && !intuit_more(s, PL_bufend))
PL_lex_state = LEX_INTERPEND;
return s;
}
PL_multi_end = 0;
pm->op_pmflags |= PMf_EVAL;
- while (es-- > 0) {
- if (es)
- sv_catpvs(repl, "eval ");
- else
- sv_catpvs(repl, "do ");
- }
- sv_catpvs(repl, "{");
+ for (; es > 1; es--) {
+ sv_catpvs(repl, "eval ");
+ }
+ sv_catpvs(repl, "do {");
sv_catsv(repl, PL_parser->lex_sub_repl);
sv_catpvs(repl, "}");
SvREFCNT_dec(PL_parser->lex_sub_repl);
PL_parser->lex_sub_repl = repl;
- es = 1;
}
else
term = '"';
if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
- deprecate("bare << to mean <<\"\"");
+ Perl_croak(aTHX_ "Use of bare << to mean <<\"\" is forbidden");
peek = s;
- while (
- isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))
- {
+ while (isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF)) {
peek += UTF ? UTF8SKIP(peek) : 1;
}
len = (peek - s >= e - d) ? (e - d) : (peek - s);
len = d - PL_tokenbuf;
#ifndef PERL_STRICT_CR
- d = strchr(s, '\r');
+ d = (char *) memchr(s, '\r', PL_bufend - s);
if (d) {
char * const olds = s;
s = d;
/* No whitespace or all! */
if (backup == s || *backup == '\n') {
- Newxz(indent, indent_len + 1, char);
+ Newx(indent, indent_len + 1, char);
memcpy(indent, backup + 1, indent_len);
+ indent[indent_len] = 0;
s--; /* before our delimiter */
PL_parser->herelines--; /* this line doesn't count */
break;
/* All whitespace or none! */
if (backup == found || SPACE_OR_TAB(*backup)) {
- Newxz(indent, indent_len + 1, char);
+ Newx(indent, indent_len + 1, char);
memcpy(indent, backup, indent_len);
+ indent[indent_len] = 0;
SvREFCNT_dec(PL_linestr);
PL_linestr = linestr_save;
PL_linestart = SvPVX(linestr_save);
interminable:
SvREFCNT_dec(tmpstr);
CopLINE_set(PL_curcop, origline);
- missingterm(PL_tokenbuf + 1);
+ missingterm(PL_tokenbuf + 1, sizeof(PL_tokenbuf) - 1);
}
/* scan_inputsymbol
PERL_ARGS_ASSERT_SCAN_INPUTSYMBOL;
- end = strchr(s, '\n');
+ end = (char *) memchr(s, '\n', PL_bufend - s);
if (!end)
end = PL_bufend;
if (s[1] == '<' && s[2] == '>' && s[3] == '>') {
($*@) sub prototypes sub foo ($)
(stuff) sub attr parameters sub foo : attr(stuff)
<> readline or globs <FOO>, <>, <$fh>, or <*.c>
-
+
In most of these cases (all but <>, patterns and transliterate)
yylex() calls scan_str(). m// makes yylex() call scan_pat() which
calls scan_str(). s/// makes yylex() call scan_subst() which calls
const char * non_grapheme_msg = "Use of unassigned code point or"
" non-standalone grapheme for a delimiter"
" will be a fatal error starting in Perl"
- " v5.30";
+ " 5.30";
/* The only non-UTF character that isn't a stand alone grapheme is
* white-space, hence can't be a delimiter. So can skip for non-UTF-8 */
bool check_grapheme = UTF && ckWARN_d(WARN_DEPRECATED);
}
/* terminate when run out of buffer (the for() condition), or
have found the terminator */
- else if (*s == term) {
- if (termlen == 1)
+ else if (*s == term) { /* First byte of terminator matches */
+ if (termlen == 1) /* If is the only byte, are done */
break;
- if (s+termlen <= PL_bufend && memEQ(s, (char*)termstr, termlen))
+
+ /* If the remainder of the terminator matches, also are
+ * done, after checking that is a separate grapheme */
+ if ( s + termlen <= PL_bufend
+ && memEQ(s + 1, (char*)termstr + 1, termlen - 1))
+ {
if ( check_grapheme
&& UNLIKELY(! _is_grapheme((U8 *) start,
(U8 *) s,
"%s", non_grapheme_msg);
}
break;
+ }
}
- else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
+ else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) {
has_utf8 = TRUE;
+ }
+
*to = *s;
}
}
-
+
/* if the terminator isn't the same as the start character (e.g.,
matched brackets), we have to allow more in the quoting, and
be prepared for nested brackets.
else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r')
to[-1] = '\n';
#endif
-
+
/* if we're out of file, or a read fails, bail and reset the current
line marker so we can report where the unterminated string began
*/
bool floatit; /* boolean: int or float? */
const char *lastub = NULL; /* position of last underbar */
static const char* const number_too_long = "Number too long";
+ bool warned_about_underscore = 0;
+#define WARN_ABOUT_UNDERSCORE() \
+ do { \
+ if (!warned_about_underscore) { \
+ warned_about_underscore = 1; \
+ Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), \
+ "Misplaced _ in number"); \
+ } \
+ } while(0)
/* Hexadecimal floating point.
*
* In many places (where we have quads and NV is IEEE 754 double)
}
if (*s == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
/* _ are ignored -- but warned about if consecutive */
case '_':
if (lastub && s == lastub + 1)
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
break;
out:
/* final misplaced underbar check */
- if (s[-1] == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number");
- }
+ if (s[-1] == '_')
+ WARN_ABOUT_UNDERSCORE();
if (UNLIKELY(HEXFP_PEEK(s))) {
/* Do sloppy (on the underbars) but quick detection
*/
if (*s == '_') {
if (lastub && s == lastub + 1)
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
else {
}
/* final misplaced underbar check */
- if (lastub && s == lastub + 1) {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number");
- }
+ if (lastub && s == lastub + 1)
+ WARN_ABOUT_UNDERSCORE();
/* read a decimal portion if there is one. avoid
3..5 being interpreted as the number 3. followed
*d++ = *s++;
if (*s == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s;
}
Perl_croak(aTHX_ "%s", number_too_long);
if (*s == '_') {
if (lastub && s == lastub + 1)
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s;
}
else
*d++ = *s;
}
/* fractional part ending in underbar? */
- if (s[-1] == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
- }
+ if (s[-1] == '_')
+ WARN_ABOUT_UNDERSCORE();
if (*s == '.' && isDIGIT(s[1])) {
/* oops, it's really a v-string, but without the "v" */
s = start;
|| UNLIKELY(hexfp && isALPHA_FOLD_EQ(*s, 'p')))
&& strchr("+-0123456789_", s[1]))
{
- floatit = TRUE;
+ int exp_digits = 0;
+ const char *save_s = s;
+ char * save_d = d;
- /* regardless of whether user said 3E5 or 3e5, use lower 'e',
+ /* regardless of whether user said 3E5 or 3e5, use lower 'e',
ditto for p (hexfloats) */
if ((isALPHA_FOLD_EQ(*s, 'e'))) {
/* At least some Mach atof()s don't grok 'E' */
/* stray preinitial _ */
if (*s == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
/* stray initial _ */
if (*s == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
/* read digits of exponent */
while (isDIGIT(*s) || *s == '_') {
if (isDIGIT(*s)) {
+ ++exp_digits;
if (d >= e)
Perl_croak(aTHX_ "%s", number_too_long);
*d++ = *s++;
else {
if (((lastub && s == lastub + 1)
|| (!isDIGIT(s[1]) && s[1] != '_')))
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
}
+
+ if (!exp_digits) {
+ /* no exponent digits, the [eEpP] could be for something else,
+ * though in practice we don't get here for p since that's preparsed
+ * earlier, and results in only the 0xX being consumed, so behave similarly
+ * for decimal floats and consume only the D.DD, leaving the [eE] to the
+ * next token.
+ */
+ s = save_s;
+ d = save_d;
+ }
+ else {
+ floatit = TRUE;
+ }
}
STATIC char *
S_scan_formline(pTHX_ char *s)
{
- char *eol;
- char *t;
SV * const stuff = newSVpvs("");
bool needargs = FALSE;
bool eofmt = FALSE;
PERL_ARGS_ASSERT_SCAN_FORMLINE;
while (!needargs) {
+ char *eol;
if (*s == '.') {
- t = s+1;
+ char *t = s+1;
#ifdef PERL_STRICT_CR
while (SPACE_OR_TAB(*t))
t++;
if (!eol++)
eol = PL_bufend;
if (*s != '#') {
+ char *t;
for (t = s; t < eol; t++) {
if (*t == '~' && t[1] == '~' && SvCUR(stuff)) {
needargs = FALSE;
if (!got_some)
break;
}
- incline(s);
+ incline(s, PL_bufend);
}
enough:
if (!SvCUR(stuff) || needargs)
return 0;
}
+void
+Perl_abort_execution(pTHX_ const char * const msg, const char * const name)
+{
+ PERL_ARGS_ASSERT_ABORT_EXECUTION;
+
+ if (PL_minus_c)
+ Perl_croak(aTHX_ "%s%s had compilation errors.\n", msg, name);
+ else {
+ Perl_croak(aTHX_
+ "%sExecution of %s aborted due to compilation errors.\n", msg, name);
+ }
+ NOT_REACHED; /* NOTREACHED */
+}
+
+void
+Perl_yyquit(pTHX)
+{
+ /* Called, after at least one error has been found, to abort the parse now,
+ * instead of trying to forge ahead */
+
+ yyerror_pvn(NULL, 0, 0);
+}
+
int
Perl_yyerror(pTHX_ const char *const s)
{
SV * const where_sv = newSVpvs_flags("", SVs_TEMP);
int yychar = PL_parser->yychar;
- PERL_ARGS_ASSERT_YYERROR_PVN;
-
- if (!yychar || (yychar == ';' && !PL_rsfp))
- sv_catpvs(where_sv, "at EOF");
- else if ( PL_oldoldbufptr
- && PL_bufptr > PL_oldoldbufptr
- && PL_bufptr - PL_oldoldbufptr < 200
- && PL_oldoldbufptr != PL_oldbufptr
- && PL_oldbufptr != PL_bufptr)
- {
- /*
- Only for NetWare:
- The code below is removed for NetWare because it abends/crashes on NetWare
- when the script has error such as not having the closing quotes like:
- if ($var eq "value)
- Checking of white spaces is anyway done in NetWare code.
- */
+ /* Output error message 's' with length 'len'. 'flags' are SV flags that
+ * apply. If the number of errors found is large enough, it abandons
+ * parsing. If 's' is NULL, there is no message, and it abandons
+ * processing unconditionally */
+
+ if (s != NULL) {
+ if (!yychar || (yychar == ';' && !PL_rsfp))
+ sv_catpvs(where_sv, "at EOF");
+ else if ( PL_oldoldbufptr
+ && PL_bufptr > PL_oldoldbufptr
+ && PL_bufptr - PL_oldoldbufptr < 200
+ && PL_oldoldbufptr != PL_oldbufptr
+ && PL_oldbufptr != PL_bufptr)
+ {
+ /*
+ Only for NetWare:
+ The code below is removed for NetWare because it
+ abends/crashes on NetWare when the script has error such as
+ not having the closing quotes like:
+ if ($var eq "value)
+ Checking of white spaces is anyway done in NetWare code.
+ */
#ifndef NETWARE
- while (isSPACE(*PL_oldoldbufptr))
- PL_oldoldbufptr++;
+ while (isSPACE(*PL_oldoldbufptr))
+ PL_oldoldbufptr++;
#endif
- context = PL_oldoldbufptr;
- contlen = PL_bufptr - PL_oldoldbufptr;
- }
- else if ( PL_oldbufptr
- && PL_bufptr > PL_oldbufptr
- && PL_bufptr - PL_oldbufptr < 200
- && PL_oldbufptr != PL_bufptr) {
- /*
- Only for NetWare:
- The code below is removed for NetWare because it abends/crashes on NetWare
- when the script has error such as not having the closing quotes like:
- if ($var eq "value)
- Checking of white spaces is anyway done in NetWare code.
- */
+ context = PL_oldoldbufptr;
+ contlen = PL_bufptr - PL_oldoldbufptr;
+ }
+ else if ( PL_oldbufptr
+ && PL_bufptr > PL_oldbufptr
+ && PL_bufptr - PL_oldbufptr < 200
+ && PL_oldbufptr != PL_bufptr) {
+ /*
+ Only for NetWare:
+ The code below is removed for NetWare because it
+ abends/crashes on NetWare when the script has error such as
+ not having the closing quotes like:
+ if ($var eq "value)
+ Checking of white spaces is anyway done in NetWare code.
+ */
#ifndef NETWARE
- while (isSPACE(*PL_oldbufptr))
- PL_oldbufptr++;
+ while (isSPACE(*PL_oldbufptr))
+ PL_oldbufptr++;
#endif
- context = PL_oldbufptr;
- contlen = PL_bufptr - PL_oldbufptr;
- }
- else if (yychar > 255)
- sv_catpvs(where_sv, "next token ???");
- else if (yychar == YYEMPTY) {
- if (PL_lex_state == LEX_NORMAL)
- sv_catpvs(where_sv, "at end of line");
- else if (PL_lex_inpat)
- sv_catpvs(where_sv, "within pattern");
- else
- sv_catpvs(where_sv, "within string");
- }
- else {
- sv_catpvs(where_sv, "next char ");
- if (yychar < 32)
- Perl_sv_catpvf(aTHX_ where_sv, "^%c", toCTRL(yychar));
- else if (isPRINT_LC(yychar)) {
- const char string = yychar;
- sv_catpvn(where_sv, &string, 1);
- }
- else
- Perl_sv_catpvf(aTHX_ where_sv, "\\%03o", yychar & 255);
- }
- msg = newSVpvn_flags(s, len, (flags & SVf_UTF8) | SVs_TEMP);
- Perl_sv_catpvf(aTHX_ msg, " at %s line %" IVdf ", ",
- OutCopFILE(PL_curcop),
- (IV)(PL_parser->preambling == NOLINE
- ? CopLINE(PL_curcop)
- : PL_parser->preambling));
- if (context)
- Perl_sv_catpvf(aTHX_ msg, "near \"%" UTF8f "\"\n",
- UTF8fARG(UTF, contlen, context));
- else
- Perl_sv_catpvf(aTHX_ msg, "%" SVf "\n", SVfARG(where_sv));
- if (PL_multi_start < PL_multi_end && (U32)(CopLINE(PL_curcop) - PL_multi_end) <= 1) {
- Perl_sv_catpvf(aTHX_ msg,
- " (Might be a runaway multi-line %c%c string starting on line %" IVdf ")\n",
- (int)PL_multi_open,(int)PL_multi_close,(IV)PL_multi_start);
- PL_multi_end = 0;
- }
- if (PL_in_eval & EVAL_WARNONLY) {
- PL_in_eval &= ~EVAL_WARNONLY;
- Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX), "%" SVf, SVfARG(msg));
+ context = PL_oldbufptr;
+ contlen = PL_bufptr - PL_oldbufptr;
+ }
+ else if (yychar > 255)
+ sv_catpvs(where_sv, "next token ???");
+ else if (yychar == YYEMPTY) {
+ if (PL_lex_state == LEX_NORMAL)
+ sv_catpvs(where_sv, "at end of line");
+ else if (PL_lex_inpat)
+ sv_catpvs(where_sv, "within pattern");
+ else
+ sv_catpvs(where_sv, "within string");
+ }
+ else {
+ sv_catpvs(where_sv, "next char ");
+ if (yychar < 32)
+ Perl_sv_catpvf(aTHX_ where_sv, "^%c", toCTRL(yychar));
+ else if (isPRINT_LC(yychar)) {
+ const char string = yychar;
+ sv_catpvn(where_sv, &string, 1);
+ }
+ else
+ Perl_sv_catpvf(aTHX_ where_sv, "\\%03o", yychar & 255);
+ }
+ msg = newSVpvn_flags(s, len, (flags & SVf_UTF8) | SVs_TEMP);
+ Perl_sv_catpvf(aTHX_ msg, " at %s line %" IVdf ", ",
+ OutCopFILE(PL_curcop),
+ (IV)(PL_parser->preambling == NOLINE
+ ? CopLINE(PL_curcop)
+ : PL_parser->preambling));
+ if (context)
+ Perl_sv_catpvf(aTHX_ msg, "near \"%" UTF8f "\"\n",
+ UTF8fARG(UTF, contlen, context));
+ else
+ Perl_sv_catpvf(aTHX_ msg, "%" SVf "\n", SVfARG(where_sv));
+ if ( PL_multi_start < PL_multi_end
+ && (U32)(CopLINE(PL_curcop) - PL_multi_end) <= 1)
+ {
+ Perl_sv_catpvf(aTHX_ msg,
+ " (Might be a runaway multi-line %c%c string starting on"
+ " line %" IVdf ")\n",
+ (int)PL_multi_open,(int)PL_multi_close,(IV)PL_multi_start);
+ PL_multi_end = 0;
+ }
+ if (PL_in_eval & EVAL_WARNONLY) {
+ PL_in_eval &= ~EVAL_WARNONLY;
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX), "%" SVf, SVfARG(msg));
+ }
+ else {
+ qerror(msg);
+ }
}
- else
- qerror(msg);
- if (PL_error_count >= 10) {
- SV * errsv;
- if (PL_in_eval && ((errsv = ERRSV), SvCUR(errsv)))
- Perl_croak(aTHX_ "%" SVf "%s has too many errors.\n",
- SVfARG(errsv), OutCopFILE(PL_curcop));
- else
- Perl_croak(aTHX_ "%s has too many errors.\n",
- OutCopFILE(PL_curcop));
+ if (s == NULL || PL_error_count >= 10) {
+ const char * msg = "";
+ const char * const name = OutCopFILE(PL_curcop);
+
+ if (PL_in_eval) {
+ SV * errsv = ERRSV;
+ if (SvCUR(errsv)) {
+ msg = Perl_form(aTHX_ "%" SVf, SVfARG(errsv));
+ }
+ }
+
+ if (s == NULL) {
+ abort_execution(msg, name);
+ }
+ else {
+ Perl_croak(aTHX_ "%s%s has too many errors.\n", msg, name);
+ }
}
PL_in_my = 0;
PL_in_my_stash = NULL;
/* diag_listed_as: Unsupported script encoding %s */
Perl_croak(aTHX_ "Unsupported script encoding UTF-32LE");
#ifndef PERL_NO_UTF16_FILTER
+#ifdef DEBUGGING
if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (BOM)\n");
+#endif
s += 2;
if (PL_bufend > (char*)s) {
s = add_utf16_textfilter(s, TRUE);
case 0xFE:
if (s[1] == 0xFF) { /* UTF-16 big-endian? */
#ifndef PERL_NO_UTF16_FILTER
+#ifdef DEBUGGING
if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (BOM)\n");
+#endif
s += 2;
if (PL_bufend > (char *)s) {
s = add_utf16_textfilter(s, FALSE);
}
break;
case BOM_UTF8_FIRST_BYTE: {
- const STRLEN len = sizeof(BOM_UTF8_TAIL) - 1; /* Exclude trailing NUL */
- if (slen > len && memEQ(s+1, BOM_UTF8_TAIL, len)) {
+ if (memBEGINs(s+1, slen - 1, BOM_UTF8_TAIL)) {
+#ifdef DEBUGGING
if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-8 script encoding (BOM)\n");
- s += len + 1; /* UTF-8 */
+#endif
+ s += sizeof(BOM_UTF8) - 1; /* UTF-8 */
}
break;
}
* 00 xx 00 xx
* are a good indicator of UTF-16BE. */
#ifndef PERL_NO_UTF16_FILTER
+#ifdef DEBUGGING
if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n");
+#endif
s = add_utf16_textfilter(s, FALSE);
#else
/* diag_listed_as: Unsupported script encoding %s */
* xx 00 xx 00
* are a good indicator of UTF-16LE. */
#ifndef PERL_NO_UTF16_FILTER
+#ifdef DEBUGGING
if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n");
+#endif
s = add_utf16_textfilter(s, TRUE);
#else
/* diag_listed_as: Unsupported script encoding %s */
}
}
+ /* 'chars' isn't quite the right name, as code points above 0xFFFF
+ * require 4 bytes per char */
chars = SvCUR(utf16_buffer) >> 1;
have = SvCUR(utf8_buffer);
- SvGROW(utf8_buffer, have + chars * 3 + 1);
+
+ /* Assume the worst case size as noted by the functions: twice the
+ * number of input bytes */
+ SvGROW(utf8_buffer, have + chars * 4 + 1);
if (reverse) {
end = utf16_to_utf8_reversed((U8*)SvPVX(utf16_buffer),
return KEYWORD_PLUGIN_DECLINE;
}
+/*
+=for apidoc Amx|void|wrap_keyword_plugin|Perl_keyword_plugin_t new_plugin|Perl_keyword_plugin_t *old_plugin_p
+
+Puts a C function into the chain of keyword plugins. This is the
+preferred way to manipulate the L</PL_keyword_plugin> variable.
+C<new_plugin> is a pointer to the C function that is to be added to the
+keyword plugin chain, and C<old_plugin_p> points to the storage location
+where a pointer to the next function in the chain will be stored. The
+value of C<new_plugin> is written into the L</PL_keyword_plugin> variable,
+while the value previously stored there is written to C<*old_plugin_p>.
+
+L</PL_keyword_plugin> is global to an entire process, and a module wishing
+to hook keyword parsing may find itself invoked more than once per
+process, typically in different threads. To handle that situation, this
+function is idempotent. The location C<*old_plugin_p> must initially
+(once per process) contain a null pointer. A C variable of static
+duration (declared at file scope, typically also marked C<static> to give
+it internal linkage) will be implicitly initialised appropriately, if it
+does not have an explicit initialiser. This function will only actually
+modify the plugin chain if it finds C<*old_plugin_p> to be null. This
+function is also thread safe on the small scale. It uses appropriate
+locking to avoid race conditions in accessing L</PL_keyword_plugin>.
+
+When this function is called, the function referenced by C<new_plugin>
+must be ready to be called, except for C<*old_plugin_p> being unfilled.
+In a threading situation, C<new_plugin> may be called immediately, even
+before this function has returned. C<*old_plugin_p> will always be
+appropriately set before C<new_plugin> is called. If C<new_plugin>
+decides not to do anything special with the identifier that it is given
+(which is the usual case for most calls to a keyword plugin), it must
+chain the plugin function referenced by C<*old_plugin_p>.
+
+Taken all together, XS code to install a keyword plugin should typically
+look something like this:
+
+ static Perl_keyword_plugin_t next_keyword_plugin;
+ static OP *my_keyword_plugin(pTHX_
+ char *keyword_plugin, STRLEN keyword_len, OP **op_ptr)
+ {
+ if (memEQs(keyword_ptr, keyword_len,
+ "my_new_keyword")) {
+ ...
+ } else {
+ return next_keyword_plugin(aTHX_
+ keyword_ptr, keyword_len, op_ptr);
+ }
+ }
+ BOOT:
+ wrap_keyword_plugin(my_keyword_plugin,
+ &next_keyword_plugin);
+
+Direct access to L</PL_keyword_plugin> should be avoided.
+
+=cut
+*/
+
+void
+Perl_wrap_keyword_plugin(pTHX_
+ Perl_keyword_plugin_t new_plugin, Perl_keyword_plugin_t *old_plugin_p)
+{
+ dVAR;
+
+ PERL_UNUSED_CONTEXT;
+ PERL_ARGS_ASSERT_WRAP_KEYWORD_PLUGIN;
+ if (*old_plugin_p) return;
+ KEYWORD_PLUGIN_MUTEX_LOCK;
+ if (!*old_plugin_p) {
+ *old_plugin_p = PL_keyword_plugin;
+ PL_keyword_plugin = new_plugin;
+ }
+ KEYWORD_PLUGIN_MUTEX_UNLOCK;
+}
+
#define parse_recdescent(g,p) S_parse_recdescent(aTHX_ g,p)
static void
S_parse_recdescent(pTHX_ int gramtype, I32 fakeeof)