{
const char *s = NULL;
yy_parser *parser, *oparser;
+ const U8* first_bad_char_loc;
+
if (flags && flags & ~LEX_START_FLAGS)
Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
if (line) {
STRLEN len;
s = SvPV_const(line, len);
+
+ if (SvUTF8(line) && ! is_utf8_string_loc((U8 *) s,
+ SvCUR(line),
+ &first_bad_char_loc))
+ {
+ _force_out_malformed_utf8_message(first_bad_char_loc,
+ (U8 *) s + SvCUR(line),
+ 0,
+ 1 /* 1 means die */ );
+ NOT_REACHED; /* NOTREACHED */
+ }
+
parser->linestr = flags & LEX_START_COPIED
? SvREFCNT_inc_simple_NN(line)
: newSVpvn_flags(s, len, SvUTF8(line));
STRLEN len;
const char *start = SvPV_const(sv,len);
const char * const end = start + len;
- const bool utf = SvUTF8(sv) ? TRUE : FALSE;
+ const bool utf = cBOOL(SvUTF8(sv));
PERL_ARGS_ASSERT_STR_TO_VERSION;
} (end if backslash)
handle regular character
} (end while character to read)
-
+
*/
STATIC char *
* range, so for most cases we just drop down and handle the value
* as any other. There are two exceptions.
*
- * 1. A minus sign indicates that we are actually going to have
- * a range. In this case, skip the '-', set a flag, then drop
+ * 1. A hyphen indicates that we are actually going to have a
+ * range. In this case, skip the '-', set a flag, then drop
* down to handle what should be the end range value.
* 2. After we've handled that value, the next time through, that
* flag is set and we fix up the range.
if (! dorange) {
- /* Here, we don't think we're in a range. If we've processed
- * at least one character, then see if this next one is a '-',
- * indicating the previous one was the start of a range. But
- * don't bother if we're too close to the end for the minus to
- * mean that. */
+ /* Here, we don't think we're in a range. If the new character
+ * is not a hyphen; or if it is a hyphen, but it's too close to
+ * either edge to indicate a range, then it's a regular
+ * character. */
if (*s != '-' || s >= send - 1 || s == start) {
/* A regular character. Process like any other, but first
non_portable_endpoint = 0;
backslash_N = 0;
#endif
- /* The tests here and the following 'else' for being above
- * Latin1 suffice to find all such occurences in the
- * constant, except those added by a backslash escape
- * sequence, like \x{100}. And all those set
- * 'has_above_latin1' as appropriate */
+ /* The tests here for being above Latin1 and similar ones
+ * in the following 'else' suffice to find all such
+ * occurences in the constant, except those added by a
+ * backslash escape sequence, like \x{100}. And all those
+ * set 'has_above_latin1' as appropriate */
if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
has_above_latin1 = TRUE;
}
/* Drops down to generic code to process current byte */
}
- else {
+ else { /* Is a '-' in the context where it means a range */
if (didrange) { /* Something like y/A-C-Z// */
- Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
+ Perl_croak(aTHX_ "Ambiguous range in transliteration"
+ " operator");
}
dorange = TRUE;
- s++; /* Skip past the minus */
+ s++; /* Skip past the hyphen */
/* d now points to where the end-range character will be
* placed. Save it so won't have to go finding it later,
if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
has_above_latin1 = TRUE;
}
+
+ /* Drops down to generic code to process current byte */
}
} /* End of not a range */
else {
* 'd' points to just beyond the range end in the 'sv' string,
* where we would next place something
* 'offset_to_max' is the offset in 'sv' at which the character
- * before 'd' begins.
+ * (the range's maximum end point) before 'd' begins.
*/
- const char * max_ptr = SvPVX_const(sv) + offset_to_max;
- const char * min_ptr;
+ char * max_ptr = SvPVX(sv) + offset_to_max;
+ char * min_ptr;
IV range_min;
IV range_max; /* last character in range */
- STRLEN save_offset;
STRLEN grow;
+ Size_t offset_to_min = 0;
+ Size_t extras = 0;
#ifdef EBCDIC
bool convert_unicode;
IV real_range_max = 0;
#endif
- /* Get the range-ends code point values. */
+ /* Get the code point values of the range ends. */
if (has_utf8) {
/* We know the utf8 is valid, because we just constructed
* it ourselves in previous loop iterations */
range_max = * (U8*) max_ptr;
}
+ /* If the range is just a single code point, like tr/a-a/.../,
+ * that code point is already in the output, twice. We can
+ * just back up over the second instance and avoid all the rest
+ * of the work. But if it is a variant character, it's been
+ * counted twice, so decrement */
+ if (UNLIKELY(range_max == range_min)) {
+ d = max_ptr;
+ if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
+ utf8_variant_count--;
+ }
+ goto range_done;
+ }
+
#ifdef EBCDIC
/* On EBCDIC platforms, we may have to deal with portable
* ranges. These happen if at least one range endpoint is a
* [A-Z] or [a-z], and both ends are literal characters,
* like 'A', and not like \x{C1} */
convert_unicode =
- cBOOL(backslash_N) /* \N{} forces Unicode, hence
- portable range */
- || ( ! non_portable_endpoint
- && (( isLOWER_A(range_min) && isLOWER_A(range_max))
- || (isUPPER_A(range_min) && isUPPER_A(range_max))));
+ cBOOL(backslash_N) /* \N{} forces Unicode,
+ hence portable range */
+ || ( ! non_portable_endpoint
+ && (( isLOWER_A(range_min) && isLOWER_A(range_max))
+ || (isUPPER_A(range_min) && isUPPER_A(range_max))));
if (convert_unicode) {
/* Special handling is needed for these portable ranges.
- * They are defined to all be in Unicode terms, which
- * include all Unicode code points between the end points.
+ * They are defined to be in Unicode terms, which includes
+ * all the Unicode code points between the end points.
* Convert to Unicode to get the Unicode range. Later we
* will convert each code point in the range back to
* native. */
range_max = UNI_TO_NATIVE(range_max);
}
#endif
-
/* Use the characters themselves for the error message if
* ASCII printables; otherwise some visible representation
* of them */
}
#ifdef EBCDIC
else if (convert_unicode) {
- /* diag_listed_as: Invalid range "%s" in transliteration operator */
+ /* diag_listed_as: Invalid range "%s" in transliteration operator */
Perl_croak(aTHX_
- "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04" UVXf "}\""
- " in transliteration operator",
- range_min, range_max);
+ "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
+ UVXf "}\" in transliteration operator",
+ range_min, range_max);
}
#endif
else {
- /* diag_listed_as: Invalid range "%s" in transliteration operator */
+ /* diag_listed_as: Invalid range "%s" in transliteration operator */
Perl_croak(aTHX_
- "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
- " in transliteration operator",
- range_min, range_max);
+ "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
+ " in transliteration operator",
+ range_min, range_max);
}
}
+ /* If the range is exactly two code points long, they are
+ * already both in the output */
+ if (UNLIKELY(range_min + 1 == range_max)) {
+ goto range_done;
+ }
+
+ /* Here the range contains at least 3 code points */
+
if (has_utf8) {
/* If everything in the transliteration is below 256, we
* can avoid special handling later. A translation table
- * of each of those bytes is created. And so we expand out
- * all ranges to their constituent code points. But if
- * we've encountered something above 255, the expanding
- * won't help, so skip doing that. But if it's EBCDIC, we
- * may have to look at each character below 256 if we have
- * to convert to/from Unicode values */
+ * for each of those bytes is created by op.c. So we
+ * expand out all ranges to their constituent code points.
+ * But if we've encountered something above 255, the
+ * expanding won't help, so skip doing that. But if it's
+ * EBCDIC, we may have to look at each character below 256
+ * if we have to convert to/from Unicode values */
if ( has_above_latin1
#ifdef EBCDIC
&& (range_min > 255 || ! convert_unicode)
/* Move the high character one byte to the right; then
* insert between it and the range begin, an illegal
* byte which serves to indicate this is a range (using
- * a '-' could be ambiguous). */
+ * a '-' would be ambiguous). */
char *e = d++;
while (e-- > max_ptr) {
*(e + 1) = *e;
}
/* Here we need to expand out the string to contain each
- * character in the range. Grow the output to handle this */
+ * character in the range. Grow the output to handle this.
+ * For non-UTF8, we need a byte for each code point in the
+ * range, minus the three that we've already allocated for: the
+ * hyphen, the min, and the max. For UTF-8, we need this
+ * plus an extra byte for each code point that occupies two
+ * bytes (is variant) when in UTF-8 (except we've already
+ * allocated for the end points, including if they are
+ * variants). For ASCII platforms and Unicode ranges on EBCDIC
+ * platforms, it's easy to calculate a precise number. To
+ * start, we count the variants in the range, which we need
+ * elsewhere in this function anyway. (For the case where it
+ * isn't easy to calculate, 'extras' has been initialized to 0,
+ * and the calculation is done in a loop further down.) */
+#ifdef EBCDIC
+ if (convert_unicode)
+#endif
+ {
+ /* This is executed unconditionally on ASCII, and for
+ * Unicode ranges on EBCDIC. Under these conditions, all
+ * code points above a certain value are variant; and none
+ * under that value are. We just need to find out how much
+ * of the range is above that value. We don't count the
+ * end points here, as they will already have been counted
+ * as they were parsed. */
+ if (range_min >= UTF_CONTINUATION_MARK) {
+
+ /* The whole range is made up of variants */
+ extras = (range_max - 1) - (range_min + 1) + 1;
+ }
+ else if (range_max >= UTF_CONTINUATION_MARK) {
- save_offset = min_ptr - SvPVX_const(sv);
+ /* Only the higher portion of the range is variants */
+ extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
+ }
- /* The base growth is the number of code points in the range */
- grow = range_max - range_min + 1;
- if (has_utf8) {
+ utf8_variant_count += extras;
+ }
+
+ /* The base growth is the number of code points in the range,
+ * not including the endpoints, which have already been sized
+ * for (and output). We don't subtract for the hyphen, as it
+ * has been parsed but not output, and the SvGROW below is
+ * based only on what's been output plus what's left to parse.
+ * */
+ grow = (range_max - 1) - (range_min + 1) + 1;
- /* But if the output is UTF-8, some of those characters may
- * need two bytes (since the maximum range value here is
- * 255, the max bytes per character is two). On ASCII
- * platforms, it's not much trouble to get an accurate
- * count of what's needed. But on EBCDIC, the ones that
- * need 2 bytes are scattered around, so just use a worst
- * case value instead of calculating for that platform. */
+ if (has_utf8) {
#ifdef EBCDIC
- grow *= 2;
-#else
- /* Only those above 127 require 2 bytes. This may be
- * everything in the range, or not */
- if (range_min > 127) {
+ /* In some cases in EBCDIC, we haven't yet calculated a
+ * precise amount needed for the UTF-8 variants. Just
+ * assume the worst case, that everything will expand by a
+ * byte */
+ if (! convert_unicode) {
grow *= 2;
}
- else if (range_max > 127) {
- grow += range_max - 127;
- }
+ else
#endif
+ {
+ /* Otherwise we know exactly how many variants there
+ * are in the range. */
+ grow += extras;
+ }
}
- /* Subtract 3 for the bytes that were already accounted for
- * (min, max, and the hyphen) */
- d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3);
+ /* Grow, but position the output to overwrite the range min end
+ * point, because in some cases we overwrite that */
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ offset_to_min = min_ptr - SvPVX_const(sv);
+
+ /* See Note on sizing above. */
+ d = offset_to_min + SvGROW(sv, SvCUR(sv)
+ + (send - s)
+ + grow
+ + 1 /* Trailing NUL */ );
+ /* Now, we can expand out the range. */
#ifdef EBCDIC
- /* Here, we expand out the range. */
if (convert_unicode) {
- IV i;
+ SSize_t i;
/* Recall that the min and max are now in Unicode terms, so
* we have to convert each character to its native
* equivalent */
if (has_utf8) {
for (i = range_min; i <= range_max; i++) {
- append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
- (U8 **) &d);
+ append_utf8_from_native_byte(
+ LATIN1_TO_NATIVE((U8) i),
+ (U8 **) &d);
}
}
else {
#endif
/* Always gets run for ASCII, and sometimes for EBCDIC. */
{
- IV i;
+ SSize_t i;
/* Here, no conversions are necessary, which means that the
* first character in the range is already in 'd' and
}
else {
d++;
- for (i = range_min + 1; i <= range_max; i++) {
+ assert(range_min + 1 <= range_max);
+ for (i = range_min + 1; i < range_max; i++) {
+#ifdef EBCDIC
+ /* In this case on EBCDIC, we haven't calculated
+ * the variants. Do it here, as we go along */
+ if (! UVCHR_IS_INVARIANT(i)) {
+ utf8_variant_count++;
+ }
+#endif
*d++ = (char)i;
}
+
+ /* The range_max is done outside the loop so as to
+ * avoid having to special case not incrementing
+ * 'utf8_variant_count' on EBCDIC (it's already been
+ * counted when originally parsed) */
+ *d++ = (char) range_max;
}
}
#ifdef EBCDIC
- /* If the original range extended above 255, add in that portion. */
+ /* If the original range extended above 255, add in that
+ * portion. */
if (real_range_max) {
*d++ = (char) UTF8_TWO_BYTE_HI(0x100);
*d++ = (char) UTF8_TWO_BYTE_LO(0x100);
- if (real_range_max > 0x101)
- *d++ = (char) ILLEGAL_UTF8_BYTE;
- if (real_range_max > 0x100)
+ if (real_range_max > 0x100) {
+ if (real_range_max > 0x101) {
+ *d++ = (char) ILLEGAL_UTF8_BYTE;
+ }
d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
+ }
}
#endif
if (!esc)
in_charclass = FALSE;
}
-
- /* skip for regexp comments /(?#comment)/, except for the last
- * char, which will be done separately.
- * Stop on (?{..}) and friends */
-
+ /* skip for regexp comments /(?#comment)/, except for the last
+ * char, which will be done separately. Stop on (?{..}) and
+ * friends */
else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
if (s[2] == '#') {
while (s+1 < send && *s != ')')
break;
}
}
-
- /* likewise skip #-initiated comments in //x patterns */
+ /* likewise skip #-initiated comments in //x patterns */
else if (*s == '#'
&& PL_lex_inpat
&& !in_charclass
while (s < send && *s != '\n')
*d++ = *s++;
}
-
- /* no further processing of single-quoted regex */
+ /* no further processing of single-quoted regex */
else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
goto default_action;
- /* check for embedded arrays
- (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
- */
+ /* check for embedded arrays
+ * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
+ */
else if (*s == '@' && s[1]) {
if (UTF
? isIDFIRST_utf8_safe(s+1, send)
if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
break; /* in regexp, neither @+ nor @- are interpolated */
}
-
- /* check for embedded scalars. only stop if we're sure it's a
- variable.
- */
+ /* check for embedded scalars. only stop if we're sure it's a
+ * variable. */
else if (*s == '$') {
if (!PL_lex_inpat) /* not a regexp, so $ must be var */
break;
UTF);
if (! valid) {
yyerror(error);
- continue;
+ uv = 0; /* drop through to ensure range ends are set */
}
goto NUM_ESCAPE_INSERT;
}
UTF);
if (! valid) {
yyerror(error);
- continue;
+ uv = 0; /* drop through to ensure range ends are set */
}
}
NUM_ESCAPE_INSERT:
/* Insert oct or hex escaped character. */
-
+
/* Here uv is the ordinal of the next character being added */
if (UVCHR_IS_INVARIANT(uv)) {
*d++ = (char) uv;
}
if (*start == '$') {
+ SSize_t start_off = start - SvPVX(PL_linestr);
if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
|| isUPPER(*PL_tokenbuf))
return 0;
- s = skipspace(s);
- PL_bufptr = start;
+ /* this could be $# */
+ if (isSPACE(*s))
+ s = skipspace(s);
+ PL_bufptr = SvPVX(PL_linestr) + start_off;
PL_expect = XREF;
return *s == '(' ? FUNCMETH : METHOD;
}
}
do {
fake_eof = 0;
- bof = PL_rsfp ? TRUE : FALSE;
+ bof = cBOOL(PL_rsfp);
if (0) {
fake_eof:
fake_eof = LEX_FAKE_EOF;
else { /* no override */
tmp = -tmp;
if (tmp == KEY_dump) {
- Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
- "dump() better written as CORE::dump()");
+ Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED),
+ "dump() better written as CORE::dump(). "
+ "dump() will no longer be available "
+ "in Perl 5.30");
}
gv = NULL;
gvp = 0;
== OA_FILEREF))
{
bool immediate_paren = *s == '(';
+ SSize_t s_off;
/* (Now we can afford to cross potential line boundary.) */
s = skipspace(s);
+ /* intuit_method() can indirectly call lex_next_chunk(),
+ * invalidating s
+ */
+ s_off = s - SvPVX(PL_linestr);
/* Two barewords in a row may indicate method call. */
if ( ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
|| *s == '$')
&& (tmp = intuit_method(s, lex ? NULL : sv, cv)))
{
+ /* the code at method: doesn't use s */
goto method;
}
+ s = SvPVX(PL_linestr) + s_off;
/* If not a declared subroutine, it's an indirect object. */
/* (But it's an indir obj regardless for sort.) */
case KEY_exists:
UNI(OP_EXISTS);
-
+
case KEY_exit:
UNI(OP_EXIT);
case KEY_last:
LOOPX(OP_LAST);
-
+
case KEY_lc:
UNI(OP_LC);
case KEY_pos:
UNIDOR(OP_POS);
-
+
case KEY_pack:
LOP(OP_PACK,XTERM);
case KEY_chomp:
UNI(OP_CHOMP);
-
+
case KEY_scalar:
UNI(OP_SCALAR);
|| ! SvOK(*cvp))
{
char *msg;
-
+
/* Here haven't found what we're looking for. If it is charnames,
* perhaps it needs to be loaded. Try doing that before giving up */
if (*key == 'c') {
($*@) sub prototypes sub foo ($)
(stuff) sub attr parameters sub foo : attr(stuff)
<> readline or globs <FOO>, <>, <$fh>, or <*.c>
-
+
In most of these cases (all but <>, patterns and transliterate)
yylex() calls scan_str(). m// makes yylex() call scan_pat() which
calls scan_str(). s/// makes yylex() call scan_subst() which calls
const char * non_grapheme_msg = "Use of unassigned code point or"
" non-standalone grapheme for a delimiter"
" will be a fatal error starting in Perl"
- " v5.30";
+ " 5.30";
/* The only non-UTF character that isn't a stand alone grapheme is
* white-space, hence can't be a delimiter. So can skip for non-UTF-8 */
bool check_grapheme = UTF && ckWARN_d(WARN_DEPRECATED);
*to = *s;
}
}
-
+
/* if the terminator isn't the same as the start character (e.g.,
matched brackets), we have to allow more in the quoting, and
be prepared for nested brackets.
else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r')
to[-1] = '\n';
#endif
-
+
/* if we're out of file, or a read fails, bail and reset the current
line marker so we can report where the unterminated string began
*/
bool floatit; /* boolean: int or float? */
const char *lastub = NULL; /* position of last underbar */
static const char* const number_too_long = "Number too long";
+ bool warned_about_underscore = 0;
+#define WARN_ABOUT_UNDERSCORE() \
+ do { \
+ if (!warned_about_underscore) { \
+ warned_about_underscore = 1; \
+ Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), \
+ "Misplaced _ in number"); \
+ } \
+ } while(0)
/* Hexadecimal floating point.
*
* In many places (where we have quads and NV is IEEE 754 double)
}
if (*s == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
/* _ are ignored -- but warned about if consecutive */
case '_':
if (lastub && s == lastub + 1)
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
break;
out:
/* final misplaced underbar check */
- if (s[-1] == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number");
- }
+ if (s[-1] == '_')
+ WARN_ABOUT_UNDERSCORE();
if (UNLIKELY(HEXFP_PEEK(s))) {
/* Do sloppy (on the underbars) but quick detection
*/
if (*s == '_') {
if (lastub && s == lastub + 1)
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
else {
}
/* final misplaced underbar check */
- if (lastub && s == lastub + 1) {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number");
- }
+ if (lastub && s == lastub + 1)
+ WARN_ABOUT_UNDERSCORE();
/* read a decimal portion if there is one. avoid
3..5 being interpreted as the number 3. followed
*d++ = *s++;
if (*s == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s;
}
Perl_croak(aTHX_ "%s", number_too_long);
if (*s == '_') {
if (lastub && s == lastub + 1)
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s;
}
else
*d++ = *s;
}
/* fractional part ending in underbar? */
- if (s[-1] == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
- }
+ if (s[-1] == '_')
+ WARN_ABOUT_UNDERSCORE();
if (*s == '.' && isDIGIT(s[1])) {
/* oops, it's really a v-string, but without the "v" */
s = start;
/* stray preinitial _ */
if (*s == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
/* stray initial _ */
if (*s == '_') {
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
else {
if (((lastub && s == lastub + 1)
|| (!isDIGIT(s[1]) && s[1] != '_')))
- Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Misplaced _ in number");
+ WARN_ABOUT_UNDERSCORE();
lastub = s++;
}
}