#define PL_lex_brackstack (PL_parser->lex_brackstack)
#define PL_lex_casemods (PL_parser->lex_casemods)
#define PL_lex_casestack (PL_parser->lex_casestack)
-#define PL_lex_defer (PL_parser->lex_defer)
#define PL_lex_dojoin (PL_parser->lex_dojoin)
#define PL_lex_formbrack (PL_parser->lex_formbrack)
#define PL_lex_inpat (PL_parser->lex_inpat)
string or after \E, $foo, etc */
#define LEX_INTERPCONST 2 /* NOT USED */
#define LEX_FORMLINE 1 /* expecting a format line */
-#define LEX_KNOWNEXT 0 /* next token known; just return it */
#ifdef DEBUGGING
}
else {
assert(p < e -1 );
- *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
p += 2;
}
}
}
else {
t = s;
- while (!isSPACE(*t))
+ while (*t && !isSPACE(*t))
t++;
e = t;
}
assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
PL_nexttype[PL_nexttoke] = type;
PL_nexttoke++;
- if (PL_lex_state != LEX_KNOWNEXT) {
- PL_lex_defer = PL_lex_state;
- PL_lex_state = LEX_KNOWNEXT;
- }
}
/*
S_postderef(pTHX_ int const funny, char const next)
{
assert(funny == DOLSHARP || strchr("$@%&*", funny));
- assert(strchr("*[{", next));
if (next == '*') {
PL_expect = XOPERATOR;
if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
assert('@' == funny || '$' == funny || DOLSHARP == funny);
PL_lex_state = LEX_INTERPEND;
- force_next(POSTJOIN);
+ if ('@' == funny)
+ force_next(POSTJOIN);
}
force_next(next);
PL_bufptr+=2;
SAVEI32(PL_lex_casemods);
SAVEI32(PL_lex_starts);
SAVEI8(PL_lex_state);
- SAVEI8(PL_lex_defer);
SAVESPTR(PL_lex_repl);
SAVEVPTR(PL_lex_inpat);
SAVEI16(PL_lex_inwhat);
PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
- if (!SvCUR(res))
+ if (!SvCUR(res)) {
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
+ "Unknown charname '' is deprecated");
return res;
+ }
if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
e - backslash_ptr,
if (*s == ' ' && *(s-1) == ' ') {
goto multi_spaces;
}
- if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
- Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
- "NO-BREAK SPACE in a charnames "
- "alias definition is deprecated");
- }
s++;
}
}
}
s++;
} else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
+ if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
goto bad_charname;
}
s += 2;
s++;
}
else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
+ if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
{
goto bad_charname;
}
- if (*s == *NBSP_UTF8
- && *(s+1) == *(NBSP_UTF8+1)
- && ckWARN_d(WARN_DEPRECATED))
- {
- Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
- "NO-BREAK SPACE in a charnames "
- "alias definition is deprecated");
- }
s += 2;
}
else {
example when it is entirely composed
of hex constants */
SV *res; /* result from charnames */
+ STRLEN offset_to_max; /* The offset in the output to where the range
+ high-end character is temporarily placed */
/* Note on sizing: The scanned constant is placed into sv, which is
* initialized by newSV() assuming one byte of output for every byte of
UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
before set */
#ifdef EBCDIC
- UV literal_endpoint = 0;
- bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
+ int backslash_N = 0; /* ? was the character from \N{} */
+ int non_portable_endpoint = 0; /* ? In a range is an endpoint
+ platform-specific like \x65 */
#endif
PERL_ARGS_ASSERT_SCAN_CONST;
ENTER_with_name("scan_const");
SAVEFREESV(sv);
- while (s < send || dorange) {
+ while (s < send
+ || dorange /* Handle tr/// range at right edge of input */
+ ) {
/* get transliterations out of the way (they're most literal) */
if (PL_lex_inwhat == OP_TRANS) {
- /* expand a range A-Z to the full set of characters. AIE! */
- if (dorange) {
- I32 i; /* current expanded character */
- I32 min; /* first character in range */
- I32 max; /* last character in range */
+ /* But there isn't any special handling necessary unless there is a
+ * range, so for most cases we just drop down and handle the value
+ * as any other. There are two exceptions.
+ *
+ * 1. A minus sign indicates that we are actually going to have
+ * a range. In this case, skip the '-', set a flag, then drop
+ * down to handle what should be the end range value.
+ * 2. After we've handled that value, the next time through, that
+ * flag is set and we fix up the range.
+ *
+ * Ranges entirely within Latin1 are expanded out entirely, in
+ * order to avoid the significant overhead of making a swash.
+ * Ranges that extend above Latin1 have to have a swash, so there
+ * is no advantage to abbreviate them here, so they are stored here
+ * as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte signifies a
+ * hyphen without any possible ambiguity. On EBCDIC machines, if
+ * the range is expressed as Unicode, the Latin1 portion is
+ * expanded out even if the entire range extends above Latin1.
+ * This is because each code point in it has to be processed here
+ * individually to get its native translation */
+
+ if (! dorange) {
+
+ /* Here, we don't think we're in a range. If we've processed
+ * at least one character, then see if this next one is a '-',
+ * indicating the previous one was the start of a range. But
+ * don't bother if we're too close to the end for the minus to
+ * mean that. */
+ if (*s != '-' || s >= send - 1 || s == start) {
+
+ /* A regular character. Process like any other, but first
+ * clear any flags */
+ didrange = FALSE;
+ dorange = FALSE;
#ifdef EBCDIC
- UV uvmax = 0;
+ non_portable_endpoint = 0;
+ backslash_N = 0;
#endif
+ /* Drops down to generic code to process current byte */
+ }
+ else {
+ if (didrange) { /* Something like y/A-C-Z// */
+ Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
+ }
- if (has_utf8
-#ifdef EBCDIC
- && !native_range
-#endif
- ) {
- char * const c = (char*)utf8_hop((U8*)d, -1);
- char *e = d++;
- while (e-- > c)
- *(e + 1) = *e;
- *c = (char) ILLEGAL_UTF8_BYTE;
- /* mark the range as done, and continue */
- dorange = FALSE;
- didrange = TRUE;
- continue;
- }
+ dorange = TRUE;
- i = d - SvPVX_const(sv); /* remember current offset */
-#ifdef EBCDIC
- SvGROW(sv,
- SvLEN(sv) + ((has_utf8)
- ? (512 - UTF_CONTINUATION_MARK
- + UNISKIP(0x100))
- : 256));
- /* How many two-byte within 0..255: 128 in UTF-8,
- * 96 in UTF-8-mod. */
+ s++; /* Skip past the minus */
+
+ /* d now points to where the end-range character will be
+ * placed. Save it so won't have to go finding it later,
+ * and drop down to get that character. (Actually we
+ * instead save the offset, to handle the case where a
+ * realloc in the meantime could change the actual
+ * pointer). We'll finish processing the range the next
+ * time through the loop */
+ offset_to_max = d - SvPVX_const(sv);
+ }
+ } /* End of not a range */
+ else {
+ /* Here we have parsed a range. Now must handle it. At this
+ * point:
+ * 'sv' is a SV* that contains the output string we are
+ * constructing. The final two characters in that string
+ * are the range start and range end, in order.
+ * 'd' points to just beyond the range end in the 'sv' string,
+ * where we would next place something
+ * 'offset_to_max' is the offset in 'sv' at which the character
+ * before 'd' begins.
+ */
+ const char * max_ptr = SvPVX_const(sv) + offset_to_max;
+ const char * min_ptr;
+ IV range_min;
+ IV range_max; /* last character in range */
+ STRLEN save_offset;
+ STRLEN grow;
+#ifndef EBCDIC /* Not meaningful except in EBCDIC, so initialize to false */
+ const bool convert_unicode = FALSE;
+ const IV real_range_max = 0;
#else
- SvGROW(sv, SvLEN(sv) + 256); /* never more than 256 chars in a range */
+ bool convert_unicode;
+ IV real_range_max = 0;
#endif
- d = SvPVX(sv) + i; /* refresh d after realloc */
-#ifdef EBCDIC
+
+ /* Get the range-ends code point values. */
if (has_utf8) {
- int j;
- for (j = 0; j <= 1; j++) {
- char * const c = (char*)utf8_hop((U8*)d, -1);
- const UV uv = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
- if (j)
- min = (U8)uv;
- else if (uv < 256)
- max = (U8)uv;
- else {
- max = (U8)0xff; /* only to \xff */
- uvmax = uv; /* \x{100} to uvmax */
- }
- d = c; /* eat endpoint chars */
- }
+ /* We know the utf8 is valid, because we just constructed
+ * it ourselves in previous loop iterations */
+ min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
+ range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
+ range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
}
- else {
-#endif
- d -= 2; /* eat the first char and the - */
- min = (U8)*d; /* first char in range */
- max = (U8)d[1]; /* last char in range */
+ else {
+ min_ptr = max_ptr - 1;
+ range_min = * (U8*) min_ptr;
+ range_max = * (U8*) max_ptr;
+ }
+
#ifdef EBCDIC
- }
+ /* On EBCDIC platforms, we may have to deal with portable
+ * ranges. These happen if at least one range endpoint is a
+ * Unicode value (\N{...}), or if the range is a subset of
+ * [A-Z] or [a-z], and both ends are literal characters,
+ * like 'A', and not like \x{C1} */
+ if ((convert_unicode
+ = cBOOL(backslash_N) /* \N{} forces Unicode, hence
+ portable range */
+ || ( ! non_portable_endpoint
+ && (( isLOWER_A(range_min) && isLOWER_A(range_max))
+ || (isUPPER_A(range_min) && isUPPER_A(range_max))))
+ )) {
+
+ /* Special handling is needed for these portable ranges.
+ * They are defined to all be in Unicode terms, which
+ * include all Unicode code points between the end points.
+ * Convert to Unicode to get the Unicode range. Later we
+ * will convert each code point in the range back to
+ * native. */
+ range_min = NATIVE_TO_UNI(range_min);
+ range_max = NATIVE_TO_UNI(range_max);
+ }
#endif
- if (min > max) {
- Perl_croak(aTHX_
- "Invalid range \"%c-%c\" in transliteration operator",
- (char)min, (char)max);
+ if (range_min > range_max) {
+ if (convert_unicode) {
+ /* Need to convert back to native for meaningful
+ * messages for this platform */
+ range_min = UNI_TO_NATIVE(range_min);
+ range_max = UNI_TO_NATIVE(range_max);
+ }
+
+ /* Use the characters themselves for the error message if
+ * ASCII printables; otherwise some visible representation
+ * of them */
+ if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
+ Perl_croak(aTHX_
+ "Invalid range \"%c-%c\" in transliteration operator",
+ (char)range_min, (char)range_max);
+ }
+ else if (convert_unicode) {
+ /* diag_listed_as: Invalid range "%s" in transliteration operator */
+ Perl_croak(aTHX_
+ "Invalid range \"\\N{U+%04"UVXf"}-\\N{U+%04"UVXf"}\""
+ " in transliteration operator",
+ range_min, range_max);
+ }
+ else {
+ /* diag_listed_as: Invalid range "%s" in transliteration operator */
+ Perl_croak(aTHX_
+ "Invalid range \"\\x{%04"UVXf"}-\\x{%04"UVXf"}\""
+ " in transliteration operator",
+ range_min, range_max);
+ }
}
+ if (has_utf8) {
+
+ /* We try to avoid creating a swash. If the upper end of
+ * this range is below 256, this range won't force a swash;
+ * otherwise it does force a swash, and as long as we have
+ * to have one, we might as well not expand things out.
+ * But if it's EBCDIC, we may have to look at each
+ * character below 256 if we have to convert to/from
+ * Unicode values */
+ if (range_max > 255
#ifdef EBCDIC
- /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
- * any subsets of these ranges into individual characters */
- if (literal_endpoint == 2
- && ((isLOWER_A(min) && isLOWER_A(max))
- || (isUPPER_A(min) && isUPPER_A(max))))
- {
- for (i = min; i <= max; i++) {
- if (isALPHA_A(i))
- *d++ = i;
- }
- }
- else
+ && (range_min > 255 || ! convert_unicode)
#endif
- for (i = min; i <= max; i++)
-#ifdef EBCDIC
- if (has_utf8) {
- append_utf8_from_native_byte(i, &d);
+ ) {
+ /* Move the high character one byte to the right; then
+ * insert between it and the range begin, an illegal
+ * byte which serves to indicate this is a range (using
+ * a '-' could be ambiguous). */
+ char *e = d++;
+ while (e-- > max_ptr) {
+ *(e + 1) = *e;
}
- else
-#endif
- *d++ = (char)i;
-
+ *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
+ goto range_done;
+ }
+
+ /* Here, we're going to expand out the range. For EBCDIC
+ * the range can extend above 255 (not so in ASCII), so
+ * for EBCDIC, split it into the parts above and below
+ * 255/256 */
#ifdef EBCDIC
- if (uvmax) {
- d = (char*)uvchr_to_utf8((U8*)d, 0x100);
- if (uvmax > 0x101)
- *d++ = (char) ILLEGAL_UTF8_BYTE;
- if (uvmax > 0x100)
- d = (char*)uvchr_to_utf8((U8*)d, uvmax);
- }
+ if (range_max > 255) {
+ real_range_max = range_max;
+ range_max = 255;
+ }
#endif
+ }
- /* mark the range as done, and continue */
- dorange = FALSE;
- didrange = TRUE;
+ /* Here we need to expand out the string to contain each
+ * character in the range. Grow the output to handle this */
+
+ save_offset = min_ptr - SvPVX_const(sv);
+
+ /* The base growth is the number of code points in the range */
+ grow = range_max - range_min + 1;
+ if (has_utf8) {
+
+ /* But if the output is UTF-8, some of those characters may
+ * need two bytes (since the maximum range value here is
+ * 255, the max bytes per character is two). On ASCII
+ * platforms, it's not much trouble to get an accurate
+ * count of what's needed. But on EBCDIC, the ones that
+ * need 2 bytes are scattered around, so just use a worst
+ * case value instead of calculating for that platform. */
#ifdef EBCDIC
- literal_endpoint = 0;
+ grow *= 2;
+#else
+ /* Only those above 127 require 2 bytes. This may be
+ * everything in the range, or not */
+ if (range_min > 127) {
+ grow *= 2;
+ }
+ else if (range_max > 127) {
+ grow += range_max - 127;
+ }
#endif
- continue;
- }
+ }
+
+ /* Subtract 3 for the bytes that were already accounted for
+ * (min, max, and the hyphen) */
+ SvGROW(sv, SvLEN(sv) + grow - 3);
+ d = SvPVX(sv) + save_offset; /* refresh d after realloc */
- /* range begins (ignore - as first or last char) */
- else if (*s == '-' && s+1 < send && s != start) {
- if (didrange) {
- Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
+ /* Here, we expand out the range. On ASCII platforms, the
+ * compiler should optimize out the 'convert_unicode==TRUE'
+ * portion of this */
+ if (convert_unicode) {
+ IV i;
+
+ /* Recall that the min and max are now in Unicode terms, so
+ * we have to convert each character to its native
+ * equivalent */
+ if (has_utf8) {
+ for (i = range_min; i <= range_max; i++) {
+ append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
+ (U8 **) &d);
+ }
+ }
+ else {
+ for (i = range_min; i <= range_max; i++) {
+ *d++ = (char)LATIN1_TO_NATIVE((U8) i);
+ }
+ }
}
- if (has_utf8
-#ifdef EBCDIC
- && !native_range
-#endif
- ) {
- *d++ = (char) ILLEGAL_UTF8_BYTE; /* use illegal utf8 byte--see pmtrans */
- s++;
- continue;
+ else {
+ IV i;
+
+ /* Here, no conversions are necessary, which means that the
+ * first character in the range is already in 'd' and
+ * valid, so we can skip overwriting it */
+ if (has_utf8) {
+ d += UTF8SKIP(d);
+ for (i = range_min + 1; i <= range_max; i++) {
+ append_utf8_from_native_byte((U8) i, (U8 **) &d);
+ }
+ }
+ else {
+ d++;
+ for (i = range_min + 1; i <= range_max; i++) {
+ *d++ = (char)i;
+ }
+ }
}
- dorange = TRUE;
- s++;
- }
- else {
- didrange = FALSE;
-#ifdef EBCDIC
- literal_endpoint = 0;
- native_range = TRUE;
-#endif
- }
- }
- /* if we get to any of these else's, we're not doing a
- * transliteration. */
+ /* (Compilers should optimize this out for non-EBCDIC). If the
+ * original range extended above 255, add in that portion */
+ if (real_range_max) {
+ *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
+ *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
+ if (real_range_max > 0x101)
+ *d++ = (char) ILLEGAL_UTF8_BYTE;
+ if (real_range_max > 0x100)
+ d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
+ }
+ range_done:
+ /* mark the range as done, and continue */
+ didrange = TRUE;
+ dorange = FALSE;
+#ifdef EBCDIC
+ non_portable_endpoint = 0;
+ backslash_N = 0;
+#endif
+ continue;
+ } /* End of is a range */
+ } /* End of transliteration. Joins main code after these else's */
else if (*s == '[' && PL_lex_inpat && !in_charclass) {
char *s1 = s-1;
int esc = 0;
}
switch (*s) {
-
- /* quoted - in transliterations */
- case '-':
- if (PL_lex_inwhat == OP_TRANS) {
- *d++ = *s++;
- continue;
- }
- /* FALLTHROUGH */
default:
{
if ((isALPHANUMERIC(*s)))
}
NUM_ESCAPE_INSERT:
- /* Insert oct or hex escaped character. There will always be
- * enough room in sv since such escapes will be longer than any
- * UTF-8 sequence they can end up as, except if they force us
- * to recode the rest of the string into utf8 */
+ /* Insert oct or hex escaped character. */
/* Here uv is the ordinal of the next character being added */
- if (!UVCHR_IS_INVARIANT(uv)) {
+ if (UVCHR_IS_INVARIANT(uv)) {
+ *d++ = (char) uv;
+ }
+ else {
if (!has_utf8 && uv > 255) {
/* Might need to recode whatever we have accumulated so
* far if it contains any chars variant in utf8 or
*d = '\0';
/* See Note on sizing above. */
sv_utf8_upgrade_flags_grow(
- sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
+ sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
/* Above-latin1 in string
* implies no encoding */
|SV_UTF8_NO_ENCODING,
- UNISKIP(uv) + (STRLEN)(send - s) + 1);
+ UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
d = SvPVX(sv) + SvCUR(sv);
has_utf8 = TRUE;
}
if (has_utf8) {
+ /* Usually, there will already be enough room in 'sv'
+ * since such escapes are likely longer than any UTF-8
+ * sequence they can end up as. This isn't the case on
+ * EBCDIC where \x{40000000} contains 12 bytes, and the
+ * UTF-8 for it contains 14. And, we have to allow for
+ * a trailing NUL. It probably can't happen on ASCII
+ * platforms, but be safe */
+ const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv)
+ + 1;
+ if (UNLIKELY(needed > SvLEN(sv))) {
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ d = sv_grow(sv, needed) + SvCUR(sv);
+ }
+
d = (char*)uvchr_to_utf8((U8*)d, uv);
if (PL_lex_inwhat == OP_TRANS
&& PL_sublex_info.sub_op)
(PL_lex_repl ? OPpTRANS_FROM_UTF
: OPpTRANS_TO_UTF);
}
-#ifdef EBCDIC
- if (uv > 255 && !dorange)
- native_range = FALSE;
-#endif
}
else {
*d++ = (char)uv;
}
}
- else {
- *d++ = (char) uv;
- }
+#ifdef EBCDIC
+ non_portable_endpoint++;
+#endif
continue;
case 'N':
/* In a non-pattern \N must be like \N{U+0041}, or it can be a
* named character, like \N{LATIN SMALL LETTER A}, or a named
* sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
- * GRAVE}. For convenience all three forms are referred to as
- * "named characters" below.
+ * GRAVE} (except y/// can't handle the latter, croaking). For
+ * convenience all three forms are referred to as "named
+ * characters" below.
*
* For patterns, \N also can mean to match a non-newline. Code
* before this 'switch' statement should already have handled
*
* The structure of this section of code (besides checking for
* errors and upgrading to utf8) is:
- * If the named character is of the form \N{U+...}, pass it
+ * If the named character is of the form \N{U+...}, pass it
* through if a pattern; otherwise convert the code point
* to utf8
- * Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
- * if a pattern; otherwise convert to utf8
+ * Otherwise must be some \N{NAME}: convert to
+ * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
+ *
+ * Transliteration is an exception. The conversion to utf8 is
+ * only done if the code point requires it to be representable.
*
* Here, 's' points to the 'N'; the test below is guaranteed to
* succeed if we are being called on a pattern, as we already
if (len == 0 || (len != (STRLEN)(e - s)))
goto bad_NU;
- /* If the destination is not in utf8, unconditionally
- * recode it to be so. This is because \N{} implies
- * Unicode semantics, and scalars have to be in utf8
- * to guarantee those semantics */
- if (! has_utf8) {
+ /* For non-tr///, if the destination is not in utf8,
+ * unconditionally recode it to be so. This is
+ * because \N{} implies Unicode semantics, and scalars
+ * have to be in utf8 to guarantee those semantics.
+ * tr/// doesn't care about Unicode rules, so no need
+ * there to upgrade to UTF-8 for small enough code
+ * points */
+ if (! has_utf8 && ( uv > 0xFF
+ || PL_lex_inwhat != OP_TRANS))
+ {
SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
*d = '\0';
/* See Note on sizing above. */
sv_utf8_upgrade_flags_grow(
- sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- UNISKIP(uv) + (STRLEN)(send - e) + 1);
+ sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ UVCHR_SKIP(uv) + (STRLEN)(send - e) + 1);
d = SvPVX(sv) + SvCUR(sv);
has_utf8 = TRUE;
}
/* Add the (Unicode) code point to the output. */
- if (UNI_IS_INVARIANT(uv)) {
+ if (OFFUNI_IS_INVARIANT(uv)) {
*d++ = (char) LATIN1_TO_NATIVE(uv);
}
else {
/* The regex compiler is
* expecting Unicode, not
* native */
- (U8) NATIVE_TO_LATIN1(*str));
+ NATIVE_TO_LATIN1(*str));
PERL_MY_SNPRINTF_POST_GUARD(len,
sizeof(hex_string));
Copy(hex_string, d, 3, char);
else { /* Here, not in a pattern. Convert the name to a
* string. */
- /* If destination is not in utf8, unconditionally
- * recode it to be so. This is because \N{} implies
- * Unicode semantics, and scalars have to be in utf8
- * to guarantee those semantics */
- if (! has_utf8) {
+ if (PL_lex_inwhat == OP_TRANS) {
+ str = SvPV_const(res, len);
+ if (len > ((SvUTF8(res))
+ ? UTF8SKIP(str)
+ : 1U))
+ {
+ yyerror(Perl_form(aTHX_
+ "%.*s must not be a named sequence"
+ " in transliteration operator",
+ /* +1 to include the "}" */
+ (int) (e + 1 - start), start));
+ goto end_backslash_N;
+ }
+ }
+ else if (! SvUTF8(res)) {
+ /* Make sure \N{} return is UTF-8. This is because
+ * \N{} implies Unicode semantics, and scalars have to
+ * be in utf8 to guarantee those semantics; but not
+ * needed in tr/// */
+ sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
+ str = SvPV_const(res, len);
+ }
+
+ /* Upgrade destination to be utf8 if this new
+ * component is */
+ if (! has_utf8 && SvUTF8(res)) {
SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
*d = '\0';
const STRLEN off = d - SvPVX_const(sv);
d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
}
- if (! SvUTF8(res)) { /* Make sure \N{} return is UTF-8 */
- sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
- str = SvPV_const(res, len);
- }
Copy(str, d, len, char);
d += len;
}
SvREFCNT_dec(res);
} /* End \N{NAME} */
+
+ end_backslash_N:
#ifdef EBCDIC
- if (!dorange)
- native_range = FALSE; /* \N{} is defined to be Unicode */
+ backslash_N++; /* \N{} is defined to be Unicode */
#endif
s = e + 1; /* Point to just after the '}' */
continue;
else {
yyerror("Missing control char name in \\c");
}
+#ifdef EBCDIC
+ non_portable_endpoint++;
+#endif
continue;
/* printf-style backslashes, formfeeds, newlines, etc */
s++;
continue;
} /* end if (backslash) */
-#ifdef EBCDIC
- else
- literal_endpoint++;
-#endif
default_action:
/* If we started with encoded form, or already know we want it,
if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
STRLEN len = 1;
-
/* One might think that it is wasted effort in the case of the
* source being utf8 (this_utf8 == TRUE) to take the next character
* in the source, convert it to an unsigned value, and then convert
const UV nextuv = (this_utf8)
? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
: (UV) ((U8) *s);
- const STRLEN need = UNISKIP(nextuv);
+ const STRLEN need = UVCHR_SKIP(nextuv);
if (!has_utf8) {
SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
s += len;
d = (char*)uvchr_to_utf8((U8*)d, nextuv);
-#ifdef EBCDIC
- if (uv > 255 && !dorange)
- native_range = FALSE;
-#endif
}
else {
*d++ = *s++;
The type of the next token
Structure:
+ Check if we have already built the token; if so, use it.
Switch based on the current state:
- - if we already built the token before, use it
- if we have a case modifier in a string, deal with that
- handle other cases of interpolation inside a string
- scan the next line if we are inside a format
- In the normal state switch on the next character:
+ In the normal state, switch on the next character:
- default:
if alphabetic, go to key lookup
- unrecoginized character - croak
+ unrecognized character - croak
- 0/4/26: handle end-of-line or EOF
- cases for whitespace
- \n and #: handle comments and line numbers
if (PL_nexttoke) {
PL_nexttoke--;
pl_yylval = PL_nextval[PL_nexttoke];
- if (!PL_nexttoke) {
- PL_lex_state = PL_lex_defer;
- PL_lex_defer = LEX_NORMAL;
- }
{
I32 next_type;
next_type = PL_nexttype[PL_nexttoke];
/* FALLTHROUGH */
case LEX_INTERPEND:
- /* Treat state as LEX_NORMAL if we have no inner lexing scope.
- XXX This hack can be removed if we stop setting PL_lex_state to
- LEX_KNOWNEXT, as can the hack under LEX_INTREPCONCAT below. */
- if (UNLIKELY(!PL_lex_inwhat)) {
- PL_lex_state = LEX_NORMAL;
- break;
- }
-
if (PL_lex_dojoin) {
const U8 dojoin_was = PL_lex_dojoin;
PL_lex_dojoin = FALSE;
Perl_croak(aTHX_ "panic: INTERPCONCAT, lex_brackets=%ld",
(long) PL_lex_brackets);
#endif
- /* Treat state as LEX_NORMAL when not in an inner lexing scope.
- XXX This hack can be removed if we stop setting PL_lex_state to
- LEX_KNOWNEXT. */
- if (UNLIKELY(!PL_lex_inwhat)) {
- PL_lex_state = LEX_NORMAL;
- break;
- }
-
if (PL_bufptr == PL_bufend)
return REPORT(sublex_done());
retry:
switch (*s) {
default:
- if (UTF ? isIDFIRST_utf8((U8*)s) : isALNUMC(*s))
+ if (UTF) {
+ if (! isUTF8_CHAR((U8 *) s, (U8 *) PL_bufend)) {
+ ENTER;
+ SAVESPTR(PL_warnhook);
+ PL_warnhook = PERL_WARNHOOK_FATAL;
+ utf8n_to_uvchr((U8*)s, PL_bufend-s, NULL, 0);
+ LEAVE;
+ }
+ if (isIDFIRST_utf8((U8*)s)) {
+ goto keylookup;
+ }
+ }
+ else if (isALNUMC(*s)) {
goto keylookup;
- {
+ }
+ {
SV *dsv = newSVpvs_flags("", SVs_TEMP);
const char *c = UTF ? sv_uni_display(dsv, newSVpvn_flags(s,
UTF8SKIP(s),
else
/* skip plain q word */
while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF))
- t += UTF8SKIP(t);
+ t += UTF ? UTF8SKIP(t) : 1;
}
else if (isWORDCHAR_lazy_if(t,UTF)) {
- t += UTF8SKIP(t);
+ t += UTF ? UTF8SKIP(t) : 1;
while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF))
- t += UTF8SKIP(t);
+ t += UTF ? UTF8SKIP(t) : 1;
}
while (t < PL_bufend && isSPACE(*t))
t++;
{
const char tmp = *s++;
if (tmp == '=') {
+ if ((s == PL_linestart+2 || s[-3] == '\n') && strnEQ(s, "=====", 5))
+ Perl_croak(aTHX_ "Version control conflict marker '%.*s'", 7, s - 2);
if (!PL_lex_allbrackets
&& PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
{
if (PL_expect != XOPERATOR) {
if (s[1] != '<' && !strchr(s,'>'))
check_uni();
- if (s[1] == '<' && s[2] != '>')
+ if (s[1] == '<' && s[2] != '>') {
+ if ((s == PL_linestart || s[-1] == '\n') && strnEQ(s+2, "<<<<<", 5))
+ Perl_croak(aTHX_ "Version control conflict marker '%.*s'", 7, s);
s = scan_heredoc(s);
+ }
else
s = scan_inputsymbol(s);
PL_expect = XOPERATOR;
{
char tmp = *s++;
if (tmp == '<') {
+ if ((s == PL_linestart+2 || s[-3] == '\n') && strnEQ(s, "<<<<<", 5))
+ Perl_croak(aTHX_ "Version control conflict marker '%.*s'", 7, s - 2);
if (*s == '=' && !PL_lex_allbrackets
&& PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
{
{
const char tmp = *s++;
if (tmp == '>') {
+ if ((s == PL_linestart+2 || s[-3] == '\n') && strnEQ(s, ">>>>>", 5))
+ Perl_croak(aTHX_ "Version control conflict marker '%.*s'", 7, s - 2);
if (*s == '=' && !PL_lex_allbrackets
&& PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
{
TOKEN('$');
case '@':
- if (PL_expect == XOPERATOR)
- no_op("Array", s);
- else if (PL_expect == XPOSTDEREF) POSTDEREF('@');
+ if (PL_expect == XPOSTDEREF)
+ POSTDEREF('@');
PL_tokenbuf[0] = '@';
s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
+ if (PL_expect == XOPERATOR) {
+ d = s;
+ if (PL_bufptr > s) {
+ d = PL_bufptr-1;
+ PL_bufptr = PL_oldbufptr;
+ }
+ no_op("Array", d);
+ }
pl_yylval.ival = 0;
if (!PL_tokenbuf[1]) {
PREREF('@');
UNI(OP_LCFIRST);
case KEY_local:
- pl_yylval.ival = 0;
OPERATOR(LOCAL);
case KEY_length:
if (isIDFIRST_lazy_if(s,UTF)) {
s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
if (len == 3 && strnEQ(PL_tokenbuf, "sub", 3))
- {
- if (!FEATURE_LEXSUBS_IS_ENABLED)
- Perl_croak(aTHX_
- "Experimental \"%s\" subs not enabled",
- tmp == KEY_my ? "my" :
- tmp == KEY_state ? "state" : "our");
- Perl_ck_warner_d(aTHX_
- packWARN(WARN_EXPERIMENTAL__LEXICAL_SUBS),
- "The lexical_subs feature is experimental");
goto really_sub;
- }
PL_in_my_stash = find_in_my_stash(PL_tokenbuf, len);
if (!PL_in_my_stash) {
char tmpbuf[1024];
yyerror_pv(tmpbuf, UTF ? SVf_UTF8 : 0);
}
}
- pl_yylval.ival = 1;
OPERATOR(MY);
case KEY_next:
* 2) '{'
* The final case currently doesn't get this far in the program, so we
* don't test for it. If that were to change, it would be ok to allow it.
- * c) When not under Unicode rules, any upper Latin1 character
- * d) Otherwise, when unicode rules are used, all XIDS characters.
+ * b) When not under Unicode rules, any upper Latin1 character
+ * c) Otherwise, when unicode rules are used, all XIDS characters.
*
* Because all ASCII characters have the same representation whether
* encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
- * '{' without knowing if is UTF-8 or not.
- * EBCDIC already uses the rules that ASCII platforms will use after the
- * deprecation cycle; see comment below about the deprecation. */
-#ifdef EBCDIC
-# define VALID_LEN_ONE_IDENT(s, is_utf8) \
+ * '{' without knowing if is UTF-8 or not. */
+#define VALID_LEN_ONE_IDENT(s, is_utf8) \
(isGRAPH_A(*(s)) || ((is_utf8) \
? isIDFIRST_utf8((U8*) (s)) \
: (isGRAPH_L1(*s) \
&& LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))
-#else
-# define VALID_LEN_ONE_IDENT(s, is_utf8) \
- (isGRAPH_A(*(s)) || ((is_utf8) \
- ? isIDFIRST_utf8((U8*) (s)) \
- : ! isASCII_utf8((U8*) (s))))
-#endif
STATIC char *
S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
: 1)
&& VALID_LEN_ONE_IDENT(s, is_utf8))
{
- /* Deprecate all non-graphic characters. Include SHY as a non-graphic,
- * because often it has no graphic representation. (We can't get to
- * here with SHY when 'is_utf8' is true, so no need to include a UTF-8
- * test for it.) */
- if ((is_utf8)
- ? ! isGRAPH_utf8( (U8*) s)
- : (! isGRAPH_L1( (U8) *s)
- || UNLIKELY((U8) *(s) == LATIN1_TO_NATIVE(0xAD))))
- {
- deprecate("literal non-graphic characters in variable names");
- }
-
if (is_utf8) {
const STRLEN skip = UTF8SKIP(s);
STRLEN i;
"Use of /c modifier is meaningless without /g" );
}
- STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+ if (UNLIKELY((x_mod_count) > 1)) {
+ yyerror("Only one /x regex modifier is allowed");
+ }
PL_lex_op = (OP*)pm;
pl_yylval.ival = OP_MATCH;
}
}
- STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+ if (UNLIKELY((x_mod_count) > 1)) {
+ yyerror("Only one /x regex modifier is allowed");
+ }
if ((pm->op_pmflags & PMf_CONTINUE)) {
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" );
SV *linestr;
char *bufend;
char * const olds = s;
- PERL_CONTEXT * const cx = &cxstack[cxstack_ix];
+ PERL_CONTEXT * const cx = CX_CUR();
/* These two fields are not set until an inner lexing scope is
entered. But we need them set here. */
shared->ls_bufptr = s;
goto streaming;
}
}
- else { /* eval */
+ else { /* eval or we've already hit EOF */
s = (char*)memchr((void*)s, '\n', PL_bufend - s);
- assert(s);
+ if (!s)
+ goto interminable;
}
linestr = shared->ls_linestr;
bufend = SvEND(linestr);
else
{
SV *linestr_save;
+ char *oldbufptr_save;
streaming:
sv_setpvs(tmpstr,""); /* avoid "uninitialized" warning */
term = PL_tokenbuf[1];
len--;
linestr_save = PL_linestr; /* must restore this afterwards */
d = s; /* and this */
+ oldbufptr_save = PL_oldbufptr;
PL_linestr = newSVpvs("");
PL_bufend = SvPVX(PL_linestr);
while (1) {
restore PL_linestr. */
SvREFCNT_dec_NN(PL_linestr);
PL_linestr = linestr_save;
+ PL_oldbufptr = oldbufptr_save;
goto interminable;
}
CopLINE_set(PL_curcop, origline);
PL_linestr = linestr_save;
PL_linestart = SvPVX(linestr_save);
PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+ PL_oldbufptr = oldbufptr_save;
s = d;
break;
}
* multiple fp operations. */
bool hexfp = FALSE;
int total_bits = 0;
+ int significant_bits = 0;
#if NVSIZE == 8 && defined(HAS_QUAD) && defined(Uquad_t)
# define HEXFP_UQUAD
Uquad_t hexfp_uquad = 0;
#endif
NV hexfp_mult = 1.0;
UV high_non_zero = 0; /* highest digit */
+ int non_zero_integer_digits = 0;
PERL_ARGS_ASSERT_SCAN_NUM;
if (high_non_zero == 0 && b > 0)
high_non_zero = b;
+ if (high_non_zero)
+ non_zero_integer_digits++;
+
/* this could be hexfp, but peek ahead
* to avoid matching ".." */
if (UNLIKELY(HEXFP_PEEK(s))) {
* detection will shortly be more thorough with the
* underbar checks. */
const char* h = s;
+ significant_bits = non_zero_integer_digits * shift;
#ifdef HEXFP_UQUAD
hexfp_uquad = u;
#else /* HEXFP_NV */
hexfp_nv = u;
#endif
+ /* Ignore the leading zero bits of
+ * the high (first) non-zero digit. */
+ if (high_non_zero) {
+ if (high_non_zero < 0x8)
+ significant_bits--;
+ if (high_non_zero < 0x4)
+ significant_bits--;
+ if (high_non_zero < 0x2)
+ significant_bits--;
+ }
+
if (*h == '.') {
#ifdef HEXFP_NV
- NV mult = 1 / 16.0;
+ NV nv_mult = 1.0;
#endif
- h++;
- while (isXDIGIT(*h) || *h == '_') {
+ bool accumulate = TRUE;
+ for (h++; (isXDIGIT(*h) || *h == '_'); h++) {
if (isXDIGIT(*h)) {
U8 b = XDIGIT_VALUE(*h);
- total_bits += shift;
+ significant_bits += shift;
#ifdef HEXFP_UQUAD
- hexfp_uquad <<= shift;
- hexfp_uquad |= b;
- hexfp_frac_bits += shift;
+ if (accumulate) {
+ if (significant_bits < NV_MANT_DIG) {
+ /* We are in the long "run" of xdigits,
+ * accumulate the full four bits. */
+ hexfp_uquad <<= shift;
+ hexfp_uquad |= b;
+ hexfp_frac_bits += shift;
+ } else {
+ /* We are at a hexdigit either at,
+ * or straddling, the edge of mantissa.
+ * We will try grabbing as many as
+ * possible bits. */
+ int tail =
+ significant_bits - NV_MANT_DIG;
+ if (tail <= 0)
+ tail += shift;
+ hexfp_uquad <<= tail;
+ hexfp_uquad |= b >> (shift - tail);
+ hexfp_frac_bits += tail;
+
+ /* Ignore the trailing zero bits
+ * of the last non-zero xdigit.
+ *
+ * The assumption here is that if
+ * one has input of e.g. the xdigit
+ * eight (0x8), there is only one
+ * bit being input, not the full
+ * four bits. Conversely, if one
+ * specifies a zero xdigit, the
+ * assumption is that one really
+ * wants all those bits to be zero. */
+ if (b) {
+ if ((b & 0x1) == 0x0) {
+ significant_bits--;
+ if ((b & 0x2) == 0x0) {
+ significant_bits--;
+ if ((b & 0x4) == 0x0) {
+ significant_bits--;
+ }
+ }
+ }
+ }
+
+ accumulate = FALSE;
+ }
+ } else {
+ /* Keep skipping the xdigits, and
+ * accumulating the significant bits,
+ * but do not shift the uquad
+ * (which would catastrophically drop
+ * high-order bits) or accumulate the
+ * xdigits anymore. */
+ }
#else /* HEXFP_NV */
- hexfp_nv += b * mult;
- mult /= 16.0;
+ if (accumulate) {
+ nv_mult /= 16.0;
+ if (nv_mult > 0.0)
+ hexfp_nv += b * nv_mult;
+ else
+ accumulate = FALSE;
+ }
#endif
}
- h++;
+ if (significant_bits >= NV_MANT_DIG)
+ accumulate = FALSE;
}
}
- if (total_bits >= 4) {
- if (high_non_zero < 0x8)
- total_bits--;
- if (high_non_zero < 0x4)
- total_bits--;
- if (high_non_zero < 0x2)
- total_bits--;
- }
-
- if (total_bits > 0 && (isALPHA_FOLD_EQ(*h, 'p'))) {
+ if ((total_bits > 0 || significant_bits > 0) &&
+ isALPHA_FOLD_EQ(*h, 'p')) {
bool negexp = FALSE;
h++;
if (*h == '+')
*d = '\0';
if (UNLIKELY(hexfp)) {
# ifdef NV_MANT_DIG
- if (total_bits > NV_MANT_DIG)
+ if (significant_bits > NV_MANT_DIG)
Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
"Hexadecimal float: mantissa overflow");
# endif
else if (yychar > 255)
sv_catpvs(where_sv, "next token ???");
else if (yychar == YYEMPTY) {
- if ( PL_lex_state == LEX_NORMAL
- || (PL_lex_state == LEX_KNOWNEXT && PL_lex_defer == LEX_NORMAL))
+ if (PL_lex_state == LEX_NORMAL)
sv_catpvs(where_sv, "at end of line");
else if (PL_lex_inpat)
sv_catpvs(where_sv, "within pattern");
"Integer overflow in decimal number");
}
}
-#ifdef EBCDIC
- if (rev > 0x7FFFFFFF)
- Perl_croak(aTHX_ "In EBCDIC the v-string components cannot exceed 2147483647");
-#endif
+
/* Append native character for the rev point */
tmpend = uvchr_to_utf8(tmpbuf, rev);
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
{
if (flags & ~PARSE_OPTIONAL)
Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_label");
- if (PL_lex_state == LEX_KNOWNEXT) {
+ if (PL_nexttoke) {
PL_parser->yychar = yylex();
if (PL_parser->yychar == LABEL) {
char * const lpv = pl_yylval.pval;