podcheck.t: Move a declaration

[perl5.git] / toke.c
diff --git a/toke.c b/toke.c

index b341c96..2c6cd85 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -52,7 +52,6 @@ Individual members of C<PL_parser> have their own documentation.
  #define PL_lex_brackstack      (PL_parser->lex_brackstack)
  #define PL_lex_casemods                (PL_parser->lex_casemods)
  #define PL_lex_casestack        (PL_parser->lex_casestack)
-#define PL_lex_defer           (PL_parser->lex_defer)
  #define PL_lex_dojoin          (PL_parser->lex_dojoin)
  #define PL_lex_formbrack        (PL_parser->lex_formbrack)
  #define PL_lex_inpat           (PL_parser->lex_inpat)
@@ -142,7 +141,6 @@ static const char* const ident_too_long = "Identifier too long";
                                         string or after \E, $foo, etc       */
  #define LEX_INTERPCONST                 2 /* NOT USED */
  #define LEX_FORMLINE            1 /* expecting a format line               */
-#define LEX_KNOWNEXT            0 /* next token known; just return it      */
  
  
  #ifdef DEBUGGING
@@ -1033,7 +1031,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
                 }
                 else {
                      assert(p < e -1 );
-                   *bufptr++ = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+                   *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
                     p += 2;
                  }
             }
@@ -1694,7 +1692,7 @@ S_incline(pTHX_ const char *s)
      }
      else {
         t = s;
-       while (!isSPACE(*t))
+       while (*t && !isSPACE(*t))
             t++;
         e = t;
      }
@@ -1919,10 +1917,6 @@ S_force_next(pTHX_ I32 type)
      assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
      PL_nexttype[PL_nexttoke] = type;
      PL_nexttoke++;
-    if (PL_lex_state != LEX_KNOWNEXT) {
-       PL_lex_defer = PL_lex_state;
-       PL_lex_state = LEX_KNOWNEXT;
-    }
  }
  
  /*
@@ -1938,13 +1932,13 @@ static int
  S_postderef(pTHX_ int const funny, char const next)
  {
      assert(funny == DOLSHARP || strchr("$@%&*", funny));
-    assert(strchr("*[{", next));
      if (next == '*') {
         PL_expect = XOPERATOR;
         if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
             assert('@' == funny || '$' == funny || DOLSHARP == funny);
             PL_lex_state = LEX_INTERPEND;
-           force_next(POSTJOIN);
+           if ('@' == funny)
+               force_next(POSTJOIN);
         }
         force_next(next);
         PL_bufptr+=2;
@@ -2346,7 +2340,6 @@ S_sublex_push(pTHX)
      SAVEI32(PL_lex_casemods);
      SAVEI32(PL_lex_starts);
      SAVEI8(PL_lex_state);
-    SAVEI8(PL_lex_defer);
      SAVESPTR(PL_lex_repl);
      SAVEVPTR(PL_lex_inpat);
      SAVEI16(PL_lex_inwhat);
@@ -2518,8 +2511,11 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
  
      PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
  
-    if (!SvCUR(res))
+    if (!SvCUR(res)) {
+        Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
+                       "Unknown charname '' is deprecated");
          return res;
+    }
  
      if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr,
                                       e - backslash_ptr,
@@ -2585,11 +2581,6 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
             if (*s == ' ' && *(s-1) == ' ') {
                  goto multi_spaces;
              }
-           if ((U8) *s == NBSP_NATIVE && ckWARN_d(WARN_DEPRECATED)) {
-                Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                           "NO-BREAK SPACE in a charnames "
-                           "alias definition is deprecated");
-            }
              s++;
          }
      }
@@ -2603,7 +2594,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
              }
              s++;
          } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-            if (! isALPHAU(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)))) {
+            if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
                  goto bad_charname;
              }
              s += 2;
@@ -2633,18 +2624,10 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
                  s++;
              }
              else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                if (! isCHARNAME_CONT(TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1))))
+                if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
                  {
                      goto bad_charname;
                  }
-                if (*s == *NBSP_UTF8
-                    && *(s+1) == *(NBSP_UTF8+1)
-                    && ckWARN_d(WARN_DEPRECATED))
-                {
-                    Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
-                                "NO-BREAK SPACE in a charnames "
-                                "alias definition is deprecated");
-                }
                  s += 2;
              }
              else {
@@ -2833,6 +2816,8 @@ S_scan_const(pTHX_ char *start)
                                             example when it is entirely composed
                                             of hex constants */
      SV *res;                           /* result from charnames */
+    STRLEN offset_to_max;   /* The offset in the output to where the range
+                               high-end character is temporarily placed */
  
      /* Note on sizing:  The scanned constant is placed into sv, which is
       * initialized by newSV() assuming one byte of output for every byte of
@@ -2849,8 +2834,9 @@ S_scan_const(pTHX_ char *start)
      UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
                         before set */
  #ifdef EBCDIC
-    UV literal_endpoint = 0;
-    bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */
+    int backslash_N = 0;            /* ? was the character from \N{} */
+    int non_portable_endpoint = 0;  /* ? In a range is an endpoint
+                                       platform-specific like \x65 */
  #endif
  
      PERL_ARGS_ASSERT_SCAN_CONST;
@@ -2866,152 +2852,304 @@ S_scan_const(pTHX_ char *start)
      ENTER_with_name("scan_const");
      SAVEFREESV(sv);
  
-    while (s < send || dorange) {
+    while (s < send
+           || dorange   /* Handle tr/// range at right edge of input */
+    ) {
  
          /* get transliterations out of the way (they're most literal) */
         if (PL_lex_inwhat == OP_TRANS) {
-           /* expand a range A-Z to the full set of characters.  AIE! */
-           if (dorange) {
-               I32 i;                          /* current expanded character */
-               I32 min;                        /* first character in range */
-               I32 max;                        /* last character in range */
  
+            /* But there isn't any special handling necessary unless there is a
+             * range, so for most cases we just drop down and handle the value
+             * as any other.  There are two exceptions.
+             *
+             * 1.  A minus sign indicates that we are actually going to have
+             *     a range.  In this case, skip the '-', set a flag, then drop
+             *     down to handle what should be the end range value.
+             * 2.  After we've handled that value, the next time through, that
+             *     flag is set and we fix up the range.
+             *
+             * Ranges entirely within Latin1 are expanded out entirely, in
+             * order to avoid the significant overhead of making a swash.
+             * Ranges that extend above Latin1 have to have a swash, so there
+             * is no advantage to abbreviate them here, so they are stored here
+             * as Min, ILLEGAL_UTF8_BYTE, Max.  The illegal byte signifies a
+             * hyphen without any possible ambiguity.  On EBCDIC machines, if
+             * the range is expressed as Unicode, the Latin1 portion is
+             * expanded out even if the entire range extends above Latin1.
+             * This is because each code point in it has to be processed here
+             * individually to get its native translation */
+
+           if (! dorange) {
+
+                /* Here, we don't think we're in a range.  If we've processed
+                 * at least one character, then see if this next one is a '-',
+                 * indicating the previous one was the start of a range.  But
+                 * don't bother if we're too close to the end for the minus to
+                 * mean that. */
+                if (*s != '-' || s >= send - 1 || s == start) {
+
+                    /* A regular character.  Process like any other, but first
+                     * clear any flags */
+                    didrange = FALSE;
+                    dorange = FALSE;
  #ifdef EBCDIC
-               UV uvmax = 0;
+                    non_portable_endpoint = 0;
+                    backslash_N = 0;
  #endif
+                    /* Drops down to generic code to process current byte */
+                }
+                else {
+                    if (didrange) { /* Something like y/A-C-Z// */
+                        Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
+                    }
  
-               if (has_utf8
-#ifdef EBCDIC
-                   && !native_range
-#endif
-                ) {
-                   char * const c = (char*)utf8_hop((U8*)d, -1);
-                   char *e = d++;
-                   while (e-- > c)
-                       *(e + 1) = *e;
-                   *c = (char) ILLEGAL_UTF8_BYTE;
-                   /* mark the range as done, and continue */
-                   dorange = FALSE;
-                   didrange = TRUE;
-                   continue;
-               }
+                    dorange = TRUE;
  
-               i = d - SvPVX_const(sv);                /* remember current offset */
-#ifdef EBCDIC
-                SvGROW(sv,
-                      SvLEN(sv) + ((has_utf8)
-                                    ?  (512 - UTF_CONTINUATION_MARK
-                                        + UNISKIP(0x100))
-                                   : 256));
-                /* How many two-byte within 0..255: 128 in UTF-8,
-                * 96 in UTF-8-mod. */
+                    s++;    /* Skip past the minus */
+
+                    /* d now points to where the end-range character will be
+                     * placed.  Save it so won't have to go finding it later,
+                     * and drop down to get that character.  (Actually we
+                     * instead save the offset, to handle the case where a
+                     * realloc in the meantime could change the actual
+                     * pointer).  We'll finish processing the range the next
+                     * time through the loop */
+                    offset_to_max = d - SvPVX_const(sv);
+                }
+            }  /* End of not a range */
+            else {
+                /* Here we have parsed a range.  Now must handle it.  At this
+                 * point:
+                 * 'sv' is a SV* that contains the output string we are
+                 *      constructing.  The final two characters in that string
+                 *      are the range start and range end, in order.
+                 * 'd'  points to just beyond the range end in the 'sv' string,
+                 *      where we would next place something
+                 * 'offset_to_max' is the offset in 'sv' at which the character
+                 *      before 'd' begins.
+                 */
+                const char * max_ptr = SvPVX_const(sv) + offset_to_max;
+                const char * min_ptr;
+                IV range_min;
+               IV range_max;   /* last character in range */
+                STRLEN save_offset;
+                STRLEN grow;
+#ifndef EBCDIC  /* Not meaningful except in EBCDIC, so initialize to false */
+                const bool convert_unicode = FALSE;
+                const IV real_range_max = 0;
  #else
-               SvGROW(sv, SvLEN(sv) + 256);    /* never more than 256 chars in a range */
+                bool convert_unicode;
+                IV real_range_max = 0;
  #endif
-               d = SvPVX(sv) + i;              /* refresh d after realloc */
-#ifdef EBCDIC
+
+                /* Get the range-ends code point values. */
                  if (has_utf8) {
-                    int j;
-                    for (j = 0; j <= 1; j++) {
-                        char * const c = (char*)utf8_hop((U8*)d, -1);
-                        const UV uv    = utf8n_to_uvchr((U8*)c, d - c, NULL, 0);
-                        if (j)
-                            min = (U8)uv;
-                        else if (uv < 256)
-                            max = (U8)uv;
-                        else {
-                            max = (U8)0xff; /* only to \xff */
-                            uvmax = uv; /* \x{100} to uvmax */
-                        }
-                        d = c; /* eat endpoint chars */
-                     }
+                    /* We know the utf8 is valid, because we just constructed
+                     * it ourselves in previous loop iterations */
+                    min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
+                    range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
+                    range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
                  }
-               else {
-#endif
-                  d -= 2;              /* eat the first char and the - */
-                  min = (U8)*d;        /* first char in range */
-                  max = (U8)d[1];      /* last char in range  */
+                else {
+                    min_ptr = max_ptr - 1;
+                    range_min = * (U8*) min_ptr;
+                    range_max = * (U8*) max_ptr;
+                }
+
  #ifdef EBCDIC
-              }
+                /* On EBCDIC platforms, we may have to deal with portable
+                 * ranges.  These happen if at least one range endpoint is a
+                 * Unicode value (\N{...}), or if the range is a subset of
+                 * [A-Z] or [a-z], and both ends are literal characters,
+                 * like 'A', and not like \x{C1} */
+                if ((convert_unicode
+                     = cBOOL(backslash_N)   /* \N{} forces Unicode, hence
+                                               portable range */
+                      || (   ! non_portable_endpoint
+                          && ((  isLOWER_A(range_min) && isLOWER_A(range_max))
+                             || (isUPPER_A(range_min) && isUPPER_A(range_max))))
+                )) {
+
+                    /* Special handling is needed for these portable ranges.
+                     * They are defined to all be in Unicode terms, which
+                     * include all Unicode code points between the end points.
+                     * Convert to Unicode to get the Unicode range.  Later we
+                     * will convert each code point in the range back to
+                     * native.  */
+                    range_min = NATIVE_TO_UNI(range_min);
+                    range_max = NATIVE_TO_UNI(range_max);
+                }
  #endif
  
-                if (min > max) {
-                   Perl_croak(aTHX_
-                              "Invalid range \"%c-%c\" in transliteration operator",
-                              (char)min, (char)max);
+                if (range_min > range_max) {
+                    if (convert_unicode) {
+                        /* Need to convert back to native for meaningful
+                         * messages for this platform */
+                        range_min = UNI_TO_NATIVE(range_min);
+                        range_max = UNI_TO_NATIVE(range_max);
+                    }
+
+                    /* Use the characters themselves for the error message if
+                     * ASCII printables; otherwise some visible representation
+                     * of them */
+                    if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
+                        Perl_croak(aTHX_
+                        "Invalid range \"%c-%c\" in transliteration operator",
+                        (char)range_min, (char)range_max);
+                    }
+                    else if (convert_unicode) {
+                        /* diag_listed_as: Invalid range "%s" in transliteration operator */
+                        Perl_croak(aTHX_
+                              "Invalid range \"\\N{U+%04"UVXf"}-\\N{U+%04"UVXf"}\""
+                               " in transliteration operator",
+                              range_min, range_max);
+                    }
+                    else {
+                        /* diag_listed_as: Invalid range "%s" in transliteration operator */
+                        Perl_croak(aTHX_
+                              "Invalid range \"\\x{%04"UVXf"}-\\x{%04"UVXf"}\""
+                               " in transliteration operator",
+                              range_min, range_max);
+                    }
                  }
  
+               if (has_utf8) {
+
+                    /* We try to avoid creating a swash.  If the upper end of
+                     * this range is below 256, this range won't force a swash;
+                     * otherwise it does force a swash, and as long as we have
+                     * to have one, we might as well not expand things out.
+                     * But if it's EBCDIC, we may have to look at each
+                     * character below 256 if we have to convert to/from
+                     * Unicode values */
+                    if (range_max > 255
  #ifdef EBCDIC
-                /* Because of the discontinuities in EBCDIC A-Z and a-z, expand
-                 * any subsets of these ranges into individual characters */
-               if (literal_endpoint == 2
-                    && ((isLOWER_A(min) && isLOWER_A(max))
-                     || (isUPPER_A(min) && isUPPER_A(max))))
-                {
-                    for (i = min; i <= max; i++) {
-                        if (isALPHA_A(i))
-                            *d++ = i;
-                   }
-               }
-               else
+                       && (range_min > 255 || ! convert_unicode)
  #endif
-                   for (i = min; i <= max; i++)
-#ifdef EBCDIC
-                        if (has_utf8) {
-                            append_utf8_from_native_byte(i, &d);
+                    ) {
+                        /* Move the high character one byte to the right; then
+                         * insert between it and the range begin, an illegal
+                         * byte which serves to indicate this is a range (using
+                         * a '-' could be ambiguous). */
+                        char *e = d++;
+                        while (e-- > max_ptr) {
+                            *(e + 1) = *e;
                          }
-                        else
-#endif
-                            *d++ = (char)i;
- 
+                        *(e + 1) = (char) ILLEGAL_UTF8_BYTE;
+                        goto range_done;
+                    }
+
+                    /* Here, we're going to expand out the range.  For EBCDIC
+                     * the range can extend above 255 (not so in ASCII), so
+                     * for EBCDIC, split it into the parts above and below
+                     * 255/256 */
  #ifdef EBCDIC
-                if (uvmax) {
-                    d = (char*)uvchr_to_utf8((U8*)d, 0x100);
-                    if (uvmax > 0x101)
-                        *d++ = (char) ILLEGAL_UTF8_BYTE;
-                    if (uvmax > 0x100)
-                        d = (char*)uvchr_to_utf8((U8*)d, uvmax);
-                }
+                    if (range_max > 255) {
+                        real_range_max = range_max;
+                        range_max = 255;
+                    }
  #endif
+               }
  
-               /* mark the range as done, and continue */
-               dorange = FALSE;
-               didrange = TRUE;
+                /* Here we need to expand out the string to contain each
+                 * character in the range.  Grow the output to handle this */
+
+                save_offset  = min_ptr - SvPVX_const(sv);
+
+                /* The base growth is the number of code points in the range */
+                grow = range_max - range_min + 1;
+                if (has_utf8) {
+
+                    /* But if the output is UTF-8, some of those characters may
+                     * need two bytes (since the maximum range value here is
+                     * 255, the max bytes per character is two).  On ASCII
+                     * platforms, it's not much trouble to get an accurate
+                     * count of what's needed.  But on EBCDIC, the ones that
+                     * need 2 bytes are scattered around, so just use a worst
+                     * case value instead of calculating for that platform.  */
  #ifdef EBCDIC
-               literal_endpoint = 0;
+                    grow *= 2;
+#else
+                    /* Only those above 127 require 2 bytes.  This may be
+                     * everything in the range, or not */
+                    if (range_min > 127) {
+                        grow *= 2;
+                    }
+                    else if (range_max > 127) {
+                        grow += range_max - 127;
+                    }
  #endif
-               continue;
-           }
+                }
+
+                /* Subtract 3 for the bytes that were already accounted for
+                 * (min, max, and the hyphen) */
+                SvGROW(sv, SvLEN(sv) + grow - 3);
+               d = SvPVX(sv) + save_offset;    /* refresh d after realloc */
  
-           /* range begins (ignore - as first or last char) */
-           else if (*s == '-' && s+1 < send  && s != start) {
-               if (didrange) {
-                   Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
+                /* Here, we expand out the range.  On ASCII platforms, the
+                 * compiler should optimize out the 'convert_unicode==TRUE'
+                 * portion of this */
+                if (convert_unicode) {
+                    IV i;
+
+                    /* Recall that the min and max are now in Unicode terms, so
+                     * we have to convert each character to its native
+                     * equivalent */
+                    if (has_utf8) {
+                        for (i = range_min; i <= range_max; i++) {
+                            append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
+                                                         (U8 **) &d);
+                        }
+                    }
+                    else {
+                        for (i = range_min; i <= range_max; i++) {
+                            *d++ = (char)LATIN1_TO_NATIVE((U8) i);
+                        }
+                   }
                 }
-               if (has_utf8
-#ifdef EBCDIC
-                   && !native_range
-#endif
-                   ) {
-                   *d++ = (char) ILLEGAL_UTF8_BYTE;    /* use illegal utf8 byte--see pmtrans */
-                   s++;
-                   continue;
+                else {
+                    IV i;
+
+                    /* Here, no conversions are necessary, which means that the
+                     * first character in the range is already in 'd' and
+                     * valid, so we can skip overwriting it */
+                    if (has_utf8) {
+                        d += UTF8SKIP(d);
+                        for (i = range_min + 1; i <= range_max; i++) {
+                            append_utf8_from_native_byte((U8) i, (U8 **) &d);
+                        }
+                    }
+                    else {
+                        d++;
+                        for (i = range_min + 1; i <= range_max; i++) {
+                            *d++ = (char)i;
+                        }
+                   }
                 }
-               dorange = TRUE;
-               s++;
-           }
-           else {
-               didrange = FALSE;
-#ifdef EBCDIC
-               literal_endpoint = 0;
-               native_range = TRUE;
-#endif
-           }
-       }
  
-        /* if we get to any of these else's, we're not doing a
-         * transliteration. */
+                /* (Compilers should optimize this out for non-EBCDIC).  If the
+                 * original range extended above 255, add in that portion */
+                if (real_range_max) {
+                    *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
+                    *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
+                    if (real_range_max > 0x101)
+                        *d++ = (char) ILLEGAL_UTF8_BYTE;
+                    if (real_range_max > 0x100)
+                        d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
+                }
  
+              range_done:
+               /* mark the range as done, and continue */
+               didrange = TRUE;
+               dorange = FALSE;
+#ifdef EBCDIC
+               non_portable_endpoint = 0;
+                backslash_N = 0;
+#endif
+               continue;
+           } /* End of is a range */
+        } /* End of transliteration.  Joins main code after these else's */
         else if (*s == '[' && PL_lex_inpat && !in_charclass) {
             char *s1 = s-1;
             int esc = 0;
@@ -3139,14 +3277,6 @@ S_scan_const(pTHX_ char *start)
             }
  
             switch (*s) {
-
-           /* quoted - in transliterations */
-           case '-':
-               if (PL_lex_inwhat == OP_TRANS) {
-                   *d++ = *s++;
-                   continue;
-               }
-               /* FALLTHROUGH */
             default:
                 {
                     if ((isALPHANUMERIC(*s)))
@@ -3210,13 +3340,13 @@ S_scan_const(pTHX_ char *start)
                 }
  
               NUM_ESCAPE_INSERT:
-               /* Insert oct or hex escaped character.  There will always be
-                * enough room in sv since such escapes will be longer than any
-                * UTF-8 sequence they can end up as, except if they force us
-                * to recode the rest of the string into utf8 */
+               /* Insert oct or hex escaped character. */
                 
                 /* Here uv is the ordinal of the next character being added */
-               if (!UVCHR_IS_INVARIANT(uv)) {
+               if (UVCHR_IS_INVARIANT(uv)) {
+                   *d++ = (char) uv;
+               }
+               else {
                     if (!has_utf8 && uv > 255) {
                         /* Might need to recode whatever we have accumulated so
                          * far if it contains any chars variant in utf8 or
@@ -3227,17 +3357,31 @@ S_scan_const(pTHX_ char *start)
                         *d = '\0';
                         /* See Note on sizing above.  */
                         sv_utf8_upgrade_flags_grow(
-                                         sv,
-                                         SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
+                                       sv,
+                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
                                                    /* Above-latin1 in string
                                                     * implies no encoding */
                                                    |SV_UTF8_NO_ENCODING,
-                                         UNISKIP(uv) + (STRLEN)(send - s) + 1);
+                                       UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
                         d = SvPVX(sv) + SvCUR(sv);
                         has_utf8 = TRUE;
                      }
  
                      if (has_utf8) {
+                       /* Usually, there will already be enough room in 'sv'
+                        * since such escapes are likely longer than any UTF-8
+                        * sequence they can end up as.  This isn't the case on
+                        * EBCDIC where \x{40000000} contains 12 bytes, and the
+                        * UTF-8 for it contains 14.  And, we have to allow for
+                        * a trailing NUL.  It probably can't happen on ASCII
+                        * platforms, but be safe */
+                        const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv)
+                                            + 1;
+                        if (UNLIKELY(needed > SvLEN(sv))) {
+                            SvCUR_set(sv, d - SvPVX_const(sv));
+                            d = sv_grow(sv, needed) + SvCUR(sv);
+                        }
+
                         d = (char*)uvchr_to_utf8((U8*)d, uv);
                         if (PL_lex_inwhat == OP_TRANS
                              && PL_sublex_info.sub_op)
@@ -3246,26 +3390,23 @@ S_scan_const(pTHX_ char *start)
                                 (PL_lex_repl ? OPpTRANS_FROM_UTF
                                              : OPpTRANS_TO_UTF);
                         }
-#ifdef EBCDIC
-                       if (uv > 255 && !dorange)
-                           native_range = FALSE;
-#endif
                      }
                     else {
                         *d++ = (char)uv;
                     }
                 }
-               else {
-                   *d++ = (char) uv;
-               }
+#ifdef EBCDIC
+                non_portable_endpoint++;
+#endif
                 continue;
  
             case 'N':
                  /* In a non-pattern \N must be like \N{U+0041}, or it can be a
                   * named character, like \N{LATIN SMALL LETTER A}, or a named
                   * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
-                 * GRAVE}.  For convenience all three forms are referred to as
-                 * "named characters" below.
+                 * GRAVE} (except y/// can't handle the latter, croaking).  For
+                 * convenience all three forms are referred to as "named
+                 * characters" below.
                   *
                   * For patterns, \N also can mean to match a non-newline.  Code
                   * before this 'switch' statement should already have handled
@@ -3283,11 +3424,14 @@ S_scan_const(pTHX_ char *start)
                   *
                  * The structure of this section of code (besides checking for
                  * errors and upgrading to utf8) is:
-                 *  If the named character is of the form \N{U+...}, pass it
+                 *    If the named character is of the form \N{U+...}, pass it
                   *      through if a pattern; otherwise convert the code point
                   *      to utf8
-                 *  Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
-                 *      if a pattern; otherwise convert to utf8
+                 *    Otherwise must be some \N{NAME}: convert to
+                 *      \N{U+c1.c2...} if a pattern; otherwise convert to utf8
+                 *
+                 * Transliteration is an exception.  The conversion to utf8 is
+                 * only done if the code point requires it to be representable.
                   *
                   * Here, 's' points to the 'N'; the test below is guaranteed to
                  * succeed if we are being called on a pattern, as we already
@@ -3352,25 +3496,30 @@ S_scan_const(pTHX_ char *start)
                          if (len == 0 || (len != (STRLEN)(e - s)))
                              goto bad_NU;
  
-                         /* If the destination is not in utf8, unconditionally
-                         * recode it to be so.  This is because \N{} implies
-                         * Unicode semantics, and scalars have to be in utf8
-                         * to guarantee those semantics */
-                       if (! has_utf8) {
+                         /* For non-tr///, if the destination is not in utf8,
+                          * unconditionally recode it to be so.  This is
+                          * because \N{} implies Unicode semantics, and scalars
+                          * have to be in utf8 to guarantee those semantics.
+                          * tr/// doesn't care about Unicode rules, so no need
+                          * there to upgrade to UTF-8 for small enough code
+                          * points */
+                       if (! has_utf8 && (   uv > 0xFF
+                                           || PL_lex_inwhat != OP_TRANS))
+                        {
                             SvCUR_set(sv, d - SvPVX_const(sv));
                             SvPOK_on(sv);
                             *d = '\0';
                             /* See Note on sizing above.  */
                             sv_utf8_upgrade_flags_grow(
-                                       sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                       UNISKIP(uv) + (STRLEN)(send - e) + 1);
+                                    sv,
+                                    SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                   UVCHR_SKIP(uv) + (STRLEN)(send - e) + 1);
                             d = SvPVX(sv) + SvCUR(sv);
                             has_utf8 = TRUE;
                         }
  
                          /* Add the (Unicode) code point to the output. */
-                       if (UNI_IS_INVARIANT(uv)) {
+                       if (OFFUNI_IS_INVARIANT(uv)) {
                             *d++ = (char) LATIN1_TO_NATIVE(uv);
                         }
                         else {
@@ -3427,7 +3576,7 @@ S_scan_const(pTHX_ char *start)
                                                    /* The regex compiler is
                                                     * expecting Unicode, not
                                                     * native */
-                                                  (U8) NATIVE_TO_LATIN1(*str));
+                                                  NATIVE_TO_LATIN1(*str));
                                      PERL_MY_SNPRINTF_POST_GUARD(len,
                                                             sizeof(hex_string));
                                      Copy(hex_string, d, 3, char);
@@ -3498,11 +3647,32 @@ S_scan_const(pTHX_ char *start)
                     else { /* Here, not in a pattern.  Convert the name to a
                             * string. */
  
-                        /* If destination is not in utf8, unconditionally
-                         * recode it to be so.  This is because \N{} implies
-                         * Unicode semantics, and scalars have to be in utf8
-                         * to guarantee those semantics */
-                       if (! has_utf8) {
+                        if (PL_lex_inwhat == OP_TRANS) {
+                            str = SvPV_const(res, len);
+                            if (len > ((SvUTF8(res))
+                                       ? UTF8SKIP(str)
+                                       : 1U))
+                            {
+                                yyerror(Perl_form(aTHX_
+                                    "%.*s must not be a named sequence"
+                                    " in transliteration operator",
+                                        /*  +1 to include the "}" */
+                                    (int) (e + 1 - start), start));
+                                goto end_backslash_N;
+                            }
+                        }
+                        else if (! SvUTF8(res)) {
+                            /* Make sure \N{} return is UTF-8.  This is because
+                            * \N{} implies Unicode semantics, and scalars have to
+                            * be in utf8 to guarantee those semantics; but not
+                            * needed in tr/// */
+                            sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
+                            str = SvPV_const(res, len);
+                        }
+
+                         /* Upgrade destination to be utf8 if this new
+                          * component is */
+                       if (! has_utf8 && SvUTF8(res)) {
                             SvCUR_set(sv, d - SvPVX_const(sv));
                             SvPOK_on(sv);
                             *d = '\0';
@@ -3519,10 +3689,6 @@ S_scan_const(pTHX_ char *start)
                             const STRLEN off = d - SvPVX_const(sv);
                             d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
                         }
-                        if (! SvUTF8(res)) {    /* Make sure \N{} return is UTF-8 */
-                            sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
-                            str = SvPV_const(res, len);
-                        }
                         Copy(str, d, len, char);
                         d += len;
                     }
@@ -3530,9 +3696,10 @@ S_scan_const(pTHX_ char *start)
                     SvREFCNT_dec(res);
  
                 } /* End \N{NAME} */
+
+              end_backslash_N:
  #ifdef EBCDIC
-               if (!dorange) 
-                   native_range = FALSE; /* \N{} is defined to be Unicode */
+                backslash_N++; /* \N{} is defined to be Unicode */
  #endif
                 s = e + 1;  /* Point to just after the '}' */
                 continue;
@@ -3546,6 +3713,9 @@ S_scan_const(pTHX_ char *start)
                 else {
                     yyerror("Missing control char name in \\c");
                 }
+#ifdef EBCDIC
+                non_portable_endpoint++;
+#endif
                 continue;
  
             /* printf-style backslashes, formfeeds, newlines, etc */
@@ -3575,10 +3745,6 @@ S_scan_const(pTHX_ char *start)
             s++;
             continue;
         } /* end if (backslash) */
-#ifdef EBCDIC
-       else
-           literal_endpoint++;
-#endif
  
      default_action:
         /* If we started with encoded form, or already know we want it,
@@ -3586,7 +3752,6 @@ S_scan_const(pTHX_ char *start)
         if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
             STRLEN len  = 1;
  
-
             /* One might think that it is wasted effort in the case of the
              * source being utf8 (this_utf8 == TRUE) to take the next character
              * in the source, convert it to an unsigned value, and then convert
@@ -3597,7 +3762,7 @@ S_scan_const(pTHX_ char *start)
             const UV nextuv   = (this_utf8)
                                  ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
                                  : (UV) ((U8) *s);
-           const STRLEN need = UNISKIP(nextuv);
+           const STRLEN need = UVCHR_SKIP(nextuv);
             if (!has_utf8) {
                 SvCUR_set(sv, d - SvPVX_const(sv));
                 SvPOK_on(sv);
@@ -3618,10 +3783,6 @@ S_scan_const(pTHX_ char *start)
             s += len;
  
             d = (char*)uvchr_to_utf8((U8*)d, nextuv);
-#ifdef EBCDIC
-           if (uv > 255 && !dorange)
-               native_range = FALSE;
-#endif
         }
         else {
             *d++ = *s++;
@@ -4272,15 +4433,15 @@ S_check_scalar_slice(pTHX_ char *s)
      The type of the next token
  
    Structure:
+      Check if we have already built the token; if so, use it.
        Switch based on the current state:
-         - if we already built the token before, use it
           - if we have a case modifier in a string, deal with that
           - handle other cases of interpolation inside a string
           - scan the next line if we are inside a format
-      In the normal state switch on the next character:
+      In the normal state, switch on the next character:
           - default:
             if alphabetic, go to key lookup
-           unrecoginized character - croak
+           unrecognized character - croak
           - 0/4/26: handle end-of-line or EOF
           - cases for whitespace
           - \n and #: handle comments and line numbers
@@ -4339,10 +4500,6 @@ Perl_yylex(pTHX)
      if (PL_nexttoke) {
         PL_nexttoke--;
         pl_yylval = PL_nextval[PL_nexttoke];
-       if (!PL_nexttoke) {
-           PL_lex_state = PL_lex_defer;
-           PL_lex_defer = LEX_NORMAL;
-       }
         {
             I32 next_type;
             next_type = PL_nexttype[PL_nexttoke];
@@ -4516,14 +4673,6 @@ Perl_yylex(pTHX)
         /* FALLTHROUGH */
  
      case LEX_INTERPEND:
-       /* Treat state as LEX_NORMAL if we have no inner lexing scope.
-          XXX This hack can be removed if we stop setting PL_lex_state to
-          LEX_KNOWNEXT, as can the hack under LEX_INTREPCONCAT below.  */
-       if (UNLIKELY(!PL_lex_inwhat)) {
-           PL_lex_state = LEX_NORMAL;
-           break;
-       }
-
         if (PL_lex_dojoin) {
             const U8 dojoin_was = PL_lex_dojoin;
             PL_lex_dojoin = FALSE;
@@ -4575,14 +4724,6 @@ Perl_yylex(pTHX)
             Perl_croak(aTHX_ "panic: INTERPCONCAT, lex_brackets=%ld",
                        (long) PL_lex_brackets);
  #endif
-       /* Treat state as LEX_NORMAL when not in an inner lexing scope.
-          XXX This hack can be removed if we stop setting PL_lex_state to
-          LEX_KNOWNEXT.  */
-       if (UNLIKELY(!PL_lex_inwhat)) {
-           PL_lex_state = LEX_NORMAL;
-           break;
-       }
-
         if (PL_bufptr == PL_bufend)
             return REPORT(sublex_done());
  
@@ -4640,9 +4781,22 @@ Perl_yylex(pTHX)
    retry:
      switch (*s) {
      default:
-       if (UTF ? isIDFIRST_utf8((U8*)s) : isALNUMC(*s))
+       if (UTF) {
+            if (! isUTF8_CHAR((U8 *) s, (U8 *) PL_bufend)) {
+                ENTER;
+                SAVESPTR(PL_warnhook);
+                PL_warnhook = PERL_WARNHOOK_FATAL;
+                utf8n_to_uvchr((U8*)s, PL_bufend-s, NULL, 0);
+                LEAVE;
+            }
+            if (isIDFIRST_utf8((U8*)s)) {
+                goto keylookup;
+            }
+        }
+        else if (isALNUMC(*s)) {
             goto keylookup;
-       {
+       }
+    {
          SV *dsv = newSVpvs_flags("", SVs_TEMP);
          const char *c = UTF ? sv_uni_display(dsv, newSVpvn_flags(s,
                                                      UTF8SKIP(s),
@@ -5678,12 +5832,12 @@ Perl_yylex(pTHX)
                     else
                         /* skip plain q word */
                         while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF))
-                            t += UTF8SKIP(t);
+                           t += UTF ? UTF8SKIP(t) : 1;
                 }
                 else if (isWORDCHAR_lazy_if(t,UTF)) {
-                   t += UTF8SKIP(t);
+                   t += UTF ? UTF8SKIP(t) : 1;
                     while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF))
-                        t += UTF8SKIP(t);
+                       t += UTF ? UTF8SKIP(t) : 1;
                 }
                 while (t < PL_bufend && isSPACE(*t))
                     t++;
@@ -5838,6 +5992,8 @@ Perl_yylex(pTHX)
         {
             const char tmp = *s++;
             if (tmp == '=') {
+               if ((s == PL_linestart+2 || s[-3] == '\n') && strnEQ(s, "=====", 5))
+                   Perl_croak(aTHX_ "Version control conflict marker '%.*s'", 7, s - 2);
                 if (!PL_lex_allbrackets
                      && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
                  {
@@ -5952,8 +6108,11 @@ Perl_yylex(pTHX)
         if (PL_expect != XOPERATOR) {
             if (s[1] != '<' && !strchr(s,'>'))
                 check_uni();
-           if (s[1] == '<' && s[2] != '>')
+           if (s[1] == '<' && s[2] != '>') {
+               if ((s == PL_linestart || s[-1] == '\n') && strnEQ(s+2, "<<<<<", 5))
+                   Perl_croak(aTHX_ "Version control conflict marker '%.*s'", 7, s);
                 s = scan_heredoc(s);
+           }
             else
                 s = scan_inputsymbol(s);
             PL_expect = XOPERATOR;
@@ -5963,6 +6122,8 @@ Perl_yylex(pTHX)
         {
             char tmp = *s++;
             if (tmp == '<') {
+               if ((s == PL_linestart+2 || s[-3] == '\n') && strnEQ(s, "<<<<<", 5))
+                   Perl_croak(aTHX_ "Version control conflict marker '%.*s'", 7, s - 2);
                 if (*s == '=' && !PL_lex_allbrackets
                      && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
                  {
@@ -6003,6 +6164,8 @@ Perl_yylex(pTHX)
         {
             const char tmp = *s++;
             if (tmp == '>') {
+               if ((s == PL_linestart+2 || s[-3] == '\n') && strnEQ(s, ">>>>>", 5))
+                   Perl_croak(aTHX_ "Version control conflict marker '%.*s'", 7, s - 2);
                 if (*s == '=' && !PL_lex_allbrackets
                      && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
                  {
@@ -6186,11 +6349,18 @@ Perl_yylex(pTHX)
         TOKEN('$');
  
      case '@':
-       if (PL_expect == XOPERATOR)
-           no_op("Array", s);
-       else if (PL_expect == XPOSTDEREF) POSTDEREF('@');
+        if (PL_expect == XPOSTDEREF)
+            POSTDEREF('@');
         PL_tokenbuf[0] = '@';
         s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
+       if (PL_expect == XOPERATOR) {
+            d = s;
+            if (PL_bufptr > s) {
+                d = PL_bufptr-1;
+                PL_bufptr = PL_oldbufptr;
+            }
+           no_op("Array", d);
+        }
         pl_yylval.ival = 0;
         if (!PL_tokenbuf[1]) {
             PREREF('@');
@@ -7548,7 +7718,6 @@ Perl_yylex(pTHX)
             UNI(OP_LCFIRST);
  
         case KEY_local:
-           pl_yylval.ival = 0;
             OPERATOR(LOCAL);
  
         case KEY_length:
@@ -7620,17 +7789,7 @@ Perl_yylex(pTHX)
             if (isIDFIRST_lazy_if(s,UTF)) {
                 s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
                 if (len == 3 && strnEQ(PL_tokenbuf, "sub", 3))
-               {
-                   if (!FEATURE_LEXSUBS_IS_ENABLED)
-                       Perl_croak(aTHX_
-                                 "Experimental \"%s\" subs not enabled",
-                                  tmp == KEY_my    ? "my"    :
-                                  tmp == KEY_state ? "state" : "our");
-                   Perl_ck_warner_d(aTHX_
-                       packWARN(WARN_EXPERIMENTAL__LEXICAL_SUBS),
-                       "The lexical_subs feature is experimental");
                     goto really_sub;
-               }
                 PL_in_my_stash = find_in_my_stash(PL_tokenbuf, len);
                 if (!PL_in_my_stash) {
                     char tmpbuf[1024];
@@ -7641,7 +7800,6 @@ Perl_yylex(pTHX)
                     yyerror_pv(tmpbuf, UTF ? SVf_UTF8 : 0);
                 }
             }
-           pl_yylval.ival = 1;
             OPERATOR(MY);
  
         case KEY_next:
@@ -8703,26 +8861,17 @@ S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN
   *          2) '{'
   *     The final case currently doesn't get this far in the program, so we
   *     don't test for it.  If that were to change, it would be ok to allow it.
- *  c) When not under Unicode rules, any upper Latin1 character
- *  d) Otherwise, when unicode rules are used, all XIDS characters.
+ *  b) When not under Unicode rules, any upper Latin1 character
+ *  c) Otherwise, when unicode rules are used, all XIDS characters.
   *
   *      Because all ASCII characters have the same representation whether
   *      encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
- *      '{' without knowing if is UTF-8 or not.
- * EBCDIC already uses the rules that ASCII platforms will use after the
- * deprecation cycle; see comment below about the deprecation. */
-#ifdef EBCDIC
-#   define VALID_LEN_ONE_IDENT(s, is_utf8)                                    \
+ *      '{' without knowing if is UTF-8 or not. */
+#define VALID_LEN_ONE_IDENT(s, is_utf8)                                       \
      (isGRAPH_A(*(s)) || ((is_utf8)                                            \
                           ? isIDFIRST_utf8((U8*) (s))                          \
                           : (isGRAPH_L1(*s)                                    \
                              && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))
-#else
-#   define VALID_LEN_ONE_IDENT(s, is_utf8)                                    \
-    (isGRAPH_A(*(s)) || ((is_utf8)                                            \
-                         ? isIDFIRST_utf8((U8*) (s))                          \
-                         : ! isASCII_utf8((U8*) (s))))
-#endif
  
  STATIC char *
  S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
@@ -8787,18 +8936,6 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
                            : 1)
          && VALID_LEN_ONE_IDENT(s, is_utf8))
      {
-        /* Deprecate all non-graphic characters.  Include SHY as a non-graphic,
-         * because often it has no graphic representation.  (We can't get to
-         * here with SHY when 'is_utf8' is true, so no need to include a UTF-8
-         * test for it.) */
-        if ((is_utf8)
-            ? ! isGRAPH_utf8( (U8*) s)
-            : (! isGRAPH_L1( (U8) *s)
-               || UNLIKELY((U8) *(s) == LATIN1_TO_NATIVE(0xAD))))
-        {
-            deprecate("literal non-graphic characters in variable names");
-        }
-
          if (is_utf8) {
              const STRLEN skip = UTF8SKIP(s);
              STRLEN i;
@@ -9081,7 +9218,9 @@ S_scan_pat(pTHX_ char *start, I32 type)
                        "Use of /c modifier is meaningless without /g" );
      }
  
-    STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+    if (UNLIKELY((x_mod_count) > 1)) {
+        yyerror("Only one /x regex modifier is allowed");
+    }
  
      PL_lex_op = (OP*)pm;
      pl_yylval.ival = OP_MATCH;
@@ -9136,7 +9275,9 @@ S_scan_subst(pTHX_ char *start)
         }
      }
  
-    STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+    if (UNLIKELY((x_mod_count) > 1)) {
+        yyerror("Only one /x regex modifier is allowed");
+    }
  
      if ((pm->op_pmflags & PMf_CONTINUE)) {
          Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" );
@@ -9357,7 +9498,7 @@ S_scan_heredoc(pTHX_ char *s)
         SV *linestr;
         char *bufend;
         char * const olds = s;
-       PERL_CONTEXT * const cx = &cxstack[cxstack_ix];
+       PERL_CONTEXT * const cx = CX_CUR();
         /* These two fields are not set until an inner lexing scope is
            entered.  But we need them set here. */
         shared->ls_bufptr  = s;
@@ -9392,9 +9533,10 @@ S_scan_heredoc(pTHX_ char *s)
                 goto streaming;
             }
           }
-       else {  /* eval */
+       else {  /* eval or we've already hit EOF */
             s = (char*)memchr((void*)s, '\n', PL_bufend - s);
-           assert(s);
+           if (!s)
+                goto interminable;
         }
         linestr = shared->ls_linestr;
         bufend = SvEND(linestr);
@@ -9450,12 +9592,14 @@ S_scan_heredoc(pTHX_ char *s)
      else
      {
        SV *linestr_save;
+      char *oldbufptr_save;
       streaming:
        sv_setpvs(tmpstr,"");   /* avoid "uninitialized" warning */
        term = PL_tokenbuf[1];
        len--;
        linestr_save = PL_linestr; /* must restore this afterwards */
        d = s;                    /* and this */
+      oldbufptr_save = PL_oldbufptr;
        PL_linestr = newSVpvs("");
        PL_bufend = SvPVX(PL_linestr);
        while (1) {
@@ -9472,6 +9616,7 @@ S_scan_heredoc(pTHX_ char *s)
                restore PL_linestr. */
             SvREFCNT_dec_NN(PL_linestr);
             PL_linestr = linestr_save;
+            PL_oldbufptr = oldbufptr_save;
             goto interminable;
         }
         CopLINE_set(PL_curcop, origline);
@@ -9506,6 +9651,7 @@ S_scan_heredoc(pTHX_ char *s)
             PL_linestr = linestr_save;
             PL_linestart = SvPVX(linestr_save);
             PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
+            PL_oldbufptr = oldbufptr_save;
             s = d;
             break;
         }
@@ -10106,6 +10252,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
       * multiple fp operations. */
      bool hexfp = FALSE;
      int total_bits = 0;
+    int significant_bits = 0;
  #if NVSIZE == 8 && defined(HAS_QUAD) && defined(Uquad_t)
  #  define HEXFP_UQUAD
      Uquad_t hexfp_uquad = 0;
@@ -10116,6 +10263,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
  #endif
      NV hexfp_mult = 1.0;
      UV high_non_zero = 0; /* highest digit */
+    int non_zero_integer_digits = 0;
  
      PERL_ARGS_ASSERT_SCAN_NUM;
  
@@ -10268,6 +10416,9 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                      if (high_non_zero == 0 && b > 0)
                          high_non_zero = b;
  
+                    if (high_non_zero)
+                        non_zero_integer_digits++;
+
                      /* this could be hexfp, but peek ahead
                       * to avoid matching ".." */
                      if (UNLIKELY(HEXFP_PEEK(s))) {
@@ -10294,43 +10445,103 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                   * detection will shortly be more thorough with the
                   * underbar checks. */
                  const char* h = s;
+                significant_bits = non_zero_integer_digits * shift;
  #ifdef HEXFP_UQUAD
                  hexfp_uquad = u;
  #else /* HEXFP_NV */
                  hexfp_nv = u;
  #endif
+                /* Ignore the leading zero bits of
+                 * the high (first) non-zero digit. */
+                if (high_non_zero) {
+                    if (high_non_zero < 0x8)
+                        significant_bits--;
+                    if (high_non_zero < 0x4)
+                        significant_bits--;
+                    if (high_non_zero < 0x2)
+                        significant_bits--;
+                }
+
                  if (*h == '.') {
  #ifdef HEXFP_NV
-                    NV mult = 1 / 16.0;
+                    NV nv_mult = 1.0;
  #endif
-                    h++;
-                    while (isXDIGIT(*h) || *h == '_') {
+                    bool accumulate = TRUE;
+                    for (h++; (isXDIGIT(*h) || *h == '_'); h++) {
                          if (isXDIGIT(*h)) {
                              U8 b = XDIGIT_VALUE(*h);
-                            total_bits += shift;
+                            significant_bits += shift;
  #ifdef HEXFP_UQUAD
-                            hexfp_uquad <<= shift;
-                            hexfp_uquad |= b;
-                            hexfp_frac_bits += shift;
+                            if (accumulate) {
+                                if (significant_bits < NV_MANT_DIG) {
+                                    /* We are in the long "run" of xdigits,
+                                     * accumulate the full four bits. */
+                                    hexfp_uquad <<= shift;
+                                    hexfp_uquad |= b;
+                                    hexfp_frac_bits += shift;
+                                } else {
+                                    /* We are at a hexdigit either at,
+                                     * or straddling, the edge of mantissa.
+                                     * We will try grabbing as many as
+                                     * possible bits. */
+                                    int tail =
+                                      significant_bits - NV_MANT_DIG;
+                                    if (tail <= 0)
+                                       tail += shift;
+                                    hexfp_uquad <<= tail;
+                                    hexfp_uquad |= b >> (shift - tail);
+                                    hexfp_frac_bits += tail;
+
+                                    /* Ignore the trailing zero bits
+                                     * of the last non-zero xdigit.
+                                     *
+                                     * The assumption here is that if
+                                     * one has input of e.g. the xdigit
+                                     * eight (0x8), there is only one
+                                     * bit being input, not the full
+                                     * four bits.  Conversely, if one
+                                     * specifies a zero xdigit, the
+                                     * assumption is that one really
+                                     * wants all those bits to be zero. */
+                                    if (b) {
+                                        if ((b & 0x1) == 0x0) {
+                                            significant_bits--;
+                                            if ((b & 0x2) == 0x0) {
+                                                significant_bits--;
+                                                if ((b & 0x4) == 0x0) {
+                                                    significant_bits--;
+                                                }
+                                            }
+                                        }
+                                    }
+
+                                    accumulate = FALSE;
+                                }
+                            } else {
+                                /* Keep skipping the xdigits, and
+                                 * accumulating the significant bits,
+                                 * but do not shift the uquad
+                                 * (which would catastrophically drop
+                                 * high-order bits) or accumulate the
+                                 * xdigits anymore. */
+                            }
  #else /* HEXFP_NV */
-                            hexfp_nv += b * mult;
-                            mult /= 16.0;
+                            if (accumulate) {
+                                nv_mult /= 16.0;
+                                if (nv_mult > 0.0)
+                                    hexfp_nv += b * nv_mult;
+                                else
+                                    accumulate = FALSE;
+                            }
  #endif
                          }
-                        h++;
+                        if (significant_bits >= NV_MANT_DIG)
+                            accumulate = FALSE;
                      }
                  }
  
-                if (total_bits >= 4) {
-                    if (high_non_zero < 0x8)
-                        total_bits--;
-                    if (high_non_zero < 0x4)
-                        total_bits--;
-                    if (high_non_zero < 0x2)
-                        total_bits--;
-                }
-
-                if (total_bits > 0 && (isALPHA_FOLD_EQ(*h, 'p'))) {
+                if ((total_bits > 0 || significant_bits > 0) &&
+                    isALPHA_FOLD_EQ(*h, 'p')) {
                      bool negexp = FALSE;
                      h++;
                      if (*h == '+')
@@ -10574,7 +10785,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
             *d = '\0';
              if (UNLIKELY(hexfp)) {
  #  ifdef NV_MANT_DIG
-                if (total_bits > NV_MANT_DIG)
+                if (significant_bits > NV_MANT_DIG)
                      Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
                                     "Hexadecimal float: mantissa overflow");
  #  endif
@@ -10823,8 +11034,7 @@ Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags)
      else if (yychar > 255)
         sv_catpvs(where_sv, "next token ???");
      else if (yychar == YYEMPTY) {
-       if (    PL_lex_state == LEX_NORMAL
-            || (PL_lex_state == LEX_KNOWNEXT && PL_lex_defer == LEX_NORMAL))
+       if (PL_lex_state == LEX_NORMAL)
             sv_catpvs(where_sv, "at end of line");
         else if (PL_lex_inpat)
             sv_catpvs(where_sv, "within pattern");
@@ -11185,10 +11395,7 @@ Perl_scan_vstring(pTHX_ const char *s, const char *const e, SV *sv)
                                          "Integer overflow in decimal number");
                 }
             }
-#ifdef EBCDIC
-           if (rev > 0x7FFFFFFF)
-                Perl_croak(aTHX_ "In EBCDIC the v-string components cannot exceed 2147483647");
-#endif
+
             /* Append native character for the rev point */
             tmpend = uvchr_to_utf8(tmpbuf, rev);
             sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
@@ -11496,7 +11703,7 @@ Perl_parse_label(pTHX_ U32 flags)
  {
      if (flags & ~PARSE_OPTIONAL)
         Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_label");
-    if (PL_lex_state == LEX_KNOWNEXT) {
+    if (PL_nexttoke) {
         PL_parser->yychar = yylex();
         if (PL_parser->yychar == LABEL) {
             char * const lpv = pl_yylval.pval;