toke.c: Avoid work for tr/a-a/.../

[perl5.git] / toke.c
diff --git a/toke.c b/toke.c

index 0b9b8fc..8409bb6 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -467,7 +467,7 @@ S_printbuf(pTHX_ const char *const fmt, const char *const s)
  static int
  S_deprecate_commaless_var_list(pTHX) {
      PL_expect = XTERM;
-    deprecate("comma-less variable list");
+    deprecate_fatal_in("5.28", "Use of comma-less variable list is deprecated");
      return REPORT(','); /* grandfather non-comma-format format */
  }
  
@@ -2543,7 +2543,7 @@ S_sublex_done(pTHX)
      }
  }
  
-PERL_STATIC_INLINE SV*
+STATIC SV*
  S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
  {
      /* <s> points to first character of interior of \N{}, <e> to one beyond the
@@ -2563,8 +2563,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
      PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
  
      if (!SvCUR(res)) {
-        Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
-                       "Unknown charname '' is deprecated");
+        deprecate_fatal_in("5.28", "Unknown charname '' is deprecated");
          return res;
      }
  
@@ -2785,15 +2784,17 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
  
    In transliterations:
      characters are VERY literal, except for - not at the start or end
-    of the string, which indicates a range. If the range is in bytes,
+    of the string, which indicates a range.  However some backslash sequences
+    are recognized: \r, \n, and the like
+                    \007 \o{}, \x{}, \N{}
+    If all elements in the transliteration are below 256,
      scan_const expands the range to the full set of intermediate
      characters. If the range is in utf8, the hyphen is replaced with
      a certain range mark which will be handled by pmtrans() in op.c.
  
    In double-quoted strings:
      backslashes:
-      double-quoted style: \r and \n
-      constants: \x31, etc.
+      all those recognized in transliterations
        deprecated backrefs: \1 (in substitution replacements)
        case and quoting: \U \Q \E
      stops on @ and $
@@ -2836,7 +2837,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
           } (end if backslash)
            handle regular character
      } (end while character to read)
-               
+
  */
  
  STATIC char *
@@ -2851,11 +2852,18 @@ S_scan_const(pTHX_ char *start)
      bool didrange = FALSE;              /* did we just finish a range? */
      bool in_charclass = FALSE;          /* within /[...]/ */
      bool has_utf8 = FALSE;              /* Output constant is UTF8 */
+    bool has_above_latin1 = FALSE;      /* does something require special
+                                           handling in tr/// ? */
      bool  this_utf8 = cBOOL(UTF);       /* Is the source string assumed to be
                                             UTF8?  But, this can show as true
                                             when the source isn't utf8, as for
                                             example when it is entirely composed
                                             of hex constants */
+    STRLEN utf8_variant_count = 0;      /* When not in UTF-8, this counts the
+                                           number of characters found so far
+                                           that will expand (into 2 bytes)
+                                           should we have to convert to
+                                           UTF-8) */
      SV *res;                           /* result from charnames */
      STRLEN offset_to_max;   /* The offset in the output to where the range
                                 high-end character is temporarily placed */
@@ -2904,30 +2912,29 @@ S_scan_const(pTHX_ char *start)
               * range, so for most cases we just drop down and handle the value
               * as any other.  There are two exceptions.
               *
-             * 1.  A minus sign indicates that we are actually going to have
-             *     a range.  In this case, skip the '-', set a flag, then drop
+             * 1.  A hyphen indicates that we are actually going to have a
+             *     range.  In this case, skip the '-', set a flag, then drop
               *     down to handle what should be the end range value.
               * 2.  After we've handled that value, the next time through, that
               *     flag is set and we fix up the range.
               *
               * Ranges entirely within Latin1 are expanded out entirely, in
-             * order to avoid the significant overhead of making a swash.
-             * Ranges that extend above Latin1 have to have a swash, so there
-             * is no advantage to abbreviating them here, so they are stored
-             * here as Min, ILLEGAL_UTF8_BYTE, Max.  The illegal byte signifies
-             * a hyphen without any possible ambiguity.  On EBCDIC machines, if
-             * the range is expressed as Unicode, the Latin1 portion is
-             * expanded out even if the entire range extends above Latin1.
-             * This is because each code point in it has to be processed here
-             * individually to get its native translation */
+             * order to make the transliteration a simple table look-up.
+             * Ranges that extend above Latin1 have to be done differently, so
+             * there is no advantage to expanding them here, so they are
+             * stored here as Min, ILLEGAL_UTF8_BYTE, Max.  The illegal byte
+             * signifies a hyphen without any possible ambiguity.  On EBCDIC
+             * machines, if the range is expressed as Unicode, the Latin1
+             * portion is expanded out even if the range extends above
+             * Latin1.  This is because each code point in it has to be
+             * processed here individually to get its native translation */
  
             if (! dorange) {
  
-                /* Here, we don't think we're in a range.  If we've processed
-                 * at least one character, then see if this next one is a '-',
-                 * indicating the previous one was the start of a range.  But
-                 * don't bother if we're too close to the end for the minus to
-                 * mean that. */
+                /* Here, we don't think we're in a range.  If the new character
+                 * is not a hyphen; or if it is a hyphen, but it's too close to
+                 * either edge to indicate a range, then it's a regular
+                 * character. */
                  if (*s != '-' || s >= send - 1 || s == start) {
  
                      /* A regular character.  Process like any other, but first
@@ -2938,16 +2945,26 @@ S_scan_const(pTHX_ char *start)
                      non_portable_endpoint = 0;
                      backslash_N = 0;
  #endif
+                    /* The tests here for being above Latin1 and similar ones
+                     * in the following 'else' suffice to find all such
+                     * occurences in the constant, except those added by a
+                     * backslash escape sequence, like \x{100}.  And all those
+                     * set 'has_above_latin1' as appropriate */
+                    if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
+                        has_above_latin1 = TRUE;
+                    }
+
                      /* Drops down to generic code to process current byte */
                  }
-                else {
+                else {  /* Is a '-' in the context where it means a range */
                      if (didrange) { /* Something like y/A-C-Z// */
-                        Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
+                        Perl_croak(aTHX_ "Ambiguous range in transliteration"
+                                         " operator");
                      }
  
                      dorange = TRUE;
  
-                    s++;    /* Skip past the minus */
+                    s++;    /* Skip past the hyphen */
  
                      /* d now points to where the end-range character will be
                       * placed.  Save it so won't have to go finding it later,
@@ -2957,6 +2974,12 @@ S_scan_const(pTHX_ char *start)
                       * pointer).  We'll finish processing the range the next
                       * time through the loop */
                      offset_to_max = d - SvPVX_const(sv);
+
+                    if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
+                        has_above_latin1 = TRUE;
+                    }
+
+                    /* Drops down to generic code to process current byte */
                  }
              }  /* End of not a range */
              else {
@@ -2968,9 +2991,9 @@ S_scan_const(pTHX_ char *start)
                   * 'd'  points to just beyond the range end in the 'sv' string,
                   *      where we would next place something
                   * 'offset_to_max' is the offset in 'sv' at which the character
-                 *      before 'd' begins.
+                 *      (the range's maximum end point) before 'd'  begins.
                   */
-                const char * max_ptr = SvPVX_const(sv) + offset_to_max;
+                char * max_ptr = SvPVX(sv) + offset_to_max;
                  const char * min_ptr;
                  IV range_min;
                 IV range_max;   /* last character in range */
@@ -2981,7 +3004,7 @@ S_scan_const(pTHX_ char *start)
                  IV real_range_max = 0;
  #endif
  
-                /* Get the range-ends code point values. */
+                /* Get the code point values of the range ends. */
                  if (has_utf8) {
                      /* We know the utf8 is valid, because we just constructed
                       * it ourselves in previous loop iterations */
@@ -2995,23 +3018,36 @@ S_scan_const(pTHX_ char *start)
                      range_max = * (U8*) max_ptr;
                  }
  
+                /* If the range is just a single code point, like tr/a-a/.../,
+                 * that code point is already in the output, twice.  We can
+                 * just back up over the second instance and avoid all the rest
+                 * of the work.  But if it is a variant character, it's been
+                 * counted twice, so decrement */
+                if (UNLIKELY(range_max == range_min)) {
+                    d = max_ptr;
+                    if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
+                        utf8_variant_count--;
+                    }
+                    goto range_done;
+                }
+
  #ifdef EBCDIC
                  /* On EBCDIC platforms, we may have to deal with portable
                   * ranges.  These happen if at least one range endpoint is a
                   * Unicode value (\N{...}), or if the range is a subset of
                   * [A-Z] or [a-z], and both ends are literal characters,
                   * like 'A', and not like \x{C1} */
-                if ((convert_unicode
-                     = cBOOL(backslash_N)   /* \N{} forces Unicode, hence
-                                               portable range */
-                      || (   ! non_portable_endpoint
-                          && ((  isLOWER_A(range_min) && isLOWER_A(range_max))
-                             || (isUPPER_A(range_min) && isUPPER_A(range_max))))
-                )) {
+                convert_unicode =
+                               cBOOL(backslash_N)   /* \N{} forces Unicode,
+                                                       hence portable range */
+                    || (     ! non_portable_endpoint
+                        && ((  isLOWER_A(range_min) && isLOWER_A(range_max))
+                           || (isUPPER_A(range_min) && isUPPER_A(range_max))));
+                if (convert_unicode) {
  
                      /* Special handling is needed for these portable ranges.
-                     * They are defined to all be in Unicode terms, which
-                     * include all Unicode code points between the end points.
+                     * They are defined to be in Unicode terms, which includes
+                     * all the Unicode code points between the end points.
                       * Convert to Unicode to get the Unicode range.  Later we
                       * will convert each code point in the range back to
                       * native.  */
@@ -3040,32 +3076,33 @@ S_scan_const(pTHX_ char *start)
                      }
  #ifdef EBCDIC
                      else if (convert_unicode) {
-                        /* diag_listed_as: Invalid range "%s" in transliteration operator */
+        /* diag_listed_as: Invalid range "%s" in transliteration operator */
                          Perl_croak(aTHX_
-                              "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04" UVXf "}\""
-                               " in transliteration operator",
-                              range_min, range_max);
+                           "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
+                           UVXf "}\" in transliteration operator",
+                           range_min, range_max);
                      }
  #endif
                      else {
-                        /* diag_listed_as: Invalid range "%s" in transliteration operator */
+        /* diag_listed_as: Invalid range "%s" in transliteration operator */
                          Perl_croak(aTHX_
-                              "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
-                               " in transliteration operator",
-                              range_min, range_max);
+                           "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
+                           " in transliteration operator",
+                           range_min, range_max);
                      }
                  }
  
                 if (has_utf8) {
  
-                    /* We try to avoid creating a swash.  If the upper end of
-                     * this range is below 256, this range won't force a swash;
-                     * otherwise it does force a swash, and as long as we have
-                     * to have one, we might as well not expand things out.
-                     * But if it's EBCDIC, we may have to look at each
-                     * character below 256 if we have to convert to/from
-                     * Unicode values */
-                    if (range_max > 255
+                    /* If everything in the transliteration is below 256, we
+                     * can avoid special handling later.  A translation table
+                     * for each of those bytes is created by op.c.  So we
+                     * expand out all ranges to their constituent code points.
+                     * But if we've encountered something above 255, the
+                     * expanding won't help, so skip doing that.  But if it's
+                     * EBCDIC, we may have to look at each character below 256
+                     * if we have to convert to/from Unicode values */
+                    if (   has_above_latin1
  #ifdef EBCDIC
                         && (range_min > 255 || ! convert_unicode)
  #endif
@@ -3073,7 +3110,7 @@ S_scan_const(pTHX_ char *start)
                          /* Move the high character one byte to the right; then
                           * insert between it and the range begin, an illegal
                           * byte which serves to indicate this is a range (using
-                         * a '-' could be ambiguous). */
+                         * a '-' would be ambiguous). */
                          char *e = d++;
                          while (e-- > max_ptr) {
                              *(e + 1) = *e;
@@ -3138,8 +3175,9 @@ S_scan_const(pTHX_ char *start)
                       * equivalent */
                      if (has_utf8) {
                          for (i = range_min; i <= range_max; i++) {
-                            append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
-                                                         (U8 **) &d);
+                            append_utf8_from_native_byte(
+                                                    LATIN1_TO_NATIVE((U8) i),
+                                                    (U8 **) &d);
                          }
                      }
                      else {
@@ -3172,14 +3210,17 @@ S_scan_const(pTHX_ char *start)
                 }
  
  #ifdef EBCDIC
-                /* If the original range extended above 255, add in that portion. */
+                /* If the original range extended above 255, add in that
+                 * portion. */
                  if (real_range_max) {
                      *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
                      *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
-                    if (real_range_max > 0x101)
-                        *d++ = (char) ILLEGAL_UTF8_BYTE;
-                    if (real_range_max > 0x100)
+                    if (real_range_max > 0x100) {
+                        if (real_range_max > 0x101) {
+                            *d++ = (char) ILLEGAL_UTF8_BYTE;
+                        }
                          d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
+                    }
                  }
  #endif
  
@@ -3202,8 +3243,7 @@ S_scan_const(pTHX_ char *start)
             if (!esc)
                 in_charclass = TRUE;
         }
-
-       else if (*s == ']' && PL_lex_inpat &&  in_charclass) {
+       else if (*s == ']' && PL_lex_inpat && in_charclass) {
             char *s1 = s-1;
             int esc = 0;
             while (s1 >= start && *s1-- == '\\')
@@ -3211,11 +3251,9 @@ S_scan_const(pTHX_ char *start)
             if (!esc)
                 in_charclass = FALSE;
         }
-
-       /* skip for regexp comments /(?#comment)/, except for the last
-        * char, which will be done separately.
-        * Stop on (?{..}) and friends */
-
+            /* skip for regexp comments /(?#comment)/, except for the last
+             * char, which will be done separately.  Stop on (?{..}) and
+             * friends */
         else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
             if (s[2] == '#') {
                 while (s+1 < send && *s != ')')
@@ -3228,24 +3266,22 @@ S_scan_const(pTHX_ char *start)
                 break;
             }
         }
-
-       /* likewise skip #-initiated comments in //x patterns */
+            /* likewise skip #-initiated comments in //x patterns */
         else if (*s == '#'
                   && PL_lex_inpat
                   && !in_charclass
                   && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
          {
-           while (s+1 < send && *s != '\n')
+           while (s < send && *s != '\n')
                 *d++ = *s++;
         }
-
-       /* no further processing of single-quoted regex */
+            /* no further processing of single-quoted regex */
         else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
             goto default_action;
  
-       /* check for embedded arrays
-          (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
-          */
+            /* check for embedded arrays
+             * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
+             */
         else if (*s == '@' && s[1]) {
             if (UTF
                 ? isIDFIRST_utf8_safe(s+1, send)
@@ -3258,10 +3294,8 @@ S_scan_const(pTHX_ char *start)
             if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
                 break; /* in regexp, neither @+ nor @- are interpolated */
         }
-
-       /* check for embedded scalars.  only stop if we're sure it's a
-          variable.
-        */
+            /* check for embedded scalars.  only stop if we're sure it's a
+             * variable.  */
         else if (*s == '$') {
             if (!PL_lex_inpat)  /* not a regexp, so $ must be var */
                 break;
@@ -3276,6 +3310,11 @@ S_scan_const(pTHX_ char *start)
  
         /* End of else if chain - OP_TRANS rejoin rest */
  
+        if (UNLIKELY(s >= send)) {
+            assert(s == send);
+            break;
+        }
+
         /* backslashes */
         if (*s == '\\' && s+1 < send) {
             char* e;    /* Can be used for ending '}', etc. */
@@ -3365,7 +3404,7 @@ S_scan_const(pTHX_ char *start)
                                                 UTF);
                     if (! valid) {
                         yyerror(error);
-                       continue;
+                       uv = 0; /* drop through to ensure range ends are set */
                     }
                     goto NUM_ESCAPE_INSERT;
                 }
@@ -3383,47 +3422,64 @@ S_scan_const(pTHX_ char *start)
                                                 UTF);
                     if (! valid) {
                         yyerror(error);
-                       continue;
+                       uv = 0; /* drop through to ensure range ends are set */
                     }
                 }
  
               NUM_ESCAPE_INSERT:
                 /* Insert oct or hex escaped character. */
-               
+
                 /* Here uv is the ordinal of the next character being added */
                 if (UVCHR_IS_INVARIANT(uv)) {
                     *d++ = (char) uv;
                 }
                 else {
                     if (!has_utf8 && uv > 255) {
-                       /* Might need to recode whatever we have accumulated so
-                        * far if it contains any chars variant in utf8 or
-                        * utf-ebcdic. */
  
-                       SvCUR_set(sv, d - SvPVX_const(sv));
-                       SvPOK_on(sv);
-                       *d = '\0';
-                       /* See Note on sizing above.  */
-                       sv_utf8_upgrade_flags_grow(
-                                       sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
-                                                  /* Above-latin1 in string
-                                                   * implies no encoding */
-                                                  |SV_UTF8_NO_ENCODING,
-                                       UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
-                       d = SvPVX(sv) + SvCUR(sv);
-                       has_utf8 = TRUE;
+                        /* Here, 'uv' won't fit unless we convert to UTF-8.
+                         * If we've only seen invariants so far, all we have to
+                         * do is turn on the flag */
+                        if (utf8_variant_count == 0) {
+                            SvUTF8_on(sv);
+                        }
+                        else {
+                            SvCUR_set(sv, d - SvPVX_const(sv));
+                            SvPOK_on(sv);
+                            *d = '\0';
+
+                            sv_utf8_upgrade_flags_grow(
+                                           sv,
+                                           SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+
+                                           /* Since we're having to grow here,
+                                            * make sure we have enough room for
+                                            * this escape and a NUL, so the
+                                            * code immediately below won't have
+                                            * to actually grow again */
+                                          UVCHR_SKIP(uv)
+                                        + (STRLEN)(send - s) + 1);
+                            d = SvPVX(sv) + SvCUR(sv);
+                        }
+
+                        has_above_latin1 = TRUE;
+                        has_utf8 = TRUE;
                      }
  
-                    if (has_utf8) {
+                    if (! has_utf8) {
+                       *d++ = (char)uv;
+                        utf8_variant_count++;
+                    }
+                   else {
                         /* Usually, there will already be enough room in 'sv'
                          * since such escapes are likely longer than any UTF-8
                          * sequence they can end up as.  This isn't the case on
                          * EBCDIC where \x{40000000} contains 12 bytes, and the
                          * UTF-8 for it contains 14.  And, we have to allow for
                          * a trailing NUL.  It probably can't happen on ASCII
-                        * platforms, but be safe */
-                        const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv)
+                        * platforms, but be safe.  See Note on sizing above. */
+                        const STRLEN needed = d - SvPVX(sv)
+                                            + UVCHR_SKIP(uv)
+                                            + (send - s)
                                              + 1;
                          if (UNLIKELY(needed > SvLEN(sv))) {
                              SvCUR_set(sv, d - SvPVX_const(sv));
@@ -3438,9 +3494,6 @@ S_scan_const(pTHX_ char *start)
                                 (PL_lex_repl ? OPpTRANS_FROM_UTF
                                              : OPpTRANS_TO_UTF);
                         }
-                    }
-                   else {
-                       *d++ = (char)uv;
                     }
                 }
  #ifdef EBCDIC
@@ -3554,16 +3607,27 @@ S_scan_const(pTHX_ char *start)
                         if (! has_utf8 && (   uv > 0xFF
                                             || PL_lex_inwhat != OP_TRANS))
                          {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
+
                             SvCUR_set(sv, d - SvPVX_const(sv));
                             SvPOK_on(sv);
                             *d = '\0';
-                           /* See Note on sizing above.  */
-                           sv_utf8_upgrade_flags_grow(
-                                    sv,
-                                    SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                   OFFUNISKIP(uv) + (STRLEN)(send - e) + 1);
-                           d = SvPVX(sv) + SvCUR(sv);
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
+                                sv_utf8_upgrade_flags_grow(
+                                               sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               extra);
+                                d = SvPVX(sv) + SvCUR(sv);
+                            }
+
                             has_utf8 = TRUE;
+                            has_above_latin1 = TRUE;
                         }
  
                          /* Add the (Unicode) code point to the output. */
@@ -3708,27 +3772,41 @@ S_scan_const(pTHX_ char *start)
                                      (int) (e + 1 - start), start));
                                  goto end_backslash_N;
                              }
+
+                            if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
+                                has_above_latin1 = TRUE;
+                            }
+
                          }
                          else if (! SvUTF8(res)) {
                              /* Make sure \N{} return is UTF-8.  This is because
                               * \N{} implies Unicode semantics, and scalars have
                               * to be in utf8 to guarantee those semantics; but
                               * not needed in tr/// */
-                            sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
+                            sv_utf8_upgrade_flags(res, 0);
                              str = SvPV_const(res, len);
                          }
  
                           /* Upgrade destination to be utf8 if this new
                            * component is */
                         if (! has_utf8 && SvUTF8(res)) {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = len + (send - s) + 1;
+
                             SvCUR_set(sv, d - SvPVX_const(sv));
                             SvPOK_on(sv);
                             *d = '\0';
-                           /* See Note on sizing above.  */
-                           sv_utf8_upgrade_flags_grow(sv,
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
+                                sv_utf8_upgrade_flags_grow(sv,
                                                 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                               len + (STRLEN)(send - s) + 1);
-                           d = SvPVX(sv) + SvCUR(sv);
+                                               extra);
+                                d = SvPVX(sv) + SvCUR(sv);
+                            }
                             has_utf8 = TRUE;
                         } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
  
@@ -3800,12 +3878,17 @@ S_scan_const(pTHX_ char *start)
           * to/from UTF-8.
           *
           * If the input has the same representation in UTF-8 as not, it will be
-         * a single byte, and we don't care about UTF8ness; or if neither
-         * source nor output is UTF-8, just copy the byte */
-        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s)) || (! this_utf8 && ! has_utf8))
-        {
+         * a single byte, and we don't care about UTF8ness; just copy the byte */
+        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
             *d++ = *s++;
          }
+        else if (! this_utf8 && ! has_utf8) {
+            /* If neither source nor output is UTF-8, is also a single byte,
+             * just copy it; but this byte counts should we later have to
+             * convert to UTF-8 */
+           *d++ = *s++;
+            utf8_variant_count++;
+        }
          else if (this_utf8 && has_utf8) {   /* Both UTF-8, can just copy */
             const STRLEN len = UTF8SKIP(s);
  
@@ -3822,16 +3905,26 @@ S_scan_const(pTHX_ char *start)
             const UV nextuv   = (this_utf8)
                                  ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
                                  : (UV) ((U8) *s);
-           const STRLEN need = UVCHR_SKIP(nextuv);
+           STRLEN need = UVCHR_SKIP(nextuv);
+
             if (!has_utf8) {
                 SvCUR_set(sv, d - SvPVX_const(sv));
                 SvPOK_on(sv);
                 *d = '\0';
-               /* See Note on sizing above.  */
-               sv_utf8_upgrade_flags_grow(sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                       need + (STRLEN)(send - s) + 1);
-               d = SvPVX(sv) + SvCUR(sv);
+
+                /* See Note on sizing above. */
+                need += (STRLEN)(send - s) + 1;
+
+                if (utf8_variant_count == 0) {
+                    SvUTF8_on(sv);
+                    d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
+                }
+                else {
+                    sv_utf8_upgrade_flags_grow(sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               need);
+                    d = SvPVX(sv) + SvCUR(sv);
+                }
                 has_utf8 = TRUE;
             } else if (need > len) {
                 /* encoded value larger than old, may need extra space (NOTE:
@@ -5690,7 +5783,8 @@ Perl_yylex(pTHX)
                     if (len == 6 && strnEQ(SvPVX(sv), "unique", len)) {
                         sv_free(sv);
                         if (PL_in_my == KEY_our) {
-                           deprecate(":unique");
+                            deprecate_disappears_in("5.28",
+                                "Attribute \"unique\" is deprecated");
                         }
                         else
                             Perl_croak(aTHX_ "The 'unique' attribute may only be applied to 'our' variables");
@@ -5704,7 +5798,8 @@ Perl_yylex(pTHX)
                     }
                     else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "locked", len)) {
                         sv_free(sv);
-                       deprecate(":locked");
+                        deprecate_disappears_in("5.28",
+                            "Attribute \"locked\" is deprecated");
                     }
                     else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "method", len)) {
                         sv_free(sv);
@@ -6947,8 +7042,10 @@ Perl_yylex(pTHX)
             else {                      /* no override */
                 tmp = -tmp;
                 if (tmp == KEY_dump) {
-                   Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
-                                  "dump() better written as CORE::dump()");
+                   Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED),
+                                    "dump() better written as CORE::dump(). "
+                                     "dump() will no longer be available "
+                                     "in Perl 5.30");
                 }
                 gv = NULL;
                 gvp = 0;
@@ -7633,7 +7730,7 @@ Perl_yylex(pTHX)
  
         case KEY_exists:
             UNI(OP_EXISTS);
-       
+
         case KEY_exit:
             UNI(OP_EXIT);
  
@@ -7869,7 +7966,7 @@ Perl_yylex(pTHX)
  
         case KEY_last:
             LOOPX(OP_LAST);
-       
+
         case KEY_lc:
             UNI(OP_LC);
  
@@ -8049,7 +8146,7 @@ Perl_yylex(pTHX)
  
         case KEY_pos:
             UNIDOR(OP_POS);
-       
+
         case KEY_pack:
             LOP(OP_PACK,XTERM);
  
@@ -8237,7 +8334,7 @@ Perl_yylex(pTHX)
  
         case KEY_chomp:
             UNI(OP_CHOMP);
-       
+
         case KEY_scalar:
             UNI(OP_SCALAR);
  
@@ -8877,7 +8974,7 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
         || ! SvOK(*cvp))
      {
         char *msg;
-       
+
         /* Here haven't found what we're looking for.  If it is charnames,
          * perhaps it needs to be loaded.  Try doing that before giving up */
         if (*key == 'c') {
@@ -9427,10 +9524,6 @@ S_scan_pat(pTHX_ char *start, I32 type)
                        "Use of /c modifier is meaningless without /g" );
      }
  
-    if (UNLIKELY((x_mod_count) > 1)) {
-        yyerror("Only one /x regex modifier is allowed");
-    }
-
      PL_lex_op = (OP*)pm;
      pl_yylval.ival = OP_MATCH;
      return s;
@@ -9485,10 +9578,6 @@ S_scan_subst(pTHX_ char *start)
         }
      }
  
-    if (UNLIKELY((x_mod_count) > 1)) {
-        yyerror("Only one /x regex modifier is allowed");
-    }
-
      if ((pm->op_pmflags & PMf_CONTINUE)) {
          Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" );
      }
@@ -9665,7 +9754,7 @@ S_scan_heredoc(pTHX_ char *s)
         else
             term = '"';
         if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
-           deprecate("bare << to mean <<\"\"");
+           deprecate_fatal_in("5.28", "Use of bare << to mean <<\"\" is deprecated");
         peek = s;
          while (
                 isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))
@@ -10215,7 +10304,7 @@ S_scan_inputsymbol(pTHX_ char *start)
         ($*@)           sub prototypes          sub foo ($)
         (stuff)         sub attr parameters     sub foo : attr(stuff)
         <>              readline or globs       <FOO>, <>, <$fh>, or <*.c>
-       
+
     In most of these cases (all but <>, patterns and transliterate)
     yylex() calls scan_str().  m// makes yylex() call scan_pat() which
     calls scan_str().  s/// makes yylex() call scan_subst() which calls
@@ -10378,7 +10467,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
                 *to = *s;
             }
         }
-       
+
         /* if the terminator isn't the same as the start character (e.g.,
            matched brackets), we have to allow more in the quoting, and
            be prepared for nested brackets.
@@ -10436,7 +10525,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
         else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r')
             to[-1] = '\n';
  #endif
-       
+
         /* if we're out of file, or a read fails, bail and reset the current
            line marker so we can report where the unterminated string began
         */
@@ -10515,6 +10604,15 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
      bool floatit;                      /* boolean: int or float? */
      const char *lastub = NULL;         /* position of last underbar */
      static const char* const number_too_long = "Number too long";
+    bool warned_about_underscore = 0;
+#define WARN_ABOUT_UNDERSCORE() \
+       do { \
+           if (!warned_about_underscore) { \
+               warned_about_underscore = 1; \
+               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), \
+                              "Misplaced _ in number"); \
+           } \
+       } while(0)
      /* Hexadecimal floating point.
       *
       * In many places (where we have quads and NV is IEEE 754 double)
@@ -10599,8 +10697,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
             }
  
             if (*s == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
+               WARN_ABOUT_UNDERSCORE();
                lastub = s++;
             }
  
@@ -10623,8 +10720,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                 /* _ are ignored -- but warned about if consecutive */
                 case '_':
                     if (lastub && s == lastub + 1)
-                       Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                                      "Misplaced _ in number");
+                       WARN_ABOUT_UNDERSCORE();
                     lastub = s++;
                     break;
  
@@ -10709,9 +10805,8 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
           out:
  
             /* final misplaced underbar check */
-           if (s[-1] == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number");
-           }
+           if (s[-1] == '_')
+               WARN_ABOUT_UNDERSCORE();
  
              if (UNLIKELY(HEXFP_PEEK(s))) {
                  /* Do sloppy (on the underbars) but quick detection
@@ -10920,8 +11015,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
             */
             if (*s == '_') {
                 if (lastub && s == lastub + 1)
-                   Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                                  "Misplaced _ in number");
+                   WARN_ABOUT_UNDERSCORE();
                 lastub = s++;
             }
             else {
@@ -10934,9 +11028,8 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
         }
  
         /* final misplaced underbar check */
-       if (lastub && s == lastub + 1) {
-           Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number");
-       }
+       if (lastub && s == lastub + 1)
+           WARN_ABOUT_UNDERSCORE();
  
         /* read a decimal portion if there is one.  avoid
            3..5 being interpreted as the number 3. followed
@@ -10947,8 +11040,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
             *d++ = *s++;
  
             if (*s == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
+               WARN_ABOUT_UNDERSCORE();
                 lastub = s;
             }
  
@@ -10964,18 +11056,15 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                     Perl_croak(aTHX_ "%s", number_too_long);
                 if (*s == '_') {
                    if (lastub && s == lastub + 1)
-                      Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                                     "Misplaced _ in number");
+                       WARN_ABOUT_UNDERSCORE();
                    lastub = s;
                 }
                 else
                     *d++ = *s;
             }
             /* fractional part ending in underbar? */
-           if (s[-1] == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
-           }
+           if (s[-1] == '_')
+               WARN_ABOUT_UNDERSCORE();
             if (*s == '.' && isDIGIT(s[1])) {
                 /* oops, it's really a v-string, but without the "v" */
                 s = start;
@@ -11005,8 +11094,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
  
             /* stray preinitial _ */
             if (*s == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
+               WARN_ABOUT_UNDERSCORE();
                 lastub = s++;
             }
  
@@ -11016,8 +11104,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
  
             /* stray initial _ */
             if (*s == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
+               WARN_ABOUT_UNDERSCORE();
                 lastub = s++;
             }
  
@@ -11031,8 +11118,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                 else {
                    if (((lastub && s == lastub + 1)
                          || (!isDIGIT(s[1]) && s[1] != '_')))
-                      Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                                     "Misplaced _ in number");
+                       WARN_ABOUT_UNDERSCORE();
                    lastub = s++;
                 }
             }