This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Add, clarify some comments, white-space
[perl5.git] / toke.c
diff --git a/toke.c b/toke.c
index 7887907..9700d35 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -467,7 +467,7 @@ S_printbuf(pTHX_ const char *const fmt, const char *const s)
 static int
 S_deprecate_commaless_var_list(pTHX) {
     PL_expect = XTERM;
-    deprecate("comma-less variable list");
+    deprecate_fatal_in("5.28", "Use of comma-less variable list is deprecated");
     return REPORT(','); /* grandfather non-comma-format format */
 }
 
@@ -2543,7 +2543,7 @@ S_sublex_done(pTHX)
     }
 }
 
-PERL_STATIC_INLINE SV*
+STATIC SV*
 S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
 {
     /* <s> points to first character of interior of \N{}, <e> to one beyond the
@@ -2563,8 +2563,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
     PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
 
     if (!SvCUR(res)) {
-        Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
-                       "Unknown charname '' is deprecated");
+        deprecate_fatal_in("5.28", "Unknown charname '' is deprecated");
         return res;
     }
 
@@ -2785,15 +2784,17 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
 
   In transliterations:
     characters are VERY literal, except for - not at the start or end
-    of the string, which indicates a range. If the range is in bytes,
+    of the string, which indicates a range.  However some backslash sequences
+    are recognized: \r, \n, and the like
+                    \007 \o{}, \x{}, \N{}
+    If all elements in the transliteration are below 256,
     scan_const expands the range to the full set of intermediate
     characters. If the range is in utf8, the hyphen is replaced with
     a certain range mark which will be handled by pmtrans() in op.c.
 
   In double-quoted strings:
     backslashes:
-      double-quoted style: \r and \n
-      constants: \x31, etc.
+      all those recognized in transliterations
       deprecated backrefs: \1 (in substitution replacements)
       case and quoting: \U \Q \E
     stops on @ and $
@@ -2836,7 +2837,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
          } (end if backslash)
           handle regular character
     } (end while character to read)
-               
+
 */
 
 STATIC char *
@@ -2851,11 +2852,18 @@ S_scan_const(pTHX_ char *start)
     bool didrange = FALSE;              /* did we just finish a range? */
     bool in_charclass = FALSE;          /* within /[...]/ */
     bool has_utf8 = FALSE;              /* Output constant is UTF8 */
+    bool has_above_latin1 = FALSE;      /* does something require special
+                                           handling in tr/// ? */
     bool  this_utf8 = cBOOL(UTF);       /* Is the source string assumed to be
                                            UTF8?  But, this can show as true
                                            when the source isn't utf8, as for
                                            example when it is entirely composed
                                            of hex constants */
+    STRLEN utf8_variant_count = 0;      /* When not in UTF-8, this counts the
+                                           number of characters found so far
+                                           that will expand (into 2 bytes)
+                                           should we have to convert to
+                                           UTF-8) */
     SV *res;                           /* result from charnames */
     STRLEN offset_to_max;   /* The offset in the output to where the range
                                high-end character is temporarily placed */
@@ -2904,30 +2912,29 @@ S_scan_const(pTHX_ char *start)
              * range, so for most cases we just drop down and handle the value
              * as any other.  There are two exceptions.
              *
-             * 1.  A minus sign indicates that we are actually going to have
-             *     range.  In this case, skip the '-', set a flag, then drop
+             * 1.  A hyphen indicates that we are actually going to have a
+             *     range.  In this case, skip the '-', set a flag, then drop
              *     down to handle what should be the end range value.
              * 2.  After we've handled that value, the next time through, that
              *     flag is set and we fix up the range.
              *
              * Ranges entirely within Latin1 are expanded out entirely, in
-             * order to avoid the significant overhead of making a swash.
-             * Ranges that extend above Latin1 have to have a swash, so there
-             * is no advantage to abbreviating them here, so they are stored
-             * here as Min, ILLEGAL_UTF8_BYTE, Max.  The illegal byte signifies
-             * a hyphen without any possible ambiguity.  On EBCDIC machines, if
-             * the range is expressed as Unicode, the Latin1 portion is
-             * expanded out even if the entire range extends above Latin1.
-             * This is because each code point in it has to be processed here
-             * individually to get its native translation */
+             * order to make the transliteration a simple table look-up.
+             * Ranges that extend above Latin1 have to be done differently, so
+             * there is no advantage to expanding them here, so they are
+             * stored here as Min, ILLEGAL_UTF8_BYTE, Max.  The illegal byte
+             * signifies a hyphen without any possible ambiguity.  On EBCDIC
+             * machines, if the range is expressed as Unicode, the Latin1
+             * portion is expanded out even if the range extends above
+             * Latin1.  This is because each code point in it has to be
+             * processed here individually to get its native translation */
 
            if (! dorange) {
 
-                /* Here, we don't think we're in a range.  If we've processed
-                 * at least one character, then see if this next one is a '-',
-                 * indicating the previous one was the start of a range.  But
-                 * don't bother if we're too close to the end for the minus to
-                 * mean that. */
+                /* Here, we don't think we're in a range.  If the new character
+                 * is not a hyphen; or if it is a hyphen, but it's too close to
+                 * either edge to indicate a range, then it's a regular
+                 * character. */
                 if (*s != '-' || s >= send - 1 || s == start) {
 
                     /* A regular character.  Process like any other, but first
@@ -2938,16 +2945,26 @@ S_scan_const(pTHX_ char *start)
                     non_portable_endpoint = 0;
                     backslash_N = 0;
 #endif
+                    /* The tests here for being above Latin1 and similar ones
+                     * in the following 'else' suffice to find all such
+                     * occurences in the constant, except those added by a
+                     * backslash escape sequence, like \x{100}.  And all those
+                     * set 'has_above_latin1' as appropriate */
+                    if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
+                        has_above_latin1 = TRUE;
+                    }
+
                     /* Drops down to generic code to process current byte */
                 }
-                else {
+                else {  /* Is a '-' in the context where it means a range */
                     if (didrange) { /* Something like y/A-C-Z// */
-                        Perl_croak(aTHX_ "Ambiguous range in transliteration operator");
+                        Perl_croak(aTHX_ "Ambiguous range in transliteration"
+                                         " operator");
                     }
 
                     dorange = TRUE;
 
-                    s++;    /* Skip past the minus */
+                    s++;    /* Skip past the hyphen */
 
                     /* d now points to where the end-range character will be
                      * placed.  Save it so won't have to go finding it later,
@@ -2957,6 +2974,12 @@ S_scan_const(pTHX_ char *start)
                      * pointer).  We'll finish processing the range the next
                      * time through the loop */
                     offset_to_max = d - SvPVX_const(sv);
+
+                    if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
+                        has_above_latin1 = TRUE;
+                    }
+
+                    /* Drops down to generic code to process current byte */
                 }
             }  /* End of not a range */
             else {
@@ -2968,7 +2991,7 @@ S_scan_const(pTHX_ char *start)
                  * 'd'  points to just beyond the range end in the 'sv' string,
                  *      where we would next place something
                  * 'offset_to_max' is the offset in 'sv' at which the character
-                 *      before 'd' begins.
+                 *      (the range's maximum end point) before 'd'  begins.
                  */
                 const char * max_ptr = SvPVX_const(sv) + offset_to_max;
                 const char * min_ptr;
@@ -2981,7 +3004,7 @@ S_scan_const(pTHX_ char *start)
                 IV real_range_max = 0;
 #endif
 
-                /* Get the range-ends code point values. */
+                /* Get the code point values of the range ends. */
                 if (has_utf8) {
                     /* We know the utf8 is valid, because we just constructed
                      * it ourselves in previous loop iterations */
@@ -3001,17 +3024,17 @@ S_scan_const(pTHX_ char *start)
                  * Unicode value (\N{...}), or if the range is a subset of
                  * [A-Z] or [a-z], and both ends are literal characters,
                  * like 'A', and not like \x{C1} */
-                if ((convert_unicode
-                     = cBOOL(backslash_N)   /* \N{} forces Unicode, hence
-                                               portable range */
-                      || (   ! non_portable_endpoint
-                          && ((  isLOWER_A(range_min) && isLOWER_A(range_max))
-                             || (isUPPER_A(range_min) && isUPPER_A(range_max))))
-                )) {
+                convert_unicode =
+                               cBOOL(backslash_N)   /* \N{} forces Unicode,
+                                                       hence portable range */
+                    || (     ! non_portable_endpoint
+                        && ((  isLOWER_A(range_min) && isLOWER_A(range_max))
+                           || (isUPPER_A(range_min) && isUPPER_A(range_max))));
+                if (convert_unicode) {
 
                     /* Special handling is needed for these portable ranges.
-                     * They are defined to all be in Unicode terms, which
-                     * include all Unicode code points between the end points.
+                     * They are defined to be in Unicode terms, which includes
+                     * all the Unicode code points between the end points.
                      * Convert to Unicode to get the Unicode range.  Later we
                      * will convert each code point in the range back to
                      * native.  */
@@ -3040,32 +3063,33 @@ S_scan_const(pTHX_ char *start)
                     }
 #ifdef EBCDIC
                     else if (convert_unicode) {
-                        /* diag_listed_as: Invalid range "%s" in transliteration operator */
+        /* diag_listed_as: Invalid range "%s" in transliteration operator */
                         Perl_croak(aTHX_
-                              "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04" UVXf "}\""
-                               " in transliteration operator",
-                              range_min, range_max);
+                           "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
+                           UVXf "}\" in transliteration operator",
+                           range_min, range_max);
                     }
 #endif
                     else {
-                        /* diag_listed_as: Invalid range "%s" in transliteration operator */
+        /* diag_listed_as: Invalid range "%s" in transliteration operator */
                         Perl_croak(aTHX_
-                              "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
-                               " in transliteration operator",
-                              range_min, range_max);
+                           "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
+                           " in transliteration operator",
+                           range_min, range_max);
                     }
                 }
 
                if (has_utf8) {
 
-                    /* We try to avoid creating a swash.  If the upper end of
-                     * this range is below 256, this range won't force a swash;
-                     * otherwise it does force a swash, and as long as we have
-                     * to have one, we might as well not expand things out.
-                     * But if it's EBCDIC, we may have to look at each
-                     * character below 256 if we have to convert to/from
-                     * Unicode values */
-                    if (range_max > 255
+                    /* If everything in the transliteration is below 256, we
+                     * can avoid special handling later.  A translation table
+                     * for each of those bytes is created by op.c.  So we
+                     * expand out all ranges to their constituent code points.
+                     * But if we've encountered something above 255, the
+                     * expanding won't help, so skip doing that.  But if it's
+                     * EBCDIC, we may have to look at each character below 256
+                     * if we have to convert to/from Unicode values */
+                    if (   has_above_latin1
 #ifdef EBCDIC
                        && (range_min > 255 || ! convert_unicode)
 #endif
@@ -3073,7 +3097,7 @@ S_scan_const(pTHX_ char *start)
                         /* Move the high character one byte to the right; then
                          * insert between it and the range begin, an illegal
                          * byte which serves to indicate this is a range (using
-                         * a '-' could be ambiguous). */
+                         * a '-' would be ambiguous). */
                         char *e = d++;
                         while (e-- > max_ptr) {
                             *(e + 1) = *e;
@@ -3138,8 +3162,9 @@ S_scan_const(pTHX_ char *start)
                      * equivalent */
                     if (has_utf8) {
                         for (i = range_min; i <= range_max; i++) {
-                            append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i),
-                                                         (U8 **) &d);
+                            append_utf8_from_native_byte(
+                                                    LATIN1_TO_NATIVE((U8) i),
+                                                    (U8 **) &d);
                         }
                     }
                     else {
@@ -3172,7 +3197,8 @@ S_scan_const(pTHX_ char *start)
                }
 
 #ifdef EBCDIC
-                /* If the original range extended above 255, add in that portion. */
+                /* If the original range extended above 255, add in that
+                 * portion. */
                 if (real_range_max) {
                     *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
                     *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
@@ -3202,8 +3228,7 @@ S_scan_const(pTHX_ char *start)
            if (!esc)
                in_charclass = TRUE;
        }
-
-       else if (*s == ']' && PL_lex_inpat &&  in_charclass) {
+       else if (*s == ']' && PL_lex_inpat && in_charclass) {
            char *s1 = s-1;
            int esc = 0;
            while (s1 >= start && *s1-- == '\\')
@@ -3211,11 +3236,9 @@ S_scan_const(pTHX_ char *start)
            if (!esc)
                in_charclass = FALSE;
        }
-
-       /* skip for regexp comments /(?#comment)/, except for the last
-        * char, which will be done separately.
-        * Stop on (?{..}) and friends */
-
+            /* skip for regexp comments /(?#comment)/, except for the last
+             * char, which will be done separately.  Stop on (?{..}) and
+             * friends */
        else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) {
            if (s[2] == '#') {
                while (s+1 < send && *s != ')')
@@ -3228,24 +3251,22 @@ S_scan_const(pTHX_ char *start)
                break;
            }
        }
-
-       /* likewise skip #-initiated comments in //x patterns */
+            /* likewise skip #-initiated comments in //x patterns */
        else if (*s == '#'
                  && PL_lex_inpat
                  && !in_charclass
                  && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
         {
-           while (s+1 < send && *s != '\n')
+           while (s < send && *s != '\n')
                *d++ = *s++;
        }
-
-       /* no further processing of single-quoted regex */
+            /* no further processing of single-quoted regex */
        else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
            goto default_action;
 
-       /* check for embedded arrays
-          (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
-          */
+            /* check for embedded arrays
+             * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
+             */
        else if (*s == '@' && s[1]) {
            if (UTF
                ? isIDFIRST_utf8_safe(s+1, send)
@@ -3258,10 +3279,8 @@ S_scan_const(pTHX_ char *start)
            if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
                break; /* in regexp, neither @+ nor @- are interpolated */
        }
-
-       /* check for embedded scalars.  only stop if we're sure it's a
-          variable.
-        */
+            /* check for embedded scalars.  only stop if we're sure it's a
+             * variable.  */
        else if (*s == '$') {
            if (!PL_lex_inpat)  /* not a regexp, so $ must be var */
                break;
@@ -3276,6 +3295,11 @@ S_scan_const(pTHX_ char *start)
 
        /* End of else if chain - OP_TRANS rejoin rest */
 
+        if (UNLIKELY(s >= send)) {
+            assert(s == send);
+            break;
+        }
+
        /* backslashes */
        if (*s == '\\' && s+1 < send) {
            char* e;    /* Can be used for ending '}', etc. */
@@ -3365,7 +3389,7 @@ S_scan_const(pTHX_ char *start)
                                                UTF);
                    if (! valid) {
                        yyerror(error);
-                       continue;
+                       uv = 0; /* drop through to ensure range ends are set */
                    }
                    goto NUM_ESCAPE_INSERT;
                }
@@ -3383,51 +3407,68 @@ S_scan_const(pTHX_ char *start)
                                                UTF);
                    if (! valid) {
                        yyerror(error);
-                       continue;
+                       uv = 0; /* drop through to ensure range ends are set */
                    }
                }
 
              NUM_ESCAPE_INSERT:
                /* Insert oct or hex escaped character. */
-               
+
                /* Here uv is the ordinal of the next character being added */
                if (UVCHR_IS_INVARIANT(uv)) {
                    *d++ = (char) uv;
                }
                else {
                    if (!has_utf8 && uv > 255) {
-                       /* Might need to recode whatever we have accumulated so
-                        * far if it contains any chars variant in utf8 or
-                        * utf-ebcdic. */
 
-                       SvCUR_set(sv, d - SvPVX_const(sv));
-                       SvPOK_on(sv);
-                       *d = '\0';
-                       /* See Note on sizing above.  */
-                       sv_utf8_upgrade_flags_grow(
-                                       sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
-                                                  /* Above-latin1 in string
-                                                   * implies no encoding */
-                                                  |SV_UTF8_NO_ENCODING,
-                                       UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
-                       d = SvPVX(sv) + SvCUR(sv);
-                       has_utf8 = TRUE;
+                        /* Here, 'uv' won't fit unless we convert to UTF-8.
+                         * If we've only seen invariants so far, all we have to
+                         * do is turn on the flag */
+                        if (utf8_variant_count == 0) {
+                            SvUTF8_on(sv);
+                        }
+                        else {
+                            SvCUR_set(sv, d - SvPVX_const(sv));
+                            SvPOK_on(sv);
+                            *d = '\0';
+
+                            sv_utf8_upgrade_flags_grow(
+                                           sv,
+                                           SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+
+                                           /* Since we're having to grow here,
+                                            * make sure we have enough room for
+                                            * this escape and a NUL, so the
+                                            * code immediately below won't have
+                                            * to actually grow again */
+                                          UVCHR_SKIP(uv)
+                                        + (STRLEN)(send - s) + 1);
+                            d = SvPVX(sv) + SvCUR(sv);
+                        }
+
+                        has_above_latin1 = TRUE;
+                        has_utf8 = TRUE;
                     }
 
-                    if (has_utf8) {
+                    if (! has_utf8) {
+                       *d++ = (char)uv;
+                        utf8_variant_count++;
+                    }
+                   else {
                        /* Usually, there will already be enough room in 'sv'
                         * since such escapes are likely longer than any UTF-8
                         * sequence they can end up as.  This isn't the case on
                         * EBCDIC where \x{40000000} contains 12 bytes, and the
                         * UTF-8 for it contains 14.  And, we have to allow for
                         * a trailing NUL.  It probably can't happen on ASCII
-                        * platforms, but be safe */
-                        const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv)
+                        * platforms, but be safe.  See Note on sizing above. */
+                        const STRLEN needed = d - SvPVX(sv)
+                                            + UVCHR_SKIP(uv)
+                                            + (send - s)
                                             + 1;
                         if (UNLIKELY(needed > SvLEN(sv))) {
                             SvCUR_set(sv, d - SvPVX_const(sv));
-                            d = sv_grow(sv, needed) + SvCUR(sv);
+                            d = SvCUR(sv) + SvGROW(sv, needed);
                         }
 
                        d = (char*)uvchr_to_utf8((U8*)d, uv);
@@ -3438,9 +3479,6 @@ S_scan_const(pTHX_ char *start)
                                (PL_lex_repl ? OPpTRANS_FROM_UTF
                                             : OPpTRANS_TO_UTF);
                        }
-                    }
-                   else {
-                       *d++ = (char)uv;
                    }
                }
 #ifdef EBCDIC
@@ -3554,16 +3592,27 @@ S_scan_const(pTHX_ char *start)
                        if (! has_utf8 && (   uv > 0xFF
                                            || PL_lex_inwhat != OP_TRANS))
                         {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
+
                            SvCUR_set(sv, d - SvPVX_const(sv));
                            SvPOK_on(sv);
                            *d = '\0';
-                           /* See Note on sizing above.  */
-                           sv_utf8_upgrade_flags_grow(
-                                    sv,
-                                    SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                   OFFUNISKIP(uv) + (STRLEN)(send - e) + 1);
-                           d = SvPVX(sv) + SvCUR(sv);
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
+                                sv_utf8_upgrade_flags_grow(
+                                               sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               extra);
+                                d = SvPVX(sv) + SvCUR(sv);
+                            }
+
                            has_utf8 = TRUE;
+                            has_above_latin1 = TRUE;
                        }
 
                         /* Add the (Unicode) code point to the output. */
@@ -3708,34 +3757,49 @@ S_scan_const(pTHX_ char *start)
                                     (int) (e + 1 - start), start));
                                 goto end_backslash_N;
                             }
+
+                            if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
+                                has_above_latin1 = TRUE;
+                            }
+
                         }
                         else if (! SvUTF8(res)) {
                             /* Make sure \N{} return is UTF-8.  This is because
                              * \N{} implies Unicode semantics, and scalars have
                              * to be in utf8 to guarantee those semantics; but
                              * not needed in tr/// */
-                            sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
+                            sv_utf8_upgrade_flags(res, 0);
                             str = SvPV_const(res, len);
                         }
 
                          /* Upgrade destination to be utf8 if this new
                           * component is */
                        if (! has_utf8 && SvUTF8(res)) {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = len + (send - s) + 1;
+
                            SvCUR_set(sv, d - SvPVX_const(sv));
                            SvPOK_on(sv);
                            *d = '\0';
-                           /* See Note on sizing above.  */
-                           sv_utf8_upgrade_flags_grow(sv,
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
+                                sv_utf8_upgrade_flags_grow(sv,
                                                SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                               len + (STRLEN)(send - s) + 1);
-                           d = SvPVX(sv) + SvCUR(sv);
+                                               extra);
+                                d = SvPVX(sv) + SvCUR(sv);
+                            }
                            has_utf8 = TRUE;
                        } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
 
                            /* See Note on sizing above.  (NOTE: SvCUR() is not
                             * set correctly here). */
+                            const STRLEN extra = len + (send - e) + 1;
                            const STRLEN off = d - SvPVX_const(sv);
-                           d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
+                           d = off + SvGROW(sv, off + extra);
                        }
                        Copy(str, d, len, char);
                        d += len;
@@ -3799,11 +3863,16 @@ S_scan_const(pTHX_ char *start)
          * to/from UTF-8.
          *
          * If the input has the same representation in UTF-8 as not, it will be
-         * a single byte, and we don't care about UTF8ness; or if neither
-         * source nor output is UTF-8, just copy the byte */
-        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s)) || (! this_utf8 && ! has_utf8))
-        {
+         * a single byte, and we don't care about UTF8ness; just copy the byte */
+        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
+           *d++ = *s++;
+        }
+        else if (! this_utf8 && ! has_utf8) {
+            /* If neither source nor output is UTF-8, is also a single byte,
+             * just copy it; but this byte counts should we later have to
+             * convert to UTF-8 */
            *d++ = *s++;
+            utf8_variant_count++;
         }
         else if (this_utf8 && has_utf8) {   /* Both UTF-8, can just copy */
            const STRLEN len = UTF8SKIP(s);
@@ -3821,23 +3890,34 @@ S_scan_const(pTHX_ char *start)
            const UV nextuv   = (this_utf8)
                                 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
                                 : (UV) ((U8) *s);
-           const STRLEN need = UVCHR_SKIP(nextuv);
+           STRLEN need = UVCHR_SKIP(nextuv);
+
            if (!has_utf8) {
                SvCUR_set(sv, d - SvPVX_const(sv));
                SvPOK_on(sv);
                *d = '\0';
-               /* See Note on sizing above.  */
-               sv_utf8_upgrade_flags_grow(sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                       need + (STRLEN)(send - s) + 1);
-               d = SvPVX(sv) + SvCUR(sv);
+
+                /* See Note on sizing above. */
+                need += (STRLEN)(send - s) + 1;
+
+                if (utf8_variant_count == 0) {
+                    SvUTF8_on(sv);
+                    d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
+                }
+                else {
+                    sv_utf8_upgrade_flags_grow(sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               need);
+                    d = SvPVX(sv) + SvCUR(sv);
+                }
                has_utf8 = TRUE;
            } else if (need > len) {
                /* encoded value larger than old, may need extra space (NOTE:
                 * SvCUR() is not set correctly here).   See Note on sizing
                 * above.  */
+                const STRLEN extra = need + (send - s) + 1;
                const STRLEN off = d - SvPVX_const(sv);
-               d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off;
+               d = off + SvGROW(sv, off + extra);
            }
            s += len;
 
@@ -5688,7 +5768,8 @@ Perl_yylex(pTHX)
                    if (len == 6 && strnEQ(SvPVX(sv), "unique", len)) {
                        sv_free(sv);
                        if (PL_in_my == KEY_our) {
-                           deprecate(":unique");
+                            deprecate_disappears_in("5.28",
+                                "Attribute \"unique\" is deprecated");
                        }
                        else
                            Perl_croak(aTHX_ "The 'unique' attribute may only be applied to 'our' variables");
@@ -5702,7 +5783,8 @@ Perl_yylex(pTHX)
                    }
                    else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "locked", len)) {
                        sv_free(sv);
-                       deprecate(":locked");
+                        deprecate_disappears_in("5.28",
+                            "Attribute \"locked\" is deprecated");
                    }
                    else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "method", len)) {
                        sv_free(sv);
@@ -6945,8 +7027,10 @@ Perl_yylex(pTHX)
            else {                      /* no override */
                tmp = -tmp;
                if (tmp == KEY_dump) {
-                   Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
-                                  "dump() better written as CORE::dump()");
+                   Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED),
+                                    "dump() better written as CORE::dump(). "
+                                     "dump() will no longer be available "
+                                     "in Perl 5.30");
                }
                gv = NULL;
                gvp = 0;
@@ -7631,7 +7715,7 @@ Perl_yylex(pTHX)
 
        case KEY_exists:
            UNI(OP_EXISTS);
-       
+
        case KEY_exit:
            UNI(OP_EXIT);
 
@@ -7867,7 +7951,7 @@ Perl_yylex(pTHX)
 
        case KEY_last:
            LOOPX(OP_LAST);
-       
+
        case KEY_lc:
            UNI(OP_LC);
 
@@ -8047,7 +8131,7 @@ Perl_yylex(pTHX)
 
        case KEY_pos:
            UNIDOR(OP_POS);
-       
+
        case KEY_pack:
            LOP(OP_PACK,XTERM);
 
@@ -8235,7 +8319,7 @@ Perl_yylex(pTHX)
 
        case KEY_chomp:
            UNI(OP_CHOMP);
-       
+
        case KEY_scalar:
            UNI(OP_SCALAR);
 
@@ -8875,7 +8959,7 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
        || ! SvOK(*cvp))
     {
        char *msg;
-       
+
        /* Here haven't found what we're looking for.  If it is charnames,
         * perhaps it needs to be loaded.  Try doing that before giving up */
        if (*key == 'c') {
@@ -9425,10 +9509,6 @@ S_scan_pat(pTHX_ char *start, I32 type)
                       "Use of /c modifier is meaningless without /g" );
     }
 
-    if (UNLIKELY((x_mod_count) > 1)) {
-        yyerror("Only one /x regex modifier is allowed");
-    }
-
     PL_lex_op = (OP*)pm;
     pl_yylval.ival = OP_MATCH;
     return s;
@@ -9483,10 +9563,6 @@ S_scan_subst(pTHX_ char *start)
        }
     }
 
-    if (UNLIKELY((x_mod_count) > 1)) {
-        yyerror("Only one /x regex modifier is allowed");
-    }
-
     if ((pm->op_pmflags & PMf_CONTINUE)) {
         Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" );
     }
@@ -9663,7 +9739,7 @@ S_scan_heredoc(pTHX_ char *s)
        else
            term = '"';
        if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
-           deprecate("bare << to mean <<\"\"");
+           deprecate_fatal_in("5.28", "Use of bare << to mean <<\"\" is deprecated");
        peek = s;
         while (
                isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))
@@ -10213,7 +10289,7 @@ S_scan_inputsymbol(pTHX_ char *start)
        ($*@)           sub prototypes          sub foo ($)
        (stuff)         sub attr parameters     sub foo : attr(stuff)
        <>              readline or globs       <FOO>, <>, <$fh>, or <*.c>
-       
+
    In most of these cases (all but <>, patterns and transliterate)
    yylex() calls scan_str().  m// makes yylex() call scan_pat() which
    calls scan_str().  s/// makes yylex() call scan_subst() which calls
@@ -10376,7 +10452,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
                *to = *s;
            }
        }
-       
+
        /* if the terminator isn't the same as the start character (e.g.,
           matched brackets), we have to allow more in the quoting, and
           be prepared for nested brackets.
@@ -10434,7 +10510,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
        else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r')
            to[-1] = '\n';
 #endif
-       
+
        /* if we're out of file, or a read fails, bail and reset the current
           line marker so we can report where the unterminated string began
        */
@@ -10513,6 +10589,15 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
     bool floatit;                      /* boolean: int or float? */
     const char *lastub = NULL;         /* position of last underbar */
     static const char* const number_too_long = "Number too long";
+    bool warned_about_underscore = 0;
+#define WARN_ABOUT_UNDERSCORE() \
+       do { \
+           if (!warned_about_underscore) { \
+               warned_about_underscore = 1; \
+               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), \
+                              "Misplaced _ in number"); \
+           } \
+       } while(0)
     /* Hexadecimal floating point.
      *
      * In many places (where we have quads and NV is IEEE 754 double)
@@ -10597,8 +10682,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
            }
 
            if (*s == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
+               WARN_ABOUT_UNDERSCORE();
               lastub = s++;
            }
 
@@ -10621,8 +10705,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                /* _ are ignored -- but warned about if consecutive */
                case '_':
                    if (lastub && s == lastub + 1)
-                       Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                                      "Misplaced _ in number");
+                       WARN_ABOUT_UNDERSCORE();
                    lastub = s++;
                    break;
 
@@ -10707,9 +10790,8 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
          out:
 
            /* final misplaced underbar check */
-           if (s[-1] == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number");
-           }
+           if (s[-1] == '_')
+               WARN_ABOUT_UNDERSCORE();
 
             if (UNLIKELY(HEXFP_PEEK(s))) {
                 /* Do sloppy (on the underbars) but quick detection
@@ -10918,8 +11000,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
            */
            if (*s == '_') {
                if (lastub && s == lastub + 1)
-                   Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                                  "Misplaced _ in number");
+                   WARN_ABOUT_UNDERSCORE();
                lastub = s++;
            }
            else {
@@ -10932,9 +11013,8 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
        }
 
        /* final misplaced underbar check */
-       if (lastub && s == lastub + 1) {
-           Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number");
-       }
+       if (lastub && s == lastub + 1)
+           WARN_ABOUT_UNDERSCORE();
 
        /* read a decimal portion if there is one.  avoid
           3..5 being interpreted as the number 3. followed
@@ -10945,8 +11025,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
            *d++ = *s++;
 
            if (*s == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
+               WARN_ABOUT_UNDERSCORE();
                lastub = s;
            }
 
@@ -10962,18 +11041,15 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                    Perl_croak(aTHX_ "%s", number_too_long);
                if (*s == '_') {
                   if (lastub && s == lastub + 1)
-                      Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                                     "Misplaced _ in number");
+                       WARN_ABOUT_UNDERSCORE();
                   lastub = s;
                }
                else
                    *d++ = *s;
            }
            /* fractional part ending in underbar? */
-           if (s[-1] == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
-           }
+           if (s[-1] == '_')
+               WARN_ABOUT_UNDERSCORE();
            if (*s == '.' && isDIGIT(s[1])) {
                /* oops, it's really a v-string, but without the "v" */
                s = start;
@@ -11003,8 +11079,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
 
            /* stray preinitial _ */
            if (*s == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
+               WARN_ABOUT_UNDERSCORE();
                lastub = s++;
            }
 
@@ -11014,8 +11089,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
 
            /* stray initial _ */
            if (*s == '_') {
-               Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                              "Misplaced _ in number");
+               WARN_ABOUT_UNDERSCORE();
                lastub = s++;
            }
 
@@ -11029,8 +11103,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
                else {
                   if (((lastub && s == lastub + 1)
                         || (!isDIGIT(s[1]) && s[1] != '_')))
-                      Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
-                                     "Misplaced _ in number");
+                       WARN_ABOUT_UNDERSCORE();
                   lastub = s++;
                }
            }