toke.c: Fix comment that said the opposite of what was meant

[perl5.git] / toke.c
diff --git a/toke.c b/toke.c

index 7836710..178a51f 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -2856,6 +2856,11 @@ S_scan_const(pTHX_ char *start)
                                             when the source isn't utf8, as for
                                             example when it is entirely composed
                                             of hex constants */
+    STRLEN utf8_variant_count = 0;      /* When not in UTF-8, this counts the
+                                           number of characters found so far
+                                           that will expand (into 2 bytes)
+                                           should we have to convert to
+                                           UTF-8) */
      SV *res;                           /* result from charnames */
      STRLEN offset_to_max;   /* The offset in the output to where the range
                                 high-end character is temporarily placed */
@@ -2913,7 +2918,7 @@ S_scan_const(pTHX_ char *start)
               * Ranges entirely within Latin1 are expanded out entirely, in
               * order to avoid the significant overhead of making a swash.
               * Ranges that extend above Latin1 have to have a swash, so there
-             * is no advantage to abbreviating them here, so they are stored
+             * is no advantage to expanding them here, so they are stored
               * here as Min, ILLEGAL_UTF8_BYTE, Max.  The illegal byte signifies
               * a hyphen without any possible ambiguity.  On EBCDIC machines, if
               * the range is expressed as Unicode, the Latin1 portion is
@@ -3001,13 +3006,13 @@ S_scan_const(pTHX_ char *start)
                   * Unicode value (\N{...}), or if the range is a subset of
                   * [A-Z] or [a-z], and both ends are literal characters,
                   * like 'A', and not like \x{C1} */
-                if ((convert_unicode
-                     = cBOOL(backslash_N)   /* \N{} forces Unicode, hence
+                convert_unicode =
+                       cBOOL(backslash_N)   /* \N{} forces Unicode, hence
                                                 portable range */
                        || (   ! non_portable_endpoint
                            && ((  isLOWER_A(range_min) && isLOWER_A(range_max))
-                             || (isUPPER_A(range_min) && isUPPER_A(range_max))))
-                )) {
+                             || (isUPPER_A(range_min) && isUPPER_A(range_max))));
+                if (convert_unicode) {
  
                      /* Special handling is needed for these portable ranges.
                       * They are defined to all be in Unicode terms, which
@@ -3202,8 +3207,7 @@ S_scan_const(pTHX_ char *start)
             if (!esc)
                 in_charclass = TRUE;
         }
-
-       else if (*s == ']' && PL_lex_inpat &&  in_charclass) {
+       else if (*s == ']' && PL_lex_inpat && in_charclass) {
             char *s1 = s-1;
             int esc = 0;
             while (s1 >= start && *s1-- == '\\')
@@ -3396,31 +3400,40 @@ S_scan_const(pTHX_ char *start)
                 }
                 else {
                     if (!has_utf8 && uv > 255) {
-                       /* Might need to recode whatever we have accumulated so
-                        * far if it contains any chars variant in utf8 or
-                        * utf-ebcdic. */
  
-                       SvCUR_set(sv, d - SvPVX_const(sv));
-                       SvPOK_on(sv);
-                       *d = '\0';
-                       /* See Note on sizing above.  */
-                       sv_utf8_upgrade_flags_grow(
-                                       sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
-                                                  /* Above-latin1 in string
-                                                   * implies no encoding */
-                                                  |SV_UTF8_NO_ENCODING,
+                        /* Here, 'uv' won't fit unless we convert to UTF-8.
+                         * If we've only seen invariants so far, all we have to
+                         * do is turn on the flag */
+                        if (utf8_variant_count == 0) {
+                            SvUTF8_on(sv);
+                        }
+                        else {
+                            SvCUR_set(sv, d - SvPVX_const(sv));
+                            SvPOK_on(sv);
+                            *d = '\0';
+
+                            sv_utf8_upgrade_flags_grow(
+                                           sv,
+                                           SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+
                                             /* Since we're having to grow here,
                                              * make sure we have enough room for
                                              * this escape and a NUL, so the
                                              * code immediately below won't have
                                              * to actually grow again */
-                                       UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
-                       d = SvPVX(sv) + SvCUR(sv);
-                       has_utf8 = TRUE;
+                                          UVCHR_SKIP(uv)
+                                        + (STRLEN)(send - s) + 1);
+                            d = SvPVX(sv) + SvCUR(sv);
+                        }
+
+                        has_utf8 = TRUE;
                      }
  
-                    if (has_utf8) {
+                    if (! has_utf8) {
+                       *d++ = (char)uv;
+                        utf8_variant_count++;
+                    }
+                   else {
                         /* Usually, there will already be enough room in 'sv'
                          * since such escapes are likely longer than any UTF-8
                          * sequence they can end up as.  This isn't the case on
@@ -3445,9 +3458,6 @@ S_scan_const(pTHX_ char *start)
                                 (PL_lex_repl ? OPpTRANS_FROM_UTF
                                              : OPpTRANS_TO_UTF);
                         }
-                    }
-                   else {
-                       *d++ = (char)uv;
                     }
                 }
  #ifdef EBCDIC
@@ -3561,15 +3571,25 @@ S_scan_const(pTHX_ char *start)
                         if (! has_utf8 && (   uv > 0xFF
                                             || PL_lex_inwhat != OP_TRANS))
                          {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
+
                             SvCUR_set(sv, d - SvPVX_const(sv));
                             SvPOK_on(sv);
                             *d = '\0';
-                           /* See Note on sizing above.  */
-                           sv_utf8_upgrade_flags_grow(
-                                    sv,
-                                    SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                   OFFUNISKIP(uv) + (STRLEN)(send - e) + 1);
-                           d = SvPVX(sv) + SvCUR(sv);
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
+                                sv_utf8_upgrade_flags_grow(
+                                               sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               extra);
+                                d = SvPVX(sv) + SvCUR(sv);
+                            }
+
                             has_utf8 = TRUE;
                         }
  
@@ -3721,21 +3741,30 @@ S_scan_const(pTHX_ char *start)
                               * \N{} implies Unicode semantics, and scalars have
                               * to be in utf8 to guarantee those semantics; but
                               * not needed in tr/// */
-                            sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
+                            sv_utf8_upgrade_flags(res, 0);
                              str = SvPV_const(res, len);
                          }
  
                           /* Upgrade destination to be utf8 if this new
                            * component is */
                         if (! has_utf8 && SvUTF8(res)) {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = len + (send - s) + 1;
+
                             SvCUR_set(sv, d - SvPVX_const(sv));
                             SvPOK_on(sv);
                             *d = '\0';
-                           /* See Note on sizing above.  */
-                           sv_utf8_upgrade_flags_grow(sv,
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
+                                sv_utf8_upgrade_flags_grow(sv,
                                                 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                               len + (STRLEN)(send - s) + 1);
-                           d = SvPVX(sv) + SvCUR(sv);
+                                               extra);
+                                d = SvPVX(sv) + SvCUR(sv);
+                            }
                             has_utf8 = TRUE;
                         } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
  
@@ -3807,12 +3836,17 @@ S_scan_const(pTHX_ char *start)
           * to/from UTF-8.
           *
           * If the input has the same representation in UTF-8 as not, it will be
-         * a single byte, and we don't care about UTF8ness; or if neither
-         * source nor output is UTF-8, just copy the byte */
-        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s)) || (! this_utf8 && ! has_utf8))
-        {
+         * a single byte, and we don't care about UTF8ness; just copy the byte */
+        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
             *d++ = *s++;
          }
+        else if (! this_utf8 && ! has_utf8) {
+            /* If neither source nor output is UTF-8, is also a single byte,
+             * just copy it; but this byte counts should we later have to
+             * convert to UTF-8 */
+           *d++ = *s++;
+            utf8_variant_count++;
+        }
          else if (this_utf8 && has_utf8) {   /* Both UTF-8, can just copy */
             const STRLEN len = UTF8SKIP(s);
  
@@ -3829,16 +3863,26 @@ S_scan_const(pTHX_ char *start)
             const UV nextuv   = (this_utf8)
                                  ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
                                  : (UV) ((U8) *s);
-           const STRLEN need = UVCHR_SKIP(nextuv);
+           STRLEN need = UVCHR_SKIP(nextuv);
+
             if (!has_utf8) {
                 SvCUR_set(sv, d - SvPVX_const(sv));
                 SvPOK_on(sv);
                 *d = '\0';
-               /* See Note on sizing above.  */
-               sv_utf8_upgrade_flags_grow(sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                       need + (STRLEN)(send - s) + 1);
-               d = SvPVX(sv) + SvCUR(sv);
+
+                /* See Note on sizing above. */
+                need += (STRLEN)(send - s) + 1;
+
+                if (utf8_variant_count == 0) {
+                    SvUTF8_on(sv);
+                    d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
+                }
+                else {
+                    sv_utf8_upgrade_flags_grow(sv,
+                                               SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+                                               need);
+                    d = SvPVX(sv) + SvCUR(sv);
+                }
                 has_utf8 = TRUE;
             } else if (need > len) {
                 /* encoded value larger than old, may need extra space (NOTE: