toke.c: Don't force \N{} into utf8 unnecessarily
authorKarl Williamson <public@khwilliamson.com>
Sun, 11 Nov 2012 21:47:53 +0000 (14:47 -0700)
committerKarl Williamson <public@khwilliamson.com>
Sun, 11 Nov 2012 22:34:03 +0000 (15:34 -0700)
regcomp.c no longer requires everything containing \N{} to be in UTF-8.
I'm not sure of the exact commit; it might even have been in 5.16.  But
it was done by the time 86e88272fdabc40e3b168a3cc43af5e86284d01b was
done.

Therefore we can remove the temporary code that forced utf8, and replace
it with code that handles the non-utf8 case.

Note that outside patterns, \N{} still forces utf8.  This is so that
Unicode semantics will be imposed on the string it resides in, no matter
how it is used.  Patterns have a flag that indicates Unicode semantics,
so don't need to be in utf8.

t/re/pat_advanced.t
toke.c

index 7b43fd1..7a38d41 100644 (file)
@@ -977,6 +977,9 @@ sub run_tests {
         use Cname;
 
         ok 'fooB'  =~ /\N{foo}[\N{B}\N{b}]/, "Passthrough charname";
+        my $name = "foo\xDF";
+        my $result = eval "'A${name}B'  =~ /^A\\N{$name}B\$/";
+        ok !$@ && $result,  "Passthrough charname of non-ASCII, Latin1";
         #
         # Why doesn't must_warn work here?
         #
@@ -1021,7 +1024,7 @@ sub run_tests {
         ok $@ && $@ =~ /Invalid character/, 'Verify that leading digit in name gives error';
         eval 'q() =~ /\N{COM,MA}/';
         ok $@ && $@ =~ /Invalid character/, 'Verify that comma in name gives error';
-        my $name = "A\x{D7}O";
+        $name = "A\x{D7}O";
         eval "q(W) =~ /\\N{$name}/";
         ok $@ && $@ =~ /Invalid character/, 'Verify that latin1 symbol in name gives error';
         my $utf8_name = "7 CITIES OF GOLD";
diff --git a/toke.c b/toke.c
index 4b2937a..2118baf 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -2777,14 +2777,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         }
     }
 
-    /* A custom translator can leave res not in UTF-8, so make sure.  XXX This
-     * can be revisited to not use utf8 for characters that don't need it when
-     * regexes don't have to be in utf8 for Unicode semantics.  If doing so,
-     * remember EBCDIC */
-    if (! SvUTF8(res)) {
-        sv_utf8_upgrade(res);
-    }
-    else { /* Don't accept malformed input */
+    if (SvUTF8(res)) { /* Don't accept malformed input */
         const U8* first_bad_char_loc;
         STRLEN len;
         const char* const str = SvPV_const(res, len);
@@ -3398,31 +3391,6 @@ S_scan_const(pTHX_ char *start)
 
                /* Here it looks like a named character */
 
-               if (PL_lex_inpat) {
-
-                   /* XXX This block is temporary code.  \N{} implies that the
-                    * pattern is to have Unicode semantics, and therefore
-                    * currently has to be encoded in utf8.  By putting it in
-                    * utf8 now, we save a whole pass in the regular expression
-                    * compiler.  Once that code is changed so Unicode
-                    * semantics doesn't necessarily have to be in utf8, this
-                    * block should be removed.  However, the code that parses
-                    * the output of this would have to be changed to not
-                    * necessarily expect utf8 */
-                   if (!has_utf8) {
-                       SvCUR_set(sv, d - SvPVX_const(sv));
-                       SvPOK_on(sv);
-                       *d = '\0';
-                       /* See Note on sizing above.  */
-                       sv_utf8_upgrade_flags_grow(sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                       /* 5 = '\N{' + cur char + NUL */
-                                       (STRLEN)(send - s) + 5);
-                       d = SvPVX(sv) + SvCUR(sv);
-                       has_utf8 = TRUE;
-                   }
-               }
-
                if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
                    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
                                | PERL_SCAN_DISALLOW_PREFIX;
@@ -3504,11 +3472,36 @@ S_scan_const(pTHX_ char *start)
 
                            const char *str_end = str + len;
                            const STRLEN off = d - SvPVX_const(sv);
+
+                            if (! SvUTF8(res)) {
+                                /* For the non-UTF-8 case, we can determine the
+                                 * exact length needed without having to parse
+                                 * through the string.  Each character takes up
+                                 * 2 hex digits plus either a trailing dot or
+                                 * the "}" */
+                                d = off + SvGROW(sv, off
+                                                    + 3 * len
+                                                    + 6 /* For the "\N{U+", and
+                                                           trailing NUL */
+                                                    + (STRLEN)(send - e));
+                                Copy("\\N{U+", d, 5, char);
+                                d += 5;
+                                while (str < str_end) {
+                                    char hex_string[4];
+                                    my_snprintf(hex_string, sizeof(hex_string),
+                                                "%02X.", (U8) *str);
+                                    Copy(hex_string, d, 3, char);
+                                    d += 3;
+                                    str++;
+                                }
+                                d--;    /* We will overwrite below the final
+                                           dot with a right brace */
+                            }
+                            else {
                            STRLEN char_length;     /* cur char's byte length */
                            STRLEN output_length;   /* and the number of bytes
                                                       after this is translated
                                                       into hex digits */
-
                            /* 2 hex per byte; 2 chars for '\N'; 2 chars for
                             * max('U+', '.'); and 1 for NUL */
                            char hex_string[2 * UTF8_MAXBYTES + 5];
@@ -3556,6 +3549,7 @@ S_scan_const(pTHX_ char *start)
                                Copy(hex_string, d, output_length, char);
                                d += output_length;
                            }
+                           }
 
                            *d++ = '}'; /* Done.  Add the trailing brace */
                        }