This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Simplify \N{U+...} code
authorFather Chrysostomos <sprout@cpan.org>
Mon, 2 Feb 2015 06:38:00 +0000 (22:38 -0800)
committerFather Chrysostomos <sprout@cpan.org>
Mon, 2 Feb 2015 20:38:33 +0000 (12:38 -0800)
If we are parsing a \N{U+XXX.YYY} construct in a regexp literal, we do
not need to pass it to grok_hex, because we do not need the numeric
value at this point.  The regexp engine will be calling grok_hex
again, after all.  A simple scan for hex digits should be faster, and
makes the code a little simpler, too.

toke.c

diff --git a/toke.c b/toke.c
index 67b6096..559c74c 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3288,44 +3288,43 @@ S_scan_const(pTHX_ char *start)
                /* Here it looks like a named character */
 
                if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
-                   I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-                               | PERL_SCAN_SILENT_ILLDIGIT
-                               | PERL_SCAN_DISALLOW_PREFIX;
-                   STRLEN len;
-
                    s += 2;         /* Skip to next char after the 'U+' */
-                   len = e - s;
-                   uv = grok_hex(s, &len, &flags, NULL);
-                   if (len == 0
-                    || (  len != (STRLEN)(e - s) && s[len] != '.'
-                       && PL_lex_inpat))
-                   {
-                     bad_NU:
-                       yyerror("Invalid hexadecimal number in \\N{U+...}");
-                       s = e + 1;
-                       continue;
-                   }
-
                    if (PL_lex_inpat) {
 
                         /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
-                        const char * const orig_s = s - 5;
-                        while (*s == '.') {
-                            s++;
-                            len = e - s;
-                            uv = grok_hex(s, &len, &flags, NULL);
-                            if (!len
-                             || (len != (STRLEN)(e - s) && s[len] != '.'))
-                                goto bad_NU;
+                        /* Check the syntax.  */
+                        const char *orig_s;
+                        orig_s = s - 5;
+                        if (!isXDIGIT(*s)) {
+                          bad_NU:
+                            yyerror(
+                                "Invalid hexadecimal number in \\N{U+...}"
+                            );
+                            s = e + 1;
+                            continue;
+                        }
+                        while (++s < e) {
+                            if (isXDIGIT(*s))
+                                continue;
+                            else if ((*s == '.' || *s == '_')
+                                  && isXDIGIT(s[1]))
+                                continue;
+                            goto bad_NU;
                         }
 
-                        /* Pass everything through unchanged.  The reason we
-                         * evaluate the numbers is to make sure there wasn't a
-                         * syntax error.  +1 is for the '}' */
+                        /* Pass everything through unchanged.
+                         * +1 is for the '}' */
                         Copy(orig_s, d, e - orig_s + 1, char);
                         d += e - orig_s + 1;
                    }
                    else {  /* Not a pattern: convert the hex to string */
+                        I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+                               | PERL_SCAN_SILENT_ILLDIGIT
+                               | PERL_SCAN_DISALLOW_PREFIX;
+                        STRLEN len = e - s;
+                        uv = grok_hex(s, &len, &flags, NULL);
+                        if (len == 0 || (len != (STRLEN)(e - s)))
+                            goto bad_NU;
 
                          /* If the destination is not in utf8, unconditionally
                          * recode it to be so.  This is because \N{} implies