X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b304119758005f7ab21373539113204710b204cd..aede6a0450b5f9e2edd1c2fbd17dabeb526d8098:/toke.c

diff --git a/toke.c b/toke.c
index 39e0b79..44d0fef 100644
--- a/toke.c
+++ b/toke.c
@@ -3084,6 +3084,7 @@ S_scan_const(pTHX_ char *start)
 	     * symbol meaning, e.g. \x{2E} would be confused with a dot.  But
 	     * in spite of this, we do have to process \N here while the proper
 	     * charnames handler is in scope.  See bugs #56444 and #62056.
+             *
 	     * There is a complication because \N in a pattern may also stand
 	     * for 'match a non-nl', and not mean a charname, in which case its
 	     * processing should be deferred to the regex compiler.  To be a
@@ -3218,31 +3219,44 @@ S_scan_const(pTHX_ char *start)
 		continue;
 
  	    case 'N':
-		/* In a non-pattern \N must be a named character, like \N{LATIN
-		 * SMALL LETTER A} or \N{U+0041}.  For patterns, it also can
-		 * mean to match a non-newline.  For non-patterns, named
-		 * characters are converted to their string equivalents. In
-		 * patterns, named characters are not converted to their
-		 * ultimate forms for the same reasons that other escapes
-		 * aren't.  Instead, they are converted to the \N{U+...} form
-		 * to get the value from the charnames that is in effect right
-		 * now, while preserving the fact that it was a named character
-		 * so that the regex compiler knows this */
-
-		/* The structure of this section of code (besides checking for
+                /* In a non-pattern \N must be like \N{U+0041}, or it can be a
+                 * named character, like \N{LATIN SMALL LETTER A}, or a named
+                 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
+                 * GRAVE}.  For convenience all three forms are referred to as
+                 * "named characters" below.
+                 *
+                 * For patterns, \N also can mean to match a non-newline.  Code
+                 * before this 'switch' statement should already have handled
+                 * this situation, and hence this code only has to deal with
+                 * the named character cases.
+                 *
+                 * For non-patterns, the named characters are converted to
+                 * their string equivalents.  In patterns, named characters are
+                 * not converted to their ultimate forms for the same reasons
+                 * that other escapes aren't.  Instead, they are converted to
+                 * the \N{U+...} form to get the value from the charnames that
+                 * is in effect right now, while preserving the fact that it
+                 * was a named character, so that the regex compiler knows
+                 * this.
+                 *
+		 * The structure of this section of code (besides checking for
 		 * errors and upgrading to utf8) is:
-		 *  Further disambiguate between the two meanings of \N, and if
-		 *	not a charname, go process it elsewhere
-		 *  If of form \N{U+...}, pass it through if a pattern;
-		 *	otherwise convert to utf8
-		 *  Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a
-		 *  pattern; otherwise convert to utf8 */
-
-		/* Here, s points to the 'N'; the test below is guaranteed to
-		 * succeed if we are being called on a pattern as we already
-		 * know from a test above that the next character is a '{'.
-		 * On a non-pattern \N must mean 'named sequence, which
-		 * requires braces */
+                 *  If the named character is of the form \N{U+...}, pass it
+                 *      through if a pattern; otherwise convert the code point
+                 *      to utf8
+                 *  Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
+                 *      if a pattern; otherwise convert to utf8
+                 *
+                 * If the regex compiler should ever need to differentiate
+                 * between the \N{U+...} and \N{name} forms, that could easily
+                 * be done here by stripping any leading zeros from the
+                 * \N{U+...} case, and adding them to the other one. */
+
+                /* Here, 's' points to the 'N'; the test below is guaranteed to
+		 * succeed if we are being called on a pattern, as we already
+                 * know from a test above that the next character is a '{'.  A
+                 * non-pattern \N must mean 'named character', which requires
+                 * braces */
 		s++;
 		if (*s != '{') {
 		    yyerror("Missing braces on \\N{}"); 
@@ -3267,8 +3281,6 @@ S_scan_const(pTHX_ char *start)
 				| PERL_SCAN_DISALLOW_PREFIX;
 		    STRLEN len;
 
-		    /* For \N{U+...}, the '...' is a unicode value even on
-		     * EBCDIC machines */
 		    s += 2;	    /* Skip to next char after the 'U+' */
 		    len = e - s;
 		    uv = grok_hex(s, &len, &flags, NULL);
@@ -3279,27 +3291,26 @@ S_scan_const(pTHX_ char *start)
 		    }
 
 		    if (PL_lex_inpat) {
-
-			/* On non-EBCDIC platforms, pass through to the regex
-			 * compiler unchanged.  The reason we evaluated the
-			 * number above is to make sure there wasn't a syntax
-			 * error.  But on EBCDIC we convert to native so
-			 * downstream code can continue to assume it's native
-			 */
 			s -= 5;	    /* Include the '\N{U+' */
 #ifdef EBCDIC
-			d += my_snprintf(d, e - s + 1 + 1,  /* includes the }
+                        /* On EBCDIC platforms, in \N{U+...}, the '...' is a
+                         * Unicode value, so convert to native so downstream
+                         * code can continue to assume it's native */
+			d += my_snprintf(d, e - s + 1 + 1,  /* includes the '}'
 							       and the \0 */
-				    "\\N{U+%X}",
-				    (unsigned int) UNI_TO_NATIVE(uv));
+                                         "\\N{U+%X}",
+                                         (unsigned int) UNI_TO_NATIVE(uv));
 #else
-			Copy(s, d, e - s + 1, char);	/* 1 = include the } */
+                        /* On non-EBCDIC platforms, pass it through unchanged.
+                         * The reason we evaluated the number above is to make
+                         * sure there wasn't a syntax error. */
+			Copy(s, d, e - s + 1, char);	/* +1 is for the '}' */
 			d += e - s + 1;
 #endif
 		    }
 		    else {  /* Not a pattern: convert the hex to string */
 
-			 /* If destination is not in utf8, unconditionally
+                         /* If the destination is not in utf8, unconditionally
 			  * recode it to be so.  This is because \N{} implies
 			  * Unicode semantics, and scalars have to be in utf8
 			  * to guarantee those semantics */
@@ -3352,13 +3363,18 @@ S_scan_const(pTHX_ char *start)
                                  * through the string.  Each character takes up
                                  * 2 hex digits plus either a trailing dot or
                                  * the "}" */
+                                const char initial_text[] = "\\N{U+";
+                                const STRLEN initial_len = sizeof(initial_text)
+                                                           - 1;
                                 d = off + SvGROW(sv, off
                                                     + 3 * len
-                                                    + 6 /* For the "\N{U+", and
-                                                           trailing NUL */
+
+                                                    /* +1 for trailing NUL */
+                                                    + initial_len + 1
+
                                                     + (STRLEN)(send - e));
-                                Copy("\\N{U+", d, 5, char);
-                                d += 5;
+                                Copy(initial_text, d, initial_len, char);
+                                d += initial_len;
                                 while (str < str_end) {
                                     char hex_string[4];
                                     int len =
@@ -3370,7 +3386,7 @@ S_scan_const(pTHX_ char *start)
                                     d += 3;
                                     str++;
                                 }
-                                d--;    /* We will overwrite below the final
+                                d--;    /* Below, we will overwrite the final
                                            dot with a right brace */
                             }
                             else {
@@ -11385,7 +11401,6 @@ S_parse_opt_lexvar(pTHX)
     PL_bufptr = s;
     if (d == PL_tokenbuf+1)
 	return NULL;
-    *d = 0;
     var = newOP(sigil == '$' ? OP_PADSV : sigil == '@' ? OP_PADAV : OP_PADHV,
 		OPf_MOD | (OPpLVAL_INTRO<<8));
     var->op_targ = allocmy(PL_tokenbuf, d - PL_tokenbuf, UTF ? SVf_UTF8 : 0);