+ if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ STRLEN len;
+
+ /* For \N{U+...}, the '...' is a unicode value even on
+ * EBCDIC machines */
+ s += 2; /* Skip to next char after the 'U+' */
+ len = e - s;
+ uv = grok_hex(s, &len, &flags, NULL);
+ if (len == 0 || len != (STRLEN)(e - s)) {
+ yyerror("Invalid hexadecimal number in \\N{U+...}");
+ s = e + 1;
+ continue;
+ }
+
+ if (PL_lex_inpat) {
+
+ /* Pass through to the regex compiler unchanged. The
+ * reason we evaluated the number above is to make sure
+ * there wasn't a syntax error. */
+ s -= 5; /* Include the '\N{U+' */
+ Copy(s, d, e - s + 1, char); /* 1 = include the } */
+ d += e - s + 1;
+ }
+ else { /* Not a pattern: convert the hex to string */
+
+ /* If destination is not in utf8, unconditionally
+ * recode it to be so. This is because \N{} implies
+ * Unicode semantics, and scalars have to be in utf8
+ * to guarantee those semantics */
+ if (! has_utf8) {
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+ /* See Note on sizing above. */
+ sv_utf8_upgrade_flags_grow(
+ sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ UNISKIP(uv) + (STRLEN)(send - e) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ has_utf8 = TRUE;
+ }
+
+ /* Add the string to the output */
+ if (UNI_IS_INVARIANT(uv)) {
+ *d++ = (char) uv;
+ }
+ else d = (char*)uvuni_to_utf8((U8*)d, uv);
+ }
+ }
+ else { /* Here is \N{NAME} but not \N{U+...}. */
+
+ SV *res; /* result from charnames */
+ const char *str; /* the string in 'res' */
+ STRLEN len; /* its length */
+
+ /* Get the value for NAME */
+ res = newSVpvn(s, e - s);
+ res = new_constant( NULL, 0, "charnames",
+ /* includes all of: \N{...} */
+ res, NULL, s - 3, e - s + 4 );
+
+ /* Most likely res will be in utf8 already since the
+ * standard charnames uses pack U, but a custom translator
+ * can leave it otherwise, so make sure. XXX This can be
+ * revisited to not have charnames use utf8 for characters
+ * that don't need it when regexes don't have to be in utf8
+ * for Unicode semantics. If doing so, remember EBCDIC */
+ sv_utf8_upgrade(res);
+ str = SvPV_const(res, len);
+
+ /* Don't accept malformed input */
+ if (! is_utf8_string((U8 *) str, len)) {
+ yyerror("Malformed UTF-8 returned by \\N");
+ }
+ else if (PL_lex_inpat) {
+
+ if (! len) { /* The name resolved to an empty string */
+ Copy("\\N{}", d, 4, char);
+ d += 4;
+ }
+ else {
+ /* In order to not lose information for the regex
+ * compiler, pass the result in the specially made
+ * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
+ * the code points in hex of each character
+ * returned by charnames */
+
+ const char *str_end = str + len;
+ STRLEN char_length; /* cur char's byte length */
+ STRLEN output_length; /* and the number of bytes
+ after this is translated
+ into hex digits */
+ const STRLEN off = d - SvPVX_const(sv);
+
+ /* 2 hex per byte; 2 chars for '\N'; 2 chars for
+ * max('U+', '.'); and 1 for NUL */
+ char hex_string[2 * UTF8_MAXBYTES + 5];
+
+ /* Get the first character of the result. */
+ U32 uv = utf8n_to_uvuni((U8 *) str,
+ len,
+ &char_length,
+ UTF8_ALLOW_ANYUV);
+
+ /* The call to is_utf8_string() above hopefully
+ * guarantees that there won't be an error. But
+ * it's easy here to make sure. The function just
+ * above warns and returns 0 if invalid utf8, but
+ * it can also return 0 if the input is validly a
+ * NUL. Disambiguate */
+ if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ /* Convert first code point to hex, including the
+ * boiler plate before it */
+ sprintf(hex_string, "\\N{U+%X", (unsigned int) uv);
+ output_length = strlen(hex_string);
+
+ /* Make sure there is enough space to hold it */
+ d = off + SvGROW(sv, off
+ + output_length
+ + (STRLEN)(send - e)
+ + 2); /* '}' + NUL */
+ /* And output it */
+ Copy(hex_string, d, output_length, char);
+ d += output_length;
+
+ /* For each subsequent character, append dot and
+ * its ordinal in hex */
+ while ((str += char_length) < str_end) {
+ const STRLEN off = d - SvPVX_const(sv);
+ U32 uv = utf8n_to_uvuni((U8 *) str,
+ str_end - str,
+ &char_length,
+ UTF8_ALLOW_ANYUV);
+ if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ sprintf(hex_string, ".%X", (unsigned int) uv);
+ output_length = strlen(hex_string);
+
+ d = off + SvGROW(sv, off
+ + output_length
+ + (STRLEN)(send - e)
+ + 2); /* '}' + NUL */
+ Copy(hex_string, d, output_length, char);
+ d += output_length;
+ }
+
+ *d++ = '}'; /* Done. Add the trailing brace */
+ }
+ }
+ else { /* Here, not in a pattern. Convert the name to a
+ * string. */
+
+ /* If destination is not in utf8, unconditionally
+ * recode it to be so. This is because \N{} implies
+ * Unicode semantics, and scalars have to be in utf8
+ * to guarantee those semantics */
+ if (! has_utf8) {
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+ /* See Note on sizing above. */
+ sv_utf8_upgrade_flags_grow(sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ len + (STRLEN)(send - s) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ has_utf8 = TRUE;
+ } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
+
+ /* See Note on sizing above. (NOTE: SvCUR() is not
+ * set correctly here). */
+ const STRLEN off = d - SvPVX_const(sv);
+ d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1);
+ }
+ Copy(str, d, len, char);
+ d += len;